Implement job metadata extraction and task management features. Add validation for frame range limits, enhance job and task data structures, and introduce new API endpoints for metadata and task retrieval. Update client-side components to handle metadata extraction and display task statuses. Improve error handling in API responses.
This commit is contained in:
@@ -185,6 +185,117 @@ func (s *Server) handleUpdateTaskProgress(w http.ResponseWriter, r *http.Request
|
||||
s.respondJSON(w, http.StatusOK, map[string]string{"message": "Progress updated"})
|
||||
}
|
||||
|
||||
// handleUpdateTaskStep handles step start/complete events from runners
|
||||
func (s *Server) handleUpdateTaskStep(w http.ResponseWriter, r *http.Request) {
|
||||
// Get runner ID from context (set by runnerAuthMiddleware)
|
||||
runnerID, ok := r.Context().Value("runner_id").(int64)
|
||||
if !ok {
|
||||
s.respondError(w, http.StatusUnauthorized, "runner_id not found in context")
|
||||
return
|
||||
}
|
||||
|
||||
taskID, err := parseID(r, "id")
|
||||
if err != nil {
|
||||
s.respondError(w, http.StatusBadRequest, err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
var req struct {
|
||||
StepName string `json:"step_name"`
|
||||
Status string `json:"status"` // "pending", "running", "completed", "failed", "skipped"
|
||||
DurationMs *int `json:"duration_ms,omitempty"`
|
||||
ErrorMessage string `json:"error_message,omitempty"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
s.respondError(w, http.StatusBadRequest, "Invalid request body")
|
||||
return
|
||||
}
|
||||
|
||||
// Verify task belongs to runner
|
||||
var taskRunnerID sql.NullInt64
|
||||
err = s.db.QueryRow("SELECT runner_id FROM tasks WHERE id = ?", taskID).Scan(&taskRunnerID)
|
||||
if err == sql.ErrNoRows {
|
||||
s.respondError(w, http.StatusNotFound, "Task not found")
|
||||
return
|
||||
}
|
||||
if err != nil {
|
||||
s.respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to verify task: %v", err))
|
||||
return
|
||||
}
|
||||
if !taskRunnerID.Valid || taskRunnerID.Int64 != runnerID {
|
||||
s.respondError(w, http.StatusForbidden, "Task does not belong to this runner")
|
||||
return
|
||||
}
|
||||
|
||||
now := time.Now()
|
||||
var stepID int64
|
||||
|
||||
// Check if step already exists
|
||||
var existingStepID sql.NullInt64
|
||||
err = s.db.QueryRow(
|
||||
`SELECT id FROM task_steps WHERE task_id = ? AND step_name = ?`,
|
||||
taskID, req.StepName,
|
||||
).Scan(&existingStepID)
|
||||
|
||||
if err == sql.ErrNoRows || !existingStepID.Valid {
|
||||
// Create new step
|
||||
var startedAt *time.Time
|
||||
var completedAt *time.Time
|
||||
if req.Status == string(types.StepStatusRunning) || req.Status == string(types.StepStatusCompleted) || req.Status == string(types.StepStatusFailed) {
|
||||
startedAt = &now
|
||||
}
|
||||
if req.Status == string(types.StepStatusCompleted) || req.Status == string(types.StepStatusFailed) {
|
||||
completedAt = &now
|
||||
}
|
||||
|
||||
result, err := s.db.Exec(
|
||||
`INSERT INTO task_steps (task_id, step_name, status, started_at, completed_at, duration_ms, error_message)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?)`,
|
||||
taskID, req.StepName, req.Status, startedAt, completedAt, req.DurationMs, req.ErrorMessage,
|
||||
)
|
||||
if err != nil {
|
||||
s.respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to create step: %v", err))
|
||||
return
|
||||
}
|
||||
stepID, _ = result.LastInsertId()
|
||||
} else {
|
||||
// Update existing step
|
||||
stepID = existingStepID.Int64
|
||||
var startedAt *time.Time
|
||||
var completedAt *time.Time
|
||||
|
||||
// Get existing started_at if status is running/completed/failed
|
||||
if req.Status == string(types.StepStatusRunning) || req.Status == string(types.StepStatusCompleted) || req.Status == string(types.StepStatusFailed) {
|
||||
var existingStartedAt sql.NullTime
|
||||
s.db.QueryRow(`SELECT started_at FROM task_steps WHERE id = ?`, stepID).Scan(&existingStartedAt)
|
||||
if existingStartedAt.Valid {
|
||||
startedAt = &existingStartedAt.Time
|
||||
} else {
|
||||
startedAt = &now
|
||||
}
|
||||
}
|
||||
|
||||
if req.Status == string(types.StepStatusCompleted) || req.Status == string(types.StepStatusFailed) {
|
||||
completedAt = &now
|
||||
}
|
||||
|
||||
_, err = s.db.Exec(
|
||||
`UPDATE task_steps SET status = ?, started_at = ?, completed_at = ?, duration_ms = ?, error_message = ?
|
||||
WHERE id = ?`,
|
||||
req.Status, startedAt, completedAt, req.DurationMs, req.ErrorMessage, stepID,
|
||||
)
|
||||
if err != nil {
|
||||
s.respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to update step: %v", err))
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
s.respondJSON(w, http.StatusOK, map[string]interface{}{
|
||||
"step_id": stepID,
|
||||
"message": "Step updated successfully",
|
||||
})
|
||||
}
|
||||
|
||||
// handleDownloadFileForRunner allows runners to download job files
|
||||
func (s *Server) handleDownloadFileForRunner(w http.ResponseWriter, r *http.Request) {
|
||||
jobID, err := parseID(r, "jobId")
|
||||
@@ -396,6 +507,7 @@ type WSTaskAssignment struct {
|
||||
OutputFormat string `json:"output_format"`
|
||||
FrameStart int `json:"frame_start"`
|
||||
FrameEnd int `json:"frame_end"`
|
||||
TaskType string `json:"task_type"`
|
||||
InputFiles []string `json:"input_files"`
|
||||
}
|
||||
|
||||
@@ -633,20 +745,41 @@ func (s *Server) handleWebSocketTaskComplete(runnerID int64, taskUpdate WSTaskUp
|
||||
taskUpdate.TaskID,
|
||||
).Scan(&jobID, &frameStart, &frameEnd)
|
||||
if err == nil {
|
||||
// Count total tasks excluding failed ones (failed tasks are retried, so we count them)
|
||||
// We exclude tasks that are in a terminal failed state with max retries exceeded
|
||||
var totalTasks, completedTasks int
|
||||
s.db.QueryRow(`SELECT COUNT(*) FROM tasks WHERE job_id = ?`, jobID).Scan(&totalTasks)
|
||||
s.db.QueryRow(
|
||||
`SELECT COUNT(*) FROM tasks WHERE job_id = ? AND status IN (?, ?, ?, ?)`,
|
||||
jobID, types.TaskStatusPending, types.TaskStatusRunning, types.TaskStatusCompleted, types.TaskStatusFailed,
|
||||
).Scan(&totalTasks)
|
||||
s.db.QueryRow(
|
||||
`SELECT COUNT(*) FROM tasks WHERE job_id = ? AND status = ?`,
|
||||
jobID, types.TaskStatusCompleted,
|
||||
).Scan(&completedTasks)
|
||||
|
||||
progress := float64(completedTasks) / float64(totalTasks) * 100.0
|
||||
// Handle edge cases: division by zero and all tasks cancelled
|
||||
var progress float64
|
||||
if totalTasks == 0 {
|
||||
// All tasks cancelled or no tasks, set progress to 0
|
||||
progress = 0.0
|
||||
} else {
|
||||
progress = float64(completedTasks) / float64(totalTasks) * 100.0
|
||||
}
|
||||
|
||||
var jobStatus string
|
||||
var outputFormat string
|
||||
s.db.QueryRow(`SELECT output_format FROM jobs WHERE id = ?`, jobID).Scan(&outputFormat)
|
||||
|
||||
if completedTasks == totalTasks {
|
||||
// Check if all non-cancelled tasks are completed
|
||||
var pendingOrRunningTasks int
|
||||
s.db.QueryRow(
|
||||
`SELECT COUNT(*) FROM tasks
|
||||
WHERE job_id = ? AND status IN (?, ?)`,
|
||||
jobID, types.TaskStatusPending, types.TaskStatusRunning,
|
||||
).Scan(&pendingOrRunningTasks)
|
||||
|
||||
if pendingOrRunningTasks == 0 && totalTasks > 0 {
|
||||
// All tasks are either completed or failed/cancelled
|
||||
jobStatus = string(types.JobStatusCompleted)
|
||||
s.db.Exec(
|
||||
`UPDATE jobs SET status = ?, progress = ?, completed_at = ? WHERE id = ?`,
|
||||
@@ -654,7 +787,20 @@ func (s *Server) handleWebSocketTaskComplete(runnerID int64, taskUpdate WSTaskUp
|
||||
)
|
||||
|
||||
if outputFormat == "MP4" {
|
||||
go s.generateMP4Video(jobID)
|
||||
// Create a video generation task instead of calling generateMP4Video directly
|
||||
// This prevents race conditions when multiple runners complete frames simultaneously
|
||||
videoTaskTimeout := 86400 // 24 hours for video generation
|
||||
_, err := s.db.Exec(
|
||||
`INSERT INTO tasks (job_id, frame_start, frame_end, task_type, status, timeout_seconds, max_retries)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?)`,
|
||||
jobID, 0, 0, types.TaskTypeVideoGeneration, types.TaskStatusPending, videoTaskTimeout, 1,
|
||||
)
|
||||
if err != nil {
|
||||
log.Printf("Failed to create video generation task for job %d: %v", jobID, err)
|
||||
} else {
|
||||
// Try to distribute the task immediately
|
||||
go s.distributeTasksToRunners()
|
||||
}
|
||||
}
|
||||
} else {
|
||||
jobStatus = string(types.JobStatusRunning)
|
||||
@@ -712,7 +858,7 @@ func (s *Server) broadcastLogToFrontend(taskID int64, logEntry WSLogEntry) {
|
||||
func (s *Server) distributeTasksToRunners() {
|
||||
// Get all pending tasks
|
||||
rows, err := s.db.Query(
|
||||
`SELECT t.id, t.job_id, t.frame_start, t.frame_end, j.allow_parallel_runners, j.status as job_status
|
||||
`SELECT t.id, t.job_id, t.frame_start, t.frame_end, t.task_type, j.allow_parallel_runners, j.status as job_status
|
||||
FROM tasks t
|
||||
JOIN jobs j ON t.job_id = j.id
|
||||
WHERE t.status = ? AND j.status != ?
|
||||
@@ -731,6 +877,7 @@ func (s *Server) distributeTasksToRunners() {
|
||||
JobID int64
|
||||
FrameStart int
|
||||
FrameEnd int
|
||||
TaskType string
|
||||
AllowParallelRunners bool
|
||||
}
|
||||
|
||||
@@ -740,11 +887,12 @@ func (s *Server) distributeTasksToRunners() {
|
||||
JobID int64
|
||||
FrameStart int
|
||||
FrameEnd int
|
||||
TaskType string
|
||||
AllowParallelRunners bool
|
||||
}
|
||||
var allowParallel int
|
||||
var jobStatus string
|
||||
err := rows.Scan(&t.TaskID, &t.JobID, &t.FrameStart, &t.FrameEnd, &allowParallel, &jobStatus)
|
||||
err := rows.Scan(&t.TaskID, &t.JobID, &t.FrameStart, &t.FrameEnd, &t.TaskType, &allowParallel, &jobStatus)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
@@ -770,13 +918,6 @@ func (s *Server) distributeTasksToRunners() {
|
||||
|
||||
// Distribute tasks to runners
|
||||
for _, task := range pendingTasks {
|
||||
// Check if task is already assigned
|
||||
var assignedRunnerID sql.NullInt64
|
||||
err := s.db.QueryRow("SELECT runner_id FROM tasks WHERE id = ?", task.TaskID).Scan(&assignedRunnerID)
|
||||
if err == nil && assignedRunnerID.Valid {
|
||||
continue // Already assigned
|
||||
}
|
||||
|
||||
// Find available runner
|
||||
var selectedRunnerID int64
|
||||
for _, runnerID := range connectedRunners {
|
||||
@@ -812,9 +953,40 @@ func (s *Server) distributeTasksToRunners() {
|
||||
continue // No available runner
|
||||
}
|
||||
|
||||
// Assign task to runner
|
||||
// Atomically assign task to runner using UPDATE with WHERE runner_id IS NULL
|
||||
// This prevents race conditions when multiple goroutines try to assign the same task
|
||||
now := time.Now()
|
||||
result, err := s.db.Exec(
|
||||
`UPDATE tasks SET runner_id = ?, status = ?, started_at = ?
|
||||
WHERE id = ? AND runner_id IS NULL AND status = ?`,
|
||||
selectedRunnerID, types.TaskStatusRunning, now, task.TaskID, types.TaskStatusPending,
|
||||
)
|
||||
if err != nil {
|
||||
log.Printf("Failed to atomically assign task %d: %v", task.TaskID, err)
|
||||
continue
|
||||
}
|
||||
|
||||
// Check if the update actually affected a row (task was successfully assigned)
|
||||
rowsAffected, err := result.RowsAffected()
|
||||
if err != nil {
|
||||
log.Printf("Failed to get rows affected for task %d: %v", task.TaskID, err)
|
||||
continue
|
||||
}
|
||||
|
||||
if rowsAffected == 0 {
|
||||
// Task was already assigned by another goroutine, skip
|
||||
continue
|
||||
}
|
||||
|
||||
// Task was successfully assigned, send via WebSocket
|
||||
if err := s.assignTaskToRunner(selectedRunnerID, task.TaskID); err != nil {
|
||||
log.Printf("Failed to assign task %d to runner %d: %v", task.TaskID, selectedRunnerID, err)
|
||||
log.Printf("Failed to send task %d to runner %d: %v", task.TaskID, selectedRunnerID, err)
|
||||
// Rollback the assignment if WebSocket send fails
|
||||
s.db.Exec(
|
||||
`UPDATE tasks SET runner_id = NULL, status = ?, started_at = NULL
|
||||
WHERE id = ?`,
|
||||
types.TaskStatusPending, task.TaskID,
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -831,20 +1003,20 @@ func (s *Server) assignTaskToRunner(runnerID int64, taskID int64) error {
|
||||
|
||||
// Get task details
|
||||
var task WSTaskAssignment
|
||||
var jobName, outputFormat string
|
||||
var jobName, outputFormat, taskType string
|
||||
err := s.db.QueryRow(
|
||||
`SELECT t.job_id, t.frame_start, t.frame_end, j.name, j.output_format
|
||||
`SELECT t.job_id, t.frame_start, t.frame_end, t.task_type, j.name, j.output_format
|
||||
FROM tasks t JOIN jobs j ON t.job_id = j.id WHERE t.id = ?`,
|
||||
taskID,
|
||||
).Scan(&task.JobID, &task.FrameStart, &task.FrameEnd, &jobName, &outputFormat)
|
||||
).Scan(&task.JobID, &task.FrameStart, &task.FrameEnd, &taskType, &jobName, &outputFormat)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
task.TaskID = taskID
|
||||
task.JobID = task.JobID
|
||||
task.JobName = jobName
|
||||
task.OutputFormat = outputFormat
|
||||
task.TaskType = taskType
|
||||
|
||||
// Get input files
|
||||
rows, err := s.db.Query(
|
||||
@@ -861,14 +1033,15 @@ func (s *Server) assignTaskToRunner(runnerID int64, taskID int64) error {
|
||||
}
|
||||
}
|
||||
|
||||
// Assign task to runner in database
|
||||
now := time.Now()
|
||||
_, err = s.db.Exec(
|
||||
`UPDATE tasks SET runner_id = ?, status = ?, started_at = ? WHERE id = ?`,
|
||||
runnerID, types.TaskStatusRunning, now, taskID,
|
||||
)
|
||||
// Note: Task is already assigned in database by the atomic update in distributeTasksToRunners
|
||||
// We just need to verify it's still assigned to this runner
|
||||
var assignedRunnerID sql.NullInt64
|
||||
err = s.db.QueryRow("SELECT runner_id FROM tasks WHERE id = ?", taskID).Scan(&assignedRunnerID)
|
||||
if err != nil {
|
||||
return err
|
||||
return fmt.Errorf("task not found: %w", err)
|
||||
}
|
||||
if !assignedRunnerID.Valid || assignedRunnerID.Int64 != runnerID {
|
||||
return fmt.Errorf("task %d is not assigned to runner %d", taskID, runnerID)
|
||||
}
|
||||
|
||||
// Send task via WebSocket
|
||||
|
||||
Reference in New Issue
Block a user