Implement job metadata extraction and task management features. Add validation for frame range limits, enhance job and task data structures, and introduce new API endpoints for metadata and task retrieval. Update client-side components to handle metadata extraction and display task statuses. Improve error handling in API responses.

This commit is contained in:
2025-11-22 06:37:32 -06:00
parent 27a09aedd6
commit c9ade39ad9
10 changed files with 1078 additions and 88 deletions

View File

@@ -185,6 +185,117 @@ func (s *Server) handleUpdateTaskProgress(w http.ResponseWriter, r *http.Request
s.respondJSON(w, http.StatusOK, map[string]string{"message": "Progress updated"})
}
// handleUpdateTaskStep handles step start/complete events from runners
func (s *Server) handleUpdateTaskStep(w http.ResponseWriter, r *http.Request) {
// Get runner ID from context (set by runnerAuthMiddleware)
runnerID, ok := r.Context().Value("runner_id").(int64)
if !ok {
s.respondError(w, http.StatusUnauthorized, "runner_id not found in context")
return
}
taskID, err := parseID(r, "id")
if err != nil {
s.respondError(w, http.StatusBadRequest, err.Error())
return
}
var req struct {
StepName string `json:"step_name"`
Status string `json:"status"` // "pending", "running", "completed", "failed", "skipped"
DurationMs *int `json:"duration_ms,omitempty"`
ErrorMessage string `json:"error_message,omitempty"`
}
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
s.respondError(w, http.StatusBadRequest, "Invalid request body")
return
}
// Verify task belongs to runner
var taskRunnerID sql.NullInt64
err = s.db.QueryRow("SELECT runner_id FROM tasks WHERE id = ?", taskID).Scan(&taskRunnerID)
if err == sql.ErrNoRows {
s.respondError(w, http.StatusNotFound, "Task not found")
return
}
if err != nil {
s.respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to verify task: %v", err))
return
}
if !taskRunnerID.Valid || taskRunnerID.Int64 != runnerID {
s.respondError(w, http.StatusForbidden, "Task does not belong to this runner")
return
}
now := time.Now()
var stepID int64
// Check if step already exists
var existingStepID sql.NullInt64
err = s.db.QueryRow(
`SELECT id FROM task_steps WHERE task_id = ? AND step_name = ?`,
taskID, req.StepName,
).Scan(&existingStepID)
if err == sql.ErrNoRows || !existingStepID.Valid {
// Create new step
var startedAt *time.Time
var completedAt *time.Time
if req.Status == string(types.StepStatusRunning) || req.Status == string(types.StepStatusCompleted) || req.Status == string(types.StepStatusFailed) {
startedAt = &now
}
if req.Status == string(types.StepStatusCompleted) || req.Status == string(types.StepStatusFailed) {
completedAt = &now
}
result, err := s.db.Exec(
`INSERT INTO task_steps (task_id, step_name, status, started_at, completed_at, duration_ms, error_message)
VALUES (?, ?, ?, ?, ?, ?, ?)`,
taskID, req.StepName, req.Status, startedAt, completedAt, req.DurationMs, req.ErrorMessage,
)
if err != nil {
s.respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to create step: %v", err))
return
}
stepID, _ = result.LastInsertId()
} else {
// Update existing step
stepID = existingStepID.Int64
var startedAt *time.Time
var completedAt *time.Time
// Get existing started_at if status is running/completed/failed
if req.Status == string(types.StepStatusRunning) || req.Status == string(types.StepStatusCompleted) || req.Status == string(types.StepStatusFailed) {
var existingStartedAt sql.NullTime
s.db.QueryRow(`SELECT started_at FROM task_steps WHERE id = ?`, stepID).Scan(&existingStartedAt)
if existingStartedAt.Valid {
startedAt = &existingStartedAt.Time
} else {
startedAt = &now
}
}
if req.Status == string(types.StepStatusCompleted) || req.Status == string(types.StepStatusFailed) {
completedAt = &now
}
_, err = s.db.Exec(
`UPDATE task_steps SET status = ?, started_at = ?, completed_at = ?, duration_ms = ?, error_message = ?
WHERE id = ?`,
req.Status, startedAt, completedAt, req.DurationMs, req.ErrorMessage, stepID,
)
if err != nil {
s.respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to update step: %v", err))
return
}
}
s.respondJSON(w, http.StatusOK, map[string]interface{}{
"step_id": stepID,
"message": "Step updated successfully",
})
}
// handleDownloadFileForRunner allows runners to download job files
func (s *Server) handleDownloadFileForRunner(w http.ResponseWriter, r *http.Request) {
jobID, err := parseID(r, "jobId")
@@ -396,6 +507,7 @@ type WSTaskAssignment struct {
OutputFormat string `json:"output_format"`
FrameStart int `json:"frame_start"`
FrameEnd int `json:"frame_end"`
TaskType string `json:"task_type"`
InputFiles []string `json:"input_files"`
}
@@ -633,20 +745,41 @@ func (s *Server) handleWebSocketTaskComplete(runnerID int64, taskUpdate WSTaskUp
taskUpdate.TaskID,
).Scan(&jobID, &frameStart, &frameEnd)
if err == nil {
// Count total tasks excluding failed ones (failed tasks are retried, so we count them)
// We exclude tasks that are in a terminal failed state with max retries exceeded
var totalTasks, completedTasks int
s.db.QueryRow(`SELECT COUNT(*) FROM tasks WHERE job_id = ?`, jobID).Scan(&totalTasks)
s.db.QueryRow(
`SELECT COUNT(*) FROM tasks WHERE job_id = ? AND status IN (?, ?, ?, ?)`,
jobID, types.TaskStatusPending, types.TaskStatusRunning, types.TaskStatusCompleted, types.TaskStatusFailed,
).Scan(&totalTasks)
s.db.QueryRow(
`SELECT COUNT(*) FROM tasks WHERE job_id = ? AND status = ?`,
jobID, types.TaskStatusCompleted,
).Scan(&completedTasks)
progress := float64(completedTasks) / float64(totalTasks) * 100.0
// Handle edge cases: division by zero and all tasks cancelled
var progress float64
if totalTasks == 0 {
// All tasks cancelled or no tasks, set progress to 0
progress = 0.0
} else {
progress = float64(completedTasks) / float64(totalTasks) * 100.0
}
var jobStatus string
var outputFormat string
s.db.QueryRow(`SELECT output_format FROM jobs WHERE id = ?`, jobID).Scan(&outputFormat)
if completedTasks == totalTasks {
// Check if all non-cancelled tasks are completed
var pendingOrRunningTasks int
s.db.QueryRow(
`SELECT COUNT(*) FROM tasks
WHERE job_id = ? AND status IN (?, ?)`,
jobID, types.TaskStatusPending, types.TaskStatusRunning,
).Scan(&pendingOrRunningTasks)
if pendingOrRunningTasks == 0 && totalTasks > 0 {
// All tasks are either completed or failed/cancelled
jobStatus = string(types.JobStatusCompleted)
s.db.Exec(
`UPDATE jobs SET status = ?, progress = ?, completed_at = ? WHERE id = ?`,
@@ -654,7 +787,20 @@ func (s *Server) handleWebSocketTaskComplete(runnerID int64, taskUpdate WSTaskUp
)
if outputFormat == "MP4" {
go s.generateMP4Video(jobID)
// Create a video generation task instead of calling generateMP4Video directly
// This prevents race conditions when multiple runners complete frames simultaneously
videoTaskTimeout := 86400 // 24 hours for video generation
_, err := s.db.Exec(
`INSERT INTO tasks (job_id, frame_start, frame_end, task_type, status, timeout_seconds, max_retries)
VALUES (?, ?, ?, ?, ?, ?, ?)`,
jobID, 0, 0, types.TaskTypeVideoGeneration, types.TaskStatusPending, videoTaskTimeout, 1,
)
if err != nil {
log.Printf("Failed to create video generation task for job %d: %v", jobID, err)
} else {
// Try to distribute the task immediately
go s.distributeTasksToRunners()
}
}
} else {
jobStatus = string(types.JobStatusRunning)
@@ -712,7 +858,7 @@ func (s *Server) broadcastLogToFrontend(taskID int64, logEntry WSLogEntry) {
func (s *Server) distributeTasksToRunners() {
// Get all pending tasks
rows, err := s.db.Query(
`SELECT t.id, t.job_id, t.frame_start, t.frame_end, j.allow_parallel_runners, j.status as job_status
`SELECT t.id, t.job_id, t.frame_start, t.frame_end, t.task_type, j.allow_parallel_runners, j.status as job_status
FROM tasks t
JOIN jobs j ON t.job_id = j.id
WHERE t.status = ? AND j.status != ?
@@ -731,6 +877,7 @@ func (s *Server) distributeTasksToRunners() {
JobID int64
FrameStart int
FrameEnd int
TaskType string
AllowParallelRunners bool
}
@@ -740,11 +887,12 @@ func (s *Server) distributeTasksToRunners() {
JobID int64
FrameStart int
FrameEnd int
TaskType string
AllowParallelRunners bool
}
var allowParallel int
var jobStatus string
err := rows.Scan(&t.TaskID, &t.JobID, &t.FrameStart, &t.FrameEnd, &allowParallel, &jobStatus)
err := rows.Scan(&t.TaskID, &t.JobID, &t.FrameStart, &t.FrameEnd, &t.TaskType, &allowParallel, &jobStatus)
if err != nil {
continue
}
@@ -770,13 +918,6 @@ func (s *Server) distributeTasksToRunners() {
// Distribute tasks to runners
for _, task := range pendingTasks {
// Check if task is already assigned
var assignedRunnerID sql.NullInt64
err := s.db.QueryRow("SELECT runner_id FROM tasks WHERE id = ?", task.TaskID).Scan(&assignedRunnerID)
if err == nil && assignedRunnerID.Valid {
continue // Already assigned
}
// Find available runner
var selectedRunnerID int64
for _, runnerID := range connectedRunners {
@@ -812,9 +953,40 @@ func (s *Server) distributeTasksToRunners() {
continue // No available runner
}
// Assign task to runner
// Atomically assign task to runner using UPDATE with WHERE runner_id IS NULL
// This prevents race conditions when multiple goroutines try to assign the same task
now := time.Now()
result, err := s.db.Exec(
`UPDATE tasks SET runner_id = ?, status = ?, started_at = ?
WHERE id = ? AND runner_id IS NULL AND status = ?`,
selectedRunnerID, types.TaskStatusRunning, now, task.TaskID, types.TaskStatusPending,
)
if err != nil {
log.Printf("Failed to atomically assign task %d: %v", task.TaskID, err)
continue
}
// Check if the update actually affected a row (task was successfully assigned)
rowsAffected, err := result.RowsAffected()
if err != nil {
log.Printf("Failed to get rows affected for task %d: %v", task.TaskID, err)
continue
}
if rowsAffected == 0 {
// Task was already assigned by another goroutine, skip
continue
}
// Task was successfully assigned, send via WebSocket
if err := s.assignTaskToRunner(selectedRunnerID, task.TaskID); err != nil {
log.Printf("Failed to assign task %d to runner %d: %v", task.TaskID, selectedRunnerID, err)
log.Printf("Failed to send task %d to runner %d: %v", task.TaskID, selectedRunnerID, err)
// Rollback the assignment if WebSocket send fails
s.db.Exec(
`UPDATE tasks SET runner_id = NULL, status = ?, started_at = NULL
WHERE id = ?`,
types.TaskStatusPending, task.TaskID,
)
}
}
}
@@ -831,20 +1003,20 @@ func (s *Server) assignTaskToRunner(runnerID int64, taskID int64) error {
// Get task details
var task WSTaskAssignment
var jobName, outputFormat string
var jobName, outputFormat, taskType string
err := s.db.QueryRow(
`SELECT t.job_id, t.frame_start, t.frame_end, j.name, j.output_format
`SELECT t.job_id, t.frame_start, t.frame_end, t.task_type, j.name, j.output_format
FROM tasks t JOIN jobs j ON t.job_id = j.id WHERE t.id = ?`,
taskID,
).Scan(&task.JobID, &task.FrameStart, &task.FrameEnd, &jobName, &outputFormat)
).Scan(&task.JobID, &task.FrameStart, &task.FrameEnd, &taskType, &jobName, &outputFormat)
if err != nil {
return err
}
task.TaskID = taskID
task.JobID = task.JobID
task.JobName = jobName
task.OutputFormat = outputFormat
task.TaskType = taskType
// Get input files
rows, err := s.db.Query(
@@ -861,14 +1033,15 @@ func (s *Server) assignTaskToRunner(runnerID int64, taskID int64) error {
}
}
// Assign task to runner in database
now := time.Now()
_, err = s.db.Exec(
`UPDATE tasks SET runner_id = ?, status = ?, started_at = ? WHERE id = ?`,
runnerID, types.TaskStatusRunning, now, taskID,
)
// Note: Task is already assigned in database by the atomic update in distributeTasksToRunners
// We just need to verify it's still assigned to this runner
var assignedRunnerID sql.NullInt64
err = s.db.QueryRow("SELECT runner_id FROM tasks WHERE id = ?", taskID).Scan(&assignedRunnerID)
if err != nil {
return err
return fmt.Errorf("task not found: %w", err)
}
if !assignedRunnerID.Valid || assignedRunnerID.Int64 != runnerID {
return fmt.Errorf("task %d is not assigned to runner %d", taskID, runnerID)
}
// Send task via WebSocket