Refactor runner and installation scripts for improved functionality
- Removed the `--disable-hiprt` flag from the runner command, simplifying the rendering options for users. - Updated the `jiggablend-runner` script and README to reflect the removal of the HIPRT control flag, enhancing clarity in usage instructions. - Enhanced the installation script to provide clearer examples for running the jiggablend manager and runner, improving user experience during setup. - Implemented a more robust GPU backend detection mechanism, allowing for better compatibility with various hardware configurations.
This commit is contained in:
@@ -765,7 +765,7 @@ func (s *Manager) handleDownloadJobContext(w http.ResponseWriter, r *http.Reques
|
||||
|
||||
// Set appropriate headers for tar file
|
||||
w.Header().Set("Content-Type", "application/x-tar")
|
||||
w.Header().Set("Content-Disposition", "attachment; filename=context.tar")
|
||||
w.Header().Set("Content-Disposition", "attachment; filename=\"context.tar\"")
|
||||
|
||||
// Stream the file to the response
|
||||
io.Copy(w, file)
|
||||
@@ -821,7 +821,7 @@ func (s *Manager) handleDownloadJobContextWithToken(w http.ResponseWriter, r *ht
|
||||
|
||||
// Set appropriate headers for tar file
|
||||
w.Header().Set("Content-Type", "application/x-tar")
|
||||
w.Header().Set("Content-Disposition", "attachment; filename=context.tar")
|
||||
w.Header().Set("Content-Disposition", "attachment; filename=\"context.tar\"")
|
||||
|
||||
// Stream the file to the response
|
||||
io.Copy(w, file)
|
||||
@@ -836,7 +836,7 @@ func (s *Manager) handleUploadFileFromRunner(w http.ResponseWriter, r *http.Requ
|
||||
return
|
||||
}
|
||||
|
||||
err = r.ParseMultipartForm(MaxUploadSize) // 50 GB (for large output files)
|
||||
err = r.ParseMultipartForm(s.maxUploadSize)
|
||||
if err != nil {
|
||||
s.respondError(w, http.StatusBadRequest, fmt.Sprintf("Failed to parse multipart form: %v", err))
|
||||
return
|
||||
@@ -944,7 +944,7 @@ func (s *Manager) handleUploadFileWithToken(w http.ResponseWriter, r *http.Reque
|
||||
return
|
||||
}
|
||||
|
||||
err = r.ParseMultipartForm(MaxUploadSize) // 50 GB (for large output files)
|
||||
err = r.ParseMultipartForm(s.maxUploadSize)
|
||||
if err != nil {
|
||||
s.respondError(w, http.StatusBadRequest, fmt.Sprintf("Failed to parse multipart form: %v", err))
|
||||
return
|
||||
@@ -1228,7 +1228,7 @@ func (s *Manager) handleDownloadFileForRunner(w http.ResponseWriter, r *http.Req
|
||||
|
||||
// Set headers
|
||||
w.Header().Set("Content-Type", contentType)
|
||||
w.Header().Set("Content-Disposition", fmt.Sprintf("attachment; filename=%s", decodedFileName))
|
||||
w.Header().Set("Content-Disposition", fmt.Sprintf("attachment; filename=%q", decodedFileName))
|
||||
|
||||
// Stream file
|
||||
io.Copy(w, file)
|
||||
@@ -1476,70 +1476,49 @@ func (s *Manager) handleRunnerJobWebSocket(w http.ResponseWriter, r *http.Reques
|
||||
}
|
||||
}
|
||||
case "runner_heartbeat":
|
||||
// Lookup runner ID from job's assigned_runner_id
|
||||
var assignedRunnerID sql.NullInt64
|
||||
err := s.db.With(func(db *sql.DB) error {
|
||||
return db.QueryRow(
|
||||
"SELECT assigned_runner_id FROM jobs WHERE id = ?",
|
||||
jobID,
|
||||
).Scan(&assignedRunnerID)
|
||||
})
|
||||
if err != nil {
|
||||
log.Printf("Failed to lookup runner for job %d heartbeat: %v", jobID, err)
|
||||
// Send error response
|
||||
response := map[string]interface{}{
|
||||
"type": "error",
|
||||
"message": "Failed to process heartbeat",
|
||||
}
|
||||
s.sendWebSocketMessage(conn, response)
|
||||
continue
|
||||
}
|
||||
|
||||
if !assignedRunnerID.Valid {
|
||||
log.Printf("Job %d has no assigned runner, skipping heartbeat update", jobID)
|
||||
// Send acknowledgment but no database update
|
||||
response := map[string]interface{}{
|
||||
"type": "heartbeat_ack",
|
||||
"timestamp": time.Now().Unix(),
|
||||
"message": "No assigned runner for this job",
|
||||
}
|
||||
s.sendWebSocketMessage(conn, response)
|
||||
continue
|
||||
}
|
||||
|
||||
runnerID := assignedRunnerID.Int64
|
||||
|
||||
// Update runner heartbeat
|
||||
err = s.db.With(func(db *sql.DB) error {
|
||||
_, err := db.Exec(
|
||||
"UPDATE runners SET last_heartbeat = ?, status = ? WHERE id = ?",
|
||||
time.Now(), types.RunnerStatusOnline, runnerID,
|
||||
)
|
||||
return err
|
||||
})
|
||||
if err != nil {
|
||||
log.Printf("Failed to update runner %d heartbeat for job %d: %v", runnerID, jobID, err)
|
||||
// Send error response
|
||||
response := map[string]interface{}{
|
||||
"type": "error",
|
||||
"message": "Failed to update heartbeat",
|
||||
}
|
||||
s.sendWebSocketMessage(conn, response)
|
||||
continue
|
||||
}
|
||||
|
||||
// Send acknowledgment
|
||||
response := map[string]interface{}{
|
||||
"type": "heartbeat_ack",
|
||||
"timestamp": time.Now().Unix(),
|
||||
}
|
||||
s.sendWebSocketMessage(conn, response)
|
||||
|
||||
s.handleWSRunnerHeartbeat(conn, jobID)
|
||||
continue
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// handleWSRunnerHeartbeat processes a runner heartbeat received over a job WebSocket.
|
||||
func (s *Manager) handleWSRunnerHeartbeat(conn *websocket.Conn, jobID int64) {
|
||||
var assignedRunnerID sql.NullInt64
|
||||
err := s.db.With(func(db *sql.DB) error {
|
||||
return db.QueryRow(
|
||||
"SELECT assigned_runner_id FROM jobs WHERE id = ?", jobID,
|
||||
).Scan(&assignedRunnerID)
|
||||
})
|
||||
if err != nil {
|
||||
log.Printf("Failed to lookup runner for job %d heartbeat: %v", jobID, err)
|
||||
s.sendWebSocketMessage(conn, map[string]interface{}{"type": "error", "message": "Failed to process heartbeat"})
|
||||
return
|
||||
}
|
||||
|
||||
if !assignedRunnerID.Valid {
|
||||
log.Printf("Job %d has no assigned runner, skipping heartbeat update", jobID)
|
||||
s.sendWebSocketMessage(conn, map[string]interface{}{"type": "heartbeat_ack", "timestamp": time.Now().Unix(), "message": "No assigned runner for this job"})
|
||||
return
|
||||
}
|
||||
|
||||
runnerID := assignedRunnerID.Int64
|
||||
err = s.db.With(func(db *sql.DB) error {
|
||||
_, err := db.Exec(
|
||||
"UPDATE runners SET last_heartbeat = ?, status = ? WHERE id = ?",
|
||||
time.Now(), types.RunnerStatusOnline, runnerID,
|
||||
)
|
||||
return err
|
||||
})
|
||||
if err != nil {
|
||||
log.Printf("Failed to update runner %d heartbeat for job %d: %v", runnerID, jobID, err)
|
||||
s.sendWebSocketMessage(conn, map[string]interface{}{"type": "error", "message": "Failed to update heartbeat"})
|
||||
return
|
||||
}
|
||||
|
||||
s.sendWebSocketMessage(conn, map[string]interface{}{"type": "heartbeat_ack", "timestamp": time.Now().Unix()})
|
||||
}
|
||||
|
||||
// handleWebSocketLog handles log entries from WebSocket
|
||||
func (s *Manager) handleWebSocketLog(runnerID int64, logEntry WSLogEntry) {
|
||||
// Store log in database
|
||||
@@ -1948,241 +1927,226 @@ func (s *Manager) cleanupJobStatusUpdateMutex(jobID int64) {
|
||||
// This function is serialized per jobID to prevent race conditions when multiple tasks
|
||||
// complete concurrently and trigger status updates simultaneously.
|
||||
func (s *Manager) updateJobStatusFromTasks(jobID int64) {
|
||||
// Serialize updates per job to prevent race conditions
|
||||
mu := s.getJobStatusUpdateMutex(jobID)
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
|
||||
now := time.Now()
|
||||
|
||||
// All jobs now use parallel runners (one task per frame), so we always use task-based progress
|
||||
|
||||
// Get current job status to detect changes
|
||||
var currentStatus string
|
||||
err := s.db.With(func(conn *sql.DB) error {
|
||||
return conn.QueryRow(`SELECT status FROM jobs WHERE id = ?`, jobID).Scan(¤tStatus)
|
||||
})
|
||||
currentStatus, err := s.getJobStatus(jobID)
|
||||
if err != nil {
|
||||
log.Printf("Failed to get current job status for job %d: %v", jobID, err)
|
||||
return
|
||||
}
|
||||
|
||||
// Cancellation is terminal from the user's perspective.
|
||||
// Do not allow asynchronous task updates to revive cancelled jobs.
|
||||
if currentStatus == string(types.JobStatusCancelled) {
|
||||
return
|
||||
}
|
||||
|
||||
// Count total tasks and completed tasks
|
||||
var totalTasks, completedTasks int
|
||||
err = s.db.With(func(conn *sql.DB) error {
|
||||
err := conn.QueryRow(
|
||||
counts, err := s.getJobTaskCounts(jobID)
|
||||
if err != nil {
|
||||
log.Printf("Failed to count tasks for job %d: %v", jobID, err)
|
||||
return
|
||||
}
|
||||
|
||||
progress := counts.progress()
|
||||
|
||||
if counts.pendingOrRunning == 0 && counts.total > 0 {
|
||||
s.handleAllTasksFinished(jobID, currentStatus, counts, progress)
|
||||
} else {
|
||||
s.handleTasksInProgress(jobID, currentStatus, counts, progress)
|
||||
}
|
||||
}
|
||||
|
||||
// jobTaskCounts holds task state counts for a job.
|
||||
type jobTaskCounts struct {
|
||||
total int
|
||||
completed int
|
||||
pendingOrRunning int
|
||||
failed int
|
||||
running int
|
||||
}
|
||||
|
||||
func (c *jobTaskCounts) progress() float64 {
|
||||
if c.total == 0 {
|
||||
return 0.0
|
||||
}
|
||||
return float64(c.completed) / float64(c.total) * 100.0
|
||||
}
|
||||
|
||||
func (s *Manager) getJobStatus(jobID int64) (string, error) {
|
||||
var status string
|
||||
err := s.db.With(func(conn *sql.DB) error {
|
||||
return conn.QueryRow(`SELECT status FROM jobs WHERE id = ?`, jobID).Scan(&status)
|
||||
})
|
||||
return status, err
|
||||
}
|
||||
|
||||
func (s *Manager) getJobTaskCounts(jobID int64) (*jobTaskCounts, error) {
|
||||
c := &jobTaskCounts{}
|
||||
err := s.db.With(func(conn *sql.DB) error {
|
||||
if err := conn.QueryRow(
|
||||
`SELECT COUNT(*) FROM tasks WHERE job_id = ? AND status IN (?, ?, ?, ?)`,
|
||||
jobID, types.TaskStatusPending, types.TaskStatusRunning, types.TaskStatusCompleted, types.TaskStatusFailed,
|
||||
).Scan(&totalTasks)
|
||||
if err != nil {
|
||||
).Scan(&c.total); err != nil {
|
||||
return err
|
||||
}
|
||||
return conn.QueryRow(
|
||||
if err := conn.QueryRow(
|
||||
`SELECT COUNT(*) FROM tasks WHERE job_id = ? AND status = ?`,
|
||||
jobID, types.TaskStatusCompleted,
|
||||
).Scan(&completedTasks)
|
||||
).Scan(&c.completed); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := conn.QueryRow(
|
||||
`SELECT COUNT(*) FROM tasks WHERE job_id = ? AND status IN (?, ?)`,
|
||||
jobID, types.TaskStatusPending, types.TaskStatusRunning,
|
||||
).Scan(&c.pendingOrRunning); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := conn.QueryRow(
|
||||
`SELECT COUNT(*) FROM tasks WHERE job_id = ? AND status = ?`,
|
||||
jobID, types.TaskStatusFailed,
|
||||
).Scan(&c.failed); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := conn.QueryRow(
|
||||
`SELECT COUNT(*) FROM tasks WHERE job_id = ? AND status = ?`,
|
||||
jobID, types.TaskStatusRunning,
|
||||
).Scan(&c.running); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
log.Printf("Failed to count completed tasks for job %d: %v", jobID, err)
|
||||
return
|
||||
}
|
||||
|
||||
// Calculate progress
|
||||
var progress float64
|
||||
if totalTasks == 0 {
|
||||
// All tasks cancelled or no tasks, set progress to 0
|
||||
progress = 0.0
|
||||
} else {
|
||||
// Standard task-based progress
|
||||
progress = float64(completedTasks) / float64(totalTasks) * 100.0
|
||||
}
|
||||
return c, err
|
||||
}
|
||||
|
||||
// handleAllTasksFinished handles the case where no pending/running tasks remain.
|
||||
func (s *Manager) handleAllTasksFinished(jobID int64, currentStatus string, counts *jobTaskCounts, progress float64) {
|
||||
now := time.Now()
|
||||
var jobStatus string
|
||||
|
||||
// Check if all non-cancelled tasks are completed
|
||||
var pendingOrRunningTasks int
|
||||
err = s.db.With(func(conn *sql.DB) error {
|
||||
return conn.QueryRow(
|
||||
`SELECT COUNT(*) FROM tasks
|
||||
WHERE job_id = ? AND status IN (?, ?)`,
|
||||
jobID, types.TaskStatusPending, types.TaskStatusRunning,
|
||||
).Scan(&pendingOrRunningTasks)
|
||||
})
|
||||
if err != nil {
|
||||
log.Printf("Failed to count pending/running tasks for job %d: %v", jobID, err)
|
||||
return
|
||||
if counts.failed > 0 {
|
||||
jobStatus = s.handleFailedTasks(jobID, currentStatus, &progress)
|
||||
if jobStatus == "" {
|
||||
return // retry handled; early exit
|
||||
}
|
||||
} else {
|
||||
jobStatus = string(types.JobStatusCompleted)
|
||||
progress = 100.0
|
||||
}
|
||||
|
||||
if pendingOrRunningTasks == 0 && totalTasks > 0 {
|
||||
// All tasks are either completed or failed/cancelled
|
||||
// Check if any tasks failed
|
||||
var failedTasks int
|
||||
s.db.With(func(conn *sql.DB) error {
|
||||
conn.QueryRow(
|
||||
`SELECT COUNT(*) FROM tasks WHERE job_id = ? AND status = ?`,
|
||||
jobID, types.TaskStatusFailed,
|
||||
).Scan(&failedTasks)
|
||||
return nil
|
||||
})
|
||||
s.setJobFinalStatus(jobID, currentStatus, jobStatus, progress, now, counts)
|
||||
}
|
||||
|
||||
if failedTasks > 0 {
|
||||
// Some tasks failed - check if job has retries left
|
||||
var retryCount, maxRetries int
|
||||
err := s.db.With(func(conn *sql.DB) error {
|
||||
return conn.QueryRow(
|
||||
`SELECT retry_count, max_retries FROM jobs WHERE id = ?`,
|
||||
jobID,
|
||||
).Scan(&retryCount, &maxRetries)
|
||||
})
|
||||
if err != nil {
|
||||
log.Printf("Failed to get retry info for job %d: %v", jobID, err)
|
||||
// Fall back to marking job as failed
|
||||
jobStatus = string(types.JobStatusFailed)
|
||||
} else if retryCount < maxRetries {
|
||||
// Job has retries left - reset failed tasks and redistribute
|
||||
if err := s.resetFailedTasksAndRedistribute(jobID); err != nil {
|
||||
log.Printf("Failed to reset failed tasks for job %d: %v", jobID, err)
|
||||
// If reset fails, mark job as failed
|
||||
jobStatus = string(types.JobStatusFailed)
|
||||
} else {
|
||||
// Tasks reset successfully - job remains in running/pending state
|
||||
// Don't update job status, just update progress
|
||||
jobStatus = currentStatus // Keep current status
|
||||
// Recalculate progress after reset (failed tasks are now pending again)
|
||||
var newTotalTasks, newCompletedTasks int
|
||||
s.db.With(func(conn *sql.DB) error {
|
||||
conn.QueryRow(
|
||||
`SELECT COUNT(*) FROM tasks WHERE job_id = ? AND status IN (?, ?, ?, ?)`,
|
||||
jobID, types.TaskStatusPending, types.TaskStatusRunning, types.TaskStatusCompleted, types.TaskStatusFailed,
|
||||
).Scan(&newTotalTasks)
|
||||
conn.QueryRow(
|
||||
`SELECT COUNT(*) FROM tasks WHERE job_id = ? AND status = ?`,
|
||||
jobID, types.TaskStatusCompleted,
|
||||
).Scan(&newCompletedTasks)
|
||||
return nil
|
||||
})
|
||||
if newTotalTasks > 0 {
|
||||
progress = float64(newCompletedTasks) / float64(newTotalTasks) * 100.0
|
||||
}
|
||||
// Update progress only
|
||||
err := s.db.With(func(conn *sql.DB) error {
|
||||
_, err := conn.Exec(
|
||||
`UPDATE jobs SET progress = ? WHERE id = ?`,
|
||||
progress, jobID,
|
||||
)
|
||||
return err
|
||||
})
|
||||
if err != nil {
|
||||
log.Printf("Failed to update job %d progress: %v", jobID, err)
|
||||
} else {
|
||||
// Broadcast job update via WebSocket
|
||||
s.broadcastJobUpdate(jobID, "job_update", map[string]interface{}{
|
||||
"status": jobStatus,
|
||||
"progress": progress,
|
||||
})
|
||||
}
|
||||
return // Exit early since we've handled the retry
|
||||
}
|
||||
} else {
|
||||
// No retries left - mark job as failed and cancel active tasks
|
||||
jobStatus = string(types.JobStatusFailed)
|
||||
if err := s.cancelActiveTasksForJob(jobID); err != nil {
|
||||
log.Printf("Failed to cancel active tasks for job %d: %v", jobID, err)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// All tasks completed successfully
|
||||
jobStatus = string(types.JobStatusCompleted)
|
||||
progress = 100.0 // Ensure progress is 100% when all tasks complete
|
||||
// handleFailedTasks decides whether to retry or mark the job failed.
|
||||
// Returns "" if a retry was triggered (caller should return early),
|
||||
// or the final status string.
|
||||
func (s *Manager) handleFailedTasks(jobID int64, currentStatus string, progress *float64) string {
|
||||
var retryCount, maxRetries int
|
||||
err := s.db.With(func(conn *sql.DB) error {
|
||||
return conn.QueryRow(
|
||||
`SELECT retry_count, max_retries FROM jobs WHERE id = ?`, jobID,
|
||||
).Scan(&retryCount, &maxRetries)
|
||||
})
|
||||
if err != nil {
|
||||
log.Printf("Failed to get retry info for job %d: %v", jobID, err)
|
||||
return string(types.JobStatusFailed)
|
||||
}
|
||||
|
||||
if retryCount < maxRetries {
|
||||
if err := s.resetFailedTasksAndRedistribute(jobID); err != nil {
|
||||
log.Printf("Failed to reset failed tasks for job %d: %v", jobID, err)
|
||||
return string(types.JobStatusFailed)
|
||||
}
|
||||
|
||||
// Update job status (if we didn't return early from retry logic)
|
||||
if jobStatus != "" {
|
||||
err := s.db.With(func(conn *sql.DB) error {
|
||||
_, err := conn.Exec(
|
||||
`UPDATE jobs SET status = ?, progress = ?, completed_at = ? WHERE id = ?`,
|
||||
jobStatus, progress, now, jobID,
|
||||
)
|
||||
return err
|
||||
})
|
||||
if err != nil {
|
||||
log.Printf("Failed to update job %d status to %s: %v", jobID, jobStatus, err)
|
||||
} else {
|
||||
// Only log if status actually changed
|
||||
if currentStatus != jobStatus {
|
||||
log.Printf("Updated job %d status from %s to %s (progress: %.1f%%, completed tasks: %d/%d)", jobID, currentStatus, jobStatus, progress, completedTasks, totalTasks)
|
||||
}
|
||||
// Broadcast job update via WebSocket
|
||||
s.broadcastJobUpdate(jobID, "job_update", map[string]interface{}{
|
||||
"status": jobStatus,
|
||||
"progress": progress,
|
||||
"completed_at": now,
|
||||
})
|
||||
// Clean up mutex for jobs in final states (completed or failed)
|
||||
// No more status updates will occur for these jobs
|
||||
if jobStatus == string(types.JobStatusCompleted) || jobStatus == string(types.JobStatusFailed) {
|
||||
s.cleanupJobStatusUpdateMutex(jobID)
|
||||
}
|
||||
}
|
||||
// Recalculate progress after reset
|
||||
counts, err := s.getJobTaskCounts(jobID)
|
||||
if err == nil && counts.total > 0 {
|
||||
*progress = counts.progress()
|
||||
}
|
||||
|
||||
// Encode tasks are now created immediately when the job is created
|
||||
// with a condition that prevents assignment until all render tasks are completed.
|
||||
// No need to create them here anymore.
|
||||
} else {
|
||||
// Job has pending or running tasks - determine if it's running or still pending
|
||||
var runningTasks int
|
||||
s.db.With(func(conn *sql.DB) error {
|
||||
conn.QueryRow(
|
||||
`SELECT COUNT(*) FROM tasks WHERE job_id = ? AND status = ?`,
|
||||
jobID, types.TaskStatusRunning,
|
||||
).Scan(&runningTasks)
|
||||
return nil
|
||||
})
|
||||
|
||||
if runningTasks > 0 {
|
||||
// Has running tasks - job is running
|
||||
jobStatus = string(types.JobStatusRunning)
|
||||
var startedAt sql.NullTime
|
||||
s.db.With(func(conn *sql.DB) error {
|
||||
conn.QueryRow(`SELECT started_at FROM jobs WHERE id = ?`, jobID).Scan(&startedAt)
|
||||
if !startedAt.Valid {
|
||||
conn.Exec(`UPDATE jobs SET started_at = ? WHERE id = ?`, now, jobID)
|
||||
}
|
||||
return nil
|
||||
})
|
||||
} else {
|
||||
// All tasks are pending - job is pending
|
||||
jobStatus = string(types.JobStatusPending)
|
||||
}
|
||||
|
||||
err := s.db.With(func(conn *sql.DB) error {
|
||||
_, err := conn.Exec(
|
||||
`UPDATE jobs SET status = ?, progress = ? WHERE id = ?`,
|
||||
jobStatus, progress, jobID,
|
||||
)
|
||||
err = s.db.With(func(conn *sql.DB) error {
|
||||
_, err := conn.Exec(`UPDATE jobs SET progress = ? WHERE id = ?`, *progress, jobID)
|
||||
return err
|
||||
})
|
||||
if err != nil {
|
||||
log.Printf("Failed to update job %d status to %s: %v", jobID, jobStatus, err)
|
||||
log.Printf("Failed to update job %d progress: %v", jobID, err)
|
||||
} else {
|
||||
// Only log if status actually changed
|
||||
if currentStatus != jobStatus {
|
||||
log.Printf("Updated job %d status from %s to %s (progress: %.1f%%, completed: %d/%d, pending: %d, running: %d)", jobID, currentStatus, jobStatus, progress, completedTasks, totalTasks, pendingOrRunningTasks-runningTasks, runningTasks)
|
||||
}
|
||||
// Broadcast job update during execution (not just on completion)
|
||||
s.broadcastJobUpdate(jobID, "job_update", map[string]interface{}{
|
||||
"status": jobStatus,
|
||||
"progress": progress,
|
||||
"status": currentStatus,
|
||||
"progress": *progress,
|
||||
})
|
||||
}
|
||||
return "" // retry handled
|
||||
}
|
||||
|
||||
// No retries left
|
||||
if err := s.cancelActiveTasksForJob(jobID); err != nil {
|
||||
log.Printf("Failed to cancel active tasks for job %d: %v", jobID, err)
|
||||
}
|
||||
return string(types.JobStatusFailed)
|
||||
}
|
||||
|
||||
// setJobFinalStatus persists the terminal job status and broadcasts the update.
|
||||
func (s *Manager) setJobFinalStatus(jobID int64, currentStatus, jobStatus string, progress float64, now time.Time, counts *jobTaskCounts) {
|
||||
err := s.db.With(func(conn *sql.DB) error {
|
||||
_, err := conn.Exec(
|
||||
`UPDATE jobs SET status = ?, progress = ?, completed_at = ? WHERE id = ?`,
|
||||
jobStatus, progress, now, jobID,
|
||||
)
|
||||
return err
|
||||
})
|
||||
if err != nil {
|
||||
log.Printf("Failed to update job %d status to %s: %v", jobID, jobStatus, err)
|
||||
return
|
||||
}
|
||||
if currentStatus != jobStatus {
|
||||
log.Printf("Updated job %d status from %s to %s (progress: %.1f%%, completed tasks: %d/%d)", jobID, currentStatus, jobStatus, progress, counts.completed, counts.total)
|
||||
}
|
||||
s.broadcastJobUpdate(jobID, "job_update", map[string]interface{}{
|
||||
"status": jobStatus,
|
||||
"progress": progress,
|
||||
"completed_at": now,
|
||||
})
|
||||
if jobStatus == string(types.JobStatusCompleted) || jobStatus == string(types.JobStatusFailed) {
|
||||
s.cleanupJobStatusUpdateMutex(jobID)
|
||||
}
|
||||
}
|
||||
|
||||
// handleTasksInProgress handles the case where tasks are still pending or running.
|
||||
func (s *Manager) handleTasksInProgress(jobID int64, currentStatus string, counts *jobTaskCounts, progress float64) {
|
||||
now := time.Now()
|
||||
var jobStatus string
|
||||
|
||||
if counts.running > 0 {
|
||||
jobStatus = string(types.JobStatusRunning)
|
||||
s.db.With(func(conn *sql.DB) error {
|
||||
var startedAt sql.NullTime
|
||||
conn.QueryRow(`SELECT started_at FROM jobs WHERE id = ?`, jobID).Scan(&startedAt)
|
||||
if !startedAt.Valid {
|
||||
conn.Exec(`UPDATE jobs SET started_at = ? WHERE id = ?`, now, jobID)
|
||||
}
|
||||
return nil
|
||||
})
|
||||
} else {
|
||||
jobStatus = string(types.JobStatusPending)
|
||||
}
|
||||
|
||||
err := s.db.With(func(conn *sql.DB) error {
|
||||
_, err := conn.Exec(
|
||||
`UPDATE jobs SET status = ?, progress = ? WHERE id = ?`,
|
||||
jobStatus, progress, jobID,
|
||||
)
|
||||
return err
|
||||
})
|
||||
if err != nil {
|
||||
log.Printf("Failed to update job %d status to %s: %v", jobID, jobStatus, err)
|
||||
return
|
||||
}
|
||||
if currentStatus != jobStatus {
|
||||
pending := counts.pendingOrRunning - counts.running
|
||||
log.Printf("Updated job %d status from %s to %s (progress: %.1f%%, completed: %d/%d, pending: %d, running: %d)", jobID, currentStatus, jobStatus, progress, counts.completed, counts.total, pending, counts.running)
|
||||
}
|
||||
s.broadcastJobUpdate(jobID, "job_update", map[string]interface{}{
|
||||
"status": jobStatus,
|
||||
"progress": progress,
|
||||
})
|
||||
}
|
||||
|
||||
// broadcastLogToFrontend broadcasts log to connected frontend clients
|
||||
|
||||
Reference in New Issue
Block a user