Refactor runner and installation scripts for improved functionality

- Removed the `--disable-hiprt` flag from the runner command, simplifying the rendering options for users.
- Updated the `jiggablend-runner` script and README to reflect the removal of the HIPRT control flag, enhancing clarity in usage instructions.
- Enhanced the installation script to provide clearer examples for running the jiggablend manager and runner, improving user experience during setup.
- Implemented a more robust GPU backend detection mechanism, allowing for better compatibility with various hardware configurations.
This commit is contained in:
2026-03-14 21:08:06 -05:00
parent 28cb50492c
commit 16d6a95058
30 changed files with 1041 additions and 782 deletions

View File

@@ -765,7 +765,7 @@ func (s *Manager) handleDownloadJobContext(w http.ResponseWriter, r *http.Reques
// Set appropriate headers for tar file
w.Header().Set("Content-Type", "application/x-tar")
w.Header().Set("Content-Disposition", "attachment; filename=context.tar")
w.Header().Set("Content-Disposition", "attachment; filename=\"context.tar\"")
// Stream the file to the response
io.Copy(w, file)
@@ -821,7 +821,7 @@ func (s *Manager) handleDownloadJobContextWithToken(w http.ResponseWriter, r *ht
// Set appropriate headers for tar file
w.Header().Set("Content-Type", "application/x-tar")
w.Header().Set("Content-Disposition", "attachment; filename=context.tar")
w.Header().Set("Content-Disposition", "attachment; filename=\"context.tar\"")
// Stream the file to the response
io.Copy(w, file)
@@ -836,7 +836,7 @@ func (s *Manager) handleUploadFileFromRunner(w http.ResponseWriter, r *http.Requ
return
}
err = r.ParseMultipartForm(MaxUploadSize) // 50 GB (for large output files)
err = r.ParseMultipartForm(s.maxUploadSize)
if err != nil {
s.respondError(w, http.StatusBadRequest, fmt.Sprintf("Failed to parse multipart form: %v", err))
return
@@ -944,7 +944,7 @@ func (s *Manager) handleUploadFileWithToken(w http.ResponseWriter, r *http.Reque
return
}
err = r.ParseMultipartForm(MaxUploadSize) // 50 GB (for large output files)
err = r.ParseMultipartForm(s.maxUploadSize)
if err != nil {
s.respondError(w, http.StatusBadRequest, fmt.Sprintf("Failed to parse multipart form: %v", err))
return
@@ -1228,7 +1228,7 @@ func (s *Manager) handleDownloadFileForRunner(w http.ResponseWriter, r *http.Req
// Set headers
w.Header().Set("Content-Type", contentType)
w.Header().Set("Content-Disposition", fmt.Sprintf("attachment; filename=%s", decodedFileName))
w.Header().Set("Content-Disposition", fmt.Sprintf("attachment; filename=%q", decodedFileName))
// Stream file
io.Copy(w, file)
@@ -1476,70 +1476,49 @@ func (s *Manager) handleRunnerJobWebSocket(w http.ResponseWriter, r *http.Reques
}
}
case "runner_heartbeat":
// Lookup runner ID from job's assigned_runner_id
var assignedRunnerID sql.NullInt64
err := s.db.With(func(db *sql.DB) error {
return db.QueryRow(
"SELECT assigned_runner_id FROM jobs WHERE id = ?",
jobID,
).Scan(&assignedRunnerID)
})
if err != nil {
log.Printf("Failed to lookup runner for job %d heartbeat: %v", jobID, err)
// Send error response
response := map[string]interface{}{
"type": "error",
"message": "Failed to process heartbeat",
}
s.sendWebSocketMessage(conn, response)
continue
}
if !assignedRunnerID.Valid {
log.Printf("Job %d has no assigned runner, skipping heartbeat update", jobID)
// Send acknowledgment but no database update
response := map[string]interface{}{
"type": "heartbeat_ack",
"timestamp": time.Now().Unix(),
"message": "No assigned runner for this job",
}
s.sendWebSocketMessage(conn, response)
continue
}
runnerID := assignedRunnerID.Int64
// Update runner heartbeat
err = s.db.With(func(db *sql.DB) error {
_, err := db.Exec(
"UPDATE runners SET last_heartbeat = ?, status = ? WHERE id = ?",
time.Now(), types.RunnerStatusOnline, runnerID,
)
return err
})
if err != nil {
log.Printf("Failed to update runner %d heartbeat for job %d: %v", runnerID, jobID, err)
// Send error response
response := map[string]interface{}{
"type": "error",
"message": "Failed to update heartbeat",
}
s.sendWebSocketMessage(conn, response)
continue
}
// Send acknowledgment
response := map[string]interface{}{
"type": "heartbeat_ack",
"timestamp": time.Now().Unix(),
}
s.sendWebSocketMessage(conn, response)
s.handleWSRunnerHeartbeat(conn, jobID)
continue
}
}
}
// handleWSRunnerHeartbeat processes a runner heartbeat received over a job WebSocket.
func (s *Manager) handleWSRunnerHeartbeat(conn *websocket.Conn, jobID int64) {
var assignedRunnerID sql.NullInt64
err := s.db.With(func(db *sql.DB) error {
return db.QueryRow(
"SELECT assigned_runner_id FROM jobs WHERE id = ?", jobID,
).Scan(&assignedRunnerID)
})
if err != nil {
log.Printf("Failed to lookup runner for job %d heartbeat: %v", jobID, err)
s.sendWebSocketMessage(conn, map[string]interface{}{"type": "error", "message": "Failed to process heartbeat"})
return
}
if !assignedRunnerID.Valid {
log.Printf("Job %d has no assigned runner, skipping heartbeat update", jobID)
s.sendWebSocketMessage(conn, map[string]interface{}{"type": "heartbeat_ack", "timestamp": time.Now().Unix(), "message": "No assigned runner for this job"})
return
}
runnerID := assignedRunnerID.Int64
err = s.db.With(func(db *sql.DB) error {
_, err := db.Exec(
"UPDATE runners SET last_heartbeat = ?, status = ? WHERE id = ?",
time.Now(), types.RunnerStatusOnline, runnerID,
)
return err
})
if err != nil {
log.Printf("Failed to update runner %d heartbeat for job %d: %v", runnerID, jobID, err)
s.sendWebSocketMessage(conn, map[string]interface{}{"type": "error", "message": "Failed to update heartbeat"})
return
}
s.sendWebSocketMessage(conn, map[string]interface{}{"type": "heartbeat_ack", "timestamp": time.Now().Unix()})
}
// handleWebSocketLog handles log entries from WebSocket
func (s *Manager) handleWebSocketLog(runnerID int64, logEntry WSLogEntry) {
// Store log in database
@@ -1948,241 +1927,226 @@ func (s *Manager) cleanupJobStatusUpdateMutex(jobID int64) {
// This function is serialized per jobID to prevent race conditions when multiple tasks
// complete concurrently and trigger status updates simultaneously.
func (s *Manager) updateJobStatusFromTasks(jobID int64) {
// Serialize updates per job to prevent race conditions
mu := s.getJobStatusUpdateMutex(jobID)
mu.Lock()
defer mu.Unlock()
now := time.Now()
// All jobs now use parallel runners (one task per frame), so we always use task-based progress
// Get current job status to detect changes
var currentStatus string
err := s.db.With(func(conn *sql.DB) error {
return conn.QueryRow(`SELECT status FROM jobs WHERE id = ?`, jobID).Scan(&currentStatus)
})
currentStatus, err := s.getJobStatus(jobID)
if err != nil {
log.Printf("Failed to get current job status for job %d: %v", jobID, err)
return
}
// Cancellation is terminal from the user's perspective.
// Do not allow asynchronous task updates to revive cancelled jobs.
if currentStatus == string(types.JobStatusCancelled) {
return
}
// Count total tasks and completed tasks
var totalTasks, completedTasks int
err = s.db.With(func(conn *sql.DB) error {
err := conn.QueryRow(
counts, err := s.getJobTaskCounts(jobID)
if err != nil {
log.Printf("Failed to count tasks for job %d: %v", jobID, err)
return
}
progress := counts.progress()
if counts.pendingOrRunning == 0 && counts.total > 0 {
s.handleAllTasksFinished(jobID, currentStatus, counts, progress)
} else {
s.handleTasksInProgress(jobID, currentStatus, counts, progress)
}
}
// jobTaskCounts holds task state counts for a job.
type jobTaskCounts struct {
total int
completed int
pendingOrRunning int
failed int
running int
}
func (c *jobTaskCounts) progress() float64 {
if c.total == 0 {
return 0.0
}
return float64(c.completed) / float64(c.total) * 100.0
}
func (s *Manager) getJobStatus(jobID int64) (string, error) {
var status string
err := s.db.With(func(conn *sql.DB) error {
return conn.QueryRow(`SELECT status FROM jobs WHERE id = ?`, jobID).Scan(&status)
})
return status, err
}
func (s *Manager) getJobTaskCounts(jobID int64) (*jobTaskCounts, error) {
c := &jobTaskCounts{}
err := s.db.With(func(conn *sql.DB) error {
if err := conn.QueryRow(
`SELECT COUNT(*) FROM tasks WHERE job_id = ? AND status IN (?, ?, ?, ?)`,
jobID, types.TaskStatusPending, types.TaskStatusRunning, types.TaskStatusCompleted, types.TaskStatusFailed,
).Scan(&totalTasks)
if err != nil {
).Scan(&c.total); err != nil {
return err
}
return conn.QueryRow(
if err := conn.QueryRow(
`SELECT COUNT(*) FROM tasks WHERE job_id = ? AND status = ?`,
jobID, types.TaskStatusCompleted,
).Scan(&completedTasks)
).Scan(&c.completed); err != nil {
return err
}
if err := conn.QueryRow(
`SELECT COUNT(*) FROM tasks WHERE job_id = ? AND status IN (?, ?)`,
jobID, types.TaskStatusPending, types.TaskStatusRunning,
).Scan(&c.pendingOrRunning); err != nil {
return err
}
if err := conn.QueryRow(
`SELECT COUNT(*) FROM tasks WHERE job_id = ? AND status = ?`,
jobID, types.TaskStatusFailed,
).Scan(&c.failed); err != nil {
return err
}
if err := conn.QueryRow(
`SELECT COUNT(*) FROM tasks WHERE job_id = ? AND status = ?`,
jobID, types.TaskStatusRunning,
).Scan(&c.running); err != nil {
return err
}
return nil
})
if err != nil {
log.Printf("Failed to count completed tasks for job %d: %v", jobID, err)
return
}
// Calculate progress
var progress float64
if totalTasks == 0 {
// All tasks cancelled or no tasks, set progress to 0
progress = 0.0
} else {
// Standard task-based progress
progress = float64(completedTasks) / float64(totalTasks) * 100.0
}
return c, err
}
// handleAllTasksFinished handles the case where no pending/running tasks remain.
func (s *Manager) handleAllTasksFinished(jobID int64, currentStatus string, counts *jobTaskCounts, progress float64) {
now := time.Now()
var jobStatus string
// Check if all non-cancelled tasks are completed
var pendingOrRunningTasks int
err = s.db.With(func(conn *sql.DB) error {
return conn.QueryRow(
`SELECT COUNT(*) FROM tasks
WHERE job_id = ? AND status IN (?, ?)`,
jobID, types.TaskStatusPending, types.TaskStatusRunning,
).Scan(&pendingOrRunningTasks)
})
if err != nil {
log.Printf("Failed to count pending/running tasks for job %d: %v", jobID, err)
return
if counts.failed > 0 {
jobStatus = s.handleFailedTasks(jobID, currentStatus, &progress)
if jobStatus == "" {
return // retry handled; early exit
}
} else {
jobStatus = string(types.JobStatusCompleted)
progress = 100.0
}
if pendingOrRunningTasks == 0 && totalTasks > 0 {
// All tasks are either completed or failed/cancelled
// Check if any tasks failed
var failedTasks int
s.db.With(func(conn *sql.DB) error {
conn.QueryRow(
`SELECT COUNT(*) FROM tasks WHERE job_id = ? AND status = ?`,
jobID, types.TaskStatusFailed,
).Scan(&failedTasks)
return nil
})
s.setJobFinalStatus(jobID, currentStatus, jobStatus, progress, now, counts)
}
if failedTasks > 0 {
// Some tasks failed - check if job has retries left
var retryCount, maxRetries int
err := s.db.With(func(conn *sql.DB) error {
return conn.QueryRow(
`SELECT retry_count, max_retries FROM jobs WHERE id = ?`,
jobID,
).Scan(&retryCount, &maxRetries)
})
if err != nil {
log.Printf("Failed to get retry info for job %d: %v", jobID, err)
// Fall back to marking job as failed
jobStatus = string(types.JobStatusFailed)
} else if retryCount < maxRetries {
// Job has retries left - reset failed tasks and redistribute
if err := s.resetFailedTasksAndRedistribute(jobID); err != nil {
log.Printf("Failed to reset failed tasks for job %d: %v", jobID, err)
// If reset fails, mark job as failed
jobStatus = string(types.JobStatusFailed)
} else {
// Tasks reset successfully - job remains in running/pending state
// Don't update job status, just update progress
jobStatus = currentStatus // Keep current status
// Recalculate progress after reset (failed tasks are now pending again)
var newTotalTasks, newCompletedTasks int
s.db.With(func(conn *sql.DB) error {
conn.QueryRow(
`SELECT COUNT(*) FROM tasks WHERE job_id = ? AND status IN (?, ?, ?, ?)`,
jobID, types.TaskStatusPending, types.TaskStatusRunning, types.TaskStatusCompleted, types.TaskStatusFailed,
).Scan(&newTotalTasks)
conn.QueryRow(
`SELECT COUNT(*) FROM tasks WHERE job_id = ? AND status = ?`,
jobID, types.TaskStatusCompleted,
).Scan(&newCompletedTasks)
return nil
})
if newTotalTasks > 0 {
progress = float64(newCompletedTasks) / float64(newTotalTasks) * 100.0
}
// Update progress only
err := s.db.With(func(conn *sql.DB) error {
_, err := conn.Exec(
`UPDATE jobs SET progress = ? WHERE id = ?`,
progress, jobID,
)
return err
})
if err != nil {
log.Printf("Failed to update job %d progress: %v", jobID, err)
} else {
// Broadcast job update via WebSocket
s.broadcastJobUpdate(jobID, "job_update", map[string]interface{}{
"status": jobStatus,
"progress": progress,
})
}
return // Exit early since we've handled the retry
}
} else {
// No retries left - mark job as failed and cancel active tasks
jobStatus = string(types.JobStatusFailed)
if err := s.cancelActiveTasksForJob(jobID); err != nil {
log.Printf("Failed to cancel active tasks for job %d: %v", jobID, err)
}
}
} else {
// All tasks completed successfully
jobStatus = string(types.JobStatusCompleted)
progress = 100.0 // Ensure progress is 100% when all tasks complete
// handleFailedTasks decides whether to retry or mark the job failed.
// Returns "" if a retry was triggered (caller should return early),
// or the final status string.
func (s *Manager) handleFailedTasks(jobID int64, currentStatus string, progress *float64) string {
var retryCount, maxRetries int
err := s.db.With(func(conn *sql.DB) error {
return conn.QueryRow(
`SELECT retry_count, max_retries FROM jobs WHERE id = ?`, jobID,
).Scan(&retryCount, &maxRetries)
})
if err != nil {
log.Printf("Failed to get retry info for job %d: %v", jobID, err)
return string(types.JobStatusFailed)
}
if retryCount < maxRetries {
if err := s.resetFailedTasksAndRedistribute(jobID); err != nil {
log.Printf("Failed to reset failed tasks for job %d: %v", jobID, err)
return string(types.JobStatusFailed)
}
// Update job status (if we didn't return early from retry logic)
if jobStatus != "" {
err := s.db.With(func(conn *sql.DB) error {
_, err := conn.Exec(
`UPDATE jobs SET status = ?, progress = ?, completed_at = ? WHERE id = ?`,
jobStatus, progress, now, jobID,
)
return err
})
if err != nil {
log.Printf("Failed to update job %d status to %s: %v", jobID, jobStatus, err)
} else {
// Only log if status actually changed
if currentStatus != jobStatus {
log.Printf("Updated job %d status from %s to %s (progress: %.1f%%, completed tasks: %d/%d)", jobID, currentStatus, jobStatus, progress, completedTasks, totalTasks)
}
// Broadcast job update via WebSocket
s.broadcastJobUpdate(jobID, "job_update", map[string]interface{}{
"status": jobStatus,
"progress": progress,
"completed_at": now,
})
// Clean up mutex for jobs in final states (completed or failed)
// No more status updates will occur for these jobs
if jobStatus == string(types.JobStatusCompleted) || jobStatus == string(types.JobStatusFailed) {
s.cleanupJobStatusUpdateMutex(jobID)
}
}
// Recalculate progress after reset
counts, err := s.getJobTaskCounts(jobID)
if err == nil && counts.total > 0 {
*progress = counts.progress()
}
// Encode tasks are now created immediately when the job is created
// with a condition that prevents assignment until all render tasks are completed.
// No need to create them here anymore.
} else {
// Job has pending or running tasks - determine if it's running or still pending
var runningTasks int
s.db.With(func(conn *sql.DB) error {
conn.QueryRow(
`SELECT COUNT(*) FROM tasks WHERE job_id = ? AND status = ?`,
jobID, types.TaskStatusRunning,
).Scan(&runningTasks)
return nil
})
if runningTasks > 0 {
// Has running tasks - job is running
jobStatus = string(types.JobStatusRunning)
var startedAt sql.NullTime
s.db.With(func(conn *sql.DB) error {
conn.QueryRow(`SELECT started_at FROM jobs WHERE id = ?`, jobID).Scan(&startedAt)
if !startedAt.Valid {
conn.Exec(`UPDATE jobs SET started_at = ? WHERE id = ?`, now, jobID)
}
return nil
})
} else {
// All tasks are pending - job is pending
jobStatus = string(types.JobStatusPending)
}
err := s.db.With(func(conn *sql.DB) error {
_, err := conn.Exec(
`UPDATE jobs SET status = ?, progress = ? WHERE id = ?`,
jobStatus, progress, jobID,
)
err = s.db.With(func(conn *sql.DB) error {
_, err := conn.Exec(`UPDATE jobs SET progress = ? WHERE id = ?`, *progress, jobID)
return err
})
if err != nil {
log.Printf("Failed to update job %d status to %s: %v", jobID, jobStatus, err)
log.Printf("Failed to update job %d progress: %v", jobID, err)
} else {
// Only log if status actually changed
if currentStatus != jobStatus {
log.Printf("Updated job %d status from %s to %s (progress: %.1f%%, completed: %d/%d, pending: %d, running: %d)", jobID, currentStatus, jobStatus, progress, completedTasks, totalTasks, pendingOrRunningTasks-runningTasks, runningTasks)
}
// Broadcast job update during execution (not just on completion)
s.broadcastJobUpdate(jobID, "job_update", map[string]interface{}{
"status": jobStatus,
"progress": progress,
"status": currentStatus,
"progress": *progress,
})
}
return "" // retry handled
}
// No retries left
if err := s.cancelActiveTasksForJob(jobID); err != nil {
log.Printf("Failed to cancel active tasks for job %d: %v", jobID, err)
}
return string(types.JobStatusFailed)
}
// setJobFinalStatus persists the terminal job status and broadcasts the update.
func (s *Manager) setJobFinalStatus(jobID int64, currentStatus, jobStatus string, progress float64, now time.Time, counts *jobTaskCounts) {
err := s.db.With(func(conn *sql.DB) error {
_, err := conn.Exec(
`UPDATE jobs SET status = ?, progress = ?, completed_at = ? WHERE id = ?`,
jobStatus, progress, now, jobID,
)
return err
})
if err != nil {
log.Printf("Failed to update job %d status to %s: %v", jobID, jobStatus, err)
return
}
if currentStatus != jobStatus {
log.Printf("Updated job %d status from %s to %s (progress: %.1f%%, completed tasks: %d/%d)", jobID, currentStatus, jobStatus, progress, counts.completed, counts.total)
}
s.broadcastJobUpdate(jobID, "job_update", map[string]interface{}{
"status": jobStatus,
"progress": progress,
"completed_at": now,
})
if jobStatus == string(types.JobStatusCompleted) || jobStatus == string(types.JobStatusFailed) {
s.cleanupJobStatusUpdateMutex(jobID)
}
}
// handleTasksInProgress handles the case where tasks are still pending or running.
func (s *Manager) handleTasksInProgress(jobID int64, currentStatus string, counts *jobTaskCounts, progress float64) {
now := time.Now()
var jobStatus string
if counts.running > 0 {
jobStatus = string(types.JobStatusRunning)
s.db.With(func(conn *sql.DB) error {
var startedAt sql.NullTime
conn.QueryRow(`SELECT started_at FROM jobs WHERE id = ?`, jobID).Scan(&startedAt)
if !startedAt.Valid {
conn.Exec(`UPDATE jobs SET started_at = ? WHERE id = ?`, now, jobID)
}
return nil
})
} else {
jobStatus = string(types.JobStatusPending)
}
err := s.db.With(func(conn *sql.DB) error {
_, err := conn.Exec(
`UPDATE jobs SET status = ?, progress = ? WHERE id = ?`,
jobStatus, progress, jobID,
)
return err
})
if err != nil {
log.Printf("Failed to update job %d status to %s: %v", jobID, jobStatus, err)
return
}
if currentStatus != jobStatus {
pending := counts.pendingOrRunning - counts.running
log.Printf("Updated job %d status from %s to %s (progress: %.1f%%, completed: %d/%d, pending: %d, running: %d)", jobID, currentStatus, jobStatus, progress, counts.completed, counts.total, pending, counts.running)
}
s.broadcastJobUpdate(jobID, "job_update", map[string]interface{}{
"status": jobStatus,
"progress": progress,
})
}
// broadcastLogToFrontend broadcasts log to connected frontend clients