redo
This commit is contained in:
@@ -81,11 +81,18 @@ func main() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Start heartbeat loop
|
// Start WebSocket connection with reconnection
|
||||||
|
go client.ConnectWebSocketWithReconnect()
|
||||||
|
|
||||||
|
// Start heartbeat loop (for WebSocket ping/pong and HTTP fallback)
|
||||||
go client.HeartbeatLoop()
|
go client.HeartbeatLoop()
|
||||||
|
|
||||||
// Start task processing loop
|
// ProcessTasks is now handled via WebSocket, but kept for HTTP fallback
|
||||||
client.ProcessTasks()
|
// WebSocket will handle task assignment automatically
|
||||||
|
log.Printf("Runner started, connecting to manager via WebSocket...")
|
||||||
|
|
||||||
|
// Block forever
|
||||||
|
select {}
|
||||||
}
|
}
|
||||||
|
|
||||||
func loadSecrets(path string) (*SecretsFile, error) {
|
func loadSecrets(path string) (*SecretsFile, error) {
|
||||||
|
|||||||
1
go.mod
1
go.mod
@@ -7,6 +7,7 @@ require (
|
|||||||
github.com/go-chi/chi/v5 v5.2.3 // indirect
|
github.com/go-chi/chi/v5 v5.2.3 // indirect
|
||||||
github.com/go-chi/cors v1.2.2 // indirect
|
github.com/go-chi/cors v1.2.2 // indirect
|
||||||
github.com/google/uuid v1.6.0 // indirect
|
github.com/google/uuid v1.6.0 // indirect
|
||||||
|
github.com/gorilla/websocket v1.5.3 // indirect
|
||||||
github.com/mattn/go-sqlite3 v1.14.32 // indirect
|
github.com/mattn/go-sqlite3 v1.14.32 // indirect
|
||||||
golang.org/x/oauth2 v0.33.0 // indirect
|
golang.org/x/oauth2 v0.33.0 // indirect
|
||||||
)
|
)
|
||||||
|
|||||||
2
go.sum
2
go.sum
@@ -6,6 +6,8 @@ github.com/go-chi/cors v1.2.2 h1:Jmey33TE+b+rB7fT8MUy1u0I4L+NARQlK6LhzKPSyQE=
|
|||||||
github.com/go-chi/cors v1.2.2/go.mod h1:sSbTewc+6wYHBBCW7ytsFSn836hqM7JxpglAy2Vzc58=
|
github.com/go-chi/cors v1.2.2/go.mod h1:sSbTewc+6wYHBBCW7ytsFSn836hqM7JxpglAy2Vzc58=
|
||||||
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||||
|
github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg=
|
||||||
|
github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
|
||||||
github.com/mattn/go-sqlite3 v1.14.32 h1:JD12Ag3oLy1zQA+BNn74xRgaBbdhbNIDYvQUEuuErjs=
|
github.com/mattn/go-sqlite3 v1.14.32 h1:JD12Ag3oLy1zQA+BNn74xRgaBbdhbNIDYvQUEuuErjs=
|
||||||
github.com/mattn/go-sqlite3 v1.14.32/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y=
|
github.com/mattn/go-sqlite3 v1.14.32/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y=
|
||||||
golang.org/x/oauth2 v0.33.0 h1:4Q+qn+E5z8gPRJfmRy7C2gGG3T4jIprK6aSYgTXGRpo=
|
golang.org/x/oauth2 v0.33.0 h1:4Q+qn+E5z8gPRJfmRy7C2gGG3T4jIprK6aSYgTXGRpo=
|
||||||
|
|||||||
@@ -5,10 +5,14 @@ import (
|
|||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
|
"log"
|
||||||
"net/http"
|
"net/http"
|
||||||
|
"strconv"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"fuego/pkg/types"
|
"fuego/pkg/types"
|
||||||
|
|
||||||
|
"github.com/go-chi/chi/v5"
|
||||||
)
|
)
|
||||||
|
|
||||||
// handleCreateJob creates a new job
|
// handleCreateJob creates a new job
|
||||||
@@ -39,10 +43,19 @@ func (s *Server) handleCreateJob(w http.ResponseWriter, r *http.Request) {
|
|||||||
req.OutputFormat = "PNG"
|
req.OutputFormat = "PNG"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Default allow_parallel_runners to true if not provided
|
||||||
|
allowParallelRunners := true
|
||||||
|
if req.AllowParallelRunners != nil {
|
||||||
|
allowParallelRunners = *req.AllowParallelRunners
|
||||||
|
}
|
||||||
|
|
||||||
|
// Set job timeout to 24 hours (86400 seconds)
|
||||||
|
jobTimeout := 86400
|
||||||
|
|
||||||
result, err := s.db.Exec(
|
result, err := s.db.Exec(
|
||||||
`INSERT INTO jobs (user_id, name, status, progress, frame_start, frame_end, output_format)
|
`INSERT INTO jobs (user_id, name, status, progress, frame_start, frame_end, output_format, allow_parallel_runners, timeout_seconds)
|
||||||
VALUES (?, ?, ?, ?, ?, ?, ?)`,
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`,
|
||||||
userID, req.Name, types.JobStatusPending, 0.0, req.FrameStart, req.FrameEnd, req.OutputFormat,
|
userID, req.Name, types.JobStatusPending, 0.0, req.FrameStart, req.FrameEnd, req.OutputFormat, allowParallelRunners, jobTimeout,
|
||||||
)
|
)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
s.respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to create job: %v", err))
|
s.respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to create job: %v", err))
|
||||||
@@ -51,11 +64,21 @@ func (s *Server) handleCreateJob(w http.ResponseWriter, r *http.Request) {
|
|||||||
|
|
||||||
jobID, _ := result.LastInsertId()
|
jobID, _ := result.LastInsertId()
|
||||||
|
|
||||||
|
// Determine task timeout based on output format
|
||||||
|
// 5 minutes (300 seconds) for frame tasks, 24 hours (86400 seconds) for FFmpeg video generation
|
||||||
|
taskTimeout := 300 // Default: 5 minutes for frame rendering
|
||||||
|
if req.OutputFormat == "MP4" {
|
||||||
|
// For MP4, we'll create frame tasks with 5 min timeout
|
||||||
|
// Video generation tasks will be created later with 24h timeout
|
||||||
|
taskTimeout = 300
|
||||||
|
}
|
||||||
|
|
||||||
// Create tasks for the job (one task per frame for simplicity, could be batched)
|
// Create tasks for the job (one task per frame for simplicity, could be batched)
|
||||||
for frame := req.FrameStart; frame <= req.FrameEnd; frame++ {
|
for frame := req.FrameStart; frame <= req.FrameEnd; frame++ {
|
||||||
_, err = s.db.Exec(
|
_, err = s.db.Exec(
|
||||||
`INSERT INTO tasks (job_id, frame_start, frame_end, status) VALUES (?, ?, ?, ?)`,
|
`INSERT INTO tasks (job_id, frame_start, frame_end, status, timeout_seconds, max_retries)
|
||||||
jobID, frame, frame, types.TaskStatusPending,
|
VALUES (?, ?, ?, ?, ?, ?)`,
|
||||||
|
jobID, frame, frame, types.TaskStatusPending, taskTimeout, 3,
|
||||||
)
|
)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
s.respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to create tasks: %v", err))
|
s.respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to create tasks: %v", err))
|
||||||
@@ -72,9 +95,14 @@ func (s *Server) handleCreateJob(w http.ResponseWriter, r *http.Request) {
|
|||||||
FrameStart: req.FrameStart,
|
FrameStart: req.FrameStart,
|
||||||
FrameEnd: req.FrameEnd,
|
FrameEnd: req.FrameEnd,
|
||||||
OutputFormat: req.OutputFormat,
|
OutputFormat: req.OutputFormat,
|
||||||
|
AllowParallelRunners: allowParallelRunners,
|
||||||
|
TimeoutSeconds: jobTimeout,
|
||||||
CreatedAt: time.Now(),
|
CreatedAt: time.Now(),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Immediately try to distribute tasks to connected runners
|
||||||
|
go s.distributeTasksToRunners()
|
||||||
|
|
||||||
s.respondJSON(w, http.StatusCreated, job)
|
s.respondJSON(w, http.StatusCreated, job)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -88,7 +116,7 @@ func (s *Server) handleListJobs(w http.ResponseWriter, r *http.Request) {
|
|||||||
|
|
||||||
rows, err := s.db.Query(
|
rows, err := s.db.Query(
|
||||||
`SELECT id, user_id, name, status, progress, frame_start, frame_end, output_format,
|
`SELECT id, user_id, name, status, progress, frame_start, frame_end, output_format,
|
||||||
created_at, started_at, completed_at, error_message
|
allow_parallel_runners, timeout_seconds, created_at, started_at, completed_at, error_message
|
||||||
FROM jobs WHERE user_id = ? ORDER BY created_at DESC`,
|
FROM jobs WHERE user_id = ? ORDER BY created_at DESC`,
|
||||||
userID,
|
userID,
|
||||||
)
|
)
|
||||||
@@ -105,7 +133,7 @@ func (s *Server) handleListJobs(w http.ResponseWriter, r *http.Request) {
|
|||||||
|
|
||||||
err := rows.Scan(
|
err := rows.Scan(
|
||||||
&job.ID, &job.UserID, &job.Name, &job.Status, &job.Progress,
|
&job.ID, &job.UserID, &job.Name, &job.Status, &job.Progress,
|
||||||
&job.FrameStart, &job.FrameEnd, &job.OutputFormat,
|
&job.FrameStart, &job.FrameEnd, &job.OutputFormat, &job.AllowParallelRunners, &job.TimeoutSeconds,
|
||||||
&job.CreatedAt, &startedAt, &completedAt, &job.ErrorMessage,
|
&job.CreatedAt, &startedAt, &completedAt, &job.ErrorMessage,
|
||||||
)
|
)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -145,12 +173,12 @@ func (s *Server) handleGetJob(w http.ResponseWriter, r *http.Request) {
|
|||||||
|
|
||||||
err = s.db.QueryRow(
|
err = s.db.QueryRow(
|
||||||
`SELECT id, user_id, name, status, progress, frame_start, frame_end, output_format,
|
`SELECT id, user_id, name, status, progress, frame_start, frame_end, output_format,
|
||||||
created_at, started_at, completed_at, error_message
|
allow_parallel_runners, timeout_seconds, created_at, started_at, completed_at, error_message
|
||||||
FROM jobs WHERE id = ? AND user_id = ?`,
|
FROM jobs WHERE id = ? AND user_id = ?`,
|
||||||
jobID, userID,
|
jobID, userID,
|
||||||
).Scan(
|
).Scan(
|
||||||
&job.ID, &job.UserID, &job.Name, &job.Status, &job.Progress,
|
&job.ID, &job.UserID, &job.Name, &job.Status, &job.Progress,
|
||||||
&job.FrameStart, &job.FrameEnd, &job.OutputFormat,
|
&job.FrameStart, &job.FrameEnd, &job.OutputFormat, &job.AllowParallelRunners, &job.TimeoutSeconds,
|
||||||
&job.CreatedAt, &startedAt, &completedAt, &job.ErrorMessage,
|
&job.CreatedAt, &startedAt, &completedAt, &job.ErrorMessage,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -496,3 +524,454 @@ func (s *Server) handleStreamVideo(w http.ResponseWriter, r *http.Request) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// handleGetTaskLogs retrieves logs for a specific task
|
||||||
|
func (s *Server) handleGetTaskLogs(w http.ResponseWriter, r *http.Request) {
|
||||||
|
userID, err := getUserID(r)
|
||||||
|
if err != nil {
|
||||||
|
s.respondError(w, http.StatusUnauthorized, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
jobID, err := parseID(r, "id")
|
||||||
|
if err != nil {
|
||||||
|
s.respondError(w, http.StatusBadRequest, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
taskIDStr := chi.URLParam(r, "taskId")
|
||||||
|
taskID, err := strconv.ParseInt(taskIDStr, 10, 64)
|
||||||
|
if err != nil {
|
||||||
|
s.respondError(w, http.StatusBadRequest, "Invalid task ID")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify job belongs to user
|
||||||
|
var jobUserID int64
|
||||||
|
err = s.db.QueryRow("SELECT user_id FROM jobs WHERE id = ?", jobID).Scan(&jobUserID)
|
||||||
|
if err == sql.ErrNoRows {
|
||||||
|
s.respondError(w, http.StatusNotFound, "Job not found")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
s.respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to verify job: %v", err))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if jobUserID != userID {
|
||||||
|
s.respondError(w, http.StatusForbidden, "Access denied")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify task belongs to job
|
||||||
|
var taskJobID int64
|
||||||
|
err = s.db.QueryRow("SELECT job_id FROM tasks WHERE id = ?", taskID).Scan(&taskJobID)
|
||||||
|
if err == sql.ErrNoRows {
|
||||||
|
s.respondError(w, http.StatusNotFound, "Task not found")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
s.respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to verify task: %v", err))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if taskJobID != jobID {
|
||||||
|
s.respondError(w, http.StatusBadRequest, "Task does not belong to this job")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get query parameters for filtering
|
||||||
|
stepName := r.URL.Query().Get("step_name")
|
||||||
|
logLevel := r.URL.Query().Get("log_level")
|
||||||
|
limitStr := r.URL.Query().Get("limit")
|
||||||
|
limit := 1000 // default
|
||||||
|
if limitStr != "" {
|
||||||
|
if l, err := strconv.Atoi(limitStr); err == nil && l > 0 {
|
||||||
|
limit = l
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build query
|
||||||
|
query := `SELECT id, task_id, runner_id, log_level, message, step_name, created_at
|
||||||
|
FROM task_logs WHERE task_id = ?`
|
||||||
|
args := []interface{}{taskID}
|
||||||
|
if stepName != "" {
|
||||||
|
query += " AND step_name = ?"
|
||||||
|
args = append(args, stepName)
|
||||||
|
}
|
||||||
|
if logLevel != "" {
|
||||||
|
query += " AND log_level = ?"
|
||||||
|
args = append(args, logLevel)
|
||||||
|
}
|
||||||
|
query += " ORDER BY created_at ASC LIMIT ?"
|
||||||
|
args = append(args, limit)
|
||||||
|
|
||||||
|
rows, err := s.db.Query(query, args...)
|
||||||
|
if err != nil {
|
||||||
|
s.respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to query logs: %v", err))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
defer rows.Close()
|
||||||
|
|
||||||
|
logs := []types.TaskLog{}
|
||||||
|
for rows.Next() {
|
||||||
|
var log types.TaskLog
|
||||||
|
var runnerID sql.NullInt64
|
||||||
|
err := rows.Scan(
|
||||||
|
&log.ID, &log.TaskID, &runnerID, &log.LogLevel, &log.Message,
|
||||||
|
&log.StepName, &log.CreatedAt,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
s.respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to scan log: %v", err))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if runnerID.Valid {
|
||||||
|
log.RunnerID = &runnerID.Int64
|
||||||
|
}
|
||||||
|
logs = append(logs, log)
|
||||||
|
}
|
||||||
|
|
||||||
|
s.respondJSON(w, http.StatusOK, logs)
|
||||||
|
}
|
||||||
|
|
||||||
|
// handleGetTaskSteps retrieves step timeline for a specific task
|
||||||
|
func (s *Server) handleGetTaskSteps(w http.ResponseWriter, r *http.Request) {
|
||||||
|
userID, err := getUserID(r)
|
||||||
|
if err != nil {
|
||||||
|
s.respondError(w, http.StatusUnauthorized, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
jobID, err := parseID(r, "id")
|
||||||
|
if err != nil {
|
||||||
|
s.respondError(w, http.StatusBadRequest, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
taskIDStr := chi.URLParam(r, "taskId")
|
||||||
|
taskID, err := strconv.ParseInt(taskIDStr, 10, 64)
|
||||||
|
if err != nil {
|
||||||
|
s.respondError(w, http.StatusBadRequest, "Invalid task ID")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify job belongs to user
|
||||||
|
var jobUserID int64
|
||||||
|
err = s.db.QueryRow("SELECT user_id FROM jobs WHERE id = ?", jobID).Scan(&jobUserID)
|
||||||
|
if err == sql.ErrNoRows {
|
||||||
|
s.respondError(w, http.StatusNotFound, "Job not found")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
s.respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to verify job: %v", err))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if jobUserID != userID {
|
||||||
|
s.respondError(w, http.StatusForbidden, "Access denied")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify task belongs to job
|
||||||
|
var taskJobID int64
|
||||||
|
err = s.db.QueryRow("SELECT job_id FROM tasks WHERE id = ?", taskID).Scan(&taskJobID)
|
||||||
|
if err == sql.ErrNoRows {
|
||||||
|
s.respondError(w, http.StatusNotFound, "Task not found")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
s.respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to verify task: %v", err))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if taskJobID != jobID {
|
||||||
|
s.respondError(w, http.StatusBadRequest, "Task does not belong to this job")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
rows, err := s.db.Query(
|
||||||
|
`SELECT id, task_id, step_name, status, started_at, completed_at, duration_ms, error_message
|
||||||
|
FROM task_steps WHERE task_id = ? ORDER BY started_at ASC`,
|
||||||
|
taskID,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
s.respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to query steps: %v", err))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
defer rows.Close()
|
||||||
|
|
||||||
|
steps := []types.TaskStep{}
|
||||||
|
for rows.Next() {
|
||||||
|
var step types.TaskStep
|
||||||
|
var startedAt, completedAt sql.NullTime
|
||||||
|
var durationMs sql.NullInt64
|
||||||
|
err := rows.Scan(
|
||||||
|
&step.ID, &step.TaskID, &step.StepName, &step.Status,
|
||||||
|
&startedAt, &completedAt, &durationMs, &step.ErrorMessage,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
s.respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to scan step: %v", err))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if startedAt.Valid {
|
||||||
|
step.StartedAt = &startedAt.Time
|
||||||
|
}
|
||||||
|
if completedAt.Valid {
|
||||||
|
step.CompletedAt = &completedAt.Time
|
||||||
|
}
|
||||||
|
if durationMs.Valid {
|
||||||
|
duration := int(durationMs.Int64)
|
||||||
|
step.DurationMs = &duration
|
||||||
|
}
|
||||||
|
steps = append(steps, step)
|
||||||
|
}
|
||||||
|
|
||||||
|
s.respondJSON(w, http.StatusOK, steps)
|
||||||
|
}
|
||||||
|
|
||||||
|
// handleRetryTask retries a failed task
|
||||||
|
func (s *Server) handleRetryTask(w http.ResponseWriter, r *http.Request) {
|
||||||
|
userID, err := getUserID(r)
|
||||||
|
if err != nil {
|
||||||
|
s.respondError(w, http.StatusUnauthorized, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
jobID, err := parseID(r, "id")
|
||||||
|
if err != nil {
|
||||||
|
s.respondError(w, http.StatusBadRequest, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
taskIDStr := chi.URLParam(r, "taskId")
|
||||||
|
taskID, err := strconv.ParseInt(taskIDStr, 10, 64)
|
||||||
|
if err != nil {
|
||||||
|
s.respondError(w, http.StatusBadRequest, "Invalid task ID")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify job belongs to user
|
||||||
|
var jobUserID int64
|
||||||
|
err = s.db.QueryRow("SELECT user_id FROM jobs WHERE id = ?", jobID).Scan(&jobUserID)
|
||||||
|
if err == sql.ErrNoRows {
|
||||||
|
s.respondError(w, http.StatusNotFound, "Job not found")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
s.respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to verify job: %v", err))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if jobUserID != userID {
|
||||||
|
s.respondError(w, http.StatusForbidden, "Access denied")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify task belongs to job and is in a retryable state
|
||||||
|
var taskJobID int64
|
||||||
|
var taskStatus string
|
||||||
|
var retryCount, maxRetries int
|
||||||
|
err = s.db.QueryRow(
|
||||||
|
"SELECT job_id, status, retry_count, max_retries FROM tasks WHERE id = ?",
|
||||||
|
taskID,
|
||||||
|
).Scan(&taskJobID, &taskStatus, &retryCount, &maxRetries)
|
||||||
|
if err == sql.ErrNoRows {
|
||||||
|
s.respondError(w, http.StatusNotFound, "Task not found")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
s.respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to verify task: %v", err))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if taskJobID != jobID {
|
||||||
|
s.respondError(w, http.StatusBadRequest, "Task does not belong to this job")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if taskStatus != string(types.TaskStatusFailed) {
|
||||||
|
s.respondError(w, http.StatusBadRequest, "Task is not in failed state")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if retryCount >= maxRetries {
|
||||||
|
s.respondError(w, http.StatusBadRequest, "Maximum retries exceeded")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reset task to pending
|
||||||
|
_, err = s.db.Exec(
|
||||||
|
`UPDATE tasks SET status = ?, runner_id = NULL, current_step = NULL,
|
||||||
|
error_message = NULL, started_at = NULL, completed_at = NULL
|
||||||
|
WHERE id = ?`,
|
||||||
|
types.TaskStatusPending, taskID,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
s.respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to retry task: %v", err))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
s.respondJSON(w, http.StatusOK, map[string]string{"message": "Task queued for retry"})
|
||||||
|
}
|
||||||
|
|
||||||
|
// handleStreamTaskLogsWebSocket streams task logs via WebSocket
|
||||||
|
// Note: This is called after auth middleware, so userID is already verified
|
||||||
|
func (s *Server) handleStreamTaskLogsWebSocket(w http.ResponseWriter, r *http.Request) {
|
||||||
|
userID, err := getUserID(r)
|
||||||
|
if err != nil {
|
||||||
|
http.Error(w, "Unauthorized", http.StatusUnauthorized)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
jobID, err := parseID(r, "id")
|
||||||
|
if err != nil {
|
||||||
|
s.respondError(w, http.StatusBadRequest, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
taskIDStr := chi.URLParam(r, "taskId")
|
||||||
|
taskID, err := strconv.ParseInt(taskIDStr, 10, 64)
|
||||||
|
if err != nil {
|
||||||
|
s.respondError(w, http.StatusBadRequest, "Invalid task ID")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify job belongs to user
|
||||||
|
var jobUserID int64
|
||||||
|
err = s.db.QueryRow("SELECT user_id FROM jobs WHERE id = ?", jobID).Scan(&jobUserID)
|
||||||
|
if err == sql.ErrNoRows {
|
||||||
|
s.respondError(w, http.StatusNotFound, "Job not found")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
s.respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to verify job: %v", err))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if jobUserID != userID {
|
||||||
|
s.respondError(w, http.StatusForbidden, "Access denied")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify task belongs to job
|
||||||
|
var taskJobID int64
|
||||||
|
err = s.db.QueryRow("SELECT job_id FROM tasks WHERE id = ?", taskID).Scan(&taskJobID)
|
||||||
|
if err == sql.ErrNoRows {
|
||||||
|
s.respondError(w, http.StatusNotFound, "Task not found")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
s.respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to verify task: %v", err))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if taskJobID != jobID {
|
||||||
|
s.respondError(w, http.StatusBadRequest, "Task does not belong to this job")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Upgrade to WebSocket
|
||||||
|
conn, err := s.wsUpgrader.Upgrade(w, r, nil)
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("Failed to upgrade WebSocket: %v", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
defer conn.Close()
|
||||||
|
|
||||||
|
key := fmt.Sprintf("%d:%d", jobID, taskID)
|
||||||
|
s.frontendConnsMu.Lock()
|
||||||
|
s.frontendConns[key] = conn
|
||||||
|
s.frontendConnsMu.Unlock()
|
||||||
|
|
||||||
|
defer func() {
|
||||||
|
s.frontendConnsMu.Lock()
|
||||||
|
delete(s.frontendConns, key)
|
||||||
|
s.frontendConnsMu.Unlock()
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Send initial connection message
|
||||||
|
conn.WriteJSON(map[string]interface{}{
|
||||||
|
"type": "connected",
|
||||||
|
"timestamp": time.Now().Unix(),
|
||||||
|
})
|
||||||
|
|
||||||
|
// Get last log ID to start streaming from
|
||||||
|
lastIDStr := r.URL.Query().Get("last_id")
|
||||||
|
lastID := int64(0)
|
||||||
|
if lastIDStr != "" {
|
||||||
|
if id, err := strconv.ParseInt(lastIDStr, 10, 64); err == nil {
|
||||||
|
lastID = id
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Send existing logs
|
||||||
|
rows, err := s.db.Query(
|
||||||
|
`SELECT id, task_id, runner_id, log_level, message, step_name, created_at
|
||||||
|
FROM task_logs WHERE task_id = ? AND id > ? ORDER BY created_at ASC LIMIT 100`,
|
||||||
|
taskID, lastID,
|
||||||
|
)
|
||||||
|
if err == nil {
|
||||||
|
defer rows.Close()
|
||||||
|
for rows.Next() {
|
||||||
|
var log types.TaskLog
|
||||||
|
var runnerID sql.NullInt64
|
||||||
|
err := rows.Scan(
|
||||||
|
&log.ID, &log.TaskID, &runnerID, &log.LogLevel, &log.Message,
|
||||||
|
&log.StepName, &log.CreatedAt,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if runnerID.Valid {
|
||||||
|
log.RunnerID = &runnerID.Int64
|
||||||
|
}
|
||||||
|
if log.ID > lastID {
|
||||||
|
lastID = log.ID
|
||||||
|
}
|
||||||
|
|
||||||
|
conn.WriteJSON(map[string]interface{}{
|
||||||
|
"type": "log",
|
||||||
|
"data": log,
|
||||||
|
"timestamp": time.Now().Unix(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Poll for new logs and send them
|
||||||
|
ticker := time.NewTicker(1 * time.Second)
|
||||||
|
defer ticker.Stop()
|
||||||
|
|
||||||
|
ctx := r.Context()
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return
|
||||||
|
case <-ticker.C:
|
||||||
|
rows, err := s.db.Query(
|
||||||
|
`SELECT id, task_id, runner_id, log_level, message, step_name, created_at
|
||||||
|
FROM task_logs WHERE task_id = ? AND id > ? ORDER BY created_at ASC LIMIT 100`,
|
||||||
|
taskID, lastID,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
for rows.Next() {
|
||||||
|
var log types.TaskLog
|
||||||
|
var runnerID sql.NullInt64
|
||||||
|
err := rows.Scan(
|
||||||
|
&log.ID, &log.TaskID, &runnerID, &log.LogLevel, &log.Message,
|
||||||
|
&log.StepName, &log.CreatedAt,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
rows.Close()
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if runnerID.Valid {
|
||||||
|
log.RunnerID = &runnerID.Int64
|
||||||
|
}
|
||||||
|
if log.ID > lastID {
|
||||||
|
lastID = log.ID
|
||||||
|
}
|
||||||
|
|
||||||
|
conn.WriteJSON(map[string]interface{}{
|
||||||
|
"type": "log",
|
||||||
|
"data": log,
|
||||||
|
"timestamp": time.Now().Unix(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
rows.Close()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -2,18 +2,20 @@ package api
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"crypto/subtle"
|
||||||
"database/sql"
|
"database/sql"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"log"
|
"log"
|
||||||
"net/http"
|
"net/http"
|
||||||
"strings"
|
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/go-chi/chi/v5"
|
|
||||||
"fuego/internal/auth"
|
"fuego/internal/auth"
|
||||||
"fuego/pkg/types"
|
"fuego/pkg/types"
|
||||||
|
|
||||||
|
"github.com/go-chi/chi/v5"
|
||||||
|
"github.com/gorilla/websocket"
|
||||||
)
|
)
|
||||||
|
|
||||||
// handleListRunners lists all runners
|
// handleListRunners lists all runners
|
||||||
@@ -163,206 +165,6 @@ func (s *Server) handleRegisterRunner(w http.ResponseWriter, r *http.Request) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
// handleRunnerHeartbeat updates runner heartbeat
|
|
||||||
func (s *Server) handleRunnerHeartbeat(w http.ResponseWriter, r *http.Request) {
|
|
||||||
runnerID, ok := r.Context().Value("runner_id").(int64)
|
|
||||||
if !ok {
|
|
||||||
s.respondError(w, http.StatusBadRequest, "runner_id not found in context")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
_, err := s.db.Exec(
|
|
||||||
`UPDATE runners SET last_heartbeat = ?, status = ? WHERE id = ?`,
|
|
||||||
time.Now(), types.RunnerStatusOnline, runnerID,
|
|
||||||
)
|
|
||||||
if err != nil {
|
|
||||||
s.respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to update heartbeat: %v", err))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
s.respondJSON(w, http.StatusOK, map[string]string{"message": "Heartbeat updated"})
|
|
||||||
}
|
|
||||||
|
|
||||||
// handleGetRunnerTasks gets pending tasks for a runner
|
|
||||||
func (s *Server) handleGetRunnerTasks(w http.ResponseWriter, r *http.Request) {
|
|
||||||
runnerID, ok := r.Context().Value("runner_id").(int64)
|
|
||||||
if !ok {
|
|
||||||
s.respondError(w, http.StatusBadRequest, "runner_id not found in context")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get pending tasks
|
|
||||||
rows, err := s.db.Query(
|
|
||||||
`SELECT t.id, t.job_id, t.runner_id, t.frame_start, t.frame_end, t.status, t.output_path,
|
|
||||||
t.created_at, t.started_at, t.completed_at, t.error_message,
|
|
||||||
j.name as job_name, j.output_format
|
|
||||||
FROM tasks t
|
|
||||||
JOIN jobs j ON t.job_id = j.id
|
|
||||||
WHERE t.status = ? AND j.status != ?
|
|
||||||
ORDER BY t.created_at ASC
|
|
||||||
LIMIT 10`,
|
|
||||||
types.TaskStatusPending, types.JobStatusCancelled,
|
|
||||||
)
|
|
||||||
if err != nil {
|
|
||||||
s.respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to query tasks: %v", err))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
defer rows.Close()
|
|
||||||
|
|
||||||
tasks := []map[string]interface{}{}
|
|
||||||
for rows.Next() {
|
|
||||||
var task types.Task
|
|
||||||
var runnerID sql.NullInt64
|
|
||||||
var startedAt, completedAt sql.NullTime
|
|
||||||
var jobName, outputFormat string
|
|
||||||
|
|
||||||
err := rows.Scan(
|
|
||||||
&task.ID, &task.JobID, &runnerID, &task.FrameStart, &task.FrameEnd,
|
|
||||||
&task.Status, &task.OutputPath, &task.CreatedAt,
|
|
||||||
&startedAt, &completedAt, &task.ErrorMessage,
|
|
||||||
&jobName, &outputFormat,
|
|
||||||
)
|
|
||||||
if err != nil {
|
|
||||||
s.respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to scan task: %v", err))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
if runnerID.Valid {
|
|
||||||
task.RunnerID = &runnerID.Int64
|
|
||||||
}
|
|
||||||
if startedAt.Valid {
|
|
||||||
task.StartedAt = &startedAt.Time
|
|
||||||
}
|
|
||||||
if completedAt.Valid {
|
|
||||||
task.CompletedAt = &completedAt.Time
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get input files for the job
|
|
||||||
var inputFiles []string
|
|
||||||
fileRows, err := s.db.Query(
|
|
||||||
`SELECT file_path FROM job_files WHERE job_id = ? AND file_type = ?`,
|
|
||||||
task.JobID, types.JobFileTypeInput,
|
|
||||||
)
|
|
||||||
if err == nil {
|
|
||||||
for fileRows.Next() {
|
|
||||||
var filePath string
|
|
||||||
if err := fileRows.Scan(&filePath); err == nil {
|
|
||||||
inputFiles = append(inputFiles, filePath)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fileRows.Close()
|
|
||||||
}
|
|
||||||
|
|
||||||
tasks = append(tasks, map[string]interface{}{
|
|
||||||
"task": task,
|
|
||||||
"job_name": jobName,
|
|
||||||
"output_format": outputFormat,
|
|
||||||
"input_files": inputFiles,
|
|
||||||
})
|
|
||||||
|
|
||||||
// Assign task to runner
|
|
||||||
_, err = s.db.Exec(
|
|
||||||
`UPDATE tasks SET runner_id = ?, status = ? WHERE id = ?`,
|
|
||||||
runnerID, types.TaskStatusRunning, task.ID,
|
|
||||||
)
|
|
||||||
if err != nil {
|
|
||||||
s.respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to assign task: %v", err))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
s.respondJSON(w, http.StatusOK, tasks)
|
|
||||||
}
|
|
||||||
|
|
||||||
// handleCompleteTask marks a task as completed
|
|
||||||
func (s *Server) handleCompleteTask(w http.ResponseWriter, r *http.Request) {
|
|
||||||
taskID, err := parseID(r, "id")
|
|
||||||
if err != nil {
|
|
||||||
s.respondError(w, http.StatusBadRequest, err.Error())
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
var req struct {
|
|
||||||
OutputPath string `json:"output_path"`
|
|
||||||
Success bool `json:"success"`
|
|
||||||
Error string `json:"error,omitempty"`
|
|
||||||
}
|
|
||||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
|
||||||
s.respondError(w, http.StatusBadRequest, "Invalid request body")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
status := types.TaskStatusCompleted
|
|
||||||
if !req.Success {
|
|
||||||
status = types.TaskStatusFailed
|
|
||||||
}
|
|
||||||
|
|
||||||
now := time.Now()
|
|
||||||
_, err = s.db.Exec(
|
|
||||||
`UPDATE tasks SET status = ?, output_path = ?, completed_at = ?, error_message = ? WHERE id = ?`,
|
|
||||||
status, req.OutputPath, now, req.Error, taskID,
|
|
||||||
)
|
|
||||||
if err != nil {
|
|
||||||
s.respondError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to update task: %v", err))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
// Update job progress
|
|
||||||
var jobID int64
|
|
||||||
var frameStart, frameEnd int
|
|
||||||
err = s.db.QueryRow(
|
|
||||||
`SELECT job_id, frame_start, frame_end FROM tasks WHERE id = ?`,
|
|
||||||
taskID,
|
|
||||||
).Scan(&jobID, &frameStart, &frameEnd)
|
|
||||||
if err == nil {
|
|
||||||
// Count completed tasks
|
|
||||||
var totalTasks, completedTasks int
|
|
||||||
s.db.QueryRow(
|
|
||||||
`SELECT COUNT(*) FROM tasks WHERE job_id = ?`,
|
|
||||||
jobID,
|
|
||||||
).Scan(&totalTasks)
|
|
||||||
s.db.QueryRow(
|
|
||||||
`SELECT COUNT(*) FROM tasks WHERE job_id = ? AND status = ?`,
|
|
||||||
jobID, types.TaskStatusCompleted,
|
|
||||||
).Scan(&completedTasks)
|
|
||||||
|
|
||||||
progress := float64(completedTasks) / float64(totalTasks) * 100.0
|
|
||||||
|
|
||||||
// Update job status
|
|
||||||
var jobStatus string
|
|
||||||
var outputFormat string
|
|
||||||
s.db.QueryRow(`SELECT output_format FROM jobs WHERE id = ?`, jobID).Scan(&outputFormat)
|
|
||||||
|
|
||||||
if completedTasks == totalTasks {
|
|
||||||
jobStatus = string(types.JobStatusCompleted)
|
|
||||||
now := time.Now()
|
|
||||||
s.db.Exec(
|
|
||||||
`UPDATE jobs SET status = ?, progress = ?, completed_at = ? WHERE id = ?`,
|
|
||||||
jobStatus, progress, now, jobID,
|
|
||||||
)
|
|
||||||
|
|
||||||
// For MP4 jobs, create a video generation task
|
|
||||||
if outputFormat == "MP4" {
|
|
||||||
go s.generateMP4Video(jobID)
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
jobStatus = string(types.JobStatusRunning)
|
|
||||||
var startedAt sql.NullTime
|
|
||||||
s.db.QueryRow(`SELECT started_at FROM jobs WHERE id = ?`, jobID).Scan(&startedAt)
|
|
||||||
if !startedAt.Valid {
|
|
||||||
now := time.Now()
|
|
||||||
s.db.Exec(`UPDATE jobs SET started_at = ? WHERE id = ?`, now, jobID)
|
|
||||||
}
|
|
||||||
s.db.Exec(
|
|
||||||
`UPDATE jobs SET status = ?, progress = ? WHERE id = ?`,
|
|
||||||
jobStatus, progress, jobID,
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
s.respondJSON(w, http.StatusOK, map[string]string{"message": "Task completed"})
|
|
||||||
}
|
|
||||||
|
|
||||||
// handleUpdateTaskProgress updates task progress
|
// handleUpdateTaskProgress updates task progress
|
||||||
func (s *Server) handleUpdateTaskProgress(w http.ResponseWriter, r *http.Request) {
|
func (s *Server) handleUpdateTaskProgress(w http.ResponseWriter, r *http.Request) {
|
||||||
_, err := parseID(r, "id")
|
_, err := parseID(r, "id")
|
||||||
@@ -516,12 +318,12 @@ func (s *Server) handleGetJobStatusForRunner(w http.ResponseWriter, r *http.Requ
|
|||||||
|
|
||||||
err = s.db.QueryRow(
|
err = s.db.QueryRow(
|
||||||
`SELECT id, user_id, name, status, progress, frame_start, frame_end, output_format,
|
`SELECT id, user_id, name, status, progress, frame_start, frame_end, output_format,
|
||||||
created_at, started_at, completed_at, error_message
|
allow_parallel_runners, created_at, started_at, completed_at, error_message
|
||||||
FROM jobs WHERE id = ?`,
|
FROM jobs WHERE id = ?`,
|
||||||
jobID,
|
jobID,
|
||||||
).Scan(
|
).Scan(
|
||||||
&job.ID, &job.UserID, &job.Name, &job.Status, &job.Progress,
|
&job.ID, &job.UserID, &job.Name, &job.Status, &job.Progress,
|
||||||
&job.FrameStart, &job.FrameEnd, &job.OutputFormat,
|
&job.FrameStart, &job.FrameEnd, &job.OutputFormat, &job.AllowParallelRunners,
|
||||||
&job.CreatedAt, &startedAt, &completedAt, &job.ErrorMessage,
|
&job.CreatedAt, &startedAt, &completedAt, &job.ErrorMessage,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -580,3 +382,500 @@ func (s *Server) handleGetJobFilesForRunner(w http.ResponseWriter, r *http.Reque
|
|||||||
s.respondJSON(w, http.StatusOK, files)
|
s.respondJSON(w, http.StatusOK, files)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// WebSocket message types
|
||||||
|
type WSMessage struct {
|
||||||
|
Type string `json:"type"`
|
||||||
|
Data json.RawMessage `json:"data"`
|
||||||
|
Timestamp int64 `json:"timestamp"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type WSTaskAssignment struct {
|
||||||
|
TaskID int64 `json:"task_id"`
|
||||||
|
JobID int64 `json:"job_id"`
|
||||||
|
JobName string `json:"job_name"`
|
||||||
|
OutputFormat string `json:"output_format"`
|
||||||
|
FrameStart int `json:"frame_start"`
|
||||||
|
FrameEnd int `json:"frame_end"`
|
||||||
|
InputFiles []string `json:"input_files"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type WSLogEntry struct {
|
||||||
|
TaskID int64 `json:"task_id"`
|
||||||
|
LogLevel string `json:"log_level"`
|
||||||
|
Message string `json:"message"`
|
||||||
|
StepName string `json:"step_name,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type WSTaskUpdate struct {
|
||||||
|
TaskID int64 `json:"task_id"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
OutputPath string `json:"output_path,omitempty"`
|
||||||
|
Success bool `json:"success"`
|
||||||
|
Error string `json:"error,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// handleRunnerWebSocket handles WebSocket connections from runners
|
||||||
|
func (s *Server) handleRunnerWebSocket(w http.ResponseWriter, r *http.Request) {
|
||||||
|
// Get runner ID and signature from query params
|
||||||
|
runnerIDStr := r.URL.Query().Get("runner_id")
|
||||||
|
signature := r.URL.Query().Get("signature")
|
||||||
|
timestampStr := r.URL.Query().Get("timestamp")
|
||||||
|
|
||||||
|
if runnerIDStr == "" || signature == "" || timestampStr == "" {
|
||||||
|
s.respondError(w, http.StatusBadRequest, "runner_id, signature, and timestamp required")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
var runnerID int64
|
||||||
|
_, err := fmt.Sscanf(runnerIDStr, "%d", &runnerID)
|
||||||
|
if err != nil {
|
||||||
|
s.respondError(w, http.StatusBadRequest, "invalid runner_id")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get runner secret
|
||||||
|
runnerSecret, err := s.secrets.GetRunnerSecret(runnerID)
|
||||||
|
if err != nil {
|
||||||
|
s.respondError(w, http.StatusUnauthorized, "runner not found or not verified")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify signature
|
||||||
|
var timestamp int64
|
||||||
|
_, err = fmt.Sscanf(timestampStr, "%d", ×tamp)
|
||||||
|
if err != nil {
|
||||||
|
s.respondError(w, http.StatusBadRequest, "invalid timestamp")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify signature manually (similar to HTTP auth)
|
||||||
|
timestampTime := time.Unix(timestamp, 0)
|
||||||
|
|
||||||
|
// Check timestamp is not too old
|
||||||
|
if time.Since(timestampTime) > 5*time.Minute {
|
||||||
|
s.respondError(w, http.StatusUnauthorized, "timestamp too old")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check timestamp is not in the future (allow 1 minute clock skew)
|
||||||
|
if timestampTime.After(time.Now().Add(1 * time.Minute)) {
|
||||||
|
s.respondError(w, http.StatusUnauthorized, "timestamp in future")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build the message that should be signed
|
||||||
|
path := r.URL.Path
|
||||||
|
expectedSig := auth.SignRequest("GET", path, "", runnerSecret, timestampTime)
|
||||||
|
|
||||||
|
// Compare signatures (constant-time)
|
||||||
|
if subtle.ConstantTimeCompare([]byte(signature), []byte(expectedSig)) != 1 {
|
||||||
|
s.respondError(w, http.StatusUnauthorized, "invalid signature")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Upgrade to WebSocket
|
||||||
|
conn, err := s.wsUpgrader.Upgrade(w, r, nil)
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("Failed to upgrade WebSocket: %v", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
defer conn.Close()
|
||||||
|
|
||||||
|
// Register connection
|
||||||
|
s.runnerConnsMu.Lock()
|
||||||
|
// Remove old connection if exists
|
||||||
|
if oldConn, exists := s.runnerConns[runnerID]; exists {
|
||||||
|
oldConn.Close()
|
||||||
|
}
|
||||||
|
s.runnerConns[runnerID] = conn
|
||||||
|
s.runnerConnsMu.Unlock()
|
||||||
|
|
||||||
|
// Update runner status to online
|
||||||
|
_, _ = s.db.Exec(
|
||||||
|
`UPDATE runners SET status = ?, last_heartbeat = ? WHERE id = ?`,
|
||||||
|
types.RunnerStatusOnline, time.Now(), runnerID,
|
||||||
|
)
|
||||||
|
|
||||||
|
// Cleanup on disconnect
|
||||||
|
defer func() {
|
||||||
|
s.runnerConnsMu.Lock()
|
||||||
|
delete(s.runnerConns, runnerID)
|
||||||
|
s.runnerConnsMu.Unlock()
|
||||||
|
_, _ = s.db.Exec(
|
||||||
|
`UPDATE runners SET status = ? WHERE id = ?`,
|
||||||
|
types.RunnerStatusOffline, runnerID,
|
||||||
|
)
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Set ping handler to update heartbeat
|
||||||
|
conn.SetPongHandler(func(string) error {
|
||||||
|
_, _ = s.db.Exec(
|
||||||
|
`UPDATE runners SET last_heartbeat = ?, status = ? WHERE id = ?`,
|
||||||
|
time.Now(), types.RunnerStatusOnline, runnerID,
|
||||||
|
)
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
|
||||||
|
// Send ping every 30 seconds
|
||||||
|
go func() {
|
||||||
|
ticker := time.NewTicker(30 * time.Second)
|
||||||
|
defer ticker.Stop()
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-ticker.C:
|
||||||
|
s.runnerConnsMu.RLock()
|
||||||
|
conn, exists := s.runnerConns[runnerID]
|
||||||
|
s.runnerConnsMu.RUnlock()
|
||||||
|
if !exists {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err := conn.WriteControl(websocket.PingMessage, []byte{}, time.Now().Add(10*time.Second)); err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Handle incoming messages
|
||||||
|
for {
|
||||||
|
var msg WSMessage
|
||||||
|
err := conn.ReadJSON(&msg)
|
||||||
|
if err != nil {
|
||||||
|
if websocket.IsUnexpectedCloseError(err, websocket.CloseGoingAway, websocket.CloseAbnormalClosure) {
|
||||||
|
log.Printf("WebSocket error for runner %d: %v", runnerID, err)
|
||||||
|
}
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
switch msg.Type {
|
||||||
|
case "heartbeat":
|
||||||
|
// Update heartbeat
|
||||||
|
_, _ = s.db.Exec(
|
||||||
|
`UPDATE runners SET last_heartbeat = ?, status = ? WHERE id = ?`,
|
||||||
|
time.Now(), types.RunnerStatusOnline, runnerID,
|
||||||
|
)
|
||||||
|
|
||||||
|
case "log_entry":
|
||||||
|
var logEntry WSLogEntry
|
||||||
|
if err := json.Unmarshal(msg.Data, &logEntry); err == nil {
|
||||||
|
s.handleWebSocketLog(runnerID, logEntry)
|
||||||
|
}
|
||||||
|
|
||||||
|
case "task_update":
|
||||||
|
var taskUpdate WSTaskUpdate
|
||||||
|
if err := json.Unmarshal(msg.Data, &taskUpdate); err == nil {
|
||||||
|
s.handleWebSocketTaskUpdate(runnerID, taskUpdate)
|
||||||
|
}
|
||||||
|
|
||||||
|
case "task_complete":
|
||||||
|
var taskUpdate WSTaskUpdate
|
||||||
|
if err := json.Unmarshal(msg.Data, &taskUpdate); err == nil {
|
||||||
|
s.handleWebSocketTaskComplete(runnerID, taskUpdate)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// handleWebSocketLog handles log entries from WebSocket
|
||||||
|
func (s *Server) handleWebSocketLog(runnerID int64, logEntry WSLogEntry) {
|
||||||
|
// Store log in database
|
||||||
|
_, err := s.db.Exec(
|
||||||
|
`INSERT INTO task_logs (task_id, runner_id, log_level, message, step_name, created_at)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?)`,
|
||||||
|
logEntry.TaskID, runnerID, logEntry.LogLevel, logEntry.Message, logEntry.StepName, time.Now(),
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("Failed to store log: %v", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Broadcast to frontend clients
|
||||||
|
s.broadcastLogToFrontend(logEntry.TaskID, logEntry)
|
||||||
|
}
|
||||||
|
|
||||||
|
// handleWebSocketTaskUpdate handles task status updates from WebSocket
|
||||||
|
func (s *Server) handleWebSocketTaskUpdate(runnerID int64, taskUpdate WSTaskUpdate) {
|
||||||
|
// This can be used for progress updates
|
||||||
|
// For now, we'll just log it
|
||||||
|
log.Printf("Task %d update from runner %d: %s", taskUpdate.TaskID, runnerID, taskUpdate.Status)
|
||||||
|
}
|
||||||
|
|
||||||
|
// handleWebSocketTaskComplete handles task completion from WebSocket
|
||||||
|
func (s *Server) handleWebSocketTaskComplete(runnerID int64, taskUpdate WSTaskUpdate) {
|
||||||
|
// Verify task belongs to runner
|
||||||
|
var taskRunnerID sql.NullInt64
|
||||||
|
err := s.db.QueryRow("SELECT runner_id FROM tasks WHERE id = ?", taskUpdate.TaskID).Scan(&taskRunnerID)
|
||||||
|
if err != nil || !taskRunnerID.Valid || taskRunnerID.Int64 != runnerID {
|
||||||
|
log.Printf("Task %d does not belong to runner %d", taskUpdate.TaskID, runnerID)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
status := types.TaskStatusCompleted
|
||||||
|
if !taskUpdate.Success {
|
||||||
|
status = types.TaskStatusFailed
|
||||||
|
}
|
||||||
|
|
||||||
|
now := time.Now()
|
||||||
|
_, err = s.db.Exec(
|
||||||
|
`UPDATE tasks SET status = ?, output_path = ?, completed_at = ?, error_message = ? WHERE id = ?`,
|
||||||
|
status, taskUpdate.OutputPath, now, taskUpdate.Error, taskUpdate.TaskID,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("Failed to update task: %v", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update job progress
|
||||||
|
var jobID int64
|
||||||
|
var frameStart, frameEnd int
|
||||||
|
err = s.db.QueryRow(
|
||||||
|
`SELECT job_id, frame_start, frame_end FROM tasks WHERE id = ?`,
|
||||||
|
taskUpdate.TaskID,
|
||||||
|
).Scan(&jobID, &frameStart, &frameEnd)
|
||||||
|
if err == nil {
|
||||||
|
var totalTasks, completedTasks int
|
||||||
|
s.db.QueryRow(`SELECT COUNT(*) FROM tasks WHERE job_id = ?`, jobID).Scan(&totalTasks)
|
||||||
|
s.db.QueryRow(
|
||||||
|
`SELECT COUNT(*) FROM tasks WHERE job_id = ? AND status = ?`,
|
||||||
|
jobID, types.TaskStatusCompleted,
|
||||||
|
).Scan(&completedTasks)
|
||||||
|
|
||||||
|
progress := float64(completedTasks) / float64(totalTasks) * 100.0
|
||||||
|
|
||||||
|
var jobStatus string
|
||||||
|
var outputFormat string
|
||||||
|
s.db.QueryRow(`SELECT output_format FROM jobs WHERE id = ?`, jobID).Scan(&outputFormat)
|
||||||
|
|
||||||
|
if completedTasks == totalTasks {
|
||||||
|
jobStatus = string(types.JobStatusCompleted)
|
||||||
|
s.db.Exec(
|
||||||
|
`UPDATE jobs SET status = ?, progress = ?, completed_at = ? WHERE id = ?`,
|
||||||
|
jobStatus, progress, now, jobID,
|
||||||
|
)
|
||||||
|
|
||||||
|
if outputFormat == "MP4" {
|
||||||
|
go s.generateMP4Video(jobID)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
jobStatus = string(types.JobStatusRunning)
|
||||||
|
var startedAt sql.NullTime
|
||||||
|
s.db.QueryRow(`SELECT started_at FROM jobs WHERE id = ?`, jobID).Scan(&startedAt)
|
||||||
|
if !startedAt.Valid {
|
||||||
|
s.db.Exec(`UPDATE jobs SET started_at = ? WHERE id = ?`, now, jobID)
|
||||||
|
}
|
||||||
|
s.db.Exec(
|
||||||
|
`UPDATE jobs SET status = ?, progress = ? WHERE id = ?`,
|
||||||
|
jobStatus, progress, jobID,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// broadcastLogToFrontend broadcasts log to connected frontend clients
|
||||||
|
func (s *Server) broadcastLogToFrontend(taskID int64, logEntry WSLogEntry) {
|
||||||
|
// Get job_id from task
|
||||||
|
var jobID int64
|
||||||
|
err := s.db.QueryRow("SELECT job_id FROM tasks WHERE id = ?", taskID).Scan(&jobID)
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
key := fmt.Sprintf("%d:%d", jobID, taskID)
|
||||||
|
s.frontendConnsMu.RLock()
|
||||||
|
conn, exists := s.frontendConns[key]
|
||||||
|
s.frontendConnsMu.RUnlock()
|
||||||
|
|
||||||
|
if exists && conn != nil {
|
||||||
|
// Get full log entry from database for consistency
|
||||||
|
var log types.TaskLog
|
||||||
|
var runnerID sql.NullInt64
|
||||||
|
err := s.db.QueryRow(
|
||||||
|
`SELECT id, task_id, runner_id, log_level, message, step_name, created_at
|
||||||
|
FROM task_logs WHERE task_id = ? AND message = ? ORDER BY id DESC LIMIT 1`,
|
||||||
|
taskID, logEntry.Message,
|
||||||
|
).Scan(&log.ID, &log.TaskID, &runnerID, &log.LogLevel, &log.Message, &log.StepName, &log.CreatedAt)
|
||||||
|
if err == nil {
|
||||||
|
if runnerID.Valid {
|
||||||
|
log.RunnerID = &runnerID.Int64
|
||||||
|
}
|
||||||
|
msg := map[string]interface{}{
|
||||||
|
"type": "log",
|
||||||
|
"data": log,
|
||||||
|
"timestamp": time.Now().Unix(),
|
||||||
|
}
|
||||||
|
conn.WriteJSON(msg)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// distributeTasksToRunners pushes available tasks to connected runners
|
||||||
|
func (s *Server) distributeTasksToRunners() {
|
||||||
|
// Get all pending tasks
|
||||||
|
rows, err := s.db.Query(
|
||||||
|
`SELECT t.id, t.job_id, t.frame_start, t.frame_end, j.allow_parallel_runners, j.status as job_status
|
||||||
|
FROM tasks t
|
||||||
|
JOIN jobs j ON t.job_id = j.id
|
||||||
|
WHERE t.status = ? AND j.status != ?
|
||||||
|
ORDER BY t.created_at ASC
|
||||||
|
LIMIT 100`,
|
||||||
|
types.TaskStatusPending, types.JobStatusCancelled,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("Failed to query pending tasks: %v", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
defer rows.Close()
|
||||||
|
|
||||||
|
var pendingTasks []struct {
|
||||||
|
TaskID int64
|
||||||
|
JobID int64
|
||||||
|
FrameStart int
|
||||||
|
FrameEnd int
|
||||||
|
AllowParallelRunners bool
|
||||||
|
}
|
||||||
|
|
||||||
|
for rows.Next() {
|
||||||
|
var t struct {
|
||||||
|
TaskID int64
|
||||||
|
JobID int64
|
||||||
|
FrameStart int
|
||||||
|
FrameEnd int
|
||||||
|
AllowParallelRunners bool
|
||||||
|
}
|
||||||
|
var allowParallel int
|
||||||
|
var jobStatus string
|
||||||
|
err := rows.Scan(&t.TaskID, &t.JobID, &t.FrameStart, &t.FrameEnd, &allowParallel, &jobStatus)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
t.AllowParallelRunners = allowParallel == 1
|
||||||
|
pendingTasks = append(pendingTasks, t)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(pendingTasks) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get connected runners
|
||||||
|
s.runnerConnsMu.RLock()
|
||||||
|
connectedRunners := make([]int64, 0, len(s.runnerConns))
|
||||||
|
for runnerID := range s.runnerConns {
|
||||||
|
connectedRunners = append(connectedRunners, runnerID)
|
||||||
|
}
|
||||||
|
s.runnerConnsMu.RUnlock()
|
||||||
|
|
||||||
|
if len(connectedRunners) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Distribute tasks to runners
|
||||||
|
for _, task := range pendingTasks {
|
||||||
|
// Check if task is already assigned
|
||||||
|
var assignedRunnerID sql.NullInt64
|
||||||
|
err := s.db.QueryRow("SELECT runner_id FROM tasks WHERE id = ?", task.TaskID).Scan(&assignedRunnerID)
|
||||||
|
if err == nil && assignedRunnerID.Valid {
|
||||||
|
continue // Already assigned
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find available runner
|
||||||
|
var selectedRunnerID int64
|
||||||
|
for _, runnerID := range connectedRunners {
|
||||||
|
// Check if runner is busy (has running tasks)
|
||||||
|
var runningCount int
|
||||||
|
s.db.QueryRow(
|
||||||
|
`SELECT COUNT(*) FROM tasks WHERE runner_id = ? AND status = ?`,
|
||||||
|
runnerID, types.TaskStatusRunning,
|
||||||
|
).Scan(&runningCount)
|
||||||
|
|
||||||
|
if runningCount > 0 {
|
||||||
|
continue // Runner is busy
|
||||||
|
}
|
||||||
|
|
||||||
|
// For non-parallel jobs, check if runner already has tasks from this job
|
||||||
|
if !task.AllowParallelRunners {
|
||||||
|
var jobTaskCount int
|
||||||
|
s.db.QueryRow(
|
||||||
|
`SELECT COUNT(*) FROM tasks
|
||||||
|
WHERE job_id = ? AND runner_id = ? AND status IN (?, ?)`,
|
||||||
|
task.JobID, runnerID, types.TaskStatusPending, types.TaskStatusRunning,
|
||||||
|
).Scan(&jobTaskCount)
|
||||||
|
if jobTaskCount > 0 {
|
||||||
|
continue // Another runner is working on this job
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
selectedRunnerID = runnerID
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
if selectedRunnerID == 0 {
|
||||||
|
continue // No available runner
|
||||||
|
}
|
||||||
|
|
||||||
|
// Assign task to runner
|
||||||
|
if err := s.assignTaskToRunner(selectedRunnerID, task.TaskID); err != nil {
|
||||||
|
log.Printf("Failed to assign task %d to runner %d: %v", task.TaskID, selectedRunnerID, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// assignTaskToRunner sends a task to a runner via WebSocket
|
||||||
|
func (s *Server) assignTaskToRunner(runnerID int64, taskID int64) error {
|
||||||
|
s.runnerConnsMu.RLock()
|
||||||
|
conn, exists := s.runnerConns[runnerID]
|
||||||
|
s.runnerConnsMu.RUnlock()
|
||||||
|
|
||||||
|
if !exists {
|
||||||
|
return fmt.Errorf("runner %d not connected", runnerID)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get task details
|
||||||
|
var task WSTaskAssignment
|
||||||
|
var jobName, outputFormat string
|
||||||
|
err := s.db.QueryRow(
|
||||||
|
`SELECT t.job_id, t.frame_start, t.frame_end, j.name, j.output_format
|
||||||
|
FROM tasks t JOIN jobs j ON t.job_id = j.id WHERE t.id = ?`,
|
||||||
|
taskID,
|
||||||
|
).Scan(&task.JobID, &task.FrameStart, &task.FrameEnd, &jobName, &outputFormat)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
task.TaskID = taskID
|
||||||
|
task.JobID = task.JobID
|
||||||
|
task.JobName = jobName
|
||||||
|
task.OutputFormat = outputFormat
|
||||||
|
|
||||||
|
// Get input files
|
||||||
|
rows, err := s.db.Query(
|
||||||
|
`SELECT file_path FROM job_files WHERE job_id = ? AND file_type = ?`,
|
||||||
|
task.JobID, types.JobFileTypeInput,
|
||||||
|
)
|
||||||
|
if err == nil {
|
||||||
|
defer rows.Close()
|
||||||
|
for rows.Next() {
|
||||||
|
var filePath string
|
||||||
|
if err := rows.Scan(&filePath); err == nil {
|
||||||
|
task.InputFiles = append(task.InputFiles, filePath)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Assign task to runner in database
|
||||||
|
now := time.Now()
|
||||||
|
_, err = s.db.Exec(
|
||||||
|
`UPDATE tasks SET runner_id = ?, status = ?, started_at = ? WHERE id = ?`,
|
||||||
|
runnerID, types.TaskStatusRunning, now, taskID,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Send task via WebSocket
|
||||||
|
msg := WSMessage{
|
||||||
|
Type: "task_assignment",
|
||||||
|
Timestamp: time.Now().Unix(),
|
||||||
|
}
|
||||||
|
msg.Data, _ = json.Marshal(task)
|
||||||
|
return conn.WriteJSON(msg)
|
||||||
|
}
|
||||||
|
|||||||
@@ -1,19 +1,23 @@
|
|||||||
package api
|
package api
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"database/sql"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"log"
|
"log"
|
||||||
"net/http"
|
"net/http"
|
||||||
"strconv"
|
"strconv"
|
||||||
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/go-chi/chi/v5"
|
"github.com/go-chi/chi/v5"
|
||||||
"github.com/go-chi/chi/v5/middleware"
|
"github.com/go-chi/chi/v5/middleware"
|
||||||
"github.com/go-chi/cors"
|
"github.com/go-chi/cors"
|
||||||
|
"github.com/gorilla/websocket"
|
||||||
"fuego/internal/auth"
|
"fuego/internal/auth"
|
||||||
"fuego/internal/database"
|
"fuego/internal/database"
|
||||||
"fuego/internal/storage"
|
"fuego/internal/storage"
|
||||||
|
"fuego/pkg/types"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Server represents the API server
|
// Server represents the API server
|
||||||
@@ -23,6 +27,13 @@ type Server struct {
|
|||||||
secrets *auth.Secrets
|
secrets *auth.Secrets
|
||||||
storage *storage.Storage
|
storage *storage.Storage
|
||||||
router *chi.Mux
|
router *chi.Mux
|
||||||
|
|
||||||
|
// WebSocket connections
|
||||||
|
wsUpgrader websocket.Upgrader
|
||||||
|
runnerConns map[int64]*websocket.Conn
|
||||||
|
runnerConnsMu sync.RWMutex
|
||||||
|
frontendConns map[string]*websocket.Conn // key: "jobId:taskId"
|
||||||
|
frontendConnsMu sync.RWMutex
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewServer creates a new API server
|
// NewServer creates a new API server
|
||||||
@@ -38,10 +49,20 @@ func NewServer(db *database.DB, auth *auth.Auth, storage *storage.Storage) (*Ser
|
|||||||
secrets: secrets,
|
secrets: secrets,
|
||||||
storage: storage,
|
storage: storage,
|
||||||
router: chi.NewRouter(),
|
router: chi.NewRouter(),
|
||||||
|
wsUpgrader: websocket.Upgrader{
|
||||||
|
CheckOrigin: func(r *http.Request) bool {
|
||||||
|
return true // Allow all origins for now
|
||||||
|
},
|
||||||
|
ReadBufferSize: 1024,
|
||||||
|
WriteBufferSize: 1024,
|
||||||
|
},
|
||||||
|
runnerConns: make(map[int64]*websocket.Conn),
|
||||||
|
frontendConns: make(map[string]*websocket.Conn),
|
||||||
}
|
}
|
||||||
|
|
||||||
s.setupMiddleware()
|
s.setupMiddleware()
|
||||||
s.setupRoutes()
|
s.setupRoutes()
|
||||||
|
s.StartBackgroundTasks()
|
||||||
|
|
||||||
return s, nil
|
return s, nil
|
||||||
}
|
}
|
||||||
@@ -87,6 +108,10 @@ func (s *Server) setupRoutes() {
|
|||||||
r.Get("/{id}/files", s.handleListJobFiles)
|
r.Get("/{id}/files", s.handleListJobFiles)
|
||||||
r.Get("/{id}/files/{fileId}/download", s.handleDownloadJobFile)
|
r.Get("/{id}/files/{fileId}/download", s.handleDownloadJobFile)
|
||||||
r.Get("/{id}/video", s.handleStreamVideo)
|
r.Get("/{id}/video", s.handleStreamVideo)
|
||||||
|
r.Get("/{id}/tasks/{taskId}/logs", s.handleGetTaskLogs)
|
||||||
|
r.Get("/{id}/tasks/{taskId}/logs/ws", s.handleStreamTaskLogsWebSocket)
|
||||||
|
r.Get("/{id}/tasks/{taskId}/steps", s.handleGetTaskSteps)
|
||||||
|
r.Post("/{id}/tasks/{taskId}/retry", s.handleRetryTask)
|
||||||
})
|
})
|
||||||
|
|
||||||
s.router.Route("/api/runners", func(r chi.Router) {
|
s.router.Route("/api/runners", func(r chi.Router) {
|
||||||
@@ -118,14 +143,14 @@ func (s *Server) setupRoutes() {
|
|||||||
// Registration doesn't require auth (uses token)
|
// Registration doesn't require auth (uses token)
|
||||||
r.Post("/register", s.handleRegisterRunner)
|
r.Post("/register", s.handleRegisterRunner)
|
||||||
|
|
||||||
// All other endpoints require runner authentication
|
// WebSocket endpoint (auth handled in handler)
|
||||||
|
r.Get("/ws", s.handleRunnerWebSocket)
|
||||||
|
|
||||||
|
// File operations still use HTTP (WebSocket not suitable for large files)
|
||||||
r.Group(func(r chi.Router) {
|
r.Group(func(r chi.Router) {
|
||||||
r.Use(func(next http.Handler) http.Handler {
|
r.Use(func(next http.Handler) http.Handler {
|
||||||
return http.HandlerFunc(s.runnerAuthMiddleware(next.ServeHTTP))
|
return http.HandlerFunc(s.runnerAuthMiddleware(next.ServeHTTP))
|
||||||
})
|
})
|
||||||
r.Post("/heartbeat", s.handleRunnerHeartbeat)
|
|
||||||
r.Get("/tasks", s.handleGetRunnerTasks)
|
|
||||||
r.Post("/tasks/{id}/complete", s.handleCompleteTask)
|
|
||||||
r.Post("/tasks/{id}/progress", s.handleUpdateTaskProgress)
|
r.Post("/tasks/{id}/progress", s.handleUpdateTaskProgress)
|
||||||
r.Get("/files/{jobId}/{fileName}", s.handleDownloadFileForRunner)
|
r.Get("/files/{jobId}/{fileName}", s.handleDownloadFileForRunner)
|
||||||
r.Post("/files/{jobId}/upload", s.handleUploadFileFromRunner)
|
r.Post("/files/{jobId}/upload", s.handleUploadFileFromRunner)
|
||||||
@@ -282,3 +307,207 @@ func parseID(r *http.Request, param string) (int64, error) {
|
|||||||
return id, nil
|
return id, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// StartBackgroundTasks starts background goroutines for error recovery
|
||||||
|
func (s *Server) StartBackgroundTasks() {
|
||||||
|
go s.recoverStuckTasks()
|
||||||
|
}
|
||||||
|
|
||||||
|
// recoverStuckTasks periodically checks for dead runners and stuck tasks
|
||||||
|
func (s *Server) recoverStuckTasks() {
|
||||||
|
ticker := time.NewTicker(10 * time.Second)
|
||||||
|
defer ticker.Stop()
|
||||||
|
|
||||||
|
// Also distribute tasks every 5 seconds
|
||||||
|
distributeTicker := time.NewTicker(5 * time.Second)
|
||||||
|
defer distributeTicker.Stop()
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
for range distributeTicker.C {
|
||||||
|
s.distributeTasksToRunners()
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
for range ticker.C {
|
||||||
|
func() {
|
||||||
|
defer func() {
|
||||||
|
if r := recover(); r != nil {
|
||||||
|
log.Printf("Panic in recoverStuckTasks: %v", r)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Find dead runners (no heartbeat for 90 seconds)
|
||||||
|
rows, err := s.db.Query(
|
||||||
|
`SELECT id FROM runners
|
||||||
|
WHERE last_heartbeat < datetime('now', '-90 seconds')
|
||||||
|
AND status = ?`,
|
||||||
|
types.RunnerStatusOnline,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("Failed to query dead runners: %v", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
defer rows.Close()
|
||||||
|
|
||||||
|
var deadRunnerIDs []int64
|
||||||
|
for rows.Next() {
|
||||||
|
var runnerID int64
|
||||||
|
if err := rows.Scan(&runnerID); err == nil {
|
||||||
|
deadRunnerIDs = append(deadRunnerIDs, runnerID)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
rows.Close()
|
||||||
|
|
||||||
|
if len(deadRunnerIDs) == 0 {
|
||||||
|
// Check for task timeouts
|
||||||
|
s.recoverTaskTimeouts()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reset tasks assigned to dead runners
|
||||||
|
for _, runnerID := range deadRunnerIDs {
|
||||||
|
// Get tasks assigned to this runner
|
||||||
|
taskRows, err := s.db.Query(
|
||||||
|
`SELECT id, retry_count, max_retries FROM tasks
|
||||||
|
WHERE runner_id = ? AND status = ?`,
|
||||||
|
runnerID, types.TaskStatusRunning,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("Failed to query tasks for runner %d: %v", runnerID, err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
var tasksToReset []struct {
|
||||||
|
ID int64
|
||||||
|
RetryCount int
|
||||||
|
MaxRetries int
|
||||||
|
}
|
||||||
|
|
||||||
|
for taskRows.Next() {
|
||||||
|
var t struct {
|
||||||
|
ID int64
|
||||||
|
RetryCount int
|
||||||
|
MaxRetries int
|
||||||
|
}
|
||||||
|
if err := taskRows.Scan(&t.ID, &t.RetryCount, &t.MaxRetries); err == nil {
|
||||||
|
tasksToReset = append(tasksToReset, t)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
taskRows.Close()
|
||||||
|
|
||||||
|
// Reset or fail tasks
|
||||||
|
for _, task := range tasksToReset {
|
||||||
|
if task.RetryCount >= task.MaxRetries {
|
||||||
|
// Mark as failed
|
||||||
|
_, err = s.db.Exec(
|
||||||
|
`UPDATE tasks SET status = ?, error_message = ?, runner_id = NULL
|
||||||
|
WHERE id = ?`,
|
||||||
|
types.TaskStatusFailed, "Runner died, max retries exceeded", task.ID,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("Failed to mark task %d as failed: %v", task.ID, err)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Reset to pending
|
||||||
|
_, err = s.db.Exec(
|
||||||
|
`UPDATE tasks SET status = ?, runner_id = NULL, current_step = NULL,
|
||||||
|
retry_count = retry_count + 1 WHERE id = ?`,
|
||||||
|
types.TaskStatusPending, task.ID,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("Failed to reset task %d: %v", task.ID, err)
|
||||||
|
} else {
|
||||||
|
// Add log entry
|
||||||
|
_, _ = s.db.Exec(
|
||||||
|
`INSERT INTO task_logs (task_id, log_level, message, step_name, created_at)
|
||||||
|
VALUES (?, ?, ?, ?, ?)`,
|
||||||
|
task.ID, types.LogLevelWarn, fmt.Sprintf("Runner died, task reset (retry %d/%d)", task.RetryCount+1, task.MaxRetries),
|
||||||
|
"", time.Now(),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Mark runner as offline
|
||||||
|
_, _ = s.db.Exec(
|
||||||
|
`UPDATE runners SET status = ? WHERE id = ?`,
|
||||||
|
types.RunnerStatusOffline, runnerID,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for task timeouts
|
||||||
|
s.recoverTaskTimeouts()
|
||||||
|
|
||||||
|
// Distribute newly recovered tasks
|
||||||
|
s.distributeTasksToRunners()
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// recoverTaskTimeouts handles tasks that have exceeded their timeout
|
||||||
|
func (s *Server) recoverTaskTimeouts() {
|
||||||
|
// Find tasks running longer than their timeout
|
||||||
|
rows, err := s.db.Query(
|
||||||
|
`SELECT t.id, t.runner_id, t.retry_count, t.max_retries, t.timeout_seconds, t.started_at
|
||||||
|
FROM tasks t
|
||||||
|
WHERE t.status = ?
|
||||||
|
AND t.started_at IS NOT NULL
|
||||||
|
AND (t.timeout_seconds IS NULL OR
|
||||||
|
datetime(t.started_at, '+' || t.timeout_seconds || ' seconds') < datetime('now'))`,
|
||||||
|
types.TaskStatusRunning,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("Failed to query timed out tasks: %v", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
defer rows.Close()
|
||||||
|
|
||||||
|
for rows.Next() {
|
||||||
|
var taskID int64
|
||||||
|
var runnerID sql.NullInt64
|
||||||
|
var retryCount, maxRetries int
|
||||||
|
var timeoutSeconds sql.NullInt64
|
||||||
|
var startedAt time.Time
|
||||||
|
|
||||||
|
err := rows.Scan(&taskID, &runnerID, &retryCount, &maxRetries, &timeoutSeconds, &startedAt)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use default timeout if not set (5 minutes for frame tasks, 24 hours for FFmpeg)
|
||||||
|
timeout := 300 // 5 minutes default
|
||||||
|
if timeoutSeconds.Valid {
|
||||||
|
timeout = int(timeoutSeconds.Int64)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if actually timed out
|
||||||
|
if time.Since(startedAt).Seconds() < float64(timeout) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if retryCount >= maxRetries {
|
||||||
|
// Mark as failed
|
||||||
|
_, err = s.db.Exec(
|
||||||
|
`UPDATE tasks SET status = ?, error_message = ?, runner_id = NULL
|
||||||
|
WHERE id = ?`,
|
||||||
|
types.TaskStatusFailed, "Task timeout exceeded, max retries reached", taskID,
|
||||||
|
)
|
||||||
|
} else {
|
||||||
|
// Reset to pending
|
||||||
|
_, err = s.db.Exec(
|
||||||
|
`UPDATE tasks SET status = ?, runner_id = NULL, current_step = NULL,
|
||||||
|
retry_count = retry_count + 1 WHERE id = ?`,
|
||||||
|
types.TaskStatusPending, taskID,
|
||||||
|
)
|
||||||
|
if err == nil {
|
||||||
|
// Add log entry
|
||||||
|
_, _ = s.db.Exec(
|
||||||
|
`INSERT INTO task_logs (task_id, log_level, message, step_name, created_at)
|
||||||
|
VALUES (?, ?, ?, ?, ?)`,
|
||||||
|
taskID, types.LogLevelWarn, fmt.Sprintf("Task timeout exceeded, resetting (retry %d/%d)", retryCount+1, maxRetries),
|
||||||
|
"", time.Now(),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -119,14 +119,43 @@ func (db *DB) migrate() error {
|
|||||||
FOREIGN KEY (created_by) REFERENCES users(id) ON DELETE SET NULL
|
FOREIGN KEY (created_by) REFERENCES users(id) ON DELETE SET NULL
|
||||||
);
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS task_logs (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
task_id INTEGER NOT NULL,
|
||||||
|
runner_id INTEGER,
|
||||||
|
log_level TEXT NOT NULL,
|
||||||
|
message TEXT NOT NULL,
|
||||||
|
step_name TEXT,
|
||||||
|
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
FOREIGN KEY (task_id) REFERENCES tasks(id) ON DELETE CASCADE,
|
||||||
|
FOREIGN KEY (runner_id) REFERENCES runners(id) ON DELETE SET NULL
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS task_steps (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
task_id INTEGER NOT NULL,
|
||||||
|
step_name TEXT NOT NULL,
|
||||||
|
status TEXT NOT NULL DEFAULT 'pending',
|
||||||
|
started_at DATETIME,
|
||||||
|
completed_at DATETIME,
|
||||||
|
duration_ms INTEGER,
|
||||||
|
error_message TEXT,
|
||||||
|
FOREIGN KEY (task_id) REFERENCES tasks(id) ON DELETE CASCADE
|
||||||
|
);
|
||||||
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_jobs_user_id ON jobs(user_id);
|
CREATE INDEX IF NOT EXISTS idx_jobs_user_id ON jobs(user_id);
|
||||||
CREATE INDEX IF NOT EXISTS idx_jobs_status ON jobs(status);
|
CREATE INDEX IF NOT EXISTS idx_jobs_status ON jobs(status);
|
||||||
CREATE INDEX IF NOT EXISTS idx_tasks_job_id ON tasks(job_id);
|
CREATE INDEX IF NOT EXISTS idx_tasks_job_id ON tasks(job_id);
|
||||||
CREATE INDEX IF NOT EXISTS idx_tasks_runner_id ON tasks(runner_id);
|
CREATE INDEX IF NOT EXISTS idx_tasks_runner_id ON tasks(runner_id);
|
||||||
CREATE INDEX IF NOT EXISTS idx_tasks_status ON tasks(status);
|
CREATE INDEX IF NOT EXISTS idx_tasks_status ON tasks(status);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_tasks_started_at ON tasks(started_at);
|
||||||
CREATE INDEX IF NOT EXISTS idx_job_files_job_id ON job_files(job_id);
|
CREATE INDEX IF NOT EXISTS idx_job_files_job_id ON job_files(job_id);
|
||||||
CREATE INDEX IF NOT EXISTS idx_registration_tokens_token ON registration_tokens(token);
|
CREATE INDEX IF NOT EXISTS idx_registration_tokens_token ON registration_tokens(token);
|
||||||
CREATE INDEX IF NOT EXISTS idx_registration_tokens_expires_at ON registration_tokens(expires_at);
|
CREATE INDEX IF NOT EXISTS idx_registration_tokens_expires_at ON registration_tokens(expires_at);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_task_logs_task_id_created_at ON task_logs(task_id, created_at);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_task_logs_runner_id ON task_logs(runner_id);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_task_steps_task_id ON task_steps(task_id);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_runners_last_heartbeat ON runners(last_heartbeat);
|
||||||
`
|
`
|
||||||
|
|
||||||
if _, err := db.Exec(schema); err != nil {
|
if _, err := db.Exec(schema); err != nil {
|
||||||
@@ -142,6 +171,15 @@ func (db *DB) migrate() error {
|
|||||||
`ALTER TABLE runners ADD COLUMN runner_secret TEXT`,
|
`ALTER TABLE runners ADD COLUMN runner_secret TEXT`,
|
||||||
`ALTER TABLE runners ADD COLUMN manager_secret TEXT`,
|
`ALTER TABLE runners ADD COLUMN manager_secret TEXT`,
|
||||||
`ALTER TABLE runners ADD COLUMN verified BOOLEAN NOT NULL DEFAULT 0`,
|
`ALTER TABLE runners ADD COLUMN verified BOOLEAN NOT NULL DEFAULT 0`,
|
||||||
|
// Add allow_parallel_runners to jobs if it doesn't exist
|
||||||
|
`ALTER TABLE jobs ADD COLUMN allow_parallel_runners BOOLEAN NOT NULL DEFAULT 1`,
|
||||||
|
// Add timeout_seconds to jobs if it doesn't exist
|
||||||
|
`ALTER TABLE jobs ADD COLUMN timeout_seconds INTEGER DEFAULT 86400`,
|
||||||
|
// Add new columns to tasks if they don't exist
|
||||||
|
`ALTER TABLE tasks ADD COLUMN current_step TEXT`,
|
||||||
|
`ALTER TABLE tasks ADD COLUMN retry_count INTEGER DEFAULT 0`,
|
||||||
|
`ALTER TABLE tasks ADD COLUMN max_retries INTEGER DEFAULT 3`,
|
||||||
|
`ALTER TABLE tasks ADD COLUMN timeout_seconds INTEGER`,
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, migration := range migrations {
|
for _, migration := range migrations {
|
||||||
|
|||||||
@@ -11,12 +11,17 @@ import (
|
|||||||
"log"
|
"log"
|
||||||
"mime/multipart"
|
"mime/multipart"
|
||||||
"net/http"
|
"net/http"
|
||||||
|
"net/url"
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"sort"
|
"sort"
|
||||||
"strings"
|
"strings"
|
||||||
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"github.com/gorilla/websocket"
|
||||||
|
"fuego/pkg/types"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Client represents a runner client
|
// Client represents a runner client
|
||||||
@@ -29,6 +34,9 @@ type Client struct {
|
|||||||
runnerID int64
|
runnerID int64
|
||||||
runnerSecret string
|
runnerSecret string
|
||||||
managerSecret string
|
managerSecret string
|
||||||
|
wsConn *websocket.Conn
|
||||||
|
wsConnMu sync.Mutex
|
||||||
|
stopChan chan struct{}
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewClient creates a new runner client
|
// NewClient creates a new runner client
|
||||||
@@ -39,6 +47,7 @@ func NewClient(managerURL, name, hostname, ipAddress string) *Client {
|
|||||||
hostname: hostname,
|
hostname: hostname,
|
||||||
ipAddress: ipAddress,
|
ipAddress: ipAddress,
|
||||||
httpClient: &http.Client{Timeout: 30 * time.Second},
|
httpClient: &http.Client{Timeout: 30 * time.Second},
|
||||||
|
stopChan: make(chan struct{}),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -121,81 +130,219 @@ func (c *Client) doSignedRequest(method, path string, body []byte) (*http.Respon
|
|||||||
return c.httpClient.Do(req)
|
return c.httpClient.Do(req)
|
||||||
}
|
}
|
||||||
|
|
||||||
// HeartbeatLoop sends periodic heartbeats to the manager
|
// ConnectWebSocket establishes a WebSocket connection to the manager
|
||||||
|
func (c *Client) ConnectWebSocket() error {
|
||||||
|
if c.runnerID == 0 || c.runnerSecret == "" {
|
||||||
|
return fmt.Errorf("runner not authenticated")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build WebSocket URL with authentication
|
||||||
|
timestamp := time.Now().Unix()
|
||||||
|
path := "/api/runner/ws"
|
||||||
|
// Sign the request
|
||||||
|
message := fmt.Sprintf("GET\n%s\n\n%d", path, timestamp)
|
||||||
|
h := hmac.New(sha256.New, []byte(c.runnerSecret))
|
||||||
|
h.Write([]byte(message))
|
||||||
|
signature := hex.EncodeToString(h.Sum(nil))
|
||||||
|
|
||||||
|
// Convert HTTP URL to WebSocket URL
|
||||||
|
wsURL := strings.Replace(c.managerURL, "http://", "ws://", 1)
|
||||||
|
wsURL = strings.Replace(wsURL, "https://", "wss://", 1)
|
||||||
|
wsURL = fmt.Sprintf("%s%s?runner_id=%d&signature=%s×tamp=%d",
|
||||||
|
wsURL, path, c.runnerID, signature, timestamp)
|
||||||
|
|
||||||
|
// Parse URL
|
||||||
|
u, err := url.Parse(wsURL)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("invalid WebSocket URL: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Connect
|
||||||
|
dialer := websocket.Dialer{
|
||||||
|
HandshakeTimeout: 10 * time.Second,
|
||||||
|
}
|
||||||
|
conn, _, err := dialer.Dial(u.String(), nil)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("failed to connect WebSocket: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
c.wsConnMu.Lock()
|
||||||
|
if c.wsConn != nil {
|
||||||
|
c.wsConn.Close()
|
||||||
|
}
|
||||||
|
c.wsConn = conn
|
||||||
|
c.wsConnMu.Unlock()
|
||||||
|
|
||||||
|
log.Printf("WebSocket connected to manager")
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// ConnectWebSocketWithReconnect connects with automatic reconnection
|
||||||
|
func (c *Client) ConnectWebSocketWithReconnect() {
|
||||||
|
backoff := 1 * time.Second
|
||||||
|
maxBackoff := 60 * time.Second
|
||||||
|
|
||||||
|
for {
|
||||||
|
err := c.ConnectWebSocket()
|
||||||
|
if err == nil {
|
||||||
|
backoff = 1 * time.Second // Reset on success
|
||||||
|
c.HandleWebSocketMessages()
|
||||||
|
} else {
|
||||||
|
log.Printf("WebSocket connection failed: %v, retrying in %v", err, backoff)
|
||||||
|
time.Sleep(backoff)
|
||||||
|
backoff *= 2
|
||||||
|
if backoff > maxBackoff {
|
||||||
|
backoff = maxBackoff
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if we should stop
|
||||||
|
select {
|
||||||
|
case <-c.stopChan:
|
||||||
|
return
|
||||||
|
default:
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// HandleWebSocketMessages handles incoming WebSocket messages
|
||||||
|
func (c *Client) HandleWebSocketMessages() {
|
||||||
|
c.wsConnMu.Lock()
|
||||||
|
conn := c.wsConn
|
||||||
|
c.wsConnMu.Unlock()
|
||||||
|
|
||||||
|
if conn == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Set pong handler
|
||||||
|
conn.SetPongHandler(func(string) error {
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
|
||||||
|
// Handle messages
|
||||||
|
for {
|
||||||
|
var msg map[string]interface{}
|
||||||
|
err := conn.ReadJSON(&msg)
|
||||||
|
if err != nil {
|
||||||
|
if websocket.IsUnexpectedCloseError(err, websocket.CloseGoingAway, websocket.CloseAbnormalClosure) {
|
||||||
|
log.Printf("WebSocket error: %v", err)
|
||||||
|
}
|
||||||
|
c.wsConnMu.Lock()
|
||||||
|
c.wsConn = nil
|
||||||
|
c.wsConnMu.Unlock()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
msgType, _ := msg["type"].(string)
|
||||||
|
switch msgType {
|
||||||
|
case "task_assignment":
|
||||||
|
c.handleTaskAssignment(msg)
|
||||||
|
case "ping":
|
||||||
|
// Respond to ping with pong (automatic)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// handleTaskAssignment handles a task assignment message
|
||||||
|
func (c *Client) handleTaskAssignment(msg map[string]interface{}) {
|
||||||
|
data, ok := msg["data"].(map[string]interface{})
|
||||||
|
if !ok {
|
||||||
|
log.Printf("Invalid task assignment message")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
taskID, _ := data["task_id"].(float64)
|
||||||
|
jobID, _ := data["job_id"].(float64)
|
||||||
|
jobName, _ := data["job_name"].(string)
|
||||||
|
outputFormat, _ := data["output_format"].(string)
|
||||||
|
frameStart, _ := data["frame_start"].(float64)
|
||||||
|
frameEnd, _ := data["frame_end"].(float64)
|
||||||
|
inputFilesRaw, _ := data["input_files"].([]interface{})
|
||||||
|
|
||||||
|
if len(inputFilesRaw) == 0 {
|
||||||
|
log.Printf("No input files for task %v", taskID)
|
||||||
|
c.sendTaskComplete(int64(taskID), "", false, "No input files")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert to task map format
|
||||||
|
taskMap := map[string]interface{}{
|
||||||
|
"id": taskID,
|
||||||
|
"job_id": jobID,
|
||||||
|
"frame_start": frameStart,
|
||||||
|
"frame_end": frameEnd,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Process the task
|
||||||
|
go func() {
|
||||||
|
if err := c.processTask(taskMap, jobName, outputFormat, inputFilesRaw); err != nil {
|
||||||
|
log.Printf("Failed to process task %v: %v", taskID, err)
|
||||||
|
c.sendTaskComplete(int64(taskID), "", false, err.Error())
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
|
// HeartbeatLoop sends periodic heartbeats via WebSocket
|
||||||
func (c *Client) HeartbeatLoop() {
|
func (c *Client) HeartbeatLoop() {
|
||||||
ticker := time.NewTicker(30 * time.Second)
|
ticker := time.NewTicker(30 * time.Second)
|
||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
|
|
||||||
for range ticker.C {
|
for range ticker.C {
|
||||||
req := map[string]interface{}{}
|
c.wsConnMu.RLock()
|
||||||
body, _ := json.Marshal(req)
|
conn := c.wsConn
|
||||||
|
c.wsConnMu.RUnlock()
|
||||||
|
|
||||||
resp, err := c.doSignedRequest("POST", "/api/runner/heartbeat?runner_id="+fmt.Sprintf("%d", c.runnerID), body)
|
if conn != nil {
|
||||||
if err != nil {
|
// Send heartbeat via WebSocket
|
||||||
log.Printf("Heartbeat failed: %v", err)
|
msg := map[string]interface{}{
|
||||||
continue
|
"type": "heartbeat",
|
||||||
|
"timestamp": time.Now().Unix(),
|
||||||
}
|
}
|
||||||
resp.Body.Close()
|
if err := conn.WriteJSON(msg); err != nil {
|
||||||
}
|
log.Printf("Failed to send heartbeat: %v", err)
|
||||||
}
|
|
||||||
|
|
||||||
// ProcessTasks polls for tasks and processes them
|
|
||||||
func (c *Client) ProcessTasks() {
|
|
||||||
ticker := time.NewTicker(5 * time.Second)
|
|
||||||
defer ticker.Stop()
|
|
||||||
|
|
||||||
for range ticker.C {
|
|
||||||
tasks, err := c.getTasks()
|
|
||||||
if err != nil {
|
|
||||||
log.Printf("Failed to get tasks: %v", err)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, taskData := range tasks {
|
|
||||||
taskMap, ok := taskData["task"].(map[string]interface{})
|
|
||||||
if !ok {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
jobName, _ := taskData["job_name"].(string)
|
|
||||||
outputFormat, _ := taskData["output_format"].(string)
|
|
||||||
inputFilesRaw, _ := taskData["input_files"].([]interface{})
|
|
||||||
|
|
||||||
if len(inputFilesRaw) == 0 {
|
|
||||||
log.Printf("No input files for task %v", taskMap["id"])
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
// Process the task
|
|
||||||
if err := c.processTask(taskMap, jobName, outputFormat, inputFilesRaw); err != nil {
|
|
||||||
taskID, _ := taskMap["id"].(float64)
|
|
||||||
log.Printf("Failed to process task %v: %v", taskID, err)
|
|
||||||
c.completeTask(int64(taskID), "", false, err.Error())
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// getTasks fetches tasks from the manager
|
// sendLog sends a log entry to the manager via WebSocket
|
||||||
func (c *Client) getTasks() ([]map[string]interface{}, error) {
|
func (c *Client) sendLog(taskID int64, logLevel types.LogLevel, message, stepName string) {
|
||||||
path := fmt.Sprintf("/api/runner/tasks?runner_id=%d", c.runnerID)
|
c.wsConnMu.RLock()
|
||||||
resp, err := c.doSignedRequest("GET", path, nil)
|
conn := c.wsConn
|
||||||
if err != nil {
|
c.wsConnMu.RUnlock()
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
defer resp.Body.Close()
|
|
||||||
|
|
||||||
if resp.StatusCode != http.StatusOK {
|
if conn != nil {
|
||||||
body, _ := io.ReadAll(resp.Body)
|
msg := map[string]interface{}{
|
||||||
return nil, fmt.Errorf("failed to get tasks: %s", string(body))
|
"type": "log_entry",
|
||||||
|
"data": map[string]interface{}{
|
||||||
|
"task_id": taskID,
|
||||||
|
"log_level": string(logLevel),
|
||||||
|
"message": message,
|
||||||
|
"step_name": stepName,
|
||||||
|
},
|
||||||
|
"timestamp": time.Now().Unix(),
|
||||||
|
}
|
||||||
|
if err := conn.WriteJSON(msg); err != nil {
|
||||||
|
log.Printf("Failed to send log: %v", err)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
log.Printf("WebSocket not connected, cannot send log")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
var tasks []map[string]interface{}
|
// sendStepUpdate sends a step start/complete event to the manager
|
||||||
if err := json.NewDecoder(resp.Body).Decode(&tasks); err != nil {
|
func (c *Client) sendStepUpdate(taskID int64, stepName string, status types.StepStatus, errorMsg string) {
|
||||||
return nil, err
|
// This would ideally be a separate endpoint, but for now we'll use logs
|
||||||
|
msg := fmt.Sprintf("Step %s: %s", stepName, status)
|
||||||
|
if errorMsg != "" {
|
||||||
|
msg += " - " + errorMsg
|
||||||
}
|
}
|
||||||
|
logLevel := types.LogLevelInfo
|
||||||
return tasks, nil
|
if status == types.StepStatusFailed {
|
||||||
|
logLevel = types.LogLevelError
|
||||||
|
}
|
||||||
|
c.sendLog(taskID, logLevel, msg, stepName)
|
||||||
}
|
}
|
||||||
|
|
||||||
// processTask processes a single task
|
// processTask processes a single task
|
||||||
@@ -205,6 +352,7 @@ func (c *Client) processTask(task map[string]interface{}, jobName, outputFormat
|
|||||||
frameStart := int(task["frame_start"].(float64))
|
frameStart := int(task["frame_start"].(float64))
|
||||||
frameEnd := int(task["frame_end"].(float64))
|
frameEnd := int(task["frame_end"].(float64))
|
||||||
|
|
||||||
|
c.sendLog(taskID, types.LogLevelInfo, fmt.Sprintf("Starting task: job %d, frames %d-%d, format: %s", jobID, frameStart, frameEnd, outputFormat), "")
|
||||||
log.Printf("Processing task %d: job %d, frames %d-%d, format: %s", taskID, jobID, frameStart, frameEnd, outputFormat)
|
log.Printf("Processing task %d: job %d, frames %d-%d, format: %s", taskID, jobID, frameStart, frameEnd, outputFormat)
|
||||||
|
|
||||||
// Create work directory
|
// Create work directory
|
||||||
@@ -214,11 +362,14 @@ func (c *Client) processTask(task map[string]interface{}, jobName, outputFormat
|
|||||||
}
|
}
|
||||||
defer os.RemoveAll(workDir)
|
defer os.RemoveAll(workDir)
|
||||||
|
|
||||||
// Download input files
|
// Step: download
|
||||||
|
c.sendStepUpdate(taskID, "download", types.StepStatusRunning, "")
|
||||||
|
c.sendLog(taskID, types.LogLevelInfo, "Downloading input files...", "download")
|
||||||
blendFile := ""
|
blendFile := ""
|
||||||
for _, filePath := range inputFiles {
|
for _, filePath := range inputFiles {
|
||||||
filePathStr := filePath.(string)
|
filePathStr := filePath.(string)
|
||||||
if err := c.downloadFile(filePathStr, workDir); err != nil {
|
if err := c.downloadFile(filePathStr, workDir); err != nil {
|
||||||
|
c.sendStepUpdate(taskID, "download", types.StepStatusFailed, err.Error())
|
||||||
return fmt.Errorf("failed to download file %s: %w", filePathStr, err)
|
return fmt.Errorf("failed to download file %s: %w", filePathStr, err)
|
||||||
}
|
}
|
||||||
if filepath.Ext(filePathStr) == ".blend" {
|
if filepath.Ext(filePathStr) == ".blend" {
|
||||||
@@ -227,8 +378,12 @@ func (c *Client) processTask(task map[string]interface{}, jobName, outputFormat
|
|||||||
}
|
}
|
||||||
|
|
||||||
if blendFile == "" {
|
if blendFile == "" {
|
||||||
return fmt.Errorf("no .blend file found in input files")
|
err := fmt.Errorf("no .blend file found in input files")
|
||||||
|
c.sendStepUpdate(taskID, "download", types.StepStatusFailed, err.Error())
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
|
c.sendStepUpdate(taskID, "download", types.StepStatusCompleted, "")
|
||||||
|
c.sendLog(taskID, types.LogLevelInfo, "Input files downloaded successfully", "download")
|
||||||
|
|
||||||
// Render frames
|
// Render frames
|
||||||
outputDir := filepath.Join(workDir, "output")
|
outputDir := filepath.Join(workDir, "output")
|
||||||
@@ -244,30 +399,60 @@ func (c *Client) processTask(task map[string]interface{}, jobName, outputFormat
|
|||||||
|
|
||||||
outputPattern := filepath.Join(outputDir, fmt.Sprintf("frame_%%04d.%s", strings.ToLower(renderFormat)))
|
outputPattern := filepath.Join(outputDir, fmt.Sprintf("frame_%%04d.%s", strings.ToLower(renderFormat)))
|
||||||
|
|
||||||
|
// Step: render_blender
|
||||||
|
c.sendStepUpdate(taskID, "render_blender", types.StepStatusRunning, "")
|
||||||
|
c.sendLog(taskID, types.LogLevelInfo, fmt.Sprintf("Starting Blender render for frame %d...", frameStart), "render_blender")
|
||||||
|
|
||||||
// Execute Blender
|
// Execute Blender
|
||||||
cmd := exec.Command("blender", "-b", blendFile, "-o", outputPattern, "-f", fmt.Sprintf("%d", frameStart))
|
cmd := exec.Command("blender", "-b", blendFile, "-o", outputPattern, "-f", fmt.Sprintf("%d", frameStart))
|
||||||
cmd.Dir = workDir
|
cmd.Dir = workDir
|
||||||
output, err := cmd.CombinedOutput()
|
output, err := cmd.CombinedOutput()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("blender failed: %w\nOutput: %s", err, string(output))
|
errMsg := fmt.Sprintf("blender failed: %w\nOutput: %s", err, string(output))
|
||||||
|
c.sendLog(taskID, types.LogLevelError, errMsg, "render_blender")
|
||||||
|
c.sendStepUpdate(taskID, "render_blender", types.StepStatusFailed, errMsg)
|
||||||
|
return fmt.Errorf(errMsg)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Find rendered output file
|
// Find rendered output file
|
||||||
outputFile := filepath.Join(outputDir, fmt.Sprintf("frame_%04d.%s", frameStart, strings.ToLower(renderFormat)))
|
outputFile := filepath.Join(outputDir, fmt.Sprintf("frame_%04d.%s", frameStart, strings.ToLower(renderFormat)))
|
||||||
if _, err := os.Stat(outputFile); os.IsNotExist(err) {
|
if _, err := os.Stat(outputFile); os.IsNotExist(err) {
|
||||||
return fmt.Errorf("output file not found: %s", outputFile)
|
errMsg := fmt.Sprintf("output file not found: %s", outputFile)
|
||||||
|
c.sendLog(taskID, types.LogLevelError, errMsg, "render_blender")
|
||||||
|
c.sendStepUpdate(taskID, "render_blender", types.StepStatusFailed, errMsg)
|
||||||
|
return fmt.Errorf(errMsg)
|
||||||
}
|
}
|
||||||
|
c.sendLog(taskID, types.LogLevelInfo, fmt.Sprintf("Blender render completed for frame %d", frameStart), "render_blender")
|
||||||
|
c.sendStepUpdate(taskID, "render_blender", types.StepStatusCompleted, "")
|
||||||
|
|
||||||
|
// Step: upload or upload_frames
|
||||||
|
uploadStepName := "upload"
|
||||||
|
if outputFormat == "MP4" {
|
||||||
|
uploadStepName = "upload_frames"
|
||||||
|
}
|
||||||
|
c.sendStepUpdate(taskID, uploadStepName, types.StepStatusRunning, "")
|
||||||
|
c.sendLog(taskID, types.LogLevelInfo, "Uploading output file...", uploadStepName)
|
||||||
|
|
||||||
// Upload frame file
|
|
||||||
outputPath, err := c.uploadFile(jobID, outputFile)
|
outputPath, err := c.uploadFile(jobID, outputFile)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("failed to upload output: %w", err)
|
errMsg := fmt.Sprintf("failed to upload output: %w", err)
|
||||||
|
c.sendLog(taskID, types.LogLevelError, errMsg, uploadStepName)
|
||||||
|
c.sendStepUpdate(taskID, uploadStepName, types.StepStatusFailed, errMsg)
|
||||||
|
return fmt.Errorf(errMsg)
|
||||||
}
|
}
|
||||||
|
c.sendLog(taskID, types.LogLevelInfo, "Output file uploaded successfully", uploadStepName)
|
||||||
|
c.sendStepUpdate(taskID, uploadStepName, types.StepStatusCompleted, "")
|
||||||
|
|
||||||
|
// Step: complete
|
||||||
|
c.sendStepUpdate(taskID, "complete", types.StepStatusRunning, "")
|
||||||
|
c.sendLog(taskID, types.LogLevelInfo, "Task completed successfully", "complete")
|
||||||
|
|
||||||
// Mark task as complete
|
// Mark task as complete
|
||||||
if err := c.completeTask(taskID, outputPath, true, ""); err != nil {
|
if err := c.completeTask(taskID, outputPath, true, ""); err != nil {
|
||||||
|
c.sendStepUpdate(taskID, "complete", types.StepStatusFailed, err.Error())
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
c.sendStepUpdate(taskID, "complete", types.StepStatusCompleted, "")
|
||||||
|
|
||||||
// For MP4 format, check if all frames are done and generate video
|
// For MP4 format, check if all frames are done and generate video
|
||||||
if outputFormat == "MP4" {
|
if outputFormat == "MP4" {
|
||||||
@@ -599,29 +784,33 @@ func (c *Client) uploadFile(jobID int64, filePath string) (string, error) {
|
|||||||
return result.FilePath, nil
|
return result.FilePath, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// completeTask marks a task as complete
|
// completeTask marks a task as complete via WebSocket (or HTTP fallback)
|
||||||
func (c *Client) completeTask(taskID int64, outputPath string, success bool, errorMsg string) error {
|
func (c *Client) completeTask(taskID int64, outputPath string, success bool, errorMsg string) error {
|
||||||
req := map[string]interface{}{
|
return c.sendTaskComplete(taskID, outputPath, success, errorMsg)
|
||||||
|
}
|
||||||
|
|
||||||
|
// sendTaskComplete sends task completion via WebSocket
|
||||||
|
func (c *Client) sendTaskComplete(taskID int64, outputPath string, success bool, errorMsg string) error {
|
||||||
|
c.wsConnMu.RLock()
|
||||||
|
conn := c.wsConn
|
||||||
|
c.wsConnMu.RUnlock()
|
||||||
|
|
||||||
|
if conn != nil {
|
||||||
|
msg := map[string]interface{}{
|
||||||
|
"type": "task_complete",
|
||||||
|
"data": map[string]interface{}{
|
||||||
|
"task_id": taskID,
|
||||||
"output_path": outputPath,
|
"output_path": outputPath,
|
||||||
"success": success,
|
"success": success,
|
||||||
|
"error": errorMsg,
|
||||||
|
},
|
||||||
|
"timestamp": time.Now().Unix(),
|
||||||
}
|
}
|
||||||
if !success {
|
if err := conn.WriteJSON(msg); err != nil {
|
||||||
req["error"] = errorMsg
|
return fmt.Errorf("failed to send task completion: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
body, _ := json.Marshal(req)
|
|
||||||
path := fmt.Sprintf("/api/runner/tasks/%d/complete?runner_id=%d", taskID, c.runnerID)
|
|
||||||
resp, err := c.doSignedRequest("POST", path, body)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
defer resp.Body.Close()
|
|
||||||
|
|
||||||
if resp.StatusCode != http.StatusOK {
|
|
||||||
body, _ := io.ReadAll(resp.Body)
|
|
||||||
return fmt.Errorf("failed to complete task: %s", string(body))
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
return fmt.Errorf("WebSocket not connected, cannot complete task")
|
||||||
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -33,6 +33,8 @@ type Job struct {
|
|||||||
FrameStart int `json:"frame_start"`
|
FrameStart int `json:"frame_start"`
|
||||||
FrameEnd int `json:"frame_end"`
|
FrameEnd int `json:"frame_end"`
|
||||||
OutputFormat string `json:"output_format"` // PNG, JPEG, EXR, etc.
|
OutputFormat string `json:"output_format"` // PNG, JPEG, EXR, etc.
|
||||||
|
AllowParallelRunners bool `json:"allow_parallel_runners"` // Allow multiple runners to work on this job
|
||||||
|
TimeoutSeconds int `json:"timeout_seconds"` // Job-level timeout (24 hours default)
|
||||||
CreatedAt time.Time `json:"created_at"`
|
CreatedAt time.Time `json:"created_at"`
|
||||||
StartedAt *time.Time `json:"started_at,omitempty"`
|
StartedAt *time.Time `json:"started_at,omitempty"`
|
||||||
CompletedAt *time.Time `json:"completed_at,omitempty"`
|
CompletedAt *time.Time `json:"completed_at,omitempty"`
|
||||||
@@ -78,6 +80,10 @@ type Task struct {
|
|||||||
FrameStart int `json:"frame_start"`
|
FrameStart int `json:"frame_start"`
|
||||||
FrameEnd int `json:"frame_end"`
|
FrameEnd int `json:"frame_end"`
|
||||||
Status TaskStatus `json:"status"`
|
Status TaskStatus `json:"status"`
|
||||||
|
CurrentStep string `json:"current_step,omitempty"`
|
||||||
|
RetryCount int `json:"retry_count"`
|
||||||
|
MaxRetries int `json:"max_retries"`
|
||||||
|
TimeoutSeconds *int `json:"timeout_seconds,omitempty"` // Task timeout (5 min for frames, 24h for FFmpeg)
|
||||||
OutputPath string `json:"output_path,omitempty"`
|
OutputPath string `json:"output_path,omitempty"`
|
||||||
CreatedAt time.Time `json:"created_at"`
|
CreatedAt time.Time `json:"created_at"`
|
||||||
StartedAt *time.Time `json:"started_at,omitempty"`
|
StartedAt *time.Time `json:"started_at,omitempty"`
|
||||||
@@ -110,6 +116,7 @@ type CreateJobRequest struct {
|
|||||||
FrameStart int `json:"frame_start"`
|
FrameStart int `json:"frame_start"`
|
||||||
FrameEnd int `json:"frame_end"`
|
FrameEnd int `json:"frame_end"`
|
||||||
OutputFormat string `json:"output_format"`
|
OutputFormat string `json:"output_format"`
|
||||||
|
AllowParallelRunners *bool `json:"allow_parallel_runners,omitempty"` // Optional, defaults to true
|
||||||
}
|
}
|
||||||
|
|
||||||
// UpdateJobProgressRequest represents a request to update job progress
|
// UpdateJobProgressRequest represents a request to update job progress
|
||||||
@@ -125,3 +132,70 @@ type RegisterRunnerRequest struct {
|
|||||||
Capabilities string `json:"capabilities"`
|
Capabilities string `json:"capabilities"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// LogLevel represents the level of a log entry
|
||||||
|
type LogLevel string
|
||||||
|
|
||||||
|
const (
|
||||||
|
LogLevelInfo LogLevel = "INFO"
|
||||||
|
LogLevelWarn LogLevel = "WARN"
|
||||||
|
LogLevelError LogLevel = "ERROR"
|
||||||
|
LogLevelDebug LogLevel = "DEBUG"
|
||||||
|
)
|
||||||
|
|
||||||
|
// TaskLog represents a log entry for a task
|
||||||
|
type TaskLog struct {
|
||||||
|
ID int64 `json:"id"`
|
||||||
|
TaskID int64 `json:"task_id"`
|
||||||
|
RunnerID *int64 `json:"runner_id,omitempty"`
|
||||||
|
LogLevel LogLevel `json:"log_level"`
|
||||||
|
Message string `json:"message"`
|
||||||
|
StepName string `json:"step_name,omitempty"`
|
||||||
|
CreatedAt time.Time `json:"created_at"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// StepStatus represents the status of a task step
|
||||||
|
type StepStatus string
|
||||||
|
|
||||||
|
const (
|
||||||
|
StepStatusPending StepStatus = "pending"
|
||||||
|
StepStatusRunning StepStatus = "running"
|
||||||
|
StepStatusCompleted StepStatus = "completed"
|
||||||
|
StepStatusFailed StepStatus = "failed"
|
||||||
|
StepStatusSkipped StepStatus = "skipped"
|
||||||
|
)
|
||||||
|
|
||||||
|
// TaskStep represents an execution step within a task
|
||||||
|
type TaskStep struct {
|
||||||
|
ID int64 `json:"id"`
|
||||||
|
TaskID int64 `json:"task_id"`
|
||||||
|
StepName string `json:"step_name"`
|
||||||
|
Status StepStatus `json:"status"`
|
||||||
|
StartedAt *time.Time `json:"started_at,omitempty"`
|
||||||
|
CompletedAt *time.Time `json:"completed_at,omitempty"`
|
||||||
|
DurationMs *int `json:"duration_ms,omitempty"`
|
||||||
|
ErrorMessage string `json:"error_message,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// TaskAnnotation represents an annotation (warning/error) for a task
|
||||||
|
type TaskAnnotation struct {
|
||||||
|
ID int64 `json:"id"`
|
||||||
|
TaskID int64 `json:"task_id"`
|
||||||
|
StepName string `json:"step_name,omitempty"`
|
||||||
|
Level LogLevel `json:"level"`
|
||||||
|
Message string `json:"message"`
|
||||||
|
Line *int `json:"line,omitempty"`
|
||||||
|
CreatedAt time.Time `json:"created_at"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// SendTaskLogRequest represents a request to send task logs
|
||||||
|
type SendTaskLogRequest struct {
|
||||||
|
Logs []TaskLogEntry `json:"logs"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// TaskLogEntry represents a single log entry in a batch request
|
||||||
|
type TaskLogEntry struct {
|
||||||
|
LogLevel LogLevel `json:"log_level"`
|
||||||
|
Message string `json:"message"`
|
||||||
|
StepName string `json:"step_name,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
import { useState, useEffect } from 'react';
|
import { useState, useEffect, useRef } from 'react';
|
||||||
import { jobs } from '../utils/api';
|
import { jobs } from '../utils/api';
|
||||||
import VideoPlayer from './VideoPlayer';
|
import VideoPlayer from './VideoPlayer';
|
||||||
|
|
||||||
@@ -7,13 +7,39 @@ export default function JobDetails({ job, onClose, onUpdate }) {
|
|||||||
const [files, setFiles] = useState([]);
|
const [files, setFiles] = useState([]);
|
||||||
const [loading, setLoading] = useState(true);
|
const [loading, setLoading] = useState(true);
|
||||||
const [videoUrl, setVideoUrl] = useState(null);
|
const [videoUrl, setVideoUrl] = useState(null);
|
||||||
|
const [selectedTaskId, setSelectedTaskId] = useState(null);
|
||||||
|
const [taskLogs, setTaskLogs] = useState([]);
|
||||||
|
const [taskSteps, setTaskSteps] = useState([]);
|
||||||
|
const [streaming, setStreaming] = useState(false);
|
||||||
|
const wsRef = useRef(null);
|
||||||
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
loadDetails();
|
loadDetails();
|
||||||
const interval = setInterval(loadDetails, 2000);
|
const interval = setInterval(loadDetails, 2000);
|
||||||
return () => clearInterval(interval);
|
return () => {
|
||||||
|
clearInterval(interval);
|
||||||
|
if (wsRef.current) {
|
||||||
|
wsRef.current.close();
|
||||||
|
}
|
||||||
|
};
|
||||||
}, [job.id]);
|
}, [job.id]);
|
||||||
|
|
||||||
|
useEffect(() => {
|
||||||
|
if (selectedTaskId && jobDetails.status === 'running') {
|
||||||
|
startLogStream();
|
||||||
|
} else if (wsRef.current) {
|
||||||
|
wsRef.current.close();
|
||||||
|
wsRef.current = null;
|
||||||
|
setStreaming(false);
|
||||||
|
}
|
||||||
|
return () => {
|
||||||
|
if (wsRef.current) {
|
||||||
|
wsRef.current.close();
|
||||||
|
wsRef.current = null;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}, [selectedTaskId, jobDetails.status]);
|
||||||
|
|
||||||
const loadDetails = async () => {
|
const loadDetails = async () => {
|
||||||
try {
|
try {
|
||||||
const [details, fileList] = await Promise.all([
|
const [details, fileList] = await Promise.all([
|
||||||
@@ -41,6 +67,90 @@ export default function JobDetails({ job, onClose, onUpdate }) {
|
|||||||
window.open(jobs.downloadFile(job.id, fileId), '_blank');
|
window.open(jobs.downloadFile(job.id, fileId), '_blank');
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const loadTaskLogs = async (taskId) => {
|
||||||
|
try {
|
||||||
|
const [logs, steps] = await Promise.all([
|
||||||
|
jobs.getTaskLogs(job.id, taskId),
|
||||||
|
jobs.getTaskSteps(job.id, taskId),
|
||||||
|
]);
|
||||||
|
setTaskLogs(logs);
|
||||||
|
setTaskSteps(steps);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Failed to load task logs:', error);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const startLogStream = () => {
|
||||||
|
if (!selectedTaskId || streaming) return;
|
||||||
|
|
||||||
|
setStreaming(true);
|
||||||
|
const ws = jobs.streamTaskLogsWebSocket(job.id, selectedTaskId);
|
||||||
|
wsRef.current = ws;
|
||||||
|
|
||||||
|
ws.onmessage = (event) => {
|
||||||
|
try {
|
||||||
|
const data = JSON.parse(event.data);
|
||||||
|
if (data.type === 'log' && data.data) {
|
||||||
|
setTaskLogs((prev) => [...prev, data.data]);
|
||||||
|
} else if (data.type === 'connected') {
|
||||||
|
// Connection established
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Failed to parse log message:', error);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
ws.onerror = (error) => {
|
||||||
|
console.error('WebSocket error:', error);
|
||||||
|
setStreaming(false);
|
||||||
|
};
|
||||||
|
|
||||||
|
ws.onclose = () => {
|
||||||
|
setStreaming(false);
|
||||||
|
// Auto-reconnect if job is still running
|
||||||
|
if (jobDetails.status === 'running' && selectedTaskId) {
|
||||||
|
setTimeout(() => {
|
||||||
|
if (jobDetails.status === 'running') {
|
||||||
|
startLogStream();
|
||||||
|
}
|
||||||
|
}, 2000);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
const handleTaskClick = async (taskId) => {
|
||||||
|
setSelectedTaskId(taskId);
|
||||||
|
await loadTaskLogs(taskId);
|
||||||
|
};
|
||||||
|
|
||||||
|
const getLogLevelColor = (level) => {
|
||||||
|
switch (level) {
|
||||||
|
case 'ERROR':
|
||||||
|
return 'text-red-600';
|
||||||
|
case 'WARN':
|
||||||
|
return 'text-yellow-600';
|
||||||
|
case 'DEBUG':
|
||||||
|
return 'text-gray-500';
|
||||||
|
default:
|
||||||
|
return 'text-gray-900';
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const getStepStatusIcon = (status) => {
|
||||||
|
switch (status) {
|
||||||
|
case 'completed':
|
||||||
|
return '✓';
|
||||||
|
case 'failed':
|
||||||
|
return '✗';
|
||||||
|
case 'running':
|
||||||
|
return '⏳';
|
||||||
|
case 'skipped':
|
||||||
|
return '⏸';
|
||||||
|
default:
|
||||||
|
return '○';
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
const outputFiles = files.filter((f) => f.file_type === 'output');
|
const outputFiles = files.filter((f) => f.file_type === 'output');
|
||||||
const inputFiles = files.filter((f) => f.file_type === 'input');
|
const inputFiles = files.filter((f) => f.file_type === 'input');
|
||||||
|
|
||||||
@@ -156,6 +266,75 @@ export default function JobDetails({ job, onClose, onUpdate }) {
|
|||||||
<p>{jobDetails.error_message}</p>
|
<p>{jobDetails.error_message}</p>
|
||||||
</div>
|
</div>
|
||||||
)}
|
)}
|
||||||
|
|
||||||
|
<div>
|
||||||
|
<h3 className="text-lg font-semibold text-gray-900 mb-3">
|
||||||
|
Task Execution
|
||||||
|
</h3>
|
||||||
|
<div className="space-y-4">
|
||||||
|
{taskSteps.length > 0 && (
|
||||||
|
<div className="bg-gray-50 rounded-lg p-4">
|
||||||
|
<h4 className="font-medium text-gray-900 mb-2">Steps</h4>
|
||||||
|
<div className="space-y-2">
|
||||||
|
{taskSteps.map((step) => (
|
||||||
|
<div
|
||||||
|
key={step.id}
|
||||||
|
className="flex items-center justify-between p-2 bg-white rounded"
|
||||||
|
>
|
||||||
|
<div className="flex items-center gap-2">
|
||||||
|
<span className="text-lg">
|
||||||
|
{getStepStatusIcon(step.status)}
|
||||||
|
</span>
|
||||||
|
<span className="font-medium">{step.step_name}</span>
|
||||||
|
</div>
|
||||||
|
{step.duration_ms && (
|
||||||
|
<span className="text-sm text-gray-600">
|
||||||
|
{(step.duration_ms / 1000).toFixed(2)}s
|
||||||
|
</span>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
))}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{selectedTaskId && (
|
||||||
|
<div className="bg-gray-50 rounded-lg p-4">
|
||||||
|
<h4 className="font-medium text-gray-900 mb-2">
|
||||||
|
Logs {streaming && <span className="text-sm text-green-600">(streaming)</span>}
|
||||||
|
</h4>
|
||||||
|
<div className="bg-black text-green-400 font-mono text-sm p-3 rounded max-h-96 overflow-y-auto">
|
||||||
|
{taskLogs.length === 0 ? (
|
||||||
|
<p className="text-gray-500">No logs yet...</p>
|
||||||
|
) : (
|
||||||
|
taskLogs.map((log) => (
|
||||||
|
<div
|
||||||
|
key={log.id}
|
||||||
|
className={`${getLogLevelColor(log.log_level)} mb-1`}
|
||||||
|
>
|
||||||
|
<span className="text-gray-500">
|
||||||
|
[{new Date(log.created_at).toLocaleTimeString()}]
|
||||||
|
</span>
|
||||||
|
{log.step_name && (
|
||||||
|
<span className="text-blue-400 ml-2">
|
||||||
|
[{log.step_name}]
|
||||||
|
</span>
|
||||||
|
)}
|
||||||
|
<span className="ml-2">{log.message}</span>
|
||||||
|
</div>
|
||||||
|
))
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{!selectedTaskId && (
|
||||||
|
<p className="text-gray-600 text-sm">
|
||||||
|
Select a task to view logs and steps
|
||||||
|
</p>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
</>
|
</>
|
||||||
)}
|
)}
|
||||||
</div>
|
</div>
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ export default function JobSubmission({ onSuccess }) {
|
|||||||
frame_start: 1,
|
frame_start: 1,
|
||||||
frame_end: 10,
|
frame_end: 10,
|
||||||
output_format: 'PNG',
|
output_format: 'PNG',
|
||||||
|
allow_parallel_runners: true,
|
||||||
});
|
});
|
||||||
const [file, setFile] = useState(null);
|
const [file, setFile] = useState(null);
|
||||||
const [submitting, setSubmitting] = useState(false);
|
const [submitting, setSubmitting] = useState(false);
|
||||||
@@ -32,6 +33,7 @@ export default function JobSubmission({ onSuccess }) {
|
|||||||
frame_start: parseInt(formData.frame_start),
|
frame_start: parseInt(formData.frame_start),
|
||||||
frame_end: parseInt(formData.frame_end),
|
frame_end: parseInt(formData.frame_end),
|
||||||
output_format: formData.output_format,
|
output_format: formData.output_format,
|
||||||
|
allow_parallel_runners: formData.allow_parallel_runners,
|
||||||
});
|
});
|
||||||
|
|
||||||
// Upload file
|
// Upload file
|
||||||
@@ -43,6 +45,7 @@ export default function JobSubmission({ onSuccess }) {
|
|||||||
frame_start: 1,
|
frame_start: 1,
|
||||||
frame_end: 10,
|
frame_end: 10,
|
||||||
output_format: 'PNG',
|
output_format: 'PNG',
|
||||||
|
allow_parallel_runners: true,
|
||||||
});
|
});
|
||||||
setFile(null);
|
setFile(null);
|
||||||
e.target.reset();
|
e.target.reset();
|
||||||
@@ -127,6 +130,19 @@ export default function JobSubmission({ onSuccess }) {
|
|||||||
</select>
|
</select>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
<div className="flex items-center">
|
||||||
|
<input
|
||||||
|
type="checkbox"
|
||||||
|
id="allow_parallel_runners"
|
||||||
|
checked={formData.allow_parallel_runners}
|
||||||
|
onChange={(e) => setFormData({ ...formData, allow_parallel_runners: e.target.checked })}
|
||||||
|
className="h-4 w-4 text-purple-600 focus:ring-purple-500 border-gray-300 rounded"
|
||||||
|
/>
|
||||||
|
<label htmlFor="allow_parallel_runners" className="ml-2 block text-sm text-gray-700">
|
||||||
|
Allow multiple runners to work on this job simultaneously
|
||||||
|
</label>
|
||||||
|
</div>
|
||||||
|
|
||||||
<div>
|
<div>
|
||||||
<label className="block text-sm font-medium text-gray-700 mb-2">
|
<label className="block text-sm font-medium text-gray-700 mb-2">
|
||||||
Blender File (.blend)
|
Blender File (.blend)
|
||||||
|
|||||||
@@ -87,6 +87,31 @@ export const jobs = {
|
|||||||
async getVideoUrl(jobId) {
|
async getVideoUrl(jobId) {
|
||||||
return `${API_BASE}/jobs/${jobId}/video`;
|
return `${API_BASE}/jobs/${jobId}/video`;
|
||||||
},
|
},
|
||||||
|
|
||||||
|
async getTaskLogs(jobId, taskId, options = {}) {
|
||||||
|
const params = new URLSearchParams();
|
||||||
|
if (options.stepName) params.append('step_name', options.stepName);
|
||||||
|
if (options.logLevel) params.append('log_level', options.logLevel);
|
||||||
|
if (options.limit) params.append('limit', options.limit.toString());
|
||||||
|
const query = params.toString();
|
||||||
|
return api.get(`/jobs/${jobId}/tasks/${taskId}/logs${query ? '?' + query : ''}`);
|
||||||
|
},
|
||||||
|
|
||||||
|
async getTaskSteps(jobId, taskId) {
|
||||||
|
return api.get(`/jobs/${jobId}/tasks/${taskId}/steps`);
|
||||||
|
},
|
||||||
|
|
||||||
|
streamTaskLogsWebSocket(jobId, taskId, lastId = 0) {
|
||||||
|
// Convert HTTP to WebSocket URL
|
||||||
|
const wsProtocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
|
||||||
|
const wsHost = window.location.host;
|
||||||
|
const url = `${wsProtocol}//${wsHost}${API_BASE}/jobs/${jobId}/tasks/${taskId}/logs/ws?last_id=${lastId}`;
|
||||||
|
return new WebSocket(url);
|
||||||
|
},
|
||||||
|
|
||||||
|
async retryTask(jobId, taskId) {
|
||||||
|
return api.post(`/jobs/${jobId}/tasks/${taskId}/retry`);
|
||||||
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
export const runners = {
|
export const runners = {
|
||||||
|
|||||||
Reference in New Issue
Block a user