|
|
|
|
@@ -15,6 +15,7 @@ import (
|
|
|
|
|
"sort"
|
|
|
|
|
"strconv"
|
|
|
|
|
"strings"
|
|
|
|
|
"sync"
|
|
|
|
|
"time"
|
|
|
|
|
|
|
|
|
|
"jiggablend/pkg/types"
|
|
|
|
|
@@ -144,7 +145,7 @@ func (s *Server) handleRegisterRunner(w http.ResponseWriter, r *http.Request) {
|
|
|
|
|
registration_token, runner_secret, manager_secret, verified, priority)
|
|
|
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
|
|
|
RETURNING id`,
|
|
|
|
|
req.Name, req.Hostname, req.IPAddress, types.RunnerStatusOnline, time.Now(), req.Capabilities,
|
|
|
|
|
req.Name, req.Hostname, "", types.RunnerStatusOnline, time.Now(), req.Capabilities,
|
|
|
|
|
req.RegistrationToken, runnerSecret, managerSecret, true, priority,
|
|
|
|
|
).Scan(&runnerID)
|
|
|
|
|
if err != nil {
|
|
|
|
|
@@ -157,7 +158,6 @@ func (s *Server) handleRegisterRunner(w http.ResponseWriter, r *http.Request) {
|
|
|
|
|
"id": runnerID,
|
|
|
|
|
"name": req.Name,
|
|
|
|
|
"hostname": req.Hostname,
|
|
|
|
|
"ip_address": req.IPAddress,
|
|
|
|
|
"status": types.RunnerStatusOnline,
|
|
|
|
|
"runner_secret": runnerSecret,
|
|
|
|
|
"manager_secret": managerSecret,
|
|
|
|
|
@@ -683,14 +683,25 @@ func (s *Server) handleRunnerWebSocket(w http.ResponseWriter, r *http.Request) {
|
|
|
|
|
defer conn.Close()
|
|
|
|
|
|
|
|
|
|
// Register connection (must be done before any distribution checks)
|
|
|
|
|
// Close old connection outside lock to avoid blocking
|
|
|
|
|
var oldConn *websocket.Conn
|
|
|
|
|
s.runnerConnsMu.Lock()
|
|
|
|
|
// Remove old connection if exists
|
|
|
|
|
if oldConn, exists := s.runnerConns[runnerID]; exists {
|
|
|
|
|
oldConn.Close()
|
|
|
|
|
if existingConn, exists := s.runnerConns[runnerID]; exists {
|
|
|
|
|
oldConn = existingConn
|
|
|
|
|
}
|
|
|
|
|
s.runnerConns[runnerID] = conn
|
|
|
|
|
s.runnerConnsMu.Unlock()
|
|
|
|
|
|
|
|
|
|
// Close old connection outside lock (if it existed)
|
|
|
|
|
if oldConn != nil {
|
|
|
|
|
oldConn.Close()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Create a write mutex for this connection
|
|
|
|
|
s.runnerConnsWriteMuMu.Lock()
|
|
|
|
|
s.runnerConnsWriteMu[runnerID] = &sync.Mutex{}
|
|
|
|
|
s.runnerConnsWriteMuMu.Unlock()
|
|
|
|
|
|
|
|
|
|
// Update runner status to online
|
|
|
|
|
_, _ = s.db.Exec(
|
|
|
|
|
`UPDATE runners SET status = ?, last_heartbeat = ? WHERE id = ?`,
|
|
|
|
|
@@ -698,12 +709,8 @@ func (s *Server) handleRunnerWebSocket(w http.ResponseWriter, r *http.Request) {
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
// Immediately try to distribute pending tasks to this newly connected runner
|
|
|
|
|
// Use a small delay to ensure connection registration is fully visible to other goroutines
|
|
|
|
|
log.Printf("Runner %d connected, distributing pending tasks", runnerID)
|
|
|
|
|
go func() {
|
|
|
|
|
time.Sleep(50 * time.Millisecond) // Small delay to ensure map update is visible
|
|
|
|
|
s.distributeTasksToRunners()
|
|
|
|
|
}()
|
|
|
|
|
s.triggerTaskDistribution()
|
|
|
|
|
|
|
|
|
|
// Note: We don't log to task logs here because we don't know which tasks will be assigned yet
|
|
|
|
|
// Task assignment logging happens in distributeTasksToRunners
|
|
|
|
|
@@ -713,6 +720,9 @@ func (s *Server) handleRunnerWebSocket(w http.ResponseWriter, r *http.Request) {
|
|
|
|
|
s.runnerConnsMu.Lock()
|
|
|
|
|
delete(s.runnerConns, runnerID)
|
|
|
|
|
s.runnerConnsMu.Unlock()
|
|
|
|
|
s.runnerConnsWriteMuMu.Lock()
|
|
|
|
|
delete(s.runnerConnsWriteMu, runnerID)
|
|
|
|
|
s.runnerConnsWriteMuMu.Unlock()
|
|
|
|
|
_, _ = s.db.Exec(
|
|
|
|
|
`UPDATE runners SET status = ? WHERE id = ?`,
|
|
|
|
|
types.RunnerStatusOffline, runnerID,
|
|
|
|
|
@@ -743,15 +753,28 @@ func (s *Server) handleRunnerWebSocket(w http.ResponseWriter, r *http.Request) {
|
|
|
|
|
defer ticker.Stop()
|
|
|
|
|
for range ticker.C {
|
|
|
|
|
s.runnerConnsMu.RLock()
|
|
|
|
|
conn, exists := s.runnerConns[runnerID]
|
|
|
|
|
currentConn, exists := s.runnerConns[runnerID]
|
|
|
|
|
s.runnerConnsMu.RUnlock()
|
|
|
|
|
if !exists {
|
|
|
|
|
if !exists || currentConn != conn {
|
|
|
|
|
// Connection was replaced or removed
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
// Get write mutex for this connection
|
|
|
|
|
s.runnerConnsWriteMuMu.RLock()
|
|
|
|
|
writeMu, hasMu := s.runnerConnsWriteMu[runnerID]
|
|
|
|
|
s.runnerConnsWriteMuMu.RUnlock()
|
|
|
|
|
if !hasMu || writeMu == nil {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
// Send ping - runner should respond with pong automatically
|
|
|
|
|
// Reset read deadline before sending ping to ensure we can receive pong
|
|
|
|
|
conn.SetReadDeadline(time.Now().Add(90 * time.Second)) // Increased to 90 seconds
|
|
|
|
|
if err := conn.WriteControl(websocket.PingMessage, []byte{}, time.Now().Add(10*time.Second)); err != nil {
|
|
|
|
|
writeMu.Lock()
|
|
|
|
|
err := conn.WriteControl(websocket.PingMessage, []byte{}, time.Now().Add(10*time.Second))
|
|
|
|
|
writeMu.Unlock()
|
|
|
|
|
if err != nil {
|
|
|
|
|
// Write failed - connection is likely dead, read loop will detect and cleanup
|
|
|
|
|
log.Printf("Failed to send ping to runner %d: %v", runnerID, err)
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
@@ -1191,7 +1214,7 @@ func (s *Server) updateJobStatusFromTasks(jobID int64) {
|
|
|
|
|
// Update job status to ensure it's marked as running (has pending video task)
|
|
|
|
|
s.updateJobStatusFromTasks(jobID)
|
|
|
|
|
// Try to distribute the task immediately
|
|
|
|
|
go s.distributeTasksToRunners()
|
|
|
|
|
s.triggerTaskDistribution()
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
log.Printf("Skipping video generation task creation for job %d (video task already exists)", jobID)
|
|
|
|
|
@@ -1284,7 +1307,20 @@ func (s *Server) broadcastLogToFrontend(taskID int64, logEntry WSLogEntry) {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// triggerTaskDistribution triggers task distribution in a serialized manner
|
|
|
|
|
func (s *Server) triggerTaskDistribution() {
|
|
|
|
|
go func() {
|
|
|
|
|
// Try to acquire lock - if already running, skip
|
|
|
|
|
if !s.taskDistMu.TryLock() {
|
|
|
|
|
return // Distribution already in progress
|
|
|
|
|
}
|
|
|
|
|
defer s.taskDistMu.Unlock()
|
|
|
|
|
s.distributeTasksToRunners()
|
|
|
|
|
}()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// distributeTasksToRunners pushes available tasks to connected runners
|
|
|
|
|
// This function should only be called while holding taskDistMu lock
|
|
|
|
|
func (s *Server) distributeTasksToRunners() {
|
|
|
|
|
// Quick check: if there are no pending tasks, skip the expensive query
|
|
|
|
|
var pendingCount int
|
|
|
|
|
@@ -1677,21 +1713,25 @@ func (s *Server) distributeTasksToRunners() {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
err = tx.Commit()
|
|
|
|
|
if err != nil {
|
|
|
|
|
log.Printf("Failed to commit transaction for task %d: %v", task.TaskID, err)
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Check if the update actually affected a row (task was successfully assigned)
|
|
|
|
|
rowsAffected, err := result.RowsAffected()
|
|
|
|
|
if err != nil {
|
|
|
|
|
tx.Rollback()
|
|
|
|
|
log.Printf("Failed to get rows affected for task %d: %v", task.TaskID, err)
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if rowsAffected == 0 {
|
|
|
|
|
// Task was already assigned by another goroutine, skip
|
|
|
|
|
tx.Rollback()
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Commit the assignment before attempting WebSocket send
|
|
|
|
|
// If send fails, we'll rollback in a separate transaction
|
|
|
|
|
err = tx.Commit()
|
|
|
|
|
if err != nil {
|
|
|
|
|
log.Printf("Failed to commit transaction for task %d: %v", task.TaskID, err)
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@@ -1702,27 +1742,41 @@ func (s *Server) distributeTasksToRunners() {
|
|
|
|
|
"started_at": now,
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
// Task was successfully assigned, send via WebSocket
|
|
|
|
|
// Task was successfully assigned in database, now send via WebSocket
|
|
|
|
|
log.Printf("Assigned task %d (type: %s, job: %d) to runner %d", task.TaskID, task.TaskType, task.JobID, selectedRunnerID)
|
|
|
|
|
|
|
|
|
|
// Update job status to running if this is the first task starting
|
|
|
|
|
s.updateJobStatusFromTasks(task.JobID)
|
|
|
|
|
|
|
|
|
|
// Log runner assignment to task logs
|
|
|
|
|
s.logTaskEvent(task.TaskID, nil, types.LogLevelInfo, fmt.Sprintf("Task assigned to runner %d", selectedRunnerID), "")
|
|
|
|
|
|
|
|
|
|
// Attempt to send task to runner via WebSocket
|
|
|
|
|
if err := s.assignTaskToRunner(selectedRunnerID, task.TaskID); err != nil {
|
|
|
|
|
log.Printf("Failed to send task %d to runner %d: %v", task.TaskID, selectedRunnerID, err)
|
|
|
|
|
// Log assignment failure
|
|
|
|
|
s.logTaskEvent(task.TaskID, nil, types.LogLevelError, fmt.Sprintf("Failed to send task to runner %d: %v", selectedRunnerID, err), "")
|
|
|
|
|
// Rollback the assignment if WebSocket send fails
|
|
|
|
|
s.db.Exec(
|
|
|
|
|
`UPDATE tasks SET runner_id = NULL, status = ?, started_at = NULL
|
|
|
|
|
WHERE id = ?`,
|
|
|
|
|
types.TaskStatusPending, task.TaskID,
|
|
|
|
|
)
|
|
|
|
|
// Log rollback
|
|
|
|
|
s.logTaskEvent(task.TaskID, nil, types.LogLevelWarn, fmt.Sprintf("Task assignment rolled back - runner %d connection failed", selectedRunnerID), "")
|
|
|
|
|
// Rollback the assignment if WebSocket send fails using a new transaction
|
|
|
|
|
rollbackTx, rollbackErr := s.db.Begin()
|
|
|
|
|
if rollbackErr == nil {
|
|
|
|
|
_, rollbackErr = rollbackTx.Exec(
|
|
|
|
|
`UPDATE tasks SET runner_id = NULL, status = ?, started_at = NULL
|
|
|
|
|
WHERE id = ? AND runner_id = ?`,
|
|
|
|
|
types.TaskStatusPending, task.TaskID, selectedRunnerID,
|
|
|
|
|
)
|
|
|
|
|
if rollbackErr == nil {
|
|
|
|
|
rollbackTx.Commit()
|
|
|
|
|
// Log rollback
|
|
|
|
|
s.logTaskEvent(task.TaskID, nil, types.LogLevelWarn, fmt.Sprintf("Task assignment rolled back - runner %d connection failed", selectedRunnerID), "")
|
|
|
|
|
// Update job status after rollback
|
|
|
|
|
s.updateJobStatusFromTasks(task.JobID)
|
|
|
|
|
// Trigger redistribution
|
|
|
|
|
s.triggerTaskDistribution()
|
|
|
|
|
} else {
|
|
|
|
|
rollbackTx.Rollback()
|
|
|
|
|
log.Printf("Failed to rollback task %d assignment: %v", task.TaskID, rollbackErr)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
// WebSocket send succeeded, update job status
|
|
|
|
|
s.updateJobStatusFromTasks(task.JobID)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
@@ -1805,13 +1859,34 @@ func (s *Server) assignTaskToRunner(runnerID int64, taskID int64) error {
|
|
|
|
|
return fmt.Errorf("task %d is not assigned to runner %d", taskID, runnerID)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Send task via WebSocket
|
|
|
|
|
// Send task via WebSocket with write mutex protection
|
|
|
|
|
msg := WSMessage{
|
|
|
|
|
Type: "task_assignment",
|
|
|
|
|
Timestamp: time.Now().Unix(),
|
|
|
|
|
}
|
|
|
|
|
msg.Data, _ = json.Marshal(task)
|
|
|
|
|
return conn.WriteJSON(msg)
|
|
|
|
|
|
|
|
|
|
// Get write mutex for this connection
|
|
|
|
|
s.runnerConnsWriteMuMu.RLock()
|
|
|
|
|
writeMu, hasMu := s.runnerConnsWriteMu[runnerID]
|
|
|
|
|
s.runnerConnsWriteMuMu.RUnlock()
|
|
|
|
|
|
|
|
|
|
if !hasMu || writeMu == nil {
|
|
|
|
|
return fmt.Errorf("runner %d write mutex not found", runnerID)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Re-check connection is still valid before writing
|
|
|
|
|
s.runnerConnsMu.RLock()
|
|
|
|
|
_, stillExists := s.runnerConns[runnerID]
|
|
|
|
|
s.runnerConnsMu.RUnlock()
|
|
|
|
|
if !stillExists {
|
|
|
|
|
return fmt.Errorf("runner %d disconnected", runnerID)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
writeMu.Lock()
|
|
|
|
|
err = conn.WriteJSON(msg)
|
|
|
|
|
writeMu.Unlock()
|
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// redistributeRunnerTasks resets tasks assigned to a disconnected/dead runner and redistributes them
|
|
|
|
|
@@ -1883,7 +1958,7 @@ func (s *Server) redistributeRunnerTasks(runnerID int64) {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Immediately redistribute the reset tasks
|
|
|
|
|
go s.distributeTasksToRunners()
|
|
|
|
|
s.triggerTaskDistribution()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// logTaskEvent logs an event to a task's log (manager-side logging)
|
|
|
|
|
|