Refactor runner and API components to remove IP address handling. Update client and server logic to streamline runner registration and task distribution. Introduce write mutexes for connection management to enhance concurrency control. Clean up whitespace and improve code readability across multiple files.

This commit is contained in:
2025-11-24 22:58:56 -06:00
parent 3217bbfe4d
commit a53ea4dce7
9 changed files with 133 additions and 67 deletions

View File

@@ -15,6 +15,7 @@ import (
"sort"
"strconv"
"strings"
"sync"
"time"
"jiggablend/pkg/types"
@@ -144,7 +145,7 @@ func (s *Server) handleRegisterRunner(w http.ResponseWriter, r *http.Request) {
registration_token, runner_secret, manager_secret, verified, priority)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
RETURNING id`,
req.Name, req.Hostname, req.IPAddress, types.RunnerStatusOnline, time.Now(), req.Capabilities,
req.Name, req.Hostname, "", types.RunnerStatusOnline, time.Now(), req.Capabilities,
req.RegistrationToken, runnerSecret, managerSecret, true, priority,
).Scan(&runnerID)
if err != nil {
@@ -157,7 +158,6 @@ func (s *Server) handleRegisterRunner(w http.ResponseWriter, r *http.Request) {
"id": runnerID,
"name": req.Name,
"hostname": req.Hostname,
"ip_address": req.IPAddress,
"status": types.RunnerStatusOnline,
"runner_secret": runnerSecret,
"manager_secret": managerSecret,
@@ -683,14 +683,25 @@ func (s *Server) handleRunnerWebSocket(w http.ResponseWriter, r *http.Request) {
defer conn.Close()
// Register connection (must be done before any distribution checks)
// Close old connection outside lock to avoid blocking
var oldConn *websocket.Conn
s.runnerConnsMu.Lock()
// Remove old connection if exists
if oldConn, exists := s.runnerConns[runnerID]; exists {
oldConn.Close()
if existingConn, exists := s.runnerConns[runnerID]; exists {
oldConn = existingConn
}
s.runnerConns[runnerID] = conn
s.runnerConnsMu.Unlock()
// Close old connection outside lock (if it existed)
if oldConn != nil {
oldConn.Close()
}
// Create a write mutex for this connection
s.runnerConnsWriteMuMu.Lock()
s.runnerConnsWriteMu[runnerID] = &sync.Mutex{}
s.runnerConnsWriteMuMu.Unlock()
// Update runner status to online
_, _ = s.db.Exec(
`UPDATE runners SET status = ?, last_heartbeat = ? WHERE id = ?`,
@@ -698,12 +709,8 @@ func (s *Server) handleRunnerWebSocket(w http.ResponseWriter, r *http.Request) {
)
// Immediately try to distribute pending tasks to this newly connected runner
// Use a small delay to ensure connection registration is fully visible to other goroutines
log.Printf("Runner %d connected, distributing pending tasks", runnerID)
go func() {
time.Sleep(50 * time.Millisecond) // Small delay to ensure map update is visible
s.distributeTasksToRunners()
}()
s.triggerTaskDistribution()
// Note: We don't log to task logs here because we don't know which tasks will be assigned yet
// Task assignment logging happens in distributeTasksToRunners
@@ -713,6 +720,9 @@ func (s *Server) handleRunnerWebSocket(w http.ResponseWriter, r *http.Request) {
s.runnerConnsMu.Lock()
delete(s.runnerConns, runnerID)
s.runnerConnsMu.Unlock()
s.runnerConnsWriteMuMu.Lock()
delete(s.runnerConnsWriteMu, runnerID)
s.runnerConnsWriteMuMu.Unlock()
_, _ = s.db.Exec(
`UPDATE runners SET status = ? WHERE id = ?`,
types.RunnerStatusOffline, runnerID,
@@ -743,15 +753,28 @@ func (s *Server) handleRunnerWebSocket(w http.ResponseWriter, r *http.Request) {
defer ticker.Stop()
for range ticker.C {
s.runnerConnsMu.RLock()
conn, exists := s.runnerConns[runnerID]
currentConn, exists := s.runnerConns[runnerID]
s.runnerConnsMu.RUnlock()
if !exists {
if !exists || currentConn != conn {
// Connection was replaced or removed
return
}
// Get write mutex for this connection
s.runnerConnsWriteMuMu.RLock()
writeMu, hasMu := s.runnerConnsWriteMu[runnerID]
s.runnerConnsWriteMuMu.RUnlock()
if !hasMu || writeMu == nil {
return
}
// Send ping - runner should respond with pong automatically
// Reset read deadline before sending ping to ensure we can receive pong
conn.SetReadDeadline(time.Now().Add(90 * time.Second)) // Increased to 90 seconds
if err := conn.WriteControl(websocket.PingMessage, []byte{}, time.Now().Add(10*time.Second)); err != nil {
writeMu.Lock()
err := conn.WriteControl(websocket.PingMessage, []byte{}, time.Now().Add(10*time.Second))
writeMu.Unlock()
if err != nil {
// Write failed - connection is likely dead, read loop will detect and cleanup
log.Printf("Failed to send ping to runner %d: %v", runnerID, err)
return
}
}
@@ -1191,7 +1214,7 @@ func (s *Server) updateJobStatusFromTasks(jobID int64) {
// Update job status to ensure it's marked as running (has pending video task)
s.updateJobStatusFromTasks(jobID)
// Try to distribute the task immediately
go s.distributeTasksToRunners()
s.triggerTaskDistribution()
}
} else {
log.Printf("Skipping video generation task creation for job %d (video task already exists)", jobID)
@@ -1284,7 +1307,20 @@ func (s *Server) broadcastLogToFrontend(taskID int64, logEntry WSLogEntry) {
}
}
// triggerTaskDistribution triggers task distribution in a serialized manner
func (s *Server) triggerTaskDistribution() {
go func() {
// Try to acquire lock - if already running, skip
if !s.taskDistMu.TryLock() {
return // Distribution already in progress
}
defer s.taskDistMu.Unlock()
s.distributeTasksToRunners()
}()
}
// distributeTasksToRunners pushes available tasks to connected runners
// This function should only be called while holding taskDistMu lock
func (s *Server) distributeTasksToRunners() {
// Quick check: if there are no pending tasks, skip the expensive query
var pendingCount int
@@ -1677,21 +1713,25 @@ func (s *Server) distributeTasksToRunners() {
continue
}
err = tx.Commit()
if err != nil {
log.Printf("Failed to commit transaction for task %d: %v", task.TaskID, err)
continue
}
// Check if the update actually affected a row (task was successfully assigned)
rowsAffected, err := result.RowsAffected()
if err != nil {
tx.Rollback()
log.Printf("Failed to get rows affected for task %d: %v", task.TaskID, err)
continue
}
if rowsAffected == 0 {
// Task was already assigned by another goroutine, skip
tx.Rollback()
continue
}
// Commit the assignment before attempting WebSocket send
// If send fails, we'll rollback in a separate transaction
err = tx.Commit()
if err != nil {
log.Printf("Failed to commit transaction for task %d: %v", task.TaskID, err)
continue
}
@@ -1702,27 +1742,41 @@ func (s *Server) distributeTasksToRunners() {
"started_at": now,
})
// Task was successfully assigned, send via WebSocket
// Task was successfully assigned in database, now send via WebSocket
log.Printf("Assigned task %d (type: %s, job: %d) to runner %d", task.TaskID, task.TaskType, task.JobID, selectedRunnerID)
// Update job status to running if this is the first task starting
s.updateJobStatusFromTasks(task.JobID)
// Log runner assignment to task logs
s.logTaskEvent(task.TaskID, nil, types.LogLevelInfo, fmt.Sprintf("Task assigned to runner %d", selectedRunnerID), "")
// Attempt to send task to runner via WebSocket
if err := s.assignTaskToRunner(selectedRunnerID, task.TaskID); err != nil {
log.Printf("Failed to send task %d to runner %d: %v", task.TaskID, selectedRunnerID, err)
// Log assignment failure
s.logTaskEvent(task.TaskID, nil, types.LogLevelError, fmt.Sprintf("Failed to send task to runner %d: %v", selectedRunnerID, err), "")
// Rollback the assignment if WebSocket send fails
s.db.Exec(
`UPDATE tasks SET runner_id = NULL, status = ?, started_at = NULL
WHERE id = ?`,
types.TaskStatusPending, task.TaskID,
)
// Log rollback
s.logTaskEvent(task.TaskID, nil, types.LogLevelWarn, fmt.Sprintf("Task assignment rolled back - runner %d connection failed", selectedRunnerID), "")
// Rollback the assignment if WebSocket send fails using a new transaction
rollbackTx, rollbackErr := s.db.Begin()
if rollbackErr == nil {
_, rollbackErr = rollbackTx.Exec(
`UPDATE tasks SET runner_id = NULL, status = ?, started_at = NULL
WHERE id = ? AND runner_id = ?`,
types.TaskStatusPending, task.TaskID, selectedRunnerID,
)
if rollbackErr == nil {
rollbackTx.Commit()
// Log rollback
s.logTaskEvent(task.TaskID, nil, types.LogLevelWarn, fmt.Sprintf("Task assignment rolled back - runner %d connection failed", selectedRunnerID), "")
// Update job status after rollback
s.updateJobStatusFromTasks(task.JobID)
// Trigger redistribution
s.triggerTaskDistribution()
} else {
rollbackTx.Rollback()
log.Printf("Failed to rollback task %d assignment: %v", task.TaskID, rollbackErr)
}
}
} else {
// WebSocket send succeeded, update job status
s.updateJobStatusFromTasks(task.JobID)
}
}
}
@@ -1805,13 +1859,34 @@ func (s *Server) assignTaskToRunner(runnerID int64, taskID int64) error {
return fmt.Errorf("task %d is not assigned to runner %d", taskID, runnerID)
}
// Send task via WebSocket
// Send task via WebSocket with write mutex protection
msg := WSMessage{
Type: "task_assignment",
Timestamp: time.Now().Unix(),
}
msg.Data, _ = json.Marshal(task)
return conn.WriteJSON(msg)
// Get write mutex for this connection
s.runnerConnsWriteMuMu.RLock()
writeMu, hasMu := s.runnerConnsWriteMu[runnerID]
s.runnerConnsWriteMuMu.RUnlock()
if !hasMu || writeMu == nil {
return fmt.Errorf("runner %d write mutex not found", runnerID)
}
// Re-check connection is still valid before writing
s.runnerConnsMu.RLock()
_, stillExists := s.runnerConns[runnerID]
s.runnerConnsMu.RUnlock()
if !stillExists {
return fmt.Errorf("runner %d disconnected", runnerID)
}
writeMu.Lock()
err = conn.WriteJSON(msg)
writeMu.Unlock()
return err
}
// redistributeRunnerTasks resets tasks assigned to a disconnected/dead runner and redistributes them
@@ -1883,7 +1958,7 @@ func (s *Server) redistributeRunnerTasks(runnerID int64) {
}
// Immediately redistribute the reset tasks
go s.distributeTasksToRunners()
s.triggerTaskDistribution()
}
// logTaskEvent logs an event to a task's log (manager-side logging)