Refactor runner and API components to remove IP address handling. Update client and server logic to streamline runner registration and task distribution. Introduce write mutexes for connection management to enhance concurrency control. Clean up whitespace and improve code readability across multiple files.
This commit is contained in:
@@ -15,6 +15,7 @@ import (
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"jiggablend/pkg/types"
|
||||
@@ -144,7 +145,7 @@ func (s *Server) handleRegisterRunner(w http.ResponseWriter, r *http.Request) {
|
||||
registration_token, runner_secret, manager_secret, verified, priority)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
RETURNING id`,
|
||||
req.Name, req.Hostname, req.IPAddress, types.RunnerStatusOnline, time.Now(), req.Capabilities,
|
||||
req.Name, req.Hostname, "", types.RunnerStatusOnline, time.Now(), req.Capabilities,
|
||||
req.RegistrationToken, runnerSecret, managerSecret, true, priority,
|
||||
).Scan(&runnerID)
|
||||
if err != nil {
|
||||
@@ -157,7 +158,6 @@ func (s *Server) handleRegisterRunner(w http.ResponseWriter, r *http.Request) {
|
||||
"id": runnerID,
|
||||
"name": req.Name,
|
||||
"hostname": req.Hostname,
|
||||
"ip_address": req.IPAddress,
|
||||
"status": types.RunnerStatusOnline,
|
||||
"runner_secret": runnerSecret,
|
||||
"manager_secret": managerSecret,
|
||||
@@ -683,14 +683,25 @@ func (s *Server) handleRunnerWebSocket(w http.ResponseWriter, r *http.Request) {
|
||||
defer conn.Close()
|
||||
|
||||
// Register connection (must be done before any distribution checks)
|
||||
// Close old connection outside lock to avoid blocking
|
||||
var oldConn *websocket.Conn
|
||||
s.runnerConnsMu.Lock()
|
||||
// Remove old connection if exists
|
||||
if oldConn, exists := s.runnerConns[runnerID]; exists {
|
||||
oldConn.Close()
|
||||
if existingConn, exists := s.runnerConns[runnerID]; exists {
|
||||
oldConn = existingConn
|
||||
}
|
||||
s.runnerConns[runnerID] = conn
|
||||
s.runnerConnsMu.Unlock()
|
||||
|
||||
// Close old connection outside lock (if it existed)
|
||||
if oldConn != nil {
|
||||
oldConn.Close()
|
||||
}
|
||||
|
||||
// Create a write mutex for this connection
|
||||
s.runnerConnsWriteMuMu.Lock()
|
||||
s.runnerConnsWriteMu[runnerID] = &sync.Mutex{}
|
||||
s.runnerConnsWriteMuMu.Unlock()
|
||||
|
||||
// Update runner status to online
|
||||
_, _ = s.db.Exec(
|
||||
`UPDATE runners SET status = ?, last_heartbeat = ? WHERE id = ?`,
|
||||
@@ -698,12 +709,8 @@ func (s *Server) handleRunnerWebSocket(w http.ResponseWriter, r *http.Request) {
|
||||
)
|
||||
|
||||
// Immediately try to distribute pending tasks to this newly connected runner
|
||||
// Use a small delay to ensure connection registration is fully visible to other goroutines
|
||||
log.Printf("Runner %d connected, distributing pending tasks", runnerID)
|
||||
go func() {
|
||||
time.Sleep(50 * time.Millisecond) // Small delay to ensure map update is visible
|
||||
s.distributeTasksToRunners()
|
||||
}()
|
||||
s.triggerTaskDistribution()
|
||||
|
||||
// Note: We don't log to task logs here because we don't know which tasks will be assigned yet
|
||||
// Task assignment logging happens in distributeTasksToRunners
|
||||
@@ -713,6 +720,9 @@ func (s *Server) handleRunnerWebSocket(w http.ResponseWriter, r *http.Request) {
|
||||
s.runnerConnsMu.Lock()
|
||||
delete(s.runnerConns, runnerID)
|
||||
s.runnerConnsMu.Unlock()
|
||||
s.runnerConnsWriteMuMu.Lock()
|
||||
delete(s.runnerConnsWriteMu, runnerID)
|
||||
s.runnerConnsWriteMuMu.Unlock()
|
||||
_, _ = s.db.Exec(
|
||||
`UPDATE runners SET status = ? WHERE id = ?`,
|
||||
types.RunnerStatusOffline, runnerID,
|
||||
@@ -743,15 +753,28 @@ func (s *Server) handleRunnerWebSocket(w http.ResponseWriter, r *http.Request) {
|
||||
defer ticker.Stop()
|
||||
for range ticker.C {
|
||||
s.runnerConnsMu.RLock()
|
||||
conn, exists := s.runnerConns[runnerID]
|
||||
currentConn, exists := s.runnerConns[runnerID]
|
||||
s.runnerConnsMu.RUnlock()
|
||||
if !exists {
|
||||
if !exists || currentConn != conn {
|
||||
// Connection was replaced or removed
|
||||
return
|
||||
}
|
||||
// Get write mutex for this connection
|
||||
s.runnerConnsWriteMuMu.RLock()
|
||||
writeMu, hasMu := s.runnerConnsWriteMu[runnerID]
|
||||
s.runnerConnsWriteMuMu.RUnlock()
|
||||
if !hasMu || writeMu == nil {
|
||||
return
|
||||
}
|
||||
// Send ping - runner should respond with pong automatically
|
||||
// Reset read deadline before sending ping to ensure we can receive pong
|
||||
conn.SetReadDeadline(time.Now().Add(90 * time.Second)) // Increased to 90 seconds
|
||||
if err := conn.WriteControl(websocket.PingMessage, []byte{}, time.Now().Add(10*time.Second)); err != nil {
|
||||
writeMu.Lock()
|
||||
err := conn.WriteControl(websocket.PingMessage, []byte{}, time.Now().Add(10*time.Second))
|
||||
writeMu.Unlock()
|
||||
if err != nil {
|
||||
// Write failed - connection is likely dead, read loop will detect and cleanup
|
||||
log.Printf("Failed to send ping to runner %d: %v", runnerID, err)
|
||||
return
|
||||
}
|
||||
}
|
||||
@@ -1191,7 +1214,7 @@ func (s *Server) updateJobStatusFromTasks(jobID int64) {
|
||||
// Update job status to ensure it's marked as running (has pending video task)
|
||||
s.updateJobStatusFromTasks(jobID)
|
||||
// Try to distribute the task immediately
|
||||
go s.distributeTasksToRunners()
|
||||
s.triggerTaskDistribution()
|
||||
}
|
||||
} else {
|
||||
log.Printf("Skipping video generation task creation for job %d (video task already exists)", jobID)
|
||||
@@ -1284,7 +1307,20 @@ func (s *Server) broadcastLogToFrontend(taskID int64, logEntry WSLogEntry) {
|
||||
}
|
||||
}
|
||||
|
||||
// triggerTaskDistribution triggers task distribution in a serialized manner
|
||||
func (s *Server) triggerTaskDistribution() {
|
||||
go func() {
|
||||
// Try to acquire lock - if already running, skip
|
||||
if !s.taskDistMu.TryLock() {
|
||||
return // Distribution already in progress
|
||||
}
|
||||
defer s.taskDistMu.Unlock()
|
||||
s.distributeTasksToRunners()
|
||||
}()
|
||||
}
|
||||
|
||||
// distributeTasksToRunners pushes available tasks to connected runners
|
||||
// This function should only be called while holding taskDistMu lock
|
||||
func (s *Server) distributeTasksToRunners() {
|
||||
// Quick check: if there are no pending tasks, skip the expensive query
|
||||
var pendingCount int
|
||||
@@ -1677,21 +1713,25 @@ func (s *Server) distributeTasksToRunners() {
|
||||
continue
|
||||
}
|
||||
|
||||
err = tx.Commit()
|
||||
if err != nil {
|
||||
log.Printf("Failed to commit transaction for task %d: %v", task.TaskID, err)
|
||||
continue
|
||||
}
|
||||
|
||||
// Check if the update actually affected a row (task was successfully assigned)
|
||||
rowsAffected, err := result.RowsAffected()
|
||||
if err != nil {
|
||||
tx.Rollback()
|
||||
log.Printf("Failed to get rows affected for task %d: %v", task.TaskID, err)
|
||||
continue
|
||||
}
|
||||
|
||||
if rowsAffected == 0 {
|
||||
// Task was already assigned by another goroutine, skip
|
||||
tx.Rollback()
|
||||
continue
|
||||
}
|
||||
|
||||
// Commit the assignment before attempting WebSocket send
|
||||
// If send fails, we'll rollback in a separate transaction
|
||||
err = tx.Commit()
|
||||
if err != nil {
|
||||
log.Printf("Failed to commit transaction for task %d: %v", task.TaskID, err)
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -1702,27 +1742,41 @@ func (s *Server) distributeTasksToRunners() {
|
||||
"started_at": now,
|
||||
})
|
||||
|
||||
// Task was successfully assigned, send via WebSocket
|
||||
// Task was successfully assigned in database, now send via WebSocket
|
||||
log.Printf("Assigned task %d (type: %s, job: %d) to runner %d", task.TaskID, task.TaskType, task.JobID, selectedRunnerID)
|
||||
|
||||
// Update job status to running if this is the first task starting
|
||||
s.updateJobStatusFromTasks(task.JobID)
|
||||
|
||||
// Log runner assignment to task logs
|
||||
s.logTaskEvent(task.TaskID, nil, types.LogLevelInfo, fmt.Sprintf("Task assigned to runner %d", selectedRunnerID), "")
|
||||
|
||||
// Attempt to send task to runner via WebSocket
|
||||
if err := s.assignTaskToRunner(selectedRunnerID, task.TaskID); err != nil {
|
||||
log.Printf("Failed to send task %d to runner %d: %v", task.TaskID, selectedRunnerID, err)
|
||||
// Log assignment failure
|
||||
s.logTaskEvent(task.TaskID, nil, types.LogLevelError, fmt.Sprintf("Failed to send task to runner %d: %v", selectedRunnerID, err), "")
|
||||
// Rollback the assignment if WebSocket send fails
|
||||
s.db.Exec(
|
||||
`UPDATE tasks SET runner_id = NULL, status = ?, started_at = NULL
|
||||
WHERE id = ?`,
|
||||
types.TaskStatusPending, task.TaskID,
|
||||
)
|
||||
// Log rollback
|
||||
s.logTaskEvent(task.TaskID, nil, types.LogLevelWarn, fmt.Sprintf("Task assignment rolled back - runner %d connection failed", selectedRunnerID), "")
|
||||
// Rollback the assignment if WebSocket send fails using a new transaction
|
||||
rollbackTx, rollbackErr := s.db.Begin()
|
||||
if rollbackErr == nil {
|
||||
_, rollbackErr = rollbackTx.Exec(
|
||||
`UPDATE tasks SET runner_id = NULL, status = ?, started_at = NULL
|
||||
WHERE id = ? AND runner_id = ?`,
|
||||
types.TaskStatusPending, task.TaskID, selectedRunnerID,
|
||||
)
|
||||
if rollbackErr == nil {
|
||||
rollbackTx.Commit()
|
||||
// Log rollback
|
||||
s.logTaskEvent(task.TaskID, nil, types.LogLevelWarn, fmt.Sprintf("Task assignment rolled back - runner %d connection failed", selectedRunnerID), "")
|
||||
// Update job status after rollback
|
||||
s.updateJobStatusFromTasks(task.JobID)
|
||||
// Trigger redistribution
|
||||
s.triggerTaskDistribution()
|
||||
} else {
|
||||
rollbackTx.Rollback()
|
||||
log.Printf("Failed to rollback task %d assignment: %v", task.TaskID, rollbackErr)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// WebSocket send succeeded, update job status
|
||||
s.updateJobStatusFromTasks(task.JobID)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1805,13 +1859,34 @@ func (s *Server) assignTaskToRunner(runnerID int64, taskID int64) error {
|
||||
return fmt.Errorf("task %d is not assigned to runner %d", taskID, runnerID)
|
||||
}
|
||||
|
||||
// Send task via WebSocket
|
||||
// Send task via WebSocket with write mutex protection
|
||||
msg := WSMessage{
|
||||
Type: "task_assignment",
|
||||
Timestamp: time.Now().Unix(),
|
||||
}
|
||||
msg.Data, _ = json.Marshal(task)
|
||||
return conn.WriteJSON(msg)
|
||||
|
||||
// Get write mutex for this connection
|
||||
s.runnerConnsWriteMuMu.RLock()
|
||||
writeMu, hasMu := s.runnerConnsWriteMu[runnerID]
|
||||
s.runnerConnsWriteMuMu.RUnlock()
|
||||
|
||||
if !hasMu || writeMu == nil {
|
||||
return fmt.Errorf("runner %d write mutex not found", runnerID)
|
||||
}
|
||||
|
||||
// Re-check connection is still valid before writing
|
||||
s.runnerConnsMu.RLock()
|
||||
_, stillExists := s.runnerConns[runnerID]
|
||||
s.runnerConnsMu.RUnlock()
|
||||
if !stillExists {
|
||||
return fmt.Errorf("runner %d disconnected", runnerID)
|
||||
}
|
||||
|
||||
writeMu.Lock()
|
||||
err = conn.WriteJSON(msg)
|
||||
writeMu.Unlock()
|
||||
return err
|
||||
}
|
||||
|
||||
// redistributeRunnerTasks resets tasks assigned to a disconnected/dead runner and redistributes them
|
||||
@@ -1883,7 +1958,7 @@ func (s *Server) redistributeRunnerTasks(runnerID int64) {
|
||||
}
|
||||
|
||||
// Immediately redistribute the reset tasks
|
||||
go s.distributeTasksToRunners()
|
||||
s.triggerTaskDistribution()
|
||||
}
|
||||
|
||||
// logTaskEvent logs an event to a task's log (manager-side logging)
|
||||
|
||||
Reference in New Issue
Block a user