Refactor runner and API components to remove IP address handling. Update client and server logic to streamline runner registration and task distribution. Introduce write mutexes for connection management to enhance concurrency control. Clean up whitespace and improve code readability across multiple files.
This commit is contained in:
@@ -28,7 +28,6 @@ func main() {
|
|||||||
managerURL = flag.String("manager", getEnv("MANAGER_URL", "http://localhost:8080"), "Manager URL")
|
managerURL = flag.String("manager", getEnv("MANAGER_URL", "http://localhost:8080"), "Manager URL")
|
||||||
name = flag.String("name", getEnv("RUNNER_NAME", ""), "Runner name")
|
name = flag.String("name", getEnv("RUNNER_NAME", ""), "Runner name")
|
||||||
hostname = flag.String("hostname", getEnv("RUNNER_HOSTNAME", ""), "Runner hostname")
|
hostname = flag.String("hostname", getEnv("RUNNER_HOSTNAME", ""), "Runner hostname")
|
||||||
ipAddress = flag.String("ip", getEnv("RUNNER_IP", ""), "Runner IP address")
|
|
||||||
token = flag.String("token", getEnv("REGISTRATION_TOKEN", ""), "Registration token")
|
token = flag.String("token", getEnv("REGISTRATION_TOKEN", ""), "Registration token")
|
||||||
secretsFile = flag.String("secrets-file", getEnv("SECRETS_FILE", ""), "Path to secrets file for persistent storage (default: ./runner-secrets.json, or ./runner-secrets-{id}.json if multiple runners)")
|
secretsFile = flag.String("secrets-file", getEnv("SECRETS_FILE", ""), "Path to secrets file for persistent storage (default: ./runner-secrets.json, or ./runner-secrets-{id}.json if multiple runners)")
|
||||||
runnerIDSuffix = flag.String("runner-id", getEnv("RUNNER_ID", ""), "Unique runner ID suffix (auto-generated if not provided)")
|
runnerIDSuffix = flag.String("runner-id", getEnv("RUNNER_ID", ""), "Unique runner ID suffix (auto-generated if not provided)")
|
||||||
@@ -42,9 +41,6 @@ func main() {
|
|||||||
if *hostname == "" {
|
if *hostname == "" {
|
||||||
*hostname, _ = os.Hostname()
|
*hostname, _ = os.Hostname()
|
||||||
}
|
}
|
||||||
if *ipAddress == "" {
|
|
||||||
*ipAddress = "127.0.0.1"
|
|
||||||
}
|
|
||||||
|
|
||||||
// Generate or use provided runner ID suffix
|
// Generate or use provided runner ID suffix
|
||||||
runnerIDStr := *runnerIDSuffix
|
runnerIDStr := *runnerIDSuffix
|
||||||
@@ -87,7 +83,7 @@ func main() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
client := runner.NewClient(*managerURL, *name, *hostname, *ipAddress)
|
client := runner.NewClient(*managerURL, *name, *hostname)
|
||||||
|
|
||||||
// Probe capabilities once at startup (before any registration attempts)
|
// Probe capabilities once at startup (before any registration attempts)
|
||||||
log.Printf("Probing runner capabilities...")
|
log.Printf("Probing runner capabilities...")
|
||||||
|
|||||||
@@ -136,7 +136,7 @@ func (s *Server) handleDeleteRunner(w http.ResponseWriter, r *http.Request) {
|
|||||||
// handleListRunnersAdmin lists all runners with admin details
|
// handleListRunnersAdmin lists all runners with admin details
|
||||||
func (s *Server) handleListRunnersAdmin(w http.ResponseWriter, r *http.Request) {
|
func (s *Server) handleListRunnersAdmin(w http.ResponseWriter, r *http.Request) {
|
||||||
rows, err := s.db.Query(
|
rows, err := s.db.Query(
|
||||||
`SELECT id, name, hostname, ip_address, status, last_heartbeat, capabilities,
|
`SELECT id, name, hostname, status, last_heartbeat, capabilities,
|
||||||
registration_token, verified, priority, created_at
|
registration_token, verified, priority, created_at
|
||||||
FROM runners ORDER BY created_at DESC`,
|
FROM runners ORDER BY created_at DESC`,
|
||||||
)
|
)
|
||||||
@@ -153,7 +153,7 @@ func (s *Server) handleListRunnersAdmin(w http.ResponseWriter, r *http.Request)
|
|||||||
var verified bool
|
var verified bool
|
||||||
|
|
||||||
err := rows.Scan(
|
err := rows.Scan(
|
||||||
&runner.ID, &runner.Name, &runner.Hostname, &runner.IPAddress,
|
&runner.ID, &runner.Name, &runner.Hostname,
|
||||||
&runner.Status, &runner.LastHeartbeat, &runner.Capabilities,
|
&runner.Status, &runner.LastHeartbeat, &runner.Capabilities,
|
||||||
®istrationToken, &verified, &runner.Priority, &runner.CreatedAt,
|
®istrationToken, &verified, &runner.Priority, &runner.CreatedAt,
|
||||||
)
|
)
|
||||||
@@ -166,7 +166,6 @@ func (s *Server) handleListRunnersAdmin(w http.ResponseWriter, r *http.Request)
|
|||||||
"id": runner.ID,
|
"id": runner.ID,
|
||||||
"name": runner.Name,
|
"name": runner.Name,
|
||||||
"hostname": runner.Hostname,
|
"hostname": runner.Hostname,
|
||||||
"ip_address": runner.IPAddress,
|
|
||||||
"status": runner.Status,
|
"status": runner.Status,
|
||||||
"last_heartbeat": runner.LastHeartbeat,
|
"last_heartbeat": runner.LastHeartbeat,
|
||||||
"capabilities": runner.Capabilities,
|
"capabilities": runner.Capabilities,
|
||||||
|
|||||||
@@ -346,7 +346,7 @@ func (s *Server) handleCreateJob(w http.ResponseWriter, r *http.Request) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Immediately try to distribute tasks to connected runners
|
// Immediately try to distribute tasks to connected runners
|
||||||
go s.distributeTasksToRunners()
|
s.triggerTaskDistribution()
|
||||||
|
|
||||||
s.respondJSON(w, http.StatusCreated, job)
|
s.respondJSON(w, http.StatusCreated, job)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -15,6 +15,7 @@ import (
|
|||||||
"sort"
|
"sort"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"jiggablend/pkg/types"
|
"jiggablend/pkg/types"
|
||||||
@@ -144,7 +145,7 @@ func (s *Server) handleRegisterRunner(w http.ResponseWriter, r *http.Request) {
|
|||||||
registration_token, runner_secret, manager_secret, verified, priority)
|
registration_token, runner_secret, manager_secret, verified, priority)
|
||||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||||
RETURNING id`,
|
RETURNING id`,
|
||||||
req.Name, req.Hostname, req.IPAddress, types.RunnerStatusOnline, time.Now(), req.Capabilities,
|
req.Name, req.Hostname, "", types.RunnerStatusOnline, time.Now(), req.Capabilities,
|
||||||
req.RegistrationToken, runnerSecret, managerSecret, true, priority,
|
req.RegistrationToken, runnerSecret, managerSecret, true, priority,
|
||||||
).Scan(&runnerID)
|
).Scan(&runnerID)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -157,7 +158,6 @@ func (s *Server) handleRegisterRunner(w http.ResponseWriter, r *http.Request) {
|
|||||||
"id": runnerID,
|
"id": runnerID,
|
||||||
"name": req.Name,
|
"name": req.Name,
|
||||||
"hostname": req.Hostname,
|
"hostname": req.Hostname,
|
||||||
"ip_address": req.IPAddress,
|
|
||||||
"status": types.RunnerStatusOnline,
|
"status": types.RunnerStatusOnline,
|
||||||
"runner_secret": runnerSecret,
|
"runner_secret": runnerSecret,
|
||||||
"manager_secret": managerSecret,
|
"manager_secret": managerSecret,
|
||||||
@@ -683,14 +683,25 @@ func (s *Server) handleRunnerWebSocket(w http.ResponseWriter, r *http.Request) {
|
|||||||
defer conn.Close()
|
defer conn.Close()
|
||||||
|
|
||||||
// Register connection (must be done before any distribution checks)
|
// Register connection (must be done before any distribution checks)
|
||||||
|
// Close old connection outside lock to avoid blocking
|
||||||
|
var oldConn *websocket.Conn
|
||||||
s.runnerConnsMu.Lock()
|
s.runnerConnsMu.Lock()
|
||||||
// Remove old connection if exists
|
if existingConn, exists := s.runnerConns[runnerID]; exists {
|
||||||
if oldConn, exists := s.runnerConns[runnerID]; exists {
|
oldConn = existingConn
|
||||||
oldConn.Close()
|
|
||||||
}
|
}
|
||||||
s.runnerConns[runnerID] = conn
|
s.runnerConns[runnerID] = conn
|
||||||
s.runnerConnsMu.Unlock()
|
s.runnerConnsMu.Unlock()
|
||||||
|
|
||||||
|
// Close old connection outside lock (if it existed)
|
||||||
|
if oldConn != nil {
|
||||||
|
oldConn.Close()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create a write mutex for this connection
|
||||||
|
s.runnerConnsWriteMuMu.Lock()
|
||||||
|
s.runnerConnsWriteMu[runnerID] = &sync.Mutex{}
|
||||||
|
s.runnerConnsWriteMuMu.Unlock()
|
||||||
|
|
||||||
// Update runner status to online
|
// Update runner status to online
|
||||||
_, _ = s.db.Exec(
|
_, _ = s.db.Exec(
|
||||||
`UPDATE runners SET status = ?, last_heartbeat = ? WHERE id = ?`,
|
`UPDATE runners SET status = ?, last_heartbeat = ? WHERE id = ?`,
|
||||||
@@ -698,12 +709,8 @@ func (s *Server) handleRunnerWebSocket(w http.ResponseWriter, r *http.Request) {
|
|||||||
)
|
)
|
||||||
|
|
||||||
// Immediately try to distribute pending tasks to this newly connected runner
|
// Immediately try to distribute pending tasks to this newly connected runner
|
||||||
// Use a small delay to ensure connection registration is fully visible to other goroutines
|
|
||||||
log.Printf("Runner %d connected, distributing pending tasks", runnerID)
|
log.Printf("Runner %d connected, distributing pending tasks", runnerID)
|
||||||
go func() {
|
s.triggerTaskDistribution()
|
||||||
time.Sleep(50 * time.Millisecond) // Small delay to ensure map update is visible
|
|
||||||
s.distributeTasksToRunners()
|
|
||||||
}()
|
|
||||||
|
|
||||||
// Note: We don't log to task logs here because we don't know which tasks will be assigned yet
|
// Note: We don't log to task logs here because we don't know which tasks will be assigned yet
|
||||||
// Task assignment logging happens in distributeTasksToRunners
|
// Task assignment logging happens in distributeTasksToRunners
|
||||||
@@ -713,6 +720,9 @@ func (s *Server) handleRunnerWebSocket(w http.ResponseWriter, r *http.Request) {
|
|||||||
s.runnerConnsMu.Lock()
|
s.runnerConnsMu.Lock()
|
||||||
delete(s.runnerConns, runnerID)
|
delete(s.runnerConns, runnerID)
|
||||||
s.runnerConnsMu.Unlock()
|
s.runnerConnsMu.Unlock()
|
||||||
|
s.runnerConnsWriteMuMu.Lock()
|
||||||
|
delete(s.runnerConnsWriteMu, runnerID)
|
||||||
|
s.runnerConnsWriteMuMu.Unlock()
|
||||||
_, _ = s.db.Exec(
|
_, _ = s.db.Exec(
|
||||||
`UPDATE runners SET status = ? WHERE id = ?`,
|
`UPDATE runners SET status = ? WHERE id = ?`,
|
||||||
types.RunnerStatusOffline, runnerID,
|
types.RunnerStatusOffline, runnerID,
|
||||||
@@ -743,15 +753,28 @@ func (s *Server) handleRunnerWebSocket(w http.ResponseWriter, r *http.Request) {
|
|||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
for range ticker.C {
|
for range ticker.C {
|
||||||
s.runnerConnsMu.RLock()
|
s.runnerConnsMu.RLock()
|
||||||
conn, exists := s.runnerConns[runnerID]
|
currentConn, exists := s.runnerConns[runnerID]
|
||||||
s.runnerConnsMu.RUnlock()
|
s.runnerConnsMu.RUnlock()
|
||||||
if !exists {
|
if !exists || currentConn != conn {
|
||||||
|
// Connection was replaced or removed
|
||||||
|
return
|
||||||
|
}
|
||||||
|
// Get write mutex for this connection
|
||||||
|
s.runnerConnsWriteMuMu.RLock()
|
||||||
|
writeMu, hasMu := s.runnerConnsWriteMu[runnerID]
|
||||||
|
s.runnerConnsWriteMuMu.RUnlock()
|
||||||
|
if !hasMu || writeMu == nil {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
// Send ping - runner should respond with pong automatically
|
// Send ping - runner should respond with pong automatically
|
||||||
// Reset read deadline before sending ping to ensure we can receive pong
|
// Reset read deadline before sending ping to ensure we can receive pong
|
||||||
conn.SetReadDeadline(time.Now().Add(90 * time.Second)) // Increased to 90 seconds
|
conn.SetReadDeadline(time.Now().Add(90 * time.Second)) // Increased to 90 seconds
|
||||||
if err := conn.WriteControl(websocket.PingMessage, []byte{}, time.Now().Add(10*time.Second)); err != nil {
|
writeMu.Lock()
|
||||||
|
err := conn.WriteControl(websocket.PingMessage, []byte{}, time.Now().Add(10*time.Second))
|
||||||
|
writeMu.Unlock()
|
||||||
|
if err != nil {
|
||||||
|
// Write failed - connection is likely dead, read loop will detect and cleanup
|
||||||
|
log.Printf("Failed to send ping to runner %d: %v", runnerID, err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1191,7 +1214,7 @@ func (s *Server) updateJobStatusFromTasks(jobID int64) {
|
|||||||
// Update job status to ensure it's marked as running (has pending video task)
|
// Update job status to ensure it's marked as running (has pending video task)
|
||||||
s.updateJobStatusFromTasks(jobID)
|
s.updateJobStatusFromTasks(jobID)
|
||||||
// Try to distribute the task immediately
|
// Try to distribute the task immediately
|
||||||
go s.distributeTasksToRunners()
|
s.triggerTaskDistribution()
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
log.Printf("Skipping video generation task creation for job %d (video task already exists)", jobID)
|
log.Printf("Skipping video generation task creation for job %d (video task already exists)", jobID)
|
||||||
@@ -1284,7 +1307,20 @@ func (s *Server) broadcastLogToFrontend(taskID int64, logEntry WSLogEntry) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// triggerTaskDistribution triggers task distribution in a serialized manner
|
||||||
|
func (s *Server) triggerTaskDistribution() {
|
||||||
|
go func() {
|
||||||
|
// Try to acquire lock - if already running, skip
|
||||||
|
if !s.taskDistMu.TryLock() {
|
||||||
|
return // Distribution already in progress
|
||||||
|
}
|
||||||
|
defer s.taskDistMu.Unlock()
|
||||||
|
s.distributeTasksToRunners()
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
// distributeTasksToRunners pushes available tasks to connected runners
|
// distributeTasksToRunners pushes available tasks to connected runners
|
||||||
|
// This function should only be called while holding taskDistMu lock
|
||||||
func (s *Server) distributeTasksToRunners() {
|
func (s *Server) distributeTasksToRunners() {
|
||||||
// Quick check: if there are no pending tasks, skip the expensive query
|
// Quick check: if there are no pending tasks, skip the expensive query
|
||||||
var pendingCount int
|
var pendingCount int
|
||||||
@@ -1677,21 +1713,25 @@ func (s *Server) distributeTasksToRunners() {
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
err = tx.Commit()
|
|
||||||
if err != nil {
|
|
||||||
log.Printf("Failed to commit transaction for task %d: %v", task.TaskID, err)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check if the update actually affected a row (task was successfully assigned)
|
// Check if the update actually affected a row (task was successfully assigned)
|
||||||
rowsAffected, err := result.RowsAffected()
|
rowsAffected, err := result.RowsAffected()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
tx.Rollback()
|
||||||
log.Printf("Failed to get rows affected for task %d: %v", task.TaskID, err)
|
log.Printf("Failed to get rows affected for task %d: %v", task.TaskID, err)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
if rowsAffected == 0 {
|
if rowsAffected == 0 {
|
||||||
// Task was already assigned by another goroutine, skip
|
// Task was already assigned by another goroutine, skip
|
||||||
|
tx.Rollback()
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Commit the assignment before attempting WebSocket send
|
||||||
|
// If send fails, we'll rollback in a separate transaction
|
||||||
|
err = tx.Commit()
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("Failed to commit transaction for task %d: %v", task.TaskID, err)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1702,27 +1742,41 @@ func (s *Server) distributeTasksToRunners() {
|
|||||||
"started_at": now,
|
"started_at": now,
|
||||||
})
|
})
|
||||||
|
|
||||||
// Task was successfully assigned, send via WebSocket
|
// Task was successfully assigned in database, now send via WebSocket
|
||||||
log.Printf("Assigned task %d (type: %s, job: %d) to runner %d", task.TaskID, task.TaskType, task.JobID, selectedRunnerID)
|
log.Printf("Assigned task %d (type: %s, job: %d) to runner %d", task.TaskID, task.TaskType, task.JobID, selectedRunnerID)
|
||||||
|
|
||||||
// Update job status to running if this is the first task starting
|
|
||||||
s.updateJobStatusFromTasks(task.JobID)
|
|
||||||
|
|
||||||
// Log runner assignment to task logs
|
// Log runner assignment to task logs
|
||||||
s.logTaskEvent(task.TaskID, nil, types.LogLevelInfo, fmt.Sprintf("Task assigned to runner %d", selectedRunnerID), "")
|
s.logTaskEvent(task.TaskID, nil, types.LogLevelInfo, fmt.Sprintf("Task assigned to runner %d", selectedRunnerID), "")
|
||||||
|
|
||||||
|
// Attempt to send task to runner via WebSocket
|
||||||
if err := s.assignTaskToRunner(selectedRunnerID, task.TaskID); err != nil {
|
if err := s.assignTaskToRunner(selectedRunnerID, task.TaskID); err != nil {
|
||||||
log.Printf("Failed to send task %d to runner %d: %v", task.TaskID, selectedRunnerID, err)
|
log.Printf("Failed to send task %d to runner %d: %v", task.TaskID, selectedRunnerID, err)
|
||||||
// Log assignment failure
|
// Log assignment failure
|
||||||
s.logTaskEvent(task.TaskID, nil, types.LogLevelError, fmt.Sprintf("Failed to send task to runner %d: %v", selectedRunnerID, err), "")
|
s.logTaskEvent(task.TaskID, nil, types.LogLevelError, fmt.Sprintf("Failed to send task to runner %d: %v", selectedRunnerID, err), "")
|
||||||
// Rollback the assignment if WebSocket send fails
|
// Rollback the assignment if WebSocket send fails using a new transaction
|
||||||
s.db.Exec(
|
rollbackTx, rollbackErr := s.db.Begin()
|
||||||
`UPDATE tasks SET runner_id = NULL, status = ?, started_at = NULL
|
if rollbackErr == nil {
|
||||||
WHERE id = ?`,
|
_, rollbackErr = rollbackTx.Exec(
|
||||||
types.TaskStatusPending, task.TaskID,
|
`UPDATE tasks SET runner_id = NULL, status = ?, started_at = NULL
|
||||||
)
|
WHERE id = ? AND runner_id = ?`,
|
||||||
// Log rollback
|
types.TaskStatusPending, task.TaskID, selectedRunnerID,
|
||||||
s.logTaskEvent(task.TaskID, nil, types.LogLevelWarn, fmt.Sprintf("Task assignment rolled back - runner %d connection failed", selectedRunnerID), "")
|
)
|
||||||
|
if rollbackErr == nil {
|
||||||
|
rollbackTx.Commit()
|
||||||
|
// Log rollback
|
||||||
|
s.logTaskEvent(task.TaskID, nil, types.LogLevelWarn, fmt.Sprintf("Task assignment rolled back - runner %d connection failed", selectedRunnerID), "")
|
||||||
|
// Update job status after rollback
|
||||||
|
s.updateJobStatusFromTasks(task.JobID)
|
||||||
|
// Trigger redistribution
|
||||||
|
s.triggerTaskDistribution()
|
||||||
|
} else {
|
||||||
|
rollbackTx.Rollback()
|
||||||
|
log.Printf("Failed to rollback task %d assignment: %v", task.TaskID, rollbackErr)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// WebSocket send succeeded, update job status
|
||||||
|
s.updateJobStatusFromTasks(task.JobID)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1805,13 +1859,34 @@ func (s *Server) assignTaskToRunner(runnerID int64, taskID int64) error {
|
|||||||
return fmt.Errorf("task %d is not assigned to runner %d", taskID, runnerID)
|
return fmt.Errorf("task %d is not assigned to runner %d", taskID, runnerID)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Send task via WebSocket
|
// Send task via WebSocket with write mutex protection
|
||||||
msg := WSMessage{
|
msg := WSMessage{
|
||||||
Type: "task_assignment",
|
Type: "task_assignment",
|
||||||
Timestamp: time.Now().Unix(),
|
Timestamp: time.Now().Unix(),
|
||||||
}
|
}
|
||||||
msg.Data, _ = json.Marshal(task)
|
msg.Data, _ = json.Marshal(task)
|
||||||
return conn.WriteJSON(msg)
|
|
||||||
|
// Get write mutex for this connection
|
||||||
|
s.runnerConnsWriteMuMu.RLock()
|
||||||
|
writeMu, hasMu := s.runnerConnsWriteMu[runnerID]
|
||||||
|
s.runnerConnsWriteMuMu.RUnlock()
|
||||||
|
|
||||||
|
if !hasMu || writeMu == nil {
|
||||||
|
return fmt.Errorf("runner %d write mutex not found", runnerID)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Re-check connection is still valid before writing
|
||||||
|
s.runnerConnsMu.RLock()
|
||||||
|
_, stillExists := s.runnerConns[runnerID]
|
||||||
|
s.runnerConnsMu.RUnlock()
|
||||||
|
if !stillExists {
|
||||||
|
return fmt.Errorf("runner %d disconnected", runnerID)
|
||||||
|
}
|
||||||
|
|
||||||
|
writeMu.Lock()
|
||||||
|
err = conn.WriteJSON(msg)
|
||||||
|
writeMu.Unlock()
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
// redistributeRunnerTasks resets tasks assigned to a disconnected/dead runner and redistributes them
|
// redistributeRunnerTasks resets tasks assigned to a disconnected/dead runner and redistributes them
|
||||||
@@ -1883,7 +1958,7 @@ func (s *Server) redistributeRunnerTasks(runnerID int64) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Immediately redistribute the reset tasks
|
// Immediately redistribute the reset tasks
|
||||||
go s.distributeTasksToRunners()
|
s.triggerTaskDistribution()
|
||||||
}
|
}
|
||||||
|
|
||||||
// logTaskEvent logs an event to a task's log (manager-side logging)
|
// logTaskEvent logs an event to a task's log (manager-side logging)
|
||||||
|
|||||||
@@ -38,6 +38,9 @@ type Server struct {
|
|||||||
wsUpgrader websocket.Upgrader
|
wsUpgrader websocket.Upgrader
|
||||||
runnerConns map[int64]*websocket.Conn
|
runnerConns map[int64]*websocket.Conn
|
||||||
runnerConnsMu sync.RWMutex
|
runnerConnsMu sync.RWMutex
|
||||||
|
// Mutexes for each runner connection to serialize writes
|
||||||
|
runnerConnsWriteMu map[int64]*sync.Mutex
|
||||||
|
runnerConnsWriteMuMu sync.RWMutex
|
||||||
frontendConns map[string]*websocket.Conn // key: "jobId:taskId"
|
frontendConns map[string]*websocket.Conn // key: "jobId:taskId"
|
||||||
frontendConnsMu sync.RWMutex
|
frontendConnsMu sync.RWMutex
|
||||||
// Mutexes for each frontend connection to serialize writes
|
// Mutexes for each frontend connection to serialize writes
|
||||||
@@ -55,6 +58,8 @@ type Server struct {
|
|||||||
// Throttling for progress updates (per job)
|
// Throttling for progress updates (per job)
|
||||||
progressUpdateTimes map[int64]time.Time // key: jobID
|
progressUpdateTimes map[int64]time.Time // key: jobID
|
||||||
progressUpdateTimesMu sync.RWMutex
|
progressUpdateTimesMu sync.RWMutex
|
||||||
|
// Task distribution serialization
|
||||||
|
taskDistMu sync.Mutex // Mutex to prevent concurrent distribution
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewServer creates a new API server
|
// NewServer creates a new API server
|
||||||
@@ -78,6 +83,7 @@ func NewServer(db *database.DB, auth *authpkg.Auth, storage *storage.Storage) (*
|
|||||||
WriteBufferSize: 1024,
|
WriteBufferSize: 1024,
|
||||||
},
|
},
|
||||||
runnerConns: make(map[int64]*websocket.Conn),
|
runnerConns: make(map[int64]*websocket.Conn),
|
||||||
|
runnerConnsWriteMu: make(map[int64]*sync.Mutex),
|
||||||
frontendConns: make(map[string]*websocket.Conn),
|
frontendConns: make(map[string]*websocket.Conn),
|
||||||
frontendConnsWriteMu: make(map[string]*sync.Mutex),
|
frontendConnsWriteMu: make(map[string]*sync.Mutex),
|
||||||
jobListConns: make(map[int64]*websocket.Conn),
|
jobListConns: make(map[int64]*websocket.Conn),
|
||||||
@@ -611,7 +617,7 @@ func (s *Server) recoverStuckTasks() {
|
|||||||
|
|
||||||
go func() {
|
go func() {
|
||||||
for range distributeTicker.C {
|
for range distributeTicker.C {
|
||||||
s.distributeTasksToRunners()
|
s.triggerTaskDistribution()
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
@@ -675,7 +681,7 @@ func (s *Server) recoverStuckTasks() {
|
|||||||
s.recoverTaskTimeouts()
|
s.recoverTaskTimeouts()
|
||||||
|
|
||||||
// Distribute newly recovered tasks
|
// Distribute newly recovered tasks
|
||||||
s.distributeTasksToRunners()
|
s.triggerTaskDistribution()
|
||||||
}()
|
}()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,10 +1,10 @@
|
|||||||
package runner
|
package runner
|
||||||
|
|
||||||
import (
|
import (
|
||||||
_ "embed"
|
|
||||||
"archive/tar"
|
"archive/tar"
|
||||||
"bufio"
|
"bufio"
|
||||||
"bytes"
|
"bytes"
|
||||||
|
_ "embed"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
@@ -33,7 +33,6 @@ type Client struct {
|
|||||||
managerURL string
|
managerURL string
|
||||||
name string
|
name string
|
||||||
hostname string
|
hostname string
|
||||||
ipAddress string
|
|
||||||
httpClient *http.Client
|
httpClient *http.Client
|
||||||
runnerID int64
|
runnerID int64
|
||||||
runnerSecret string
|
runnerSecret string
|
||||||
@@ -58,12 +57,11 @@ type Client struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// NewClient creates a new runner client
|
// NewClient creates a new runner client
|
||||||
func NewClient(managerURL, name, hostname, ipAddress string) *Client {
|
func NewClient(managerURL, name, hostname string) *Client {
|
||||||
return &Client{
|
return &Client{
|
||||||
managerURL: managerURL,
|
managerURL: managerURL,
|
||||||
name: name,
|
name: name,
|
||||||
hostname: hostname,
|
hostname: hostname,
|
||||||
ipAddress: ipAddress,
|
|
||||||
httpClient: &http.Client{Timeout: 30 * time.Second},
|
httpClient: &http.Client{Timeout: 30 * time.Second},
|
||||||
longRunningClient: &http.Client{Timeout: 0}, // No timeout for long-running operations (context downloads, file uploads/downloads)
|
longRunningClient: &http.Client{Timeout: 0}, // No timeout for long-running operations (context downloads, file uploads/downloads)
|
||||||
stopChan: make(chan struct{}),
|
stopChan: make(chan struct{}),
|
||||||
@@ -412,7 +410,6 @@ func (c *Client) Register(registrationToken string) (int64, string, string, erro
|
|||||||
req := map[string]interface{}{
|
req := map[string]interface{}{
|
||||||
"name": c.name,
|
"name": c.name,
|
||||||
"hostname": c.hostname,
|
"hostname": c.hostname,
|
||||||
"ip_address": c.ipAddress,
|
|
||||||
"capabilities": string(capabilitiesJSON),
|
"capabilities": string(capabilitiesJSON),
|
||||||
"registration_token": registrationToken,
|
"registration_token": registrationToken,
|
||||||
}
|
}
|
||||||
@@ -983,8 +980,8 @@ func (c *Client) processTask(task map[string]interface{}, jobName string, output
|
|||||||
// Clean up expired cache entries periodically
|
// Clean up expired cache entries periodically
|
||||||
c.cleanupExpiredContextCache()
|
c.cleanupExpiredContextCache()
|
||||||
|
|
||||||
// Download context tar
|
// Download context tar
|
||||||
contextPath := filepath.Join(workDir, "context.tar")
|
contextPath := filepath.Join(workDir, "context.tar")
|
||||||
if err := c.downloadJobContext(jobID, contextPath); err != nil {
|
if err := c.downloadJobContext(jobID, contextPath); err != nil {
|
||||||
c.sendStepUpdate(taskID, "download", types.StepStatusFailed, err.Error())
|
c.sendStepUpdate(taskID, "download", types.StepStatusFailed, err.Error())
|
||||||
return fmt.Errorf("failed to download context: %w", err)
|
return fmt.Errorf("failed to download context: %w", err)
|
||||||
@@ -3016,8 +3013,8 @@ func (c *Client) processMetadataTask(task map[string]interface{}, jobID int64, i
|
|||||||
c.sendStepUpdate(taskID, "download", types.StepStatusRunning, "")
|
c.sendStepUpdate(taskID, "download", types.StepStatusRunning, "")
|
||||||
c.sendLog(taskID, types.LogLevelInfo, "Downloading job context...", "download")
|
c.sendLog(taskID, types.LogLevelInfo, "Downloading job context...", "download")
|
||||||
|
|
||||||
// Download context tar
|
// Download context tar
|
||||||
contextPath := filepath.Join(workDir, "context.tar")
|
contextPath := filepath.Join(workDir, "context.tar")
|
||||||
if err := c.downloadJobContext(jobID, contextPath); err != nil {
|
if err := c.downloadJobContext(jobID, contextPath); err != nil {
|
||||||
c.sendStepUpdate(taskID, "download", types.StepStatusFailed, err.Error())
|
c.sendStepUpdate(taskID, "download", types.StepStatusFailed, err.Error())
|
||||||
return fmt.Errorf("failed to download context: %w", err)
|
return fmt.Errorf("failed to download context: %w", err)
|
||||||
|
|||||||
@@ -153,7 +153,7 @@ type UpdateJobProgressRequest struct {
|
|||||||
type RegisterRunnerRequest struct {
|
type RegisterRunnerRequest struct {
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
Hostname string `json:"hostname"`
|
Hostname string `json:"hostname"`
|
||||||
IPAddress string `json:"ip_address"`
|
IPAddress string `json:"ip_address,omitempty"` // Optional, extracted from request by manager
|
||||||
Capabilities string `json:"capabilities"`
|
Capabilities string `json:"capabilities"`
|
||||||
Priority *int `json:"priority,omitempty"` // Optional, defaults to 100 if not provided
|
Priority *int `json:"priority,omitempty"` // Optional, defaults to 100 if not provided
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -241,7 +241,6 @@ function displayRunners(runners) {
|
|||||||
<h3>${escapeHtml(runner.name)}</h3>
|
<h3>${escapeHtml(runner.name)}</h3>
|
||||||
<div class="runner-info">
|
<div class="runner-info">
|
||||||
<span>Hostname: ${escapeHtml(runner.hostname)}</span>
|
<span>Hostname: ${escapeHtml(runner.hostname)}</span>
|
||||||
<span>IP: ${escapeHtml(runner.ip_address)}</span>
|
|
||||||
<span>Last heartbeat: ${lastHeartbeat.toLocaleString()}</span>
|
<span>Last heartbeat: ${lastHeartbeat.toLocaleString()}</span>
|
||||||
</div>
|
</div>
|
||||||
<div class="runner-status ${isOnline ? 'online' : 'offline'}">
|
<div class="runner-status ${isOnline ? 'online' : 'offline'}">
|
||||||
|
|||||||
@@ -369,9 +369,6 @@ export default function AdminPanel() {
|
|||||||
<th className="px-6 py-3 text-left text-xs font-medium text-gray-400 uppercase tracking-wider">
|
<th className="px-6 py-3 text-left text-xs font-medium text-gray-400 uppercase tracking-wider">
|
||||||
Hostname
|
Hostname
|
||||||
</th>
|
</th>
|
||||||
<th className="px-6 py-3 text-left text-xs font-medium text-gray-400 uppercase tracking-wider">
|
|
||||||
IP Address
|
|
||||||
</th>
|
|
||||||
<th className="px-6 py-3 text-left text-xs font-medium text-gray-400 uppercase tracking-wider">
|
<th className="px-6 py-3 text-left text-xs font-medium text-gray-400 uppercase tracking-wider">
|
||||||
Status
|
Status
|
||||||
</th>
|
</th>
|
||||||
@@ -403,9 +400,6 @@ export default function AdminPanel() {
|
|||||||
<td className="px-6 py-4 whitespace-nowrap text-sm text-gray-400">
|
<td className="px-6 py-4 whitespace-nowrap text-sm text-gray-400">
|
||||||
{runner.hostname}
|
{runner.hostname}
|
||||||
</td>
|
</td>
|
||||||
<td className="px-6 py-4 whitespace-nowrap text-sm text-gray-400">
|
|
||||||
{runner.ip_address}
|
|
||||||
</td>
|
|
||||||
<td className="px-6 py-4 whitespace-nowrap">
|
<td className="px-6 py-4 whitespace-nowrap">
|
||||||
<span
|
<span
|
||||||
className={`px-2 py-1 text-xs font-medium rounded-full ${
|
className={`px-2 py-1 text-xs font-medium rounded-full ${
|
||||||
|
|||||||
Reference in New Issue
Block a user