redo
This commit is contained in:
@@ -11,12 +11,17 @@ import (
|
||||
"log"
|
||||
"mime/multipart"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/gorilla/websocket"
|
||||
"fuego/pkg/types"
|
||||
)
|
||||
|
||||
// Client represents a runner client
|
||||
@@ -29,6 +34,9 @@ type Client struct {
|
||||
runnerID int64
|
||||
runnerSecret string
|
||||
managerSecret string
|
||||
wsConn *websocket.Conn
|
||||
wsConnMu sync.Mutex
|
||||
stopChan chan struct{}
|
||||
}
|
||||
|
||||
// NewClient creates a new runner client
|
||||
@@ -39,6 +47,7 @@ func NewClient(managerURL, name, hostname, ipAddress string) *Client {
|
||||
hostname: hostname,
|
||||
ipAddress: ipAddress,
|
||||
httpClient: &http.Client{Timeout: 30 * time.Second},
|
||||
stopChan: make(chan struct{}),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -121,81 +130,219 @@ func (c *Client) doSignedRequest(method, path string, body []byte) (*http.Respon
|
||||
return c.httpClient.Do(req)
|
||||
}
|
||||
|
||||
// HeartbeatLoop sends periodic heartbeats to the manager
|
||||
// ConnectWebSocket establishes a WebSocket connection to the manager
|
||||
func (c *Client) ConnectWebSocket() error {
|
||||
if c.runnerID == 0 || c.runnerSecret == "" {
|
||||
return fmt.Errorf("runner not authenticated")
|
||||
}
|
||||
|
||||
// Build WebSocket URL with authentication
|
||||
timestamp := time.Now().Unix()
|
||||
path := "/api/runner/ws"
|
||||
// Sign the request
|
||||
message := fmt.Sprintf("GET\n%s\n\n%d", path, timestamp)
|
||||
h := hmac.New(sha256.New, []byte(c.runnerSecret))
|
||||
h.Write([]byte(message))
|
||||
signature := hex.EncodeToString(h.Sum(nil))
|
||||
|
||||
// Convert HTTP URL to WebSocket URL
|
||||
wsURL := strings.Replace(c.managerURL, "http://", "ws://", 1)
|
||||
wsURL = strings.Replace(wsURL, "https://", "wss://", 1)
|
||||
wsURL = fmt.Sprintf("%s%s?runner_id=%d&signature=%s×tamp=%d",
|
||||
wsURL, path, c.runnerID, signature, timestamp)
|
||||
|
||||
// Parse URL
|
||||
u, err := url.Parse(wsURL)
|
||||
if err != nil {
|
||||
return fmt.Errorf("invalid WebSocket URL: %w", err)
|
||||
}
|
||||
|
||||
// Connect
|
||||
dialer := websocket.Dialer{
|
||||
HandshakeTimeout: 10 * time.Second,
|
||||
}
|
||||
conn, _, err := dialer.Dial(u.String(), nil)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to connect WebSocket: %w", err)
|
||||
}
|
||||
|
||||
c.wsConnMu.Lock()
|
||||
if c.wsConn != nil {
|
||||
c.wsConn.Close()
|
||||
}
|
||||
c.wsConn = conn
|
||||
c.wsConnMu.Unlock()
|
||||
|
||||
log.Printf("WebSocket connected to manager")
|
||||
return nil
|
||||
}
|
||||
|
||||
// ConnectWebSocketWithReconnect connects with automatic reconnection
|
||||
func (c *Client) ConnectWebSocketWithReconnect() {
|
||||
backoff := 1 * time.Second
|
||||
maxBackoff := 60 * time.Second
|
||||
|
||||
for {
|
||||
err := c.ConnectWebSocket()
|
||||
if err == nil {
|
||||
backoff = 1 * time.Second // Reset on success
|
||||
c.HandleWebSocketMessages()
|
||||
} else {
|
||||
log.Printf("WebSocket connection failed: %v, retrying in %v", err, backoff)
|
||||
time.Sleep(backoff)
|
||||
backoff *= 2
|
||||
if backoff > maxBackoff {
|
||||
backoff = maxBackoff
|
||||
}
|
||||
}
|
||||
|
||||
// Check if we should stop
|
||||
select {
|
||||
case <-c.stopChan:
|
||||
return
|
||||
default:
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// HandleWebSocketMessages handles incoming WebSocket messages
|
||||
func (c *Client) HandleWebSocketMessages() {
|
||||
c.wsConnMu.Lock()
|
||||
conn := c.wsConn
|
||||
c.wsConnMu.Unlock()
|
||||
|
||||
if conn == nil {
|
||||
return
|
||||
}
|
||||
|
||||
// Set pong handler
|
||||
conn.SetPongHandler(func(string) error {
|
||||
return nil
|
||||
})
|
||||
|
||||
// Handle messages
|
||||
for {
|
||||
var msg map[string]interface{}
|
||||
err := conn.ReadJSON(&msg)
|
||||
if err != nil {
|
||||
if websocket.IsUnexpectedCloseError(err, websocket.CloseGoingAway, websocket.CloseAbnormalClosure) {
|
||||
log.Printf("WebSocket error: %v", err)
|
||||
}
|
||||
c.wsConnMu.Lock()
|
||||
c.wsConn = nil
|
||||
c.wsConnMu.Unlock()
|
||||
return
|
||||
}
|
||||
|
||||
msgType, _ := msg["type"].(string)
|
||||
switch msgType {
|
||||
case "task_assignment":
|
||||
c.handleTaskAssignment(msg)
|
||||
case "ping":
|
||||
// Respond to ping with pong (automatic)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// handleTaskAssignment handles a task assignment message
|
||||
func (c *Client) handleTaskAssignment(msg map[string]interface{}) {
|
||||
data, ok := msg["data"].(map[string]interface{})
|
||||
if !ok {
|
||||
log.Printf("Invalid task assignment message")
|
||||
return
|
||||
}
|
||||
|
||||
taskID, _ := data["task_id"].(float64)
|
||||
jobID, _ := data["job_id"].(float64)
|
||||
jobName, _ := data["job_name"].(string)
|
||||
outputFormat, _ := data["output_format"].(string)
|
||||
frameStart, _ := data["frame_start"].(float64)
|
||||
frameEnd, _ := data["frame_end"].(float64)
|
||||
inputFilesRaw, _ := data["input_files"].([]interface{})
|
||||
|
||||
if len(inputFilesRaw) == 0 {
|
||||
log.Printf("No input files for task %v", taskID)
|
||||
c.sendTaskComplete(int64(taskID), "", false, "No input files")
|
||||
return
|
||||
}
|
||||
|
||||
// Convert to task map format
|
||||
taskMap := map[string]interface{}{
|
||||
"id": taskID,
|
||||
"job_id": jobID,
|
||||
"frame_start": frameStart,
|
||||
"frame_end": frameEnd,
|
||||
}
|
||||
|
||||
// Process the task
|
||||
go func() {
|
||||
if err := c.processTask(taskMap, jobName, outputFormat, inputFilesRaw); err != nil {
|
||||
log.Printf("Failed to process task %v: %v", taskID, err)
|
||||
c.sendTaskComplete(int64(taskID), "", false, err.Error())
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
// HeartbeatLoop sends periodic heartbeats via WebSocket
|
||||
func (c *Client) HeartbeatLoop() {
|
||||
ticker := time.NewTicker(30 * time.Second)
|
||||
defer ticker.Stop()
|
||||
|
||||
for range ticker.C {
|
||||
req := map[string]interface{}{}
|
||||
body, _ := json.Marshal(req)
|
||||
c.wsConnMu.RLock()
|
||||
conn := c.wsConn
|
||||
c.wsConnMu.RUnlock()
|
||||
|
||||
resp, err := c.doSignedRequest("POST", "/api/runner/heartbeat?runner_id="+fmt.Sprintf("%d", c.runnerID), body)
|
||||
if err != nil {
|
||||
log.Printf("Heartbeat failed: %v", err)
|
||||
continue
|
||||
}
|
||||
resp.Body.Close()
|
||||
}
|
||||
}
|
||||
|
||||
// ProcessTasks polls for tasks and processes them
|
||||
func (c *Client) ProcessTasks() {
|
||||
ticker := time.NewTicker(5 * time.Second)
|
||||
defer ticker.Stop()
|
||||
|
||||
for range ticker.C {
|
||||
tasks, err := c.getTasks()
|
||||
if err != nil {
|
||||
log.Printf("Failed to get tasks: %v", err)
|
||||
continue
|
||||
}
|
||||
|
||||
for _, taskData := range tasks {
|
||||
taskMap, ok := taskData["task"].(map[string]interface{})
|
||||
if !ok {
|
||||
continue
|
||||
if conn != nil {
|
||||
// Send heartbeat via WebSocket
|
||||
msg := map[string]interface{}{
|
||||
"type": "heartbeat",
|
||||
"timestamp": time.Now().Unix(),
|
||||
}
|
||||
|
||||
jobName, _ := taskData["job_name"].(string)
|
||||
outputFormat, _ := taskData["output_format"].(string)
|
||||
inputFilesRaw, _ := taskData["input_files"].([]interface{})
|
||||
|
||||
if len(inputFilesRaw) == 0 {
|
||||
log.Printf("No input files for task %v", taskMap["id"])
|
||||
continue
|
||||
}
|
||||
|
||||
// Process the task
|
||||
if err := c.processTask(taskMap, jobName, outputFormat, inputFilesRaw); err != nil {
|
||||
taskID, _ := taskMap["id"].(float64)
|
||||
log.Printf("Failed to process task %v: %v", taskID, err)
|
||||
c.completeTask(int64(taskID), "", false, err.Error())
|
||||
if err := conn.WriteJSON(msg); err != nil {
|
||||
log.Printf("Failed to send heartbeat: %v", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// getTasks fetches tasks from the manager
|
||||
func (c *Client) getTasks() ([]map[string]interface{}, error) {
|
||||
path := fmt.Sprintf("/api/runner/tasks?runner_id=%d", c.runnerID)
|
||||
resp, err := c.doSignedRequest("GET", path, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
// sendLog sends a log entry to the manager via WebSocket
|
||||
func (c *Client) sendLog(taskID int64, logLevel types.LogLevel, message, stepName string) {
|
||||
c.wsConnMu.RLock()
|
||||
conn := c.wsConn
|
||||
c.wsConnMu.RUnlock()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
return nil, fmt.Errorf("failed to get tasks: %s", string(body))
|
||||
if conn != nil {
|
||||
msg := map[string]interface{}{
|
||||
"type": "log_entry",
|
||||
"data": map[string]interface{}{
|
||||
"task_id": taskID,
|
||||
"log_level": string(logLevel),
|
||||
"message": message,
|
||||
"step_name": stepName,
|
||||
},
|
||||
"timestamp": time.Now().Unix(),
|
||||
}
|
||||
if err := conn.WriteJSON(msg); err != nil {
|
||||
log.Printf("Failed to send log: %v", err)
|
||||
}
|
||||
} else {
|
||||
log.Printf("WebSocket not connected, cannot send log")
|
||||
}
|
||||
}
|
||||
|
||||
var tasks []map[string]interface{}
|
||||
if err := json.NewDecoder(resp.Body).Decode(&tasks); err != nil {
|
||||
return nil, err
|
||||
// sendStepUpdate sends a step start/complete event to the manager
|
||||
func (c *Client) sendStepUpdate(taskID int64, stepName string, status types.StepStatus, errorMsg string) {
|
||||
// This would ideally be a separate endpoint, but for now we'll use logs
|
||||
msg := fmt.Sprintf("Step %s: %s", stepName, status)
|
||||
if errorMsg != "" {
|
||||
msg += " - " + errorMsg
|
||||
}
|
||||
|
||||
return tasks, nil
|
||||
logLevel := types.LogLevelInfo
|
||||
if status == types.StepStatusFailed {
|
||||
logLevel = types.LogLevelError
|
||||
}
|
||||
c.sendLog(taskID, logLevel, msg, stepName)
|
||||
}
|
||||
|
||||
// processTask processes a single task
|
||||
@@ -205,6 +352,7 @@ func (c *Client) processTask(task map[string]interface{}, jobName, outputFormat
|
||||
frameStart := int(task["frame_start"].(float64))
|
||||
frameEnd := int(task["frame_end"].(float64))
|
||||
|
||||
c.sendLog(taskID, types.LogLevelInfo, fmt.Sprintf("Starting task: job %d, frames %d-%d, format: %s", jobID, frameStart, frameEnd, outputFormat), "")
|
||||
log.Printf("Processing task %d: job %d, frames %d-%d, format: %s", taskID, jobID, frameStart, frameEnd, outputFormat)
|
||||
|
||||
// Create work directory
|
||||
@@ -214,11 +362,14 @@ func (c *Client) processTask(task map[string]interface{}, jobName, outputFormat
|
||||
}
|
||||
defer os.RemoveAll(workDir)
|
||||
|
||||
// Download input files
|
||||
// Step: download
|
||||
c.sendStepUpdate(taskID, "download", types.StepStatusRunning, "")
|
||||
c.sendLog(taskID, types.LogLevelInfo, "Downloading input files...", "download")
|
||||
blendFile := ""
|
||||
for _, filePath := range inputFiles {
|
||||
filePathStr := filePath.(string)
|
||||
if err := c.downloadFile(filePathStr, workDir); err != nil {
|
||||
c.sendStepUpdate(taskID, "download", types.StepStatusFailed, err.Error())
|
||||
return fmt.Errorf("failed to download file %s: %w", filePathStr, err)
|
||||
}
|
||||
if filepath.Ext(filePathStr) == ".blend" {
|
||||
@@ -227,8 +378,12 @@ func (c *Client) processTask(task map[string]interface{}, jobName, outputFormat
|
||||
}
|
||||
|
||||
if blendFile == "" {
|
||||
return fmt.Errorf("no .blend file found in input files")
|
||||
err := fmt.Errorf("no .blend file found in input files")
|
||||
c.sendStepUpdate(taskID, "download", types.StepStatusFailed, err.Error())
|
||||
return err
|
||||
}
|
||||
c.sendStepUpdate(taskID, "download", types.StepStatusCompleted, "")
|
||||
c.sendLog(taskID, types.LogLevelInfo, "Input files downloaded successfully", "download")
|
||||
|
||||
// Render frames
|
||||
outputDir := filepath.Join(workDir, "output")
|
||||
@@ -244,30 +399,60 @@ func (c *Client) processTask(task map[string]interface{}, jobName, outputFormat
|
||||
|
||||
outputPattern := filepath.Join(outputDir, fmt.Sprintf("frame_%%04d.%s", strings.ToLower(renderFormat)))
|
||||
|
||||
// Step: render_blender
|
||||
c.sendStepUpdate(taskID, "render_blender", types.StepStatusRunning, "")
|
||||
c.sendLog(taskID, types.LogLevelInfo, fmt.Sprintf("Starting Blender render for frame %d...", frameStart), "render_blender")
|
||||
|
||||
// Execute Blender
|
||||
cmd := exec.Command("blender", "-b", blendFile, "-o", outputPattern, "-f", fmt.Sprintf("%d", frameStart))
|
||||
cmd.Dir = workDir
|
||||
output, err := cmd.CombinedOutput()
|
||||
if err != nil {
|
||||
return fmt.Errorf("blender failed: %w\nOutput: %s", err, string(output))
|
||||
errMsg := fmt.Sprintf("blender failed: %w\nOutput: %s", err, string(output))
|
||||
c.sendLog(taskID, types.LogLevelError, errMsg, "render_blender")
|
||||
c.sendStepUpdate(taskID, "render_blender", types.StepStatusFailed, errMsg)
|
||||
return fmt.Errorf(errMsg)
|
||||
}
|
||||
|
||||
// Find rendered output file
|
||||
outputFile := filepath.Join(outputDir, fmt.Sprintf("frame_%04d.%s", frameStart, strings.ToLower(renderFormat)))
|
||||
if _, err := os.Stat(outputFile); os.IsNotExist(err) {
|
||||
return fmt.Errorf("output file not found: %s", outputFile)
|
||||
errMsg := fmt.Sprintf("output file not found: %s", outputFile)
|
||||
c.sendLog(taskID, types.LogLevelError, errMsg, "render_blender")
|
||||
c.sendStepUpdate(taskID, "render_blender", types.StepStatusFailed, errMsg)
|
||||
return fmt.Errorf(errMsg)
|
||||
}
|
||||
c.sendLog(taskID, types.LogLevelInfo, fmt.Sprintf("Blender render completed for frame %d", frameStart), "render_blender")
|
||||
c.sendStepUpdate(taskID, "render_blender", types.StepStatusCompleted, "")
|
||||
|
||||
// Upload frame file
|
||||
// Step: upload or upload_frames
|
||||
uploadStepName := "upload"
|
||||
if outputFormat == "MP4" {
|
||||
uploadStepName = "upload_frames"
|
||||
}
|
||||
c.sendStepUpdate(taskID, uploadStepName, types.StepStatusRunning, "")
|
||||
c.sendLog(taskID, types.LogLevelInfo, "Uploading output file...", uploadStepName)
|
||||
|
||||
outputPath, err := c.uploadFile(jobID, outputFile)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to upload output: %w", err)
|
||||
errMsg := fmt.Sprintf("failed to upload output: %w", err)
|
||||
c.sendLog(taskID, types.LogLevelError, errMsg, uploadStepName)
|
||||
c.sendStepUpdate(taskID, uploadStepName, types.StepStatusFailed, errMsg)
|
||||
return fmt.Errorf(errMsg)
|
||||
}
|
||||
c.sendLog(taskID, types.LogLevelInfo, "Output file uploaded successfully", uploadStepName)
|
||||
c.sendStepUpdate(taskID, uploadStepName, types.StepStatusCompleted, "")
|
||||
|
||||
// Step: complete
|
||||
c.sendStepUpdate(taskID, "complete", types.StepStatusRunning, "")
|
||||
c.sendLog(taskID, types.LogLevelInfo, "Task completed successfully", "complete")
|
||||
|
||||
// Mark task as complete
|
||||
if err := c.completeTask(taskID, outputPath, true, ""); err != nil {
|
||||
c.sendStepUpdate(taskID, "complete", types.StepStatusFailed, err.Error())
|
||||
return err
|
||||
}
|
||||
c.sendStepUpdate(taskID, "complete", types.StepStatusCompleted, "")
|
||||
|
||||
// For MP4 format, check if all frames are done and generate video
|
||||
if outputFormat == "MP4" {
|
||||
@@ -599,29 +784,33 @@ func (c *Client) uploadFile(jobID int64, filePath string) (string, error) {
|
||||
return result.FilePath, nil
|
||||
}
|
||||
|
||||
// completeTask marks a task as complete
|
||||
// completeTask marks a task as complete via WebSocket (or HTTP fallback)
|
||||
func (c *Client) completeTask(taskID int64, outputPath string, success bool, errorMsg string) error {
|
||||
req := map[string]interface{}{
|
||||
"output_path": outputPath,
|
||||
"success": success,
|
||||
}
|
||||
if !success {
|
||||
req["error"] = errorMsg
|
||||
}
|
||||
|
||||
body, _ := json.Marshal(req)
|
||||
path := fmt.Sprintf("/api/runner/tasks/%d/complete?runner_id=%d", taskID, c.runnerID)
|
||||
resp, err := c.doSignedRequest("POST", path, body)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
return fmt.Errorf("failed to complete task: %s", string(body))
|
||||
}
|
||||
|
||||
return nil
|
||||
return c.sendTaskComplete(taskID, outputPath, success, errorMsg)
|
||||
}
|
||||
|
||||
// sendTaskComplete sends task completion via WebSocket
|
||||
func (c *Client) sendTaskComplete(taskID int64, outputPath string, success bool, errorMsg string) error {
|
||||
c.wsConnMu.RLock()
|
||||
conn := c.wsConn
|
||||
c.wsConnMu.RUnlock()
|
||||
|
||||
if conn != nil {
|
||||
msg := map[string]interface{}{
|
||||
"type": "task_complete",
|
||||
"data": map[string]interface{}{
|
||||
"task_id": taskID,
|
||||
"output_path": outputPath,
|
||||
"success": success,
|
||||
"error": errorMsg,
|
||||
},
|
||||
"timestamp": time.Now().Unix(),
|
||||
}
|
||||
if err := conn.WriteJSON(msg); err != nil {
|
||||
return fmt.Errorf("failed to send task completion: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
return fmt.Errorf("WebSocket not connected, cannot complete task")
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user