Some checks failed
PR Check / check-and-test (pull_request) Failing after 9s
- Added logic to delete files after successful uploads in both runner and encode tasks to prevent duplicate uploads. - Included logging for any errors encountered during file deletion to ensure visibility of issues.
366 lines
9.5 KiB
Go
366 lines
9.5 KiB
Go
// Package runner provides the Jiggablend render runner.
|
|
package runner
|
|
|
|
import (
|
|
"crypto/sha256"
|
|
"encoding/hex"
|
|
"fmt"
|
|
"log"
|
|
"net"
|
|
"os"
|
|
"os/exec"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"jiggablend/internal/runner/api"
|
|
"jiggablend/internal/runner/blender"
|
|
"jiggablend/internal/runner/encoding"
|
|
"jiggablend/internal/runner/tasks"
|
|
"jiggablend/internal/runner/workspace"
|
|
"jiggablend/pkg/executils"
|
|
"jiggablend/pkg/types"
|
|
)
|
|
|
|
// Runner is the main render runner.
|
|
type Runner struct {
|
|
id int64
|
|
name string
|
|
hostname string
|
|
|
|
manager *api.ManagerClient
|
|
workspace *workspace.Manager
|
|
blender *blender.Manager
|
|
encoder *encoding.Selector
|
|
processes *executils.ProcessTracker
|
|
|
|
processors map[string]tasks.Processor
|
|
stopChan chan struct{}
|
|
|
|
fingerprint string
|
|
fingerprintMu sync.RWMutex
|
|
}
|
|
|
|
// New creates a new runner.
|
|
func New(managerURL, name, hostname string) *Runner {
|
|
manager := api.NewManagerClient(managerURL)
|
|
|
|
r := &Runner{
|
|
name: name,
|
|
hostname: hostname,
|
|
manager: manager,
|
|
processes: executils.NewProcessTracker(),
|
|
stopChan: make(chan struct{}),
|
|
processors: make(map[string]tasks.Processor),
|
|
}
|
|
|
|
// Generate fingerprint
|
|
r.generateFingerprint()
|
|
|
|
return r
|
|
}
|
|
|
|
// CheckRequiredTools verifies that required external tools are available.
|
|
func (r *Runner) CheckRequiredTools() error {
|
|
if err := exec.Command("zstd", "--version").Run(); err != nil {
|
|
return fmt.Errorf("zstd not found - required for compressed blend file support. Install with: apt install zstd")
|
|
}
|
|
log.Printf("Found zstd for compressed blend file support")
|
|
|
|
if err := exec.Command("xvfb-run", "--help").Run(); err != nil {
|
|
return fmt.Errorf("xvfb-run not found - required for headless Blender rendering. Install with: apt install xvfb")
|
|
}
|
|
log.Printf("Found xvfb-run for headless rendering without -b option")
|
|
return nil
|
|
}
|
|
|
|
var cachedCapabilities map[string]interface{} = nil
|
|
|
|
// ProbeCapabilities detects hardware capabilities.
|
|
func (r *Runner) ProbeCapabilities() map[string]interface{} {
|
|
if cachedCapabilities != nil {
|
|
return cachedCapabilities
|
|
}
|
|
|
|
caps := make(map[string]interface{})
|
|
|
|
// Check for ffmpeg and probe encoding capabilities
|
|
if err := exec.Command("ffmpeg", "-version").Run(); err == nil {
|
|
caps["ffmpeg"] = true
|
|
} else {
|
|
caps["ffmpeg"] = false
|
|
}
|
|
|
|
cachedCapabilities = caps
|
|
return caps
|
|
}
|
|
|
|
// Register registers the runner with the manager.
|
|
func (r *Runner) Register(apiKey string) (int64, error) {
|
|
caps := r.ProbeCapabilities()
|
|
|
|
id, err := r.manager.Register(r.name, r.hostname, caps, apiKey, r.GetFingerprint())
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
|
|
r.id = id
|
|
|
|
// Initialize workspace after registration
|
|
r.workspace = workspace.NewManager(r.name)
|
|
|
|
// Initialize blender manager
|
|
r.blender = blender.NewManager(r.manager, r.workspace.BaseDir())
|
|
|
|
// Initialize encoder selector
|
|
r.encoder = encoding.NewSelector()
|
|
|
|
// Register task processors
|
|
r.processors["render"] = tasks.NewRenderProcessor()
|
|
r.processors["encode"] = tasks.NewEncodeProcessor()
|
|
|
|
return id, nil
|
|
}
|
|
|
|
// Start starts the job polling loop.
|
|
func (r *Runner) Start(pollInterval time.Duration) {
|
|
log.Printf("Starting job polling loop (interval: %v)", pollInterval)
|
|
|
|
for {
|
|
select {
|
|
case <-r.stopChan:
|
|
log.Printf("Stopping job polling loop")
|
|
return
|
|
default:
|
|
}
|
|
|
|
log.Printf("Polling for next job (runner ID: %d)", r.id)
|
|
job, err := r.manager.PollNextJob()
|
|
if err != nil {
|
|
log.Printf("Error polling for job: %v", err)
|
|
time.Sleep(pollInterval)
|
|
continue
|
|
}
|
|
|
|
if job == nil {
|
|
log.Printf("No job available, sleeping for %v", pollInterval)
|
|
time.Sleep(pollInterval)
|
|
continue
|
|
}
|
|
|
|
log.Printf("Received job assignment: task=%d, job=%d, type=%s",
|
|
job.Task.TaskID, job.Task.JobID, job.Task.TaskType)
|
|
|
|
if err := r.executeJob(job); err != nil {
|
|
log.Printf("Error processing job: %v", err)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Stop stops the runner.
|
|
func (r *Runner) Stop() {
|
|
close(r.stopChan)
|
|
}
|
|
|
|
// KillAllProcesses kills all running processes.
|
|
func (r *Runner) KillAllProcesses() {
|
|
log.Printf("Killing all running processes...")
|
|
killedCount := r.processes.KillAll()
|
|
|
|
// Release all allocated devices
|
|
if r.encoder != nil {
|
|
// Device pool cleanup is handled internally
|
|
}
|
|
|
|
log.Printf("Killed %d process(es)", killedCount)
|
|
}
|
|
|
|
// Cleanup removes the workspace directory.
|
|
func (r *Runner) Cleanup() {
|
|
if r.workspace != nil {
|
|
r.workspace.Cleanup()
|
|
}
|
|
}
|
|
|
|
// executeJob handles a job using per-job WebSocket connection.
|
|
func (r *Runner) executeJob(job *api.NextJobResponse) (err error) {
|
|
// Recover from panics to prevent runner process crashes during task execution
|
|
defer func() {
|
|
if rec := recover(); rec != nil {
|
|
log.Printf("Task execution panicked: %v", rec)
|
|
err = fmt.Errorf("task execution panicked: %v", rec)
|
|
}
|
|
}()
|
|
|
|
// Connect to job WebSocket (no runnerID needed - authentication handles it)
|
|
jobConn := api.NewJobConnection()
|
|
if err := jobConn.Connect(r.manager.GetBaseURL(), job.JobPath, job.JobToken); err != nil {
|
|
return fmt.Errorf("failed to connect job WebSocket: %w", err)
|
|
}
|
|
defer jobConn.Close()
|
|
|
|
log.Printf("Job WebSocket authenticated for task %d", job.Task.TaskID)
|
|
|
|
// Create task context
|
|
workDir := r.workspace.JobDir(job.Task.JobID)
|
|
ctx := tasks.NewContext(
|
|
job.Task.TaskID,
|
|
job.Task.JobID,
|
|
job.Task.JobName,
|
|
job.Task.Frame,
|
|
job.Task.TaskType,
|
|
workDir,
|
|
job.JobToken,
|
|
job.Task.Metadata,
|
|
r.manager,
|
|
jobConn,
|
|
r.workspace,
|
|
r.blender,
|
|
r.encoder,
|
|
r.processes,
|
|
)
|
|
|
|
ctx.Info(fmt.Sprintf("Task assignment received (job: %d, type: %s)",
|
|
job.Task.JobID, job.Task.TaskType))
|
|
|
|
// Get processor for task type
|
|
processor, ok := r.processors[job.Task.TaskType]
|
|
if !ok {
|
|
return fmt.Errorf("unknown task type: %s", job.Task.TaskType)
|
|
}
|
|
|
|
// Process the task
|
|
var processErr error
|
|
switch job.Task.TaskType {
|
|
case "render": // this task has a upload outputs step because the frames are not uploaded by the render task directly we have to do it manually here TODO: maybe we should make it work like the encode task
|
|
// Download context
|
|
contextPath := job.JobPath + "/context.tar"
|
|
if err := r.downloadContext(job.Task.JobID, contextPath, job.JobToken); err != nil {
|
|
jobConn.Log(job.Task.TaskID, types.LogLevelError, fmt.Sprintf("Failed to download context: %v", err))
|
|
jobConn.Complete(job.Task.TaskID, false, fmt.Errorf("failed to download context: %v", err))
|
|
return fmt.Errorf("failed to download context: %w", err)
|
|
}
|
|
processErr = processor.Process(ctx)
|
|
if processErr == nil {
|
|
processErr = r.uploadOutputs(ctx, job)
|
|
}
|
|
case "encode": // this task doesn't have a upload outputs step because the video is already uploaded by the encode task
|
|
processErr = processor.Process(ctx)
|
|
default:
|
|
return fmt.Errorf("unknown task type: %s", job.Task.TaskType)
|
|
}
|
|
|
|
if processErr != nil {
|
|
ctx.Error(fmt.Sprintf("Task failed: %v", processErr))
|
|
ctx.Complete(false, processErr)
|
|
return processErr
|
|
}
|
|
|
|
ctx.Complete(true, nil)
|
|
return nil
|
|
}
|
|
|
|
func (r *Runner) downloadContext(jobID int64, contextPath, jobToken string) error {
|
|
reader, err := r.manager.DownloadContext(contextPath, jobToken)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer reader.Close()
|
|
|
|
jobDir := r.workspace.JobDir(jobID)
|
|
return workspace.ExtractTar(reader, jobDir)
|
|
}
|
|
|
|
func (r *Runner) uploadOutputs(ctx *tasks.Context, job *api.NextJobResponse) error {
|
|
outputDir := ctx.WorkDir + "/output"
|
|
uploadPath := fmt.Sprintf("/api/runner/jobs/%d/upload", job.Task.JobID)
|
|
|
|
entries, err := os.ReadDir(outputDir)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to read output directory: %w", err)
|
|
}
|
|
|
|
for _, entry := range entries {
|
|
if entry.IsDir() {
|
|
continue
|
|
}
|
|
filePath := outputDir + "/" + entry.Name()
|
|
if err := r.manager.UploadFile(uploadPath, job.JobToken, filePath); err != nil {
|
|
log.Printf("Failed to upload %s: %v", filePath, err)
|
|
} else {
|
|
ctx.OutputUploaded(entry.Name())
|
|
// Delete file after successful upload to prevent duplicate uploads
|
|
if err := os.Remove(filePath); err != nil {
|
|
log.Printf("Warning: Failed to delete file %s after upload: %v", filePath, err)
|
|
}
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// generateFingerprint creates a unique hardware fingerprint.
|
|
func (r *Runner) generateFingerprint() {
|
|
r.fingerprintMu.Lock()
|
|
defer r.fingerprintMu.Unlock()
|
|
|
|
var components []string
|
|
components = append(components, r.hostname)
|
|
|
|
if machineID, err := os.ReadFile("/etc/machine-id"); err == nil {
|
|
components = append(components, strings.TrimSpace(string(machineID)))
|
|
}
|
|
|
|
if productUUID, err := os.ReadFile("/sys/class/dmi/id/product_uuid"); err == nil {
|
|
components = append(components, strings.TrimSpace(string(productUUID)))
|
|
}
|
|
|
|
if macAddr, err := r.getMACAddress(); err == nil {
|
|
components = append(components, macAddr)
|
|
}
|
|
|
|
if len(components) <= 1 {
|
|
components = append(components, fmt.Sprintf("%d", os.Getpid()))
|
|
components = append(components, fmt.Sprintf("%d", time.Now().Unix()))
|
|
}
|
|
|
|
h := sha256.New()
|
|
for _, comp := range components {
|
|
h.Write([]byte(comp))
|
|
h.Write([]byte{0})
|
|
}
|
|
|
|
r.fingerprint = hex.EncodeToString(h.Sum(nil))
|
|
}
|
|
|
|
func (r *Runner) getMACAddress() (string, error) {
|
|
interfaces, err := net.Interfaces()
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
for _, iface := range interfaces {
|
|
if iface.Flags&net.FlagLoopback != 0 || iface.Flags&net.FlagUp == 0 {
|
|
continue
|
|
}
|
|
if len(iface.HardwareAddr) == 0 {
|
|
continue
|
|
}
|
|
return iface.HardwareAddr.String(), nil
|
|
}
|
|
|
|
return "", fmt.Errorf("no suitable network interface found")
|
|
}
|
|
|
|
// GetFingerprint returns the runner's hardware fingerprint.
|
|
func (r *Runner) GetFingerprint() string {
|
|
r.fingerprintMu.RLock()
|
|
defer r.fingerprintMu.RUnlock()
|
|
return r.fingerprint
|
|
}
|
|
|
|
// GetID returns the runner ID.
|
|
func (r *Runner) GetID() int64 {
|
|
return r.id
|
|
}
|