jiggablend/internal/runner/runner.go

// Package runner provides the Jiggablend render runner.
package runner

import (
	"crypto/sha256"
	"encoding/hex"
	"fmt"
	"log"
	"net"
	"os"
	"os/exec"
	"strings"
	"sync"
	"time"

	"jiggablend/internal/runner/api"
	"jiggablend/internal/runner/blender"
	"jiggablend/internal/runner/encoding"
	"jiggablend/internal/runner/tasks"
	"jiggablend/internal/runner/workspace"
	"jiggablend/pkg/executils"
	"jiggablend/pkg/types"
)

// Runner is the main render runner.
type Runner struct {
	id       int64
	name     string
	hostname string

	manager   *api.ManagerClient
	workspace *workspace.Manager
	blender   *blender.Manager
	encoder   *encoding.Selector
	processes *executils.ProcessTracker

	processors map[string]tasks.Processor
	stopChan   chan struct{}

	fingerprint   string
	fingerprintMu sync.RWMutex
}

// New creates a new runner.
func New(managerURL, name, hostname string) *Runner {
	manager := api.NewManagerClient(managerURL)

	r := &Runner{
		name:       name,
		hostname:   hostname,
		manager:    manager,
		processes:  executils.NewProcessTracker(),
		stopChan:   make(chan struct{}),
		processors: make(map[string]tasks.Processor),
	}

	// Generate fingerprint
	r.generateFingerprint()

	return r
}

// CheckRequiredTools verifies that required external tools are available.
func (r *Runner) CheckRequiredTools() error {
	if err := exec.Command("zstd", "--version").Run(); err != nil {
		return fmt.Errorf("zstd not found - required for compressed blend file support. Install with: apt install zstd")
	}
	log.Printf("Found zstd for compressed blend file support")

	if err := exec.Command("xvfb-run", "--help").Run(); err != nil {
		return fmt.Errorf("xvfb-run not found - required for headless Blender rendering. Install with: apt install xvfb")
	}
	log.Printf("Found xvfb-run for headless rendering without -b option")
	return nil
}

var cachedCapabilities map[string]interface{} = nil

// ProbeCapabilities detects hardware capabilities.
func (r *Runner) ProbeCapabilities() map[string]interface{} {
	if cachedCapabilities != nil {
		return cachedCapabilities
	}

	caps := make(map[string]interface{})

	// Check for ffmpeg and probe encoding capabilities
	if err := exec.Command("ffmpeg", "-version").Run(); err == nil {
		caps["ffmpeg"] = true
	} else {
		caps["ffmpeg"] = false
	}

	cachedCapabilities = caps
	return caps
}

// Register registers the runner with the manager.
func (r *Runner) Register(apiKey string) (int64, error) {
	caps := r.ProbeCapabilities()

	id, err := r.manager.Register(r.name, r.hostname, caps, apiKey, r.GetFingerprint())
	if err != nil {
		return 0, err
	}

	r.id = id

	// Initialize workspace after registration
	r.workspace = workspace.NewManager(r.name)

	// Initialize blender manager
	r.blender = blender.NewManager(r.manager, r.workspace.BaseDir())

	// Initialize encoder selector
	r.encoder = encoding.NewSelector()

	// Register task processors
	r.processors["render"] = tasks.NewRenderProcessor()
	r.processors["encode"] = tasks.NewEncodeProcessor()

	return id, nil
}

// Start starts the job polling loop.
func (r *Runner) Start(pollInterval time.Duration) {
	log.Printf("Starting job polling loop (interval: %v)", pollInterval)

	for {
		select {
		case <-r.stopChan:
			log.Printf("Stopping job polling loop")
			return
		default:
		}

		log.Printf("Polling for next job (runner ID: %d)", r.id)
		job, err := r.manager.PollNextJob()
		if err != nil {
			log.Printf("Error polling for job: %v", err)
			time.Sleep(pollInterval)
			continue
		}

		if job == nil {
			log.Printf("No job available, sleeping for %v", pollInterval)
			time.Sleep(pollInterval)
			continue
		}

		log.Printf("Received job assignment: task=%d, job=%d, type=%s",
			job.Task.TaskID, job.Task.JobID, job.Task.TaskType)

		if err := r.executeJob(job); err != nil {
			log.Printf("Error processing job: %v", err)
		}
	}
}

// Stop stops the runner.
func (r *Runner) Stop() {
	close(r.stopChan)
}

// KillAllProcesses kills all running processes.
func (r *Runner) KillAllProcesses() {
	log.Printf("Killing all running processes...")
	killedCount := r.processes.KillAll()

	// Release all allocated devices
	if r.encoder != nil {
		// Device pool cleanup is handled internally
	}

	log.Printf("Killed %d process(es)", killedCount)
}

// Cleanup removes the workspace directory.
func (r *Runner) Cleanup() {
	if r.workspace != nil {
		r.workspace.Cleanup()
	}
}

// executeJob handles a job using per-job WebSocket connection.
func (r *Runner) executeJob(job *api.NextJobResponse) (err error) {
	// Recover from panics to prevent runner process crashes during task execution
	defer func() {
		if rec := recover(); rec != nil {
			log.Printf("Task execution panicked: %v", rec)
			err = fmt.Errorf("task execution panicked: %v", rec)
		}
	}()

	// Connect to job WebSocket (no runnerID needed - authentication handles it)
	jobConn := api.NewJobConnection()
	if err := jobConn.Connect(r.manager.GetBaseURL(), job.JobPath, job.JobToken); err != nil {
		return fmt.Errorf("failed to connect job WebSocket: %w", err)
	}
	defer jobConn.Close()

	log.Printf("Job WebSocket authenticated for task %d", job.Task.TaskID)

	// Create task context
	workDir := r.workspace.JobDir(job.Task.JobID)
	ctx := tasks.NewContext(
		job.Task.TaskID,
		job.Task.JobID,
		job.Task.JobName,
		job.Task.Frame,
		job.Task.TaskType,
		workDir,
		job.JobToken,
		job.Task.Metadata,
		r.manager,
		jobConn,
		r.workspace,
		r.blender,
		r.encoder,
		r.processes,
	)

	ctx.Info(fmt.Sprintf("Task assignment received (job: %d, type: %s)",
		job.Task.JobID, job.Task.TaskType))

	// Get processor for task type
	processor, ok := r.processors[job.Task.TaskType]
	if !ok {
		return fmt.Errorf("unknown task type: %s", job.Task.TaskType)
	}

	// Process the task
	var processErr error
	switch job.Task.TaskType {
	case "render": // this task has a upload outputs step because the frames are not uploaded by the render task directly we have to do it manually here TODO: maybe we should make it work like the encode task
		// Download context
		contextPath := job.JobPath + "/context.tar"
		if err := r.downloadContext(job.Task.JobID, contextPath, job.JobToken); err != nil {
			jobConn.Log(job.Task.TaskID, types.LogLevelError, fmt.Sprintf("Failed to download context: %v", err))
			jobConn.Complete(job.Task.TaskID, false, fmt.Errorf("failed to download context: %v", err))
			return fmt.Errorf("failed to download context: %w", err)
		}
		processErr = processor.Process(ctx)
		if processErr == nil {
			processErr = r.uploadOutputs(ctx, job)
		}
	case "encode": // this task doesn't have a upload outputs step because the video is already uploaded by the encode task
		processErr = processor.Process(ctx)
	default:
		return fmt.Errorf("unknown task type: %s", job.Task.TaskType)
	}

	if processErr != nil {
		ctx.Error(fmt.Sprintf("Task failed: %v", processErr))
		ctx.Complete(false, processErr)
		return processErr
	}

	ctx.Complete(true, nil)
	return nil
}

func (r *Runner) downloadContext(jobID int64, contextPath, jobToken string) error {
	reader, err := r.manager.DownloadContext(contextPath, jobToken)
	if err != nil {
		return err
	}
	defer reader.Close()

	jobDir := r.workspace.JobDir(jobID)
	return workspace.ExtractTar(reader, jobDir)
}

func (r *Runner) uploadOutputs(ctx *tasks.Context, job *api.NextJobResponse) error {
	outputDir := ctx.WorkDir + "/output"
	uploadPath := fmt.Sprintf("/api/runner/jobs/%d/upload", job.Task.JobID)

	entries, err := os.ReadDir(outputDir)
	if err != nil {
		return fmt.Errorf("failed to read output directory: %w", err)
	}

	for _, entry := range entries {
		if entry.IsDir() {
			continue
		}
		filePath := outputDir + "/" + entry.Name()
		if err := r.manager.UploadFile(uploadPath, job.JobToken, filePath); err != nil {
			log.Printf("Failed to upload %s: %v", filePath, err)
		} else {
			ctx.OutputUploaded(entry.Name())
			// Delete file after successful upload to prevent duplicate uploads
			if err := os.Remove(filePath); err != nil {
				log.Printf("Warning: Failed to delete file %s after upload: %v", filePath, err)
			}
		}
	}

	return nil
}

// generateFingerprint creates a unique hardware fingerprint.
func (r *Runner) generateFingerprint() {
	r.fingerprintMu.Lock()
	defer r.fingerprintMu.Unlock()

	var components []string
	components = append(components, r.hostname)

	if machineID, err := os.ReadFile("/etc/machine-id"); err == nil {
		components = append(components, strings.TrimSpace(string(machineID)))
	}

	if productUUID, err := os.ReadFile("/sys/class/dmi/id/product_uuid"); err == nil {
		components = append(components, strings.TrimSpace(string(productUUID)))
	}

	if macAddr, err := r.getMACAddress(); err == nil {
		components = append(components, macAddr)
	}

	if len(components) <= 1 {
		components = append(components, fmt.Sprintf("%d", os.Getpid()))
		components = append(components, fmt.Sprintf("%d", time.Now().Unix()))
	}

	h := sha256.New()
	for _, comp := range components {
		h.Write([]byte(comp))
		h.Write([]byte{0})
	}

	r.fingerprint = hex.EncodeToString(h.Sum(nil))
}

func (r *Runner) getMACAddress() (string, error) {
	interfaces, err := net.Interfaces()
	if err != nil {
		return "", err
	}

	for _, iface := range interfaces {
		if iface.Flags&net.FlagLoopback != 0 || iface.Flags&net.FlagUp == 0 {
			continue
		}
		if len(iface.HardwareAddr) == 0 {
			continue
		}
		return iface.HardwareAddr.String(), nil
	}

	return "", fmt.Errorf("no suitable network interface found")
}

// GetFingerprint returns the runner's hardware fingerprint.
func (r *Runner) GetFingerprint() string {
	r.fingerprintMu.RLock()
	defer r.fingerprintMu.RUnlock()
	return r.fingerprint
}

// GetID returns the runner ID.
func (r *Runner) GetID() int64 {
	return r.id
}