something

2025-11-27 00:46:48 -06:00
parent 11e7552b5b
commit edc8ea160c
43 changed files with 9990 additions and 3059 deletions
--- a/internal/runner/client.go
+++ b/internal/runner/client.go
@@ -25,6 +25,7 @@ import (
 	"sync"
 	"time"

+	"jiggablend/pkg/executils"
 	"jiggablend/pkg/scripts"
 	"jiggablend/pkg/types"

@@ -45,19 +46,19 @@ type Client struct {
 	stopChan           chan struct{}
 	stepStartTimes     map[string]time.Time // key: "taskID:stepName"
 	stepTimesMu        sync.RWMutex
-	workspaceDir       string                 // Persistent workspace directory for this runner
-	runningProcs       sync.Map               // map[int64]*exec.Cmd - tracks running processes by task ID
-	capabilities       map[string]interface{} // Cached capabilities from initial probe (includes bools and numbers)
-	capabilitiesMu     sync.RWMutex           // Protects capabilities
-	hwAccelCache       map[string]bool        // Cached hardware acceleration detection results
-	hwAccelCacheMu     sync.RWMutex           // Protects hwAccelCache
-	vaapiDevices       []string               // Cached VAAPI device paths (all available devices)
-	vaapiDevicesMu     sync.RWMutex           // Protects vaapiDevices
-	allocatedDevices   map[int64]string       // map[taskID]device - tracks which device is allocated to which task
-	allocatedDevicesMu sync.RWMutex           // Protects allocatedDevices
-	longRunningClient  *http.Client           // HTTP client for long-running operations (no timeout)
-	fingerprint        string                 // Unique hardware fingerprint for this runner
-	fingerprintMu      sync.RWMutex           // Protects fingerprint
+	workspaceDir       string                    // Persistent workspace directory for this runner
+	processTracker     *executils.ProcessTracker // Tracks running processes for cleanup
+	capabilities       map[string]interface{}    // Cached capabilities from initial probe (includes bools and numbers)
+	capabilitiesMu     sync.RWMutex              // Protects capabilities
+	hwAccelCache       map[string]bool           // Cached hardware acceleration detection results
+	hwAccelCacheMu     sync.RWMutex              // Protects hwAccelCache
+	vaapiDevices       []string                  // Cached VAAPI device paths (all available devices)
+	vaapiDevicesMu     sync.RWMutex              // Protects vaapiDevices
+	allocatedDevices   map[int64]string          // map[taskID]device - tracks which device is allocated to which task
+	allocatedDevicesMu sync.RWMutex              // Protects allocatedDevices
+	longRunningClient  *http.Client              // HTTP client for long-running operations (no timeout)
+	fingerprint        string                    // Unique hardware fingerprint for this runner
+	fingerprintMu      sync.RWMutex              // Protects fingerprint
 }

 // NewClient creates a new runner client
@@ -70,6 +71,7 @@ func NewClient(managerURL, name, hostname string) *Client {
 		longRunningClient: &http.Client{Timeout: 0}, // No timeout for long-running operations (context downloads, file uploads/downloads)
 		stopChan:          make(chan struct{}),
 		stepStartTimes:    make(map[string]time.Time),
+		processTracker:    executils.NewProcessTracker(),
 	}
 	// Generate fingerprint immediately
 	client.generateFingerprint()
@@ -226,12 +228,6 @@ func (c *Client) probeCapabilities() map[string]interface{} {
 		c.probeGPUCapabilities(capabilities)
 	} else {
 		capabilities["ffmpeg"] = false
-		// Set defaults when ffmpeg is not available
-		capabilities["vaapi"] = false
-		capabilities["vaapi_gpu_count"] = 0
-		capabilities["nvenc"] = false
-		capabilities["nvenc_gpu_count"] = 0
-		capabilities["video_gpu_count"] = 0
 	}

 	return capabilities
@@ -256,60 +252,6 @@ func (c *Client) probeGPUCapabilities(capabilities map[string]interface{}) {
 		log.Printf("Available hardware encoders: %v", getKeys(hwEncoders))
 	}

-	// Check for VAAPI devices and count them
-	log.Printf("Checking for VAAPI hardware acceleration...")
-
-	// First check if encoder is listed (more reliable than testing)
-	cmd := exec.Command("ffmpeg", "-hide_banner", "-encoders")
-	output, err := cmd.CombinedOutput()
-	hasVAAPIEncoder := false
-	if err == nil {
-		encoderOutput := string(output)
-		if strings.Contains(encoderOutput, "h264_vaapi") {
-			hasVAAPIEncoder = true
-			log.Printf("VAAPI encoder (h264_vaapi) found in ffmpeg encoders list")
-		}
-	}
-
-	if hasVAAPIEncoder {
-		// Try to find and test devices
-		vaapiDevices := c.findVAAPIDevices()
-		capabilities["vaapi_gpu_count"] = len(vaapiDevices)
-		if len(vaapiDevices) > 0 {
-			capabilities["vaapi"] = true
-			log.Printf("VAAPI detected: %d GPU device(s) available: %v", len(vaapiDevices), vaapiDevices)
-		} else {
-			capabilities["vaapi"] = false
-			log.Printf("VAAPI encoder available but no working devices found")
-			log.Printf("  This might indicate:")
-			log.Printf("  - Missing or incorrect GPU drivers")
-			log.Printf("  - Missing libva or mesa-va-drivers packages")
-			log.Printf("  - Permission issues accessing /dev/dri devices")
-			log.Printf("  - GPU not properly initialized")
-		}
-	} else {
-		capabilities["vaapi"] = false
-		capabilities["vaapi_gpu_count"] = 0
-		log.Printf("VAAPI encoder not available in ffmpeg")
-		log.Printf("  This might indicate:")
-		log.Printf("  - FFmpeg was not compiled with VAAPI support")
-		log.Printf("  - Missing libva development libraries during FFmpeg compilation")
-	}
-
-	// Check for NVENC (NVIDIA) - try to detect multiple GPUs
-	log.Printf("Checking for NVENC hardware acceleration...")
-	if c.checkEncoderAvailable("h264_nvenc") {
-		capabilities["nvenc"] = true
-		// Try to detect actual GPU count using nvidia-smi if available
-		nvencCount := c.detectNVENCCount()
-		capabilities["nvenc_gpu_count"] = nvencCount
-		log.Printf("NVENC detected: %d GPU(s)", nvencCount)
-	} else {
-		capabilities["nvenc"] = false
-		capabilities["nvenc_gpu_count"] = 0
-		log.Printf("NVENC encoder not available")
-	}
-
 	// Check for other hardware encoders (for completeness)
 	log.Printf("Checking for other hardware encoders...")
 	if c.checkEncoderAvailable("h264_qsv") {
@@ -368,73 +310,6 @@ func (c *Client) probeGPUCapabilities(capabilities map[string]interface{}) {
 		capabilities["mediacodec"] = false
 		capabilities["mediacodec_gpu_count"] = 0
 	}
-
-	// Calculate total GPU count for video encoding
-	// Priority: VAAPI > NVENC > QSV > VideoToolbox > AMF > others
-	vaapiCount := 0
-	if count, ok := capabilities["vaapi_gpu_count"].(int); ok {
-		vaapiCount = count
-	}
-	nvencCount := 0
-	if count, ok := capabilities["nvenc_gpu_count"].(int); ok {
-		nvencCount = count
-	}
-	qsvCount := 0
-	if count, ok := capabilities["qsv_gpu_count"].(int); ok {
-		qsvCount = count
-	}
-	videotoolboxCount := 0
-	if count, ok := capabilities["videotoolbox_gpu_count"].(int); ok {
-		videotoolboxCount = count
-	}
-	amfCount := 0
-	if count, ok := capabilities["amf_gpu_count"].(int); ok {
-		amfCount = count
-	}
-
-	// Total GPU count - use the best available (they can't be used simultaneously)
-	totalGPUs := vaapiCount
-	if totalGPUs == 0 {
-		totalGPUs = nvencCount
-	}
-	if totalGPUs == 0 {
-		totalGPUs = qsvCount
-	}
-	if totalGPUs == 0 {
-		totalGPUs = videotoolboxCount
-	}
-	if totalGPUs == 0 {
-		totalGPUs = amfCount
-	}
-	capabilities["video_gpu_count"] = totalGPUs
-
-	if totalGPUs > 0 {
-		log.Printf("Total video GPU count: %d", totalGPUs)
-	} else {
-		log.Printf("No hardware-accelerated video encoding GPUs detected (will use software encoding)")
-	}
-}
-
-// detectNVENCCount tries to detect the actual number of NVIDIA GPUs using nvidia-smi
-func (c *Client) detectNVENCCount() int {
-	// Try to use nvidia-smi to count GPUs
-	cmd := exec.Command("nvidia-smi", "--list-gpus")
-	output, err := cmd.CombinedOutput()
-	if err == nil {
-		// Count lines that contain "GPU" (each GPU is listed on a separate line)
-		lines := strings.Split(string(output), "\n")
-		count := 0
-		for _, line := range lines {
-			if strings.Contains(line, "GPU") {
-				count++
-			}
-		}
-		if count > 0 {
-			return count
-		}
-	}
-	// Fallback to 1 if nvidia-smi is not available
-	return 1
 }

 // getKeys returns all keys from a map as a slice (helper function)
@@ -926,29 +801,13 @@ func (c *Client) sendLog(taskID int64, logLevel types.LogLevel, message, stepNam
 // KillAllProcesses kills all running processes tracked by this client
 func (c *Client) KillAllProcesses() {
 	log.Printf("Killing all running processes...")
-	var killedCount int
-	c.runningProcs.Range(func(key, value interface{}) bool {
-		taskID := key.(int64)
-		cmd := value.(*exec.Cmd)
-		if cmd.Process != nil {
-			log.Printf("Killing process for task %d (PID: %d)", taskID, cmd.Process.Pid)
-			// Try graceful kill first (SIGTERM)
-			if err := cmd.Process.Signal(os.Interrupt); err != nil {
-				log.Printf("Failed to send SIGINT to process %d: %v", cmd.Process.Pid, err)
-			}
-			// Give it a moment to clean up
-			time.Sleep(100 * time.Millisecond)
-			// Force kill if still running
-			if err := cmd.Process.Kill(); err != nil {
-				log.Printf("Failed to kill process %d: %v", cmd.Process.Pid, err)
-			} else {
-				killedCount++
-			}
-		}
-		// Release any allocated device for this task
-		c.releaseVAAPIDevice(taskID)
-		return true
-	})
+	killedCount := c.processTracker.KillAll()
+	// Release all allocated VAAPI devices
+	c.allocatedDevicesMu.Lock()
+	for taskID := range c.allocatedDevices {
+		delete(c.allocatedDevices, taskID)
+	}
+	c.allocatedDevicesMu.Unlock()
 	log.Printf("Killed %d process(es)", killedCount)
 }

@@ -1272,55 +1131,33 @@ func (c *Client) processTask(task map[string]interface{}, jobName string, output

 	// Run Blender with GPU enabled via Python script
 	// Use -s (start) and -e (end) for frame ranges, or -f for single frame
+	// Use Blender's automatic frame numbering with #### pattern
 	var cmd *exec.Cmd
 	args := []string{"-b", blendFile, "--python", scriptPath}
 	if enableExecution {
 		args = append(args, "--enable-autoexec")
 	}
-	// Always render frames individually for precise control over file naming
-	// This avoids Blender's automatic frame numbering quirks
-	for frame := frameStart; frame <= frameEnd; frame++ {
-		// Create temp output pattern for this frame
-		tempPattern := filepath.Join(outputDir, fmt.Sprintf("temp_frame.%s", strings.ToLower(renderFormat)))
-		tempAbsPattern, _ := filepath.Abs(tempPattern)

-		// Build args for this specific frame
-		frameArgs := []string{"-b", blendFile, "--python", scriptPath}
-		if enableExecution {
-			frameArgs = append(frameArgs, "--enable-autoexec")
-		}
-		frameArgs = append(frameArgs, "-o", tempAbsPattern, "-f", fmt.Sprintf("%d", frame))
+	// Output pattern uses #### which Blender will replace with frame numbers
+	outputPattern := filepath.Join(outputDir, fmt.Sprintf("frame_####.%s", strings.ToLower(renderFormat)))
+	outputAbsPattern, _ := filepath.Abs(outputPattern)
+	args = append(args, "-o", outputAbsPattern)

-		c.sendLog(taskID, types.LogLevelInfo, fmt.Sprintf("Rendering frame %d...", frame), "render_blender")
-
-		frameCmd := exec.Command("blender", frameArgs...)
-		frameCmd.Dir = workDir
-		frameCmd.Env = os.Environ()
-
-		// Run this frame
-		if output, err := frameCmd.CombinedOutput(); err != nil {
-			errMsg := fmt.Sprintf("blender failed on frame %d: %v (output: %s)", frame, err, string(output))
-			c.sendLog(taskID, types.LogLevelError, errMsg, "render_blender")
-			return errors.New(errMsg)
-		}
-
-		// Immediately rename the temp file to the proper frame-numbered name
-		finalName := fmt.Sprintf("frame_%04d.%s", frame, strings.ToLower(renderFormat))
-		finalPath := filepath.Join(outputDir, finalName)
-		tempPath := filepath.Join(outputDir, fmt.Sprintf("temp_frame.%s", strings.ToLower(renderFormat)))
-
-		if err := os.Rename(tempPath, finalPath); err != nil {
-			errMsg := fmt.Sprintf("failed to rename temp file for frame %d: %v", frame, err)
-			c.sendLog(taskID, types.LogLevelError, errMsg, "render_blender")
-			return errors.New(errMsg)
-		}
-
-		c.sendLog(taskID, types.LogLevelInfo, fmt.Sprintf("Completed frame %d -> %s", frame, finalName), "render_blender")
+	if frameStart == frameEnd {
+		// Single frame
+		args = append(args, "-f", fmt.Sprintf("%d", frameStart))
+		c.sendLog(taskID, types.LogLevelInfo, fmt.Sprintf("Rendering frame %d...", frameStart), "render_blender")
+	} else {
+		// Frame range
+		args = append(args, "-s", fmt.Sprintf("%d", frameStart), "-e", fmt.Sprintf("%d", frameEnd))
+		c.sendLog(taskID, types.LogLevelInfo, fmt.Sprintf("Rendering frames %d-%d...", frameStart, frameEnd), "render_blender")
 	}

-	// Skip the rest of the function since we handled all frames above
-	c.sendStepUpdate(taskID, "render_blender", types.StepStatusCompleted, "")
-	return nil
+	// Create and run Blender command
+	cmd = exec.Command("blender", args...)
+	cmd.Dir = workDir
+	cmd.Env = os.Environ()
+
 	// Blender will handle headless rendering automatically
 	// We preserve the environment to allow GPU access if available

@@ -1350,8 +1187,8 @@ func (c *Client) processTask(task map[string]interface{}, jobName string, output
 	}

 	// Register process for cleanup on shutdown
-	c.runningProcs.Store(taskID, cmd)
-	defer c.runningProcs.Delete(taskID)
+	c.processTracker.Track(taskID, cmd)
+	defer c.processTracker.Untrack(taskID)

 	// Stream stdout line by line
 	stdoutDone := make(chan bool)
@@ -1396,15 +1233,23 @@ func (c *Client) processTask(task map[string]interface{}, jobName string, output
 	<-stdoutDone
 	<-stderrDone
 	if err != nil {
-		errMsg := fmt.Sprintf("blender failed: %v", err)
+		var errMsg string
+		if exitErr, ok := err.(*exec.ExitError); ok {
+			if exitErr.ExitCode() == 137 {
+				errMsg = "Blender was killed due to excessive memory usage (OOM)"
+			} else {
+				errMsg = fmt.Sprintf("blender failed: %v", err)
+			}
+		} else {
+			errMsg = fmt.Sprintf("blender failed: %v", err)
+		}
 		c.sendLog(taskID, types.LogLevelError, errMsg, "render_blender")
 		c.sendStepUpdate(taskID, "render_blender", types.StepStatusFailed, errMsg)
 		return errors.New(errMsg)
 	}

-	// For frame ranges, we rendered each frame individually with temp naming
-	// The files are already properly named during the individual frame rendering
-	// No additional renaming needed
+	// Blender has rendered frames with automatic numbering using the #### pattern
+	// Files will be named like frame_0001.png, frame_0002.png, etc.

 	// Find rendered output file(s)
 	// For frame ranges, we'll find all frames in the upload step
@@ -1748,8 +1593,8 @@ func (c *Client) processVideoGenerationTask(task map[string]interface{}, jobID i
 	// Extract frame number pattern (e.g., frame_2470.exr -> frame_%04d.exr)
 	baseName := filepath.Base(firstFrame)
 	// Find the numeric part and replace it with %04d pattern
-	// Use regex to find digits (including negative) after underscore and before extension
-	re := regexp.MustCompile(`_(-?\d+)\.`)
+	// Use regex to find digits (positive only, negative frames not supported) after underscore and before extension
+	re := regexp.MustCompile(`_(\d+)\.`)
 	var pattern string
 	var startNumber int
 	frameNumStr := re.FindStringSubmatch(baseName)
@@ -1763,6 +1608,7 @@ func (c *Client) processVideoGenerationTask(task map[string]interface{}, jobID i
 		startNumber = extractFrameNumber(baseName)
 		pattern = strings.Replace(baseName, fmt.Sprintf("%d", startNumber), "%04d", 1)
 	}
+	// Pattern path should be in workDir where the frame files are downloaded
 	patternPath := filepath.Join(workDir, pattern)

 	// Allocate a VAAPI device for this task (if available)
@@ -1891,8 +1737,8 @@ func (c *Client) processVideoGenerationTask(task map[string]interface{}, jobID i
 	}

 	// Register process for cleanup on shutdown
-	c.runningProcs.Store(taskID, cmd)
-	defer c.runningProcs.Delete(taskID)
+	c.processTracker.Track(taskID, cmd)
+	defer c.processTracker.Untrack(taskID)

 	// Stream stdout line by line
 	stdoutDone := make(chan bool)
@@ -1959,26 +1805,25 @@ func (c *Client) processVideoGenerationTask(task map[string]interface{}, jobID i
 	<-stderrDone

 	if err != nil {
+		var errMsg string
+		if exitErr, ok := err.(*exec.ExitError); ok {
+			if exitErr.ExitCode() == 137 {
+				errMsg = "FFmpeg was killed due to excessive memory usage (OOM)"
+			} else {
+				errMsg = fmt.Sprintf("ffmpeg encoding failed: %v", err)
+			}
+		} else {
+			errMsg = fmt.Sprintf("ffmpeg encoding failed: %v", err)
+		}
 		// Check for size-related errors and provide helpful messages
-		if sizeErr := c.checkFFmpegSizeError("ffmpeg encoding failed"); sizeErr != nil {
+		if sizeErr := c.checkFFmpegSizeError(errMsg); sizeErr != nil {
 			c.sendLog(taskID, types.LogLevelError, sizeErr.Error(), "generate_video")
 			c.sendStepUpdate(taskID, "generate_video", types.StepStatusFailed, sizeErr.Error())
 			return sizeErr
 		}
-
-		// Try alternative method with concat demuxer
-		c.sendLog(taskID, types.LogLevelWarn, "Primary ffmpeg encoding failed, trying concat method...", "generate_video")
-		err = c.generateMP4WithConcat(frameFiles, outputMP4, workDir, allocatedDevice, outputFormat, codec, pixFmt, useAlpha, useHardware, frameRate)
-		if err != nil {
-			// Check for size errors in concat method too
-			if sizeErr := c.checkFFmpegSizeError(err.Error()); sizeErr != nil {
-				c.sendLog(taskID, types.LogLevelError, sizeErr.Error(), "generate_video")
-				c.sendStepUpdate(taskID, "generate_video", types.StepStatusFailed, sizeErr.Error())
-				return sizeErr
-			}
-			c.sendStepUpdate(taskID, "generate_video", types.StepStatusFailed, err.Error())
-			return err
-		}
+		c.sendLog(taskID, types.LogLevelError, errMsg, "generate_video")
+		c.sendStepUpdate(taskID, "generate_video", types.StepStatusFailed, errMsg)
+		return errors.New(errMsg)
 	}

 	// Check if MP4 was created
@@ -2771,7 +2616,7 @@ func (c *Client) testGenericEncoder(encoder string) bool {

 // generateMP4WithConcat uses ffmpeg concat demuxer as fallback
 // device parameter is optional - if provided, it will be used for VAAPI encoding
-func (c *Client) generateMP4WithConcat(frameFiles []string, outputMP4, workDir string, device string, outputFormat string, codec string, pixFmt string, useAlpha bool, useHardware bool, frameRate float64) error {
+func (c *Client) generateMP4WithConcat(taskID int, frameFiles []string, outputMP4, workDir string, device string, outputFormat string, codec string, pixFmt string, useAlpha bool, useHardware bool, frameRate float64) error {
 	// Create file list for ffmpeg concat demuxer
 	listFile := filepath.Join(workDir, "frames.txt")
 	listFileHandle, err := os.Create(listFile)
@@ -2907,11 +2752,23 @@ func (c *Client) generateMP4WithConcat(frameFiles []string, outputMP4, workDir s
 	<-stderrDone

 	if err != nil {
+		var errMsg string
+		if exitErr, ok := err.(*exec.ExitError); ok {
+			if exitErr.ExitCode() == 137 {
+				errMsg = "FFmpeg was killed due to excessive memory usage (OOM)"
+			} else {
+				errMsg = fmt.Sprintf("ffmpeg concat failed: %v", err)
+			}
+		} else {
+			errMsg = fmt.Sprintf("ffmpeg concat failed: %v", err)
+		}
 		// Check for size-related errors
-		if sizeErr := c.checkFFmpegSizeError("ffmpeg concat failed"); sizeErr != nil {
+		if sizeErr := c.checkFFmpegSizeError(errMsg); sizeErr != nil {
 			return sizeErr
 		}
-		return fmt.Errorf("ffmpeg concat failed: %w", err)
+		c.sendLog(int64(taskID), types.LogLevelError, errMsg, "generate_video")
+		c.sendStepUpdate(int64(taskID), "generate_video", types.StepStatusFailed, errMsg)
+		return errors.New(errMsg)
 	}

 	if _, err := os.Stat(outputMP4); os.IsNotExist(err) {
@@ -3695,8 +3552,8 @@ sys.stdout.flush()
 	}

 	// Register process for cleanup on shutdown
-	c.runningProcs.Store(taskID, cmd)
-	defer c.runningProcs.Delete(taskID)
+	c.processTracker.Track(taskID, cmd)
+	defer c.processTracker.Untrack(taskID)

 	// Stream stdout line by line and collect for JSON parsing
 	stdoutDone := make(chan bool)
@@ -3743,7 +3600,16 @@ sys.stdout.flush()
 	<-stdoutDone
 	<-stderrDone
 	if err != nil {
-		errMsg := fmt.Sprintf("blender metadata extraction failed: %v", err)
+		var errMsg string
+		if exitErr, ok := err.(*exec.ExitError); ok {
+			if exitErr.ExitCode() == 137 {
+				errMsg = "Blender metadata extraction was killed due to excessive memory usage (OOM)"
+			} else {
+				errMsg = fmt.Sprintf("blender metadata extraction failed: %v", err)
+			}
+		} else {
+			errMsg = fmt.Sprintf("blender metadata extraction failed: %v", err)
+		}
 		c.sendLog(taskID, types.LogLevelError, errMsg, "extract_metadata")
 		c.sendStepUpdate(taskID, "extract_metadata", types.StepStatusFailed, errMsg)
 		return errors.New(errMsg)