something

This commit is contained in:
2025-11-27 00:46:48 -06:00
parent 11e7552b5b
commit edc8ea160c
43 changed files with 9990 additions and 3059 deletions

View File

@@ -25,6 +25,7 @@ import (
"sync"
"time"
"jiggablend/pkg/executils"
"jiggablend/pkg/scripts"
"jiggablend/pkg/types"
@@ -45,19 +46,19 @@ type Client struct {
stopChan chan struct{}
stepStartTimes map[string]time.Time // key: "taskID:stepName"
stepTimesMu sync.RWMutex
workspaceDir string // Persistent workspace directory for this runner
runningProcs sync.Map // map[int64]*exec.Cmd - tracks running processes by task ID
capabilities map[string]interface{} // Cached capabilities from initial probe (includes bools and numbers)
capabilitiesMu sync.RWMutex // Protects capabilities
hwAccelCache map[string]bool // Cached hardware acceleration detection results
hwAccelCacheMu sync.RWMutex // Protects hwAccelCache
vaapiDevices []string // Cached VAAPI device paths (all available devices)
vaapiDevicesMu sync.RWMutex // Protects vaapiDevices
allocatedDevices map[int64]string // map[taskID]device - tracks which device is allocated to which task
allocatedDevicesMu sync.RWMutex // Protects allocatedDevices
longRunningClient *http.Client // HTTP client for long-running operations (no timeout)
fingerprint string // Unique hardware fingerprint for this runner
fingerprintMu sync.RWMutex // Protects fingerprint
workspaceDir string // Persistent workspace directory for this runner
processTracker *executils.ProcessTracker // Tracks running processes for cleanup
capabilities map[string]interface{} // Cached capabilities from initial probe (includes bools and numbers)
capabilitiesMu sync.RWMutex // Protects capabilities
hwAccelCache map[string]bool // Cached hardware acceleration detection results
hwAccelCacheMu sync.RWMutex // Protects hwAccelCache
vaapiDevices []string // Cached VAAPI device paths (all available devices)
vaapiDevicesMu sync.RWMutex // Protects vaapiDevices
allocatedDevices map[int64]string // map[taskID]device - tracks which device is allocated to which task
allocatedDevicesMu sync.RWMutex // Protects allocatedDevices
longRunningClient *http.Client // HTTP client for long-running operations (no timeout)
fingerprint string // Unique hardware fingerprint for this runner
fingerprintMu sync.RWMutex // Protects fingerprint
}
// NewClient creates a new runner client
@@ -70,6 +71,7 @@ func NewClient(managerURL, name, hostname string) *Client {
longRunningClient: &http.Client{Timeout: 0}, // No timeout for long-running operations (context downloads, file uploads/downloads)
stopChan: make(chan struct{}),
stepStartTimes: make(map[string]time.Time),
processTracker: executils.NewProcessTracker(),
}
// Generate fingerprint immediately
client.generateFingerprint()
@@ -226,12 +228,6 @@ func (c *Client) probeCapabilities() map[string]interface{} {
c.probeGPUCapabilities(capabilities)
} else {
capabilities["ffmpeg"] = false
// Set defaults when ffmpeg is not available
capabilities["vaapi"] = false
capabilities["vaapi_gpu_count"] = 0
capabilities["nvenc"] = false
capabilities["nvenc_gpu_count"] = 0
capabilities["video_gpu_count"] = 0
}
return capabilities
@@ -256,60 +252,6 @@ func (c *Client) probeGPUCapabilities(capabilities map[string]interface{}) {
log.Printf("Available hardware encoders: %v", getKeys(hwEncoders))
}
// Check for VAAPI devices and count them
log.Printf("Checking for VAAPI hardware acceleration...")
// First check if encoder is listed (more reliable than testing)
cmd := exec.Command("ffmpeg", "-hide_banner", "-encoders")
output, err := cmd.CombinedOutput()
hasVAAPIEncoder := false
if err == nil {
encoderOutput := string(output)
if strings.Contains(encoderOutput, "h264_vaapi") {
hasVAAPIEncoder = true
log.Printf("VAAPI encoder (h264_vaapi) found in ffmpeg encoders list")
}
}
if hasVAAPIEncoder {
// Try to find and test devices
vaapiDevices := c.findVAAPIDevices()
capabilities["vaapi_gpu_count"] = len(vaapiDevices)
if len(vaapiDevices) > 0 {
capabilities["vaapi"] = true
log.Printf("VAAPI detected: %d GPU device(s) available: %v", len(vaapiDevices), vaapiDevices)
} else {
capabilities["vaapi"] = false
log.Printf("VAAPI encoder available but no working devices found")
log.Printf(" This might indicate:")
log.Printf(" - Missing or incorrect GPU drivers")
log.Printf(" - Missing libva or mesa-va-drivers packages")
log.Printf(" - Permission issues accessing /dev/dri devices")
log.Printf(" - GPU not properly initialized")
}
} else {
capabilities["vaapi"] = false
capabilities["vaapi_gpu_count"] = 0
log.Printf("VAAPI encoder not available in ffmpeg")
log.Printf(" This might indicate:")
log.Printf(" - FFmpeg was not compiled with VAAPI support")
log.Printf(" - Missing libva development libraries during FFmpeg compilation")
}
// Check for NVENC (NVIDIA) - try to detect multiple GPUs
log.Printf("Checking for NVENC hardware acceleration...")
if c.checkEncoderAvailable("h264_nvenc") {
capabilities["nvenc"] = true
// Try to detect actual GPU count using nvidia-smi if available
nvencCount := c.detectNVENCCount()
capabilities["nvenc_gpu_count"] = nvencCount
log.Printf("NVENC detected: %d GPU(s)", nvencCount)
} else {
capabilities["nvenc"] = false
capabilities["nvenc_gpu_count"] = 0
log.Printf("NVENC encoder not available")
}
// Check for other hardware encoders (for completeness)
log.Printf("Checking for other hardware encoders...")
if c.checkEncoderAvailable("h264_qsv") {
@@ -368,73 +310,6 @@ func (c *Client) probeGPUCapabilities(capabilities map[string]interface{}) {
capabilities["mediacodec"] = false
capabilities["mediacodec_gpu_count"] = 0
}
// Calculate total GPU count for video encoding
// Priority: VAAPI > NVENC > QSV > VideoToolbox > AMF > others
vaapiCount := 0
if count, ok := capabilities["vaapi_gpu_count"].(int); ok {
vaapiCount = count
}
nvencCount := 0
if count, ok := capabilities["nvenc_gpu_count"].(int); ok {
nvencCount = count
}
qsvCount := 0
if count, ok := capabilities["qsv_gpu_count"].(int); ok {
qsvCount = count
}
videotoolboxCount := 0
if count, ok := capabilities["videotoolbox_gpu_count"].(int); ok {
videotoolboxCount = count
}
amfCount := 0
if count, ok := capabilities["amf_gpu_count"].(int); ok {
amfCount = count
}
// Total GPU count - use the best available (they can't be used simultaneously)
totalGPUs := vaapiCount
if totalGPUs == 0 {
totalGPUs = nvencCount
}
if totalGPUs == 0 {
totalGPUs = qsvCount
}
if totalGPUs == 0 {
totalGPUs = videotoolboxCount
}
if totalGPUs == 0 {
totalGPUs = amfCount
}
capabilities["video_gpu_count"] = totalGPUs
if totalGPUs > 0 {
log.Printf("Total video GPU count: %d", totalGPUs)
} else {
log.Printf("No hardware-accelerated video encoding GPUs detected (will use software encoding)")
}
}
// detectNVENCCount tries to detect the actual number of NVIDIA GPUs using nvidia-smi
func (c *Client) detectNVENCCount() int {
// Try to use nvidia-smi to count GPUs
cmd := exec.Command("nvidia-smi", "--list-gpus")
output, err := cmd.CombinedOutput()
if err == nil {
// Count lines that contain "GPU" (each GPU is listed on a separate line)
lines := strings.Split(string(output), "\n")
count := 0
for _, line := range lines {
if strings.Contains(line, "GPU") {
count++
}
}
if count > 0 {
return count
}
}
// Fallback to 1 if nvidia-smi is not available
return 1
}
// getKeys returns all keys from a map as a slice (helper function)
@@ -926,29 +801,13 @@ func (c *Client) sendLog(taskID int64, logLevel types.LogLevel, message, stepNam
// KillAllProcesses kills all running processes tracked by this client
func (c *Client) KillAllProcesses() {
log.Printf("Killing all running processes...")
var killedCount int
c.runningProcs.Range(func(key, value interface{}) bool {
taskID := key.(int64)
cmd := value.(*exec.Cmd)
if cmd.Process != nil {
log.Printf("Killing process for task %d (PID: %d)", taskID, cmd.Process.Pid)
// Try graceful kill first (SIGTERM)
if err := cmd.Process.Signal(os.Interrupt); err != nil {
log.Printf("Failed to send SIGINT to process %d: %v", cmd.Process.Pid, err)
}
// Give it a moment to clean up
time.Sleep(100 * time.Millisecond)
// Force kill if still running
if err := cmd.Process.Kill(); err != nil {
log.Printf("Failed to kill process %d: %v", cmd.Process.Pid, err)
} else {
killedCount++
}
}
// Release any allocated device for this task
c.releaseVAAPIDevice(taskID)
return true
})
killedCount := c.processTracker.KillAll()
// Release all allocated VAAPI devices
c.allocatedDevicesMu.Lock()
for taskID := range c.allocatedDevices {
delete(c.allocatedDevices, taskID)
}
c.allocatedDevicesMu.Unlock()
log.Printf("Killed %d process(es)", killedCount)
}
@@ -1272,55 +1131,33 @@ func (c *Client) processTask(task map[string]interface{}, jobName string, output
// Run Blender with GPU enabled via Python script
// Use -s (start) and -e (end) for frame ranges, or -f for single frame
// Use Blender's automatic frame numbering with #### pattern
var cmd *exec.Cmd
args := []string{"-b", blendFile, "--python", scriptPath}
if enableExecution {
args = append(args, "--enable-autoexec")
}
// Always render frames individually for precise control over file naming
// This avoids Blender's automatic frame numbering quirks
for frame := frameStart; frame <= frameEnd; frame++ {
// Create temp output pattern for this frame
tempPattern := filepath.Join(outputDir, fmt.Sprintf("temp_frame.%s", strings.ToLower(renderFormat)))
tempAbsPattern, _ := filepath.Abs(tempPattern)
// Build args for this specific frame
frameArgs := []string{"-b", blendFile, "--python", scriptPath}
if enableExecution {
frameArgs = append(frameArgs, "--enable-autoexec")
}
frameArgs = append(frameArgs, "-o", tempAbsPattern, "-f", fmt.Sprintf("%d", frame))
// Output pattern uses #### which Blender will replace with frame numbers
outputPattern := filepath.Join(outputDir, fmt.Sprintf("frame_####.%s", strings.ToLower(renderFormat)))
outputAbsPattern, _ := filepath.Abs(outputPattern)
args = append(args, "-o", outputAbsPattern)
c.sendLog(taskID, types.LogLevelInfo, fmt.Sprintf("Rendering frame %d...", frame), "render_blender")
frameCmd := exec.Command("blender", frameArgs...)
frameCmd.Dir = workDir
frameCmd.Env = os.Environ()
// Run this frame
if output, err := frameCmd.CombinedOutput(); err != nil {
errMsg := fmt.Sprintf("blender failed on frame %d: %v (output: %s)", frame, err, string(output))
c.sendLog(taskID, types.LogLevelError, errMsg, "render_blender")
return errors.New(errMsg)
}
// Immediately rename the temp file to the proper frame-numbered name
finalName := fmt.Sprintf("frame_%04d.%s", frame, strings.ToLower(renderFormat))
finalPath := filepath.Join(outputDir, finalName)
tempPath := filepath.Join(outputDir, fmt.Sprintf("temp_frame.%s", strings.ToLower(renderFormat)))
if err := os.Rename(tempPath, finalPath); err != nil {
errMsg := fmt.Sprintf("failed to rename temp file for frame %d: %v", frame, err)
c.sendLog(taskID, types.LogLevelError, errMsg, "render_blender")
return errors.New(errMsg)
}
c.sendLog(taskID, types.LogLevelInfo, fmt.Sprintf("Completed frame %d -> %s", frame, finalName), "render_blender")
if frameStart == frameEnd {
// Single frame
args = append(args, "-f", fmt.Sprintf("%d", frameStart))
c.sendLog(taskID, types.LogLevelInfo, fmt.Sprintf("Rendering frame %d...", frameStart), "render_blender")
} else {
// Frame range
args = append(args, "-s", fmt.Sprintf("%d", frameStart), "-e", fmt.Sprintf("%d", frameEnd))
c.sendLog(taskID, types.LogLevelInfo, fmt.Sprintf("Rendering frames %d-%d...", frameStart, frameEnd), "render_blender")
}
// Skip the rest of the function since we handled all frames above
c.sendStepUpdate(taskID, "render_blender", types.StepStatusCompleted, "")
return nil
// Create and run Blender command
cmd = exec.Command("blender", args...)
cmd.Dir = workDir
cmd.Env = os.Environ()
// Blender will handle headless rendering automatically
// We preserve the environment to allow GPU access if available
@@ -1350,8 +1187,8 @@ func (c *Client) processTask(task map[string]interface{}, jobName string, output
}
// Register process for cleanup on shutdown
c.runningProcs.Store(taskID, cmd)
defer c.runningProcs.Delete(taskID)
c.processTracker.Track(taskID, cmd)
defer c.processTracker.Untrack(taskID)
// Stream stdout line by line
stdoutDone := make(chan bool)
@@ -1396,15 +1233,23 @@ func (c *Client) processTask(task map[string]interface{}, jobName string, output
<-stdoutDone
<-stderrDone
if err != nil {
errMsg := fmt.Sprintf("blender failed: %v", err)
var errMsg string
if exitErr, ok := err.(*exec.ExitError); ok {
if exitErr.ExitCode() == 137 {
errMsg = "Blender was killed due to excessive memory usage (OOM)"
} else {
errMsg = fmt.Sprintf("blender failed: %v", err)
}
} else {
errMsg = fmt.Sprintf("blender failed: %v", err)
}
c.sendLog(taskID, types.LogLevelError, errMsg, "render_blender")
c.sendStepUpdate(taskID, "render_blender", types.StepStatusFailed, errMsg)
return errors.New(errMsg)
}
// For frame ranges, we rendered each frame individually with temp naming
// The files are already properly named during the individual frame rendering
// No additional renaming needed
// Blender has rendered frames with automatic numbering using the #### pattern
// Files will be named like frame_0001.png, frame_0002.png, etc.
// Find rendered output file(s)
// For frame ranges, we'll find all frames in the upload step
@@ -1748,8 +1593,8 @@ func (c *Client) processVideoGenerationTask(task map[string]interface{}, jobID i
// Extract frame number pattern (e.g., frame_2470.exr -> frame_%04d.exr)
baseName := filepath.Base(firstFrame)
// Find the numeric part and replace it with %04d pattern
// Use regex to find digits (including negative) after underscore and before extension
re := regexp.MustCompile(`_(-?\d+)\.`)
// Use regex to find digits (positive only, negative frames not supported) after underscore and before extension
re := regexp.MustCompile(`_(\d+)\.`)
var pattern string
var startNumber int
frameNumStr := re.FindStringSubmatch(baseName)
@@ -1763,6 +1608,7 @@ func (c *Client) processVideoGenerationTask(task map[string]interface{}, jobID i
startNumber = extractFrameNumber(baseName)
pattern = strings.Replace(baseName, fmt.Sprintf("%d", startNumber), "%04d", 1)
}
// Pattern path should be in workDir where the frame files are downloaded
patternPath := filepath.Join(workDir, pattern)
// Allocate a VAAPI device for this task (if available)
@@ -1891,8 +1737,8 @@ func (c *Client) processVideoGenerationTask(task map[string]interface{}, jobID i
}
// Register process for cleanup on shutdown
c.runningProcs.Store(taskID, cmd)
defer c.runningProcs.Delete(taskID)
c.processTracker.Track(taskID, cmd)
defer c.processTracker.Untrack(taskID)
// Stream stdout line by line
stdoutDone := make(chan bool)
@@ -1959,26 +1805,25 @@ func (c *Client) processVideoGenerationTask(task map[string]interface{}, jobID i
<-stderrDone
if err != nil {
var errMsg string
if exitErr, ok := err.(*exec.ExitError); ok {
if exitErr.ExitCode() == 137 {
errMsg = "FFmpeg was killed due to excessive memory usage (OOM)"
} else {
errMsg = fmt.Sprintf("ffmpeg encoding failed: %v", err)
}
} else {
errMsg = fmt.Sprintf("ffmpeg encoding failed: %v", err)
}
// Check for size-related errors and provide helpful messages
if sizeErr := c.checkFFmpegSizeError("ffmpeg encoding failed"); sizeErr != nil {
if sizeErr := c.checkFFmpegSizeError(errMsg); sizeErr != nil {
c.sendLog(taskID, types.LogLevelError, sizeErr.Error(), "generate_video")
c.sendStepUpdate(taskID, "generate_video", types.StepStatusFailed, sizeErr.Error())
return sizeErr
}
// Try alternative method with concat demuxer
c.sendLog(taskID, types.LogLevelWarn, "Primary ffmpeg encoding failed, trying concat method...", "generate_video")
err = c.generateMP4WithConcat(frameFiles, outputMP4, workDir, allocatedDevice, outputFormat, codec, pixFmt, useAlpha, useHardware, frameRate)
if err != nil {
// Check for size errors in concat method too
if sizeErr := c.checkFFmpegSizeError(err.Error()); sizeErr != nil {
c.sendLog(taskID, types.LogLevelError, sizeErr.Error(), "generate_video")
c.sendStepUpdate(taskID, "generate_video", types.StepStatusFailed, sizeErr.Error())
return sizeErr
}
c.sendStepUpdate(taskID, "generate_video", types.StepStatusFailed, err.Error())
return err
}
c.sendLog(taskID, types.LogLevelError, errMsg, "generate_video")
c.sendStepUpdate(taskID, "generate_video", types.StepStatusFailed, errMsg)
return errors.New(errMsg)
}
// Check if MP4 was created
@@ -2771,7 +2616,7 @@ func (c *Client) testGenericEncoder(encoder string) bool {
// generateMP4WithConcat uses ffmpeg concat demuxer as fallback
// device parameter is optional - if provided, it will be used for VAAPI encoding
func (c *Client) generateMP4WithConcat(frameFiles []string, outputMP4, workDir string, device string, outputFormat string, codec string, pixFmt string, useAlpha bool, useHardware bool, frameRate float64) error {
func (c *Client) generateMP4WithConcat(taskID int, frameFiles []string, outputMP4, workDir string, device string, outputFormat string, codec string, pixFmt string, useAlpha bool, useHardware bool, frameRate float64) error {
// Create file list for ffmpeg concat demuxer
listFile := filepath.Join(workDir, "frames.txt")
listFileHandle, err := os.Create(listFile)
@@ -2907,11 +2752,23 @@ func (c *Client) generateMP4WithConcat(frameFiles []string, outputMP4, workDir s
<-stderrDone
if err != nil {
var errMsg string
if exitErr, ok := err.(*exec.ExitError); ok {
if exitErr.ExitCode() == 137 {
errMsg = "FFmpeg was killed due to excessive memory usage (OOM)"
} else {
errMsg = fmt.Sprintf("ffmpeg concat failed: %v", err)
}
} else {
errMsg = fmt.Sprintf("ffmpeg concat failed: %v", err)
}
// Check for size-related errors
if sizeErr := c.checkFFmpegSizeError("ffmpeg concat failed"); sizeErr != nil {
if sizeErr := c.checkFFmpegSizeError(errMsg); sizeErr != nil {
return sizeErr
}
return fmt.Errorf("ffmpeg concat failed: %w", err)
c.sendLog(int64(taskID), types.LogLevelError, errMsg, "generate_video")
c.sendStepUpdate(int64(taskID), "generate_video", types.StepStatusFailed, errMsg)
return errors.New(errMsg)
}
if _, err := os.Stat(outputMP4); os.IsNotExist(err) {
@@ -3695,8 +3552,8 @@ sys.stdout.flush()
}
// Register process for cleanup on shutdown
c.runningProcs.Store(taskID, cmd)
defer c.runningProcs.Delete(taskID)
c.processTracker.Track(taskID, cmd)
defer c.processTracker.Untrack(taskID)
// Stream stdout line by line and collect for JSON parsing
stdoutDone := make(chan bool)
@@ -3743,7 +3600,16 @@ sys.stdout.flush()
<-stdoutDone
<-stderrDone
if err != nil {
errMsg := fmt.Sprintf("blender metadata extraction failed: %v", err)
var errMsg string
if exitErr, ok := err.(*exec.ExitError); ok {
if exitErr.ExitCode() == 137 {
errMsg = "Blender metadata extraction was killed due to excessive memory usage (OOM)"
} else {
errMsg = fmt.Sprintf("blender metadata extraction failed: %v", err)
}
} else {
errMsg = fmt.Sprintf("blender metadata extraction failed: %v", err)
}
c.sendLog(taskID, types.LogLevelError, errMsg, "extract_metadata")
c.sendStepUpdate(taskID, "extract_metadata", types.StepStatusFailed, errMsg)
return errors.New(errMsg)