Add GPU error handling and lockout mechanism in Runner

- Introduced gpuLockedOut state in Runner to manage GPU rendering based on detected errors.
- Implemented SetGPULockedOut and IsGPULockedOut methods for controlling GPU usage.
- Enhanced Context to include GPULockedOut and OnGPUError for better error handling.
- Updated RenderProcessor to check for GPU errors in logs and trigger lockout as needed.
- Modified rendering logic to force CPU rendering when GPU lockout is active, improving stability during errors.
This commit is contained in:
2026-03-13 10:01:39 -05:00
parent f9111ebac4
commit 6833bb4013
3 changed files with 106 additions and 22 deletions

View File

@@ -40,6 +40,11 @@ type Runner struct {
fingerprint string
fingerprintMu sync.RWMutex
// gpuLockedOut is set when logs indicate a GPU error (e.g. HIP "Illegal address");
// when true, the runner forces CPU rendering for all subsequent jobs.
gpuLockedOut bool
gpuLockedOutMu sync.RWMutex
}
// New creates a new runner.
@@ -238,6 +243,8 @@ func (r *Runner) executeJob(job *api.NextJobResponse) (err error) {
r.blender,
r.encoder,
r.processes,
r.IsGPULockedOut(),
func() { r.SetGPULockedOut(true) },
)
ctx.Info(fmt.Sprintf("Task assignment received (job: %d, type: %s)",
@@ -388,3 +395,21 @@ func (r *Runner) GetFingerprint() string {
func (r *Runner) GetID() int64 {
return r.id
}
// SetGPULockedOut sets whether GPU use is locked out due to a detected GPU error.
// When true, the runner will force CPU rendering for all jobs.
func (r *Runner) SetGPULockedOut(locked bool) {
r.gpuLockedOutMu.Lock()
defer r.gpuLockedOutMu.Unlock()
r.gpuLockedOut = locked
if locked {
log.Printf("GPU lockout enabled: GPU rendering disabled for subsequent jobs (CPU only)")
}
}
// IsGPULockedOut returns whether GPU use is currently locked out.
func (r *Runner) IsGPULockedOut() bool {
r.gpuLockedOutMu.RLock()
defer r.gpuLockedOutMu.RUnlock()
return r.gpuLockedOut
}