Add GPU error handling and lockout mechanism in Runner
- Introduced gpuLockedOut state in Runner to manage GPU rendering based on detected errors. - Implemented SetGPULockedOut and IsGPULockedOut methods for controlling GPU usage. - Enhanced Context to include GPULockedOut and OnGPUError for better error handling. - Updated RenderProcessor to check for GPU errors in logs and trigger lockout as needed. - Modified rendering logic to force CPU rendering when GPU lockout is active, improving stability during errors.
This commit is contained in:
@@ -40,6 +40,11 @@ type Runner struct {
|
||||
|
||||
fingerprint string
|
||||
fingerprintMu sync.RWMutex
|
||||
|
||||
// gpuLockedOut is set when logs indicate a GPU error (e.g. HIP "Illegal address");
|
||||
// when true, the runner forces CPU rendering for all subsequent jobs.
|
||||
gpuLockedOut bool
|
||||
gpuLockedOutMu sync.RWMutex
|
||||
}
|
||||
|
||||
// New creates a new runner.
|
||||
@@ -238,6 +243,8 @@ func (r *Runner) executeJob(job *api.NextJobResponse) (err error) {
|
||||
r.blender,
|
||||
r.encoder,
|
||||
r.processes,
|
||||
r.IsGPULockedOut(),
|
||||
func() { r.SetGPULockedOut(true) },
|
||||
)
|
||||
|
||||
ctx.Info(fmt.Sprintf("Task assignment received (job: %d, type: %s)",
|
||||
@@ -388,3 +395,21 @@ func (r *Runner) GetFingerprint() string {
|
||||
func (r *Runner) GetID() int64 {
|
||||
return r.id
|
||||
}
|
||||
|
||||
// SetGPULockedOut sets whether GPU use is locked out due to a detected GPU error.
|
||||
// When true, the runner will force CPU rendering for all jobs.
|
||||
func (r *Runner) SetGPULockedOut(locked bool) {
|
||||
r.gpuLockedOutMu.Lock()
|
||||
defer r.gpuLockedOutMu.Unlock()
|
||||
r.gpuLockedOut = locked
|
||||
if locked {
|
||||
log.Printf("GPU lockout enabled: GPU rendering disabled for subsequent jobs (CPU only)")
|
||||
}
|
||||
}
|
||||
|
||||
// IsGPULockedOut returns whether GPU use is currently locked out.
|
||||
func (r *Runner) IsGPULockedOut() bool {
|
||||
r.gpuLockedOutMu.RLock()
|
||||
defer r.gpuLockedOutMu.RUnlock()
|
||||
return r.gpuLockedOut
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user