Add GPU error handling and lockout mechanism in Runner
- Introduced gpuLockedOut state in Runner to manage GPU rendering based on detected errors. - Implemented SetGPULockedOut and IsGPULockedOut methods for controlling GPU usage. - Enhanced Context to include GPULockedOut and OnGPUError for better error handling. - Updated RenderProcessor to check for GPU errors in logs and trigger lockout as needed. - Modified rendering logic to force CPU rendering when GPU lockout is active, improving stability during errors.
This commit is contained in:
@@ -38,12 +38,18 @@ type Context struct {
|
||||
Blender *blender.Manager
|
||||
Encoder *encoding.Selector
|
||||
Processes *executils.ProcessTracker
|
||||
|
||||
// GPULockedOut is set when the runner has detected a GPU error (e.g. HIP) and disables GPU for all jobs.
|
||||
GPULockedOut bool
|
||||
// OnGPUError is called when a GPU error line is seen in render logs; typically sets runner GPU lockout.
|
||||
OnGPUError func()
|
||||
}
|
||||
|
||||
// ErrJobCancelled indicates the manager-side job was cancelled during execution.
|
||||
var ErrJobCancelled = errors.New("job cancelled")
|
||||
|
||||
// NewContext creates a new task context. frameEnd should be >= frame; if 0 or less than frame, it is treated as single-frame (frameEnd = frame).
|
||||
// gpuLockedOut is the runner's current GPU lockout state; onGPUError is called when a GPU error is detected in logs (may be nil).
|
||||
func NewContext(
|
||||
taskID, jobID int64,
|
||||
jobName string,
|
||||
@@ -58,26 +64,30 @@ func NewContext(
|
||||
blenderMgr *blender.Manager,
|
||||
encoder *encoding.Selector,
|
||||
processes *executils.ProcessTracker,
|
||||
gpuLockedOut bool,
|
||||
onGPUError func(),
|
||||
) *Context {
|
||||
if frameEnd < frameStart {
|
||||
frameEnd = frameStart
|
||||
}
|
||||
return &Context{
|
||||
TaskID: taskID,
|
||||
JobID: jobID,
|
||||
JobName: jobName,
|
||||
Frame: frameStart,
|
||||
FrameEnd: frameEnd,
|
||||
TaskType: taskType,
|
||||
WorkDir: workDir,
|
||||
JobToken: jobToken,
|
||||
Metadata: metadata,
|
||||
Manager: manager,
|
||||
JobConn: jobConn,
|
||||
Workspace: ws,
|
||||
Blender: blenderMgr,
|
||||
Encoder: encoder,
|
||||
Processes: processes,
|
||||
TaskID: taskID,
|
||||
JobID: jobID,
|
||||
JobName: jobName,
|
||||
Frame: frameStart,
|
||||
FrameEnd: frameEnd,
|
||||
TaskType: taskType,
|
||||
WorkDir: workDir,
|
||||
JobToken: jobToken,
|
||||
Metadata: metadata,
|
||||
Manager: manager,
|
||||
JobConn: jobConn,
|
||||
Workspace: ws,
|
||||
Blender: blenderMgr,
|
||||
Encoder: encoder,
|
||||
Processes: processes,
|
||||
GPULockedOut: gpuLockedOut,
|
||||
OnGPUError: onGPUError,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -158,6 +168,22 @@ func (c *Context) ShouldEnableExecution() bool {
|
||||
return c.Metadata != nil && c.Metadata.EnableExecution != nil && *c.Metadata.EnableExecution
|
||||
}
|
||||
|
||||
// ShouldForceCPU returns true if GPU should be disabled and CPU rendering forced
|
||||
// (runner GPU lockout or metadata force_cpu in engine_settings).
|
||||
func (c *Context) ShouldForceCPU() bool {
|
||||
if c.GPULockedOut {
|
||||
return true
|
||||
}
|
||||
if c.Metadata != nil && c.Metadata.RenderSettings.EngineSettings != nil {
|
||||
if v, ok := c.Metadata.RenderSettings.EngineSettings["force_cpu"]; ok {
|
||||
if b, ok := v.(bool); ok && b {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// IsJobCancelled checks whether the manager marked this job as cancelled.
|
||||
func (c *Context) IsJobCancelled() (bool, error) {
|
||||
if c.Manager == nil {
|
||||
|
||||
Reference in New Issue
Block a user