Add GPU error handling and lockout mechanism in Runner
- Introduced gpuLockedOut state in Runner to manage GPU rendering based on detected errors. - Implemented SetGPULockedOut and IsGPULockedOut methods for controlling GPU usage. - Enhanced Context to include GPULockedOut and OnGPUError for better error handling. - Updated RenderProcessor to check for GPU errors in logs and trigger lockout as needed. - Modified rendering logic to force CPU rendering when GPU lockout is active, improving stability during errors.
This commit is contained in:
@@ -25,6 +25,24 @@ func NewRenderProcessor() *RenderProcessor {
|
||||
return &RenderProcessor{}
|
||||
}
|
||||
|
||||
// gpuErrorSubstrings are log line substrings that indicate a GPU backend error; any match triggers full GPU lockout.
|
||||
var gpuErrorSubstrings = []string{
|
||||
"Illegal address in hip", // HIP (AMD) backend
|
||||
}
|
||||
|
||||
// checkGPUErrorLine checks a log line for GPU error indicators and triggers runner GPU lockout if found.
|
||||
func (p *RenderProcessor) checkGPUErrorLine(ctx *Context, line string) {
|
||||
for _, sub := range gpuErrorSubstrings {
|
||||
if strings.Contains(line, sub) {
|
||||
if ctx.OnGPUError != nil {
|
||||
ctx.OnGPUError()
|
||||
}
|
||||
ctx.Warn(fmt.Sprintf("GPU error detected in log (%q); GPU disabled for subsequent jobs", sub))
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Process executes a render task.
|
||||
func (p *RenderProcessor) Process(ctx *Context) error {
|
||||
if err := ctx.CheckCancelled(); err != nil {
|
||||
@@ -77,6 +95,10 @@ func (p *RenderProcessor) Process(ctx *Context) error {
|
||||
// We always render EXR (linear) for VFX accuracy; job output_format is the deliverable (EXR sequence or video).
|
||||
renderFormat := "EXR"
|
||||
|
||||
if ctx.ShouldForceCPU() {
|
||||
ctx.Info("GPU lockout active: using CPU rendering only")
|
||||
}
|
||||
|
||||
// Create render script
|
||||
if err := p.createRenderScript(ctx, renderFormat); err != nil {
|
||||
return err
|
||||
@@ -142,13 +164,22 @@ func (p *RenderProcessor) createRenderScript(ctx *Context, renderFormat string)
|
||||
return errors.New(errMsg)
|
||||
}
|
||||
|
||||
// Write render settings if available
|
||||
// Write render settings: merge job metadata with runner force_cpu (GPU lockout)
|
||||
var settingsMap map[string]interface{}
|
||||
if ctx.Metadata != nil && ctx.Metadata.RenderSettings.EngineSettings != nil {
|
||||
settingsJSON, err := json.Marshal(ctx.Metadata.RenderSettings)
|
||||
raw, err := json.Marshal(ctx.Metadata.RenderSettings)
|
||||
if err == nil {
|
||||
if err := os.WriteFile(renderSettingsFilePath, settingsJSON, 0644); err != nil {
|
||||
ctx.Warn(fmt.Sprintf("Failed to write render settings file: %v", err))
|
||||
}
|
||||
_ = json.Unmarshal(raw, &settingsMap)
|
||||
}
|
||||
}
|
||||
if settingsMap == nil {
|
||||
settingsMap = make(map[string]interface{})
|
||||
}
|
||||
settingsMap["force_cpu"] = ctx.ShouldForceCPU()
|
||||
settingsJSON, err := json.Marshal(settingsMap)
|
||||
if err == nil {
|
||||
if err := os.WriteFile(renderSettingsFilePath, settingsJSON, 0644); err != nil {
|
||||
ctx.Warn(fmt.Sprintf("Failed to write render settings file: %v", err))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -211,7 +242,7 @@ func (p *RenderProcessor) runBlender(ctx *Context, blenderBinary, blendFile, out
|
||||
ctx.Processes.Track(ctx.TaskID, cmd)
|
||||
defer ctx.Processes.Untrack(ctx.TaskID)
|
||||
|
||||
// Stream stdout
|
||||
// Stream stdout and watch for GPU error lines (lock out all GPU on any backend error)
|
||||
stdoutDone := make(chan bool)
|
||||
go func() {
|
||||
defer close(stdoutDone)
|
||||
@@ -219,6 +250,7 @@ func (p *RenderProcessor) runBlender(ctx *Context, blenderBinary, blendFile, out
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
if line != "" {
|
||||
p.checkGPUErrorLine(ctx, line)
|
||||
shouldFilter, logLevel := blender.FilterLog(line)
|
||||
if !shouldFilter {
|
||||
ctx.Log(logLevel, line)
|
||||
@@ -227,7 +259,7 @@ func (p *RenderProcessor) runBlender(ctx *Context, blenderBinary, blendFile, out
|
||||
}
|
||||
}()
|
||||
|
||||
// Stream stderr
|
||||
// Stream stderr and watch for GPU error lines
|
||||
stderrDone := make(chan bool)
|
||||
go func() {
|
||||
defer close(stderrDone)
|
||||
@@ -235,6 +267,7 @@ func (p *RenderProcessor) runBlender(ctx *Context, blenderBinary, blendFile, out
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
if line != "" {
|
||||
p.checkGPUErrorLine(ctx, line)
|
||||
shouldFilter, logLevel := blender.FilterLog(line)
|
||||
if !shouldFilter {
|
||||
if logLevel == types.LogLevelInfo {
|
||||
|
||||
Reference in New Issue
Block a user