208 lines
6.3 KiB
Go
208 lines
6.3 KiB
Go
package main
|
|
|
|
import (
|
|
"crypto/rand"
|
|
"encoding/hex"
|
|
"flag"
|
|
"fmt"
|
|
"log"
|
|
"os"
|
|
"os/signal"
|
|
"strings"
|
|
"syscall"
|
|
"time"
|
|
|
|
"jiggablend/internal/logger"
|
|
"jiggablend/internal/runner"
|
|
)
|
|
|
|
// Removed SecretsFile - runners now generate ephemeral instance IDs
|
|
|
|
func main() {
|
|
log.Printf("Runner starting up...")
|
|
|
|
// Create client early so we can clean it up on panic
|
|
var client *runner.Client
|
|
|
|
defer func() {
|
|
if r := recover(); r != nil {
|
|
log.Printf("Runner panicked: %v", r)
|
|
// Clean up workspace even on panic
|
|
if client != nil {
|
|
client.CleanupWorkspace()
|
|
}
|
|
os.Exit(1)
|
|
}
|
|
}()
|
|
|
|
var (
|
|
managerURL = flag.String("manager", getEnv("MANAGER_URL", "http://localhost:8080"), "Manager URL")
|
|
name = flag.String("name", getEnv("RUNNER_NAME", ""), "Runner name")
|
|
hostname = flag.String("hostname", getEnv("RUNNER_HOSTNAME", ""), "Runner hostname")
|
|
apiKeyFlag = flag.String("api-key", getEnv("API_KEY", ""), "API key for authentication")
|
|
logDir = flag.String("log-dir", getEnv("LOG_DIR", "./logs"), "Log directory")
|
|
logMaxSize = flag.Int("log-max-size", getEnvInt("LOG_MAX_SIZE", 100), "Maximum log file size in MB before rotation")
|
|
logMaxBackups = flag.Int("log-max-backups", getEnvInt("LOG_MAX_BACKUPS", 5), "Maximum number of rotated log files to keep")
|
|
logMaxAge = flag.Int("log-max-age", getEnvInt("LOG_MAX_AGE", 30), "Maximum age in days for rotated log files")
|
|
)
|
|
flag.Parse()
|
|
log.Printf("Flags parsed, hostname: %s", *hostname)
|
|
|
|
if *hostname == "" {
|
|
*hostname, _ = os.Hostname()
|
|
}
|
|
|
|
// Always generate a random runner ID suffix on startup
|
|
// This ensures every runner has a unique local identifier
|
|
runnerIDStr := generateShortID()
|
|
log.Printf("Generated runner ID suffix: %s", runnerIDStr)
|
|
|
|
// Generate runner name with ID if not provided
|
|
if *name == "" {
|
|
*name = fmt.Sprintf("runner-%s-%s", *hostname, runnerIDStr)
|
|
} else {
|
|
// Append ID to provided name to ensure uniqueness
|
|
*name = fmt.Sprintf("%s-%s", *name, runnerIDStr)
|
|
}
|
|
|
|
// Initialize logger (writes to both stdout and log file with rotation)
|
|
// Use runner-specific log file name based on the final name
|
|
sanitizedName := strings.ReplaceAll(*name, "/", "_")
|
|
sanitizedName = strings.ReplaceAll(sanitizedName, "\\", "_")
|
|
logFileName := fmt.Sprintf("runner-%s.log", sanitizedName)
|
|
|
|
if err := logger.Init(*logDir, logFileName, *logMaxSize, *logMaxBackups, *logMaxAge); err != nil {
|
|
log.Fatalf("Failed to initialize logger: %v", err)
|
|
}
|
|
defer func() {
|
|
if l := logger.GetDefault(); l != nil {
|
|
l.Close()
|
|
}
|
|
}()
|
|
log.Printf("Logger initialized, continuing with startup...")
|
|
log.Printf("Log rotation configured: max_size=%dMB, max_backups=%d, max_age=%d days", *logMaxSize, *logMaxBackups, *logMaxAge)
|
|
|
|
log.Printf("About to create client...")
|
|
client = runner.NewClient(*managerURL, *name, *hostname)
|
|
log.Printf("Client created successfully")
|
|
|
|
// Clean up any orphaned workspace directories from previous runs
|
|
client.CleanupWorkspace()
|
|
|
|
// Probe capabilities once at startup (before any registration attempts)
|
|
log.Printf("Probing runner capabilities...")
|
|
client.ProbeCapabilities()
|
|
capabilities := client.GetCapabilities()
|
|
capList := []string{}
|
|
for cap, value := range capabilities {
|
|
// Only show boolean true capabilities and numeric GPU counts
|
|
if enabled, ok := value.(bool); ok && enabled {
|
|
capList = append(capList, cap)
|
|
} else if count, ok := value.(int); ok && count > 0 {
|
|
capList = append(capList, fmt.Sprintf("%s=%d", cap, count))
|
|
} else if count, ok := value.(float64); ok && count > 0 {
|
|
capList = append(capList, fmt.Sprintf("%s=%.0f", cap, count))
|
|
}
|
|
}
|
|
if len(capList) > 0 {
|
|
log.Printf("Detected capabilities: %s", strings.Join(capList, ", "))
|
|
} else {
|
|
log.Printf("Warning: No capabilities detected")
|
|
}
|
|
|
|
// Register with API key (with retry logic)
|
|
if *apiKeyFlag == "" {
|
|
log.Fatalf("API key required (use --api-key or set API_KEY env var)")
|
|
}
|
|
|
|
// Retry registration with exponential backoff
|
|
backoff := 1 * time.Second
|
|
maxBackoff := 30 * time.Second
|
|
maxRetries := 10
|
|
retryCount := 0
|
|
|
|
var runnerID int64
|
|
|
|
for {
|
|
var err error
|
|
runnerID, _, _, err = client.Register(*apiKeyFlag)
|
|
if err == nil {
|
|
log.Printf("Registered runner with ID: %d", runnerID)
|
|
break
|
|
}
|
|
|
|
// Check if it's a token error (invalid/expired/used token) - shutdown immediately
|
|
errMsg := err.Error()
|
|
if strings.Contains(errMsg, "token error:") {
|
|
log.Fatalf("Registration failed (token error): %v", err)
|
|
}
|
|
|
|
// Only retry on connection errors or other retryable errors
|
|
retryCount++
|
|
if retryCount >= maxRetries {
|
|
log.Fatalf("Failed to register runner after %d attempts: %v", maxRetries, err)
|
|
}
|
|
|
|
log.Printf("Registration failed (attempt %d/%d): %v, retrying in %v", retryCount, maxRetries, err, backoff)
|
|
time.Sleep(backoff)
|
|
backoff *= 2
|
|
if backoff > maxBackoff {
|
|
backoff = maxBackoff
|
|
}
|
|
}
|
|
|
|
// Start WebSocket connection with reconnection
|
|
go client.ConnectWebSocketWithReconnect()
|
|
|
|
// Start heartbeat loop (for WebSocket ping/pong and HTTP fallback)
|
|
go client.HeartbeatLoop()
|
|
|
|
// ProcessTasks is now handled via WebSocket, but kept for HTTP fallback
|
|
// WebSocket will handle task assignment automatically
|
|
log.Printf("Runner started, connecting to manager via WebSocket...")
|
|
|
|
// Set up signal handlers to kill processes on shutdown
|
|
sigChan := make(chan os.Signal, 1)
|
|
signal.Notify(sigChan, os.Interrupt, syscall.SIGTERM)
|
|
|
|
go func() {
|
|
sig := <-sigChan
|
|
log.Printf("Received signal: %v, killing all processes and cleaning up...", sig)
|
|
client.KillAllProcesses()
|
|
// Cleanup happens in defer, but also do it here for good measure
|
|
client.CleanupWorkspace()
|
|
os.Exit(0)
|
|
}()
|
|
|
|
// Block forever
|
|
select {}
|
|
}
|
|
|
|
|
|
func getEnv(key, defaultValue string) string {
|
|
if value := os.Getenv(key); value != "" {
|
|
return value
|
|
}
|
|
return defaultValue
|
|
}
|
|
|
|
func getEnvInt(key string, defaultValue int) int {
|
|
if value := os.Getenv(key); value != "" {
|
|
var result int
|
|
if _, err := fmt.Sscanf(value, "%d", &result); err == nil {
|
|
return result
|
|
}
|
|
}
|
|
return defaultValue
|
|
}
|
|
|
|
// generateShortID generates a short random ID (8 hex characters)
|
|
func generateShortID() string {
|
|
bytes := make([]byte, 4)
|
|
if _, err := rand.Read(bytes); err != nil {
|
|
// Fallback to timestamp-based ID if crypto/rand fails
|
|
return fmt.Sprintf("%x", os.Getpid()^int(time.Now().Unix()))
|
|
}
|
|
return hex.EncodeToString(bytes)
|
|
}
|