package main import ( "crypto/rand" "encoding/hex" "flag" "fmt" "log" "os" "os/signal" "strings" "syscall" "time" "jiggablend/internal/logger" "jiggablend/internal/runner" ) // Removed SecretsFile - runners now generate ephemeral instance IDs func main() { log.Printf("Runner starting up...") // Create client early so we can clean it up on panic var client *runner.Client defer func() { if r := recover(); r != nil { log.Printf("Runner panicked: %v", r) // Clean up workspace even on panic if client != nil { client.CleanupWorkspace() } os.Exit(1) } }() var ( managerURL = flag.String("manager", getEnv("MANAGER_URL", "http://localhost:8080"), "Manager URL") name = flag.String("name", getEnv("RUNNER_NAME", ""), "Runner name") hostname = flag.String("hostname", getEnv("RUNNER_HOSTNAME", ""), "Runner hostname") apiKeyFlag = flag.String("api-key", getEnv("API_KEY", ""), "API key for authentication") logDir = flag.String("log-dir", getEnv("LOG_DIR", "./logs"), "Log directory") logMaxSize = flag.Int("log-max-size", getEnvInt("LOG_MAX_SIZE", 100), "Maximum log file size in MB before rotation") logMaxBackups = flag.Int("log-max-backups", getEnvInt("LOG_MAX_BACKUPS", 5), "Maximum number of rotated log files to keep") logMaxAge = flag.Int("log-max-age", getEnvInt("LOG_MAX_AGE", 30), "Maximum age in days for rotated log files") ) flag.Parse() log.Printf("Flags parsed, hostname: %s", *hostname) if *hostname == "" { *hostname, _ = os.Hostname() } // Always generate a random runner ID suffix on startup // This ensures every runner has a unique local identifier runnerIDStr := generateShortID() log.Printf("Generated runner ID suffix: %s", runnerIDStr) // Generate runner name with ID if not provided if *name == "" { *name = fmt.Sprintf("runner-%s-%s", *hostname, runnerIDStr) } else { // Append ID to provided name to ensure uniqueness *name = fmt.Sprintf("%s-%s", *name, runnerIDStr) } // Initialize logger (writes to both stdout and log file with rotation) // Use runner-specific log file name based on the final name sanitizedName := strings.ReplaceAll(*name, "/", "_") sanitizedName = strings.ReplaceAll(sanitizedName, "\\", "_") logFileName := fmt.Sprintf("runner-%s.log", sanitizedName) if err := logger.Init(*logDir, logFileName, *logMaxSize, *logMaxBackups, *logMaxAge); err != nil { log.Fatalf("Failed to initialize logger: %v", err) } defer func() { if l := logger.GetDefault(); l != nil { l.Close() } }() log.Printf("Logger initialized, continuing with startup...") log.Printf("Log rotation configured: max_size=%dMB, max_backups=%d, max_age=%d days", *logMaxSize, *logMaxBackups, *logMaxAge) log.Printf("About to create client...") client = runner.NewClient(*managerURL, *name, *hostname) log.Printf("Client created successfully") // Clean up any orphaned workspace directories from previous runs client.CleanupWorkspace() // Probe capabilities once at startup (before any registration attempts) log.Printf("Probing runner capabilities...") client.ProbeCapabilities() capabilities := client.GetCapabilities() capList := []string{} for cap, value := range capabilities { // Only show boolean true capabilities and numeric GPU counts if enabled, ok := value.(bool); ok && enabled { capList = append(capList, cap) } else if count, ok := value.(int); ok && count > 0 { capList = append(capList, fmt.Sprintf("%s=%d", cap, count)) } else if count, ok := value.(float64); ok && count > 0 { capList = append(capList, fmt.Sprintf("%s=%.0f", cap, count)) } } if len(capList) > 0 { log.Printf("Detected capabilities: %s", strings.Join(capList, ", ")) } else { log.Printf("Warning: No capabilities detected") } // Register with API key (with retry logic) if *apiKeyFlag == "" { log.Fatalf("API key required (use --api-key or set API_KEY env var)") } // Retry registration with exponential backoff backoff := 1 * time.Second maxBackoff := 30 * time.Second maxRetries := 10 retryCount := 0 var runnerID int64 for { var err error runnerID, _, _, err = client.Register(*apiKeyFlag) if err == nil { log.Printf("Registered runner with ID: %d", runnerID) break } // Check if it's a token error (invalid/expired/used token) - shutdown immediately errMsg := err.Error() if strings.Contains(errMsg, "token error:") { log.Fatalf("Registration failed (token error): %v", err) } // Only retry on connection errors or other retryable errors retryCount++ if retryCount >= maxRetries { log.Fatalf("Failed to register runner after %d attempts: %v", maxRetries, err) } log.Printf("Registration failed (attempt %d/%d): %v, retrying in %v", retryCount, maxRetries, err, backoff) time.Sleep(backoff) backoff *= 2 if backoff > maxBackoff { backoff = maxBackoff } } // Start WebSocket connection with reconnection go client.ConnectWebSocketWithReconnect() // Start heartbeat loop (for WebSocket ping/pong and HTTP fallback) go client.HeartbeatLoop() // ProcessTasks is now handled via WebSocket, but kept for HTTP fallback // WebSocket will handle task assignment automatically log.Printf("Runner started, connecting to manager via WebSocket...") // Set up signal handlers to kill processes on shutdown sigChan := make(chan os.Signal, 1) signal.Notify(sigChan, os.Interrupt, syscall.SIGTERM) go func() { sig := <-sigChan log.Printf("Received signal: %v, killing all processes and cleaning up...", sig) client.KillAllProcesses() // Cleanup happens in defer, but also do it here for good measure client.CleanupWorkspace() os.Exit(0) }() // Block forever select {} } func getEnv(key, defaultValue string) string { if value := os.Getenv(key); value != "" { return value } return defaultValue } func getEnvInt(key string, defaultValue int) int { if value := os.Getenv(key); value != "" { var result int if _, err := fmt.Sscanf(value, "%d", &result); err == nil { return result } } return defaultValue } // generateShortID generates a short random ID (8 hex characters) func generateShortID() string { bytes := make([]byte, 4) if _, err := rand.Read(bytes); err != nil { // Fallback to timestamp-based ID if crypto/rand fails return fmt.Sprintf("%x", os.Getpid()^int(time.Now().Unix())) } return hex.EncodeToString(bytes) }