package main import ( "crypto/rand" "encoding/hex" "encoding/json" "flag" "fmt" "log" "os" "os/signal" "strings" "syscall" "time" "jiggablend/internal/logger" "jiggablend/internal/runner" ) type SecretsFile struct { RunnerID int64 `json:"runner_id"` RunnerSecret string `json:"runner_secret"` ManagerSecret string `json:"manager_secret"` } func main() { var ( managerURL = flag.String("manager", getEnv("MANAGER_URL", "http://localhost:8080"), "Manager URL") name = flag.String("name", getEnv("RUNNER_NAME", ""), "Runner name") hostname = flag.String("hostname", getEnv("RUNNER_HOSTNAME", ""), "Runner hostname") token = flag.String("token", getEnv("REGISTRATION_TOKEN", ""), "Registration token") secretsFile = flag.String("secrets-file", getEnv("SECRETS_FILE", ""), "Path to secrets file for persistent storage (default: ./runner-secrets.json, or ./runner-secrets-{id}.json if multiple runners)") runnerIDSuffix = flag.String("runner-id", getEnv("RUNNER_ID", ""), "Unique runner ID suffix (auto-generated if not provided)") logDir = flag.String("log-dir", getEnv("LOG_DIR", "./logs"), "Log directory") logMaxSize = flag.Int("log-max-size", getEnvInt("LOG_MAX_SIZE", 100), "Maximum log file size in MB before rotation") logMaxBackups = flag.Int("log-max-backups", getEnvInt("LOG_MAX_BACKUPS", 5), "Maximum number of rotated log files to keep") logMaxAge = flag.Int("log-max-age", getEnvInt("LOG_MAX_AGE", 30), "Maximum age in days for rotated log files") ) flag.Parse() if *hostname == "" { *hostname, _ = os.Hostname() } // Generate or use provided runner ID suffix runnerIDStr := *runnerIDSuffix if runnerIDStr == "" { runnerIDStr = generateShortID() } // Generate runner name with ID if not provided if *name == "" { *name = fmt.Sprintf("runner-%s-%s", *hostname, runnerIDStr) } else { // Append ID to provided name to ensure uniqueness *name = fmt.Sprintf("%s-%s", *name, runnerIDStr) } // Initialize logger (writes to both stdout and log file with rotation) // Use runner-specific log file name based on the final name sanitizedName := strings.ReplaceAll(*name, "/", "_") sanitizedName = strings.ReplaceAll(sanitizedName, "\\", "_") logFileName := fmt.Sprintf("runner-%s.log", sanitizedName) if err := logger.Init(*logDir, logFileName, *logMaxSize, *logMaxBackups, *logMaxAge); err != nil { log.Fatalf("Failed to initialize logger: %v", err) } defer func() { if l := logger.GetDefault(); l != nil { l.Close() } }() log.Printf("Log rotation configured: max_size=%dMB, max_backups=%d, max_age=%d days", *logMaxSize, *logMaxBackups, *logMaxAge) // Set default secrets file if not provided - always use current directory if *secretsFile == "" { if *runnerIDSuffix != "" || getEnv("RUNNER_ID", "") != "" { // Multiple runners - use local file with ID *secretsFile = fmt.Sprintf("./runner-secrets-%s.json", runnerIDStr) } else { // Single runner - use local file *secretsFile = "./runner-secrets.json" } } client := runner.NewClient(*managerURL, *name, *hostname) // Probe capabilities once at startup (before any registration attempts) log.Printf("Probing runner capabilities...") client.ProbeCapabilities() capabilities := client.GetCapabilities() capList := []string{} for cap, value := range capabilities { // Only show boolean true capabilities and numeric GPU counts if enabled, ok := value.(bool); ok && enabled { capList = append(capList, cap) } else if count, ok := value.(int); ok && count > 0 { capList = append(capList, fmt.Sprintf("%s=%d", cap, count)) } else if count, ok := value.(float64); ok && count > 0 { capList = append(capList, fmt.Sprintf("%s=%.0f", cap, count)) } } if len(capList) > 0 { log.Printf("Detected capabilities: %s", strings.Join(capList, ", ")) } else { log.Printf("Warning: No capabilities detected") } // Try to load secrets from file var runnerID int64 var runnerSecret, managerSecret string if *secretsFile != "" { if secrets, err := loadSecrets(*secretsFile); err == nil { runnerID = secrets.RunnerID runnerSecret = secrets.RunnerSecret managerSecret = secrets.ManagerSecret client.SetSecrets(runnerID, runnerSecret, managerSecret) log.Printf("Loaded secrets from %s", *secretsFile) } } // If no secrets loaded, register with token (with retry logic) if runnerID == 0 { if *token == "" { log.Fatalf("Registration token required (use --token or set REGISTRATION_TOKEN env var)") } // Retry registration with exponential backoff backoff := 1 * time.Second maxBackoff := 30 * time.Second maxRetries := 10 retryCount := 0 for { var err error runnerID, runnerSecret, managerSecret, err = client.Register(*token) if err == nil { log.Printf("Registered runner with ID: %d", runnerID) // Always save secrets to file (secretsFile is now always set to a default if not provided) secrets := SecretsFile{ RunnerID: runnerID, RunnerSecret: runnerSecret, ManagerSecret: managerSecret, } if err := saveSecrets(*secretsFile, secrets); err != nil { log.Printf("Warning: Failed to save secrets to %s: %v", *secretsFile, err) } else { log.Printf("Saved secrets to %s", *secretsFile) } break } // Check if it's a token error (invalid/expired/used token) - shutdown immediately errMsg := err.Error() if strings.Contains(errMsg, "token error:") { log.Fatalf("Registration failed (token error): %v", err) } // Only retry on connection errors or other retryable errors retryCount++ if retryCount >= maxRetries { log.Fatalf("Failed to register runner after %d attempts: %v", maxRetries, err) } log.Printf("Registration failed (attempt %d/%d): %v, retrying in %v", retryCount, maxRetries, err, backoff) time.Sleep(backoff) backoff *= 2 if backoff > maxBackoff { backoff = maxBackoff } } } // Start WebSocket connection with reconnection go client.ConnectWebSocketWithReconnect() // Start heartbeat loop (for WebSocket ping/pong and HTTP fallback) go client.HeartbeatLoop() // ProcessTasks is now handled via WebSocket, but kept for HTTP fallback // WebSocket will handle task assignment automatically log.Printf("Runner started, connecting to manager via WebSocket...") // Set up signal handlers to kill processes on shutdown sigChan := make(chan os.Signal, 1) signal.Notify(sigChan, os.Interrupt, syscall.SIGTERM) go func() { sig := <-sigChan log.Printf("Received signal: %v, killing all processes and shutting down...", sig) client.KillAllProcesses() os.Exit(0) }() // Block forever select {} } func loadSecrets(path string) (*SecretsFile, error) { data, err := os.ReadFile(path) if err != nil { return nil, err } var secrets SecretsFile if err := json.Unmarshal(data, &secrets); err != nil { return nil, err } return &secrets, nil } func saveSecrets(path string, secrets SecretsFile) error { data, err := json.MarshalIndent(secrets, "", " ") if err != nil { return err } return os.WriteFile(path, data, 0600) } func getEnv(key, defaultValue string) string { if value := os.Getenv(key); value != "" { return value } return defaultValue } func getEnvInt(key string, defaultValue int) int { if value := os.Getenv(key); value != "" { var result int if _, err := fmt.Sscanf(value, "%d", &result); err == nil { return result } } return defaultValue } // generateShortID generates a short random ID (8 hex characters) func generateShortID() string { bytes := make([]byte, 4) if _, err := rand.Read(bytes); err != nil { // Fallback to timestamp-based ID if crypto/rand fails return fmt.Sprintf("%x", os.Getpid()^int(time.Now().Unix())) } return hex.EncodeToString(bytes) }