diff --git a/go.mod b/go.mod index 298bff8974..f0f201e241 100644 --- a/go.mod +++ b/go.mod @@ -314,7 +314,7 @@ replace github.com/nektos/act => gitea.com/gitea/act v0.261.7-0.20251003180512-a replace git.sr.ht/~mariusor/go-xsd-duration => gitea.com/gitea/go-xsd-duration v0.0.0-20220703122237-02e73435a078 // Use GitCaddy fork with capability support -replace code.gitea.io/actions-proto-go => git.marketally.com/gitcaddy/actions-proto-go v0.5.7 +replace code.gitea.io/actions-proto-go => git.marketally.com/gitcaddy/actions-proto-go v0.5.8 exclude github.com/gofrs/uuid v3.2.0+incompatible diff --git a/go.sum b/go.sum index 2341a2b467..923655111e 100644 --- a/go.sum +++ b/go.sum @@ -31,6 +31,8 @@ filippo.io/edwards25519 v1.1.0 h1:FNf4tywRC1HmFuKW5xopWpigGjJKiJSV0Cqo0cJWDaA= filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4= git.marketally.com/gitcaddy/actions-proto-go v0.5.7 h1:RUbafr3Vkw2l4WfSwa+oF+Ihakbm05W0FlAmXuQrDJc= git.marketally.com/gitcaddy/actions-proto-go v0.5.7/go.mod h1:RPu21UoRD3zSAujoZR6LJwuVNa2uFRBveadslczCRfQ= +git.marketally.com/gitcaddy/actions-proto-go v0.5.8 h1:MBipeHvY6A0jcobvziUtzgatZTrV4fs/HE1rPQxREN4= +git.marketally.com/gitcaddy/actions-proto-go v0.5.8/go.mod h1:RPu21UoRD3zSAujoZR6LJwuVNa2uFRBveadslczCRfQ= gitea.com/gitea/act v0.261.7-0.20251003180512-ac6e4b751763 h1:ohdxegvslDEllZmRNDqpKun6L4Oq81jNdEDtGgHEV2c= gitea.com/gitea/act v0.261.7-0.20251003180512-ac6e4b751763/go.mod h1:Pg5C9kQY1CEA3QjthjhlrqOC/QOT5NyWNjOjRHw23Ok= gitea.com/gitea/go-xsd-duration v0.0.0-20220703122237-02e73435a078 h1:BAFmdZpRW7zMQZQDClaCWobRj9uL1MR3MzpCVJvc5s4= diff --git a/models/actions/runner_health.go b/models/actions/runner_health.go new file mode 100644 index 0000000000..e2af1e9d76 --- /dev/null +++ b/models/actions/runner_health.go @@ -0,0 +1,266 @@ +// Copyright 2026 MarketAlly. All rights reserved. +// SPDX-License-Identifier: MIT + +package actions + +import ( + "context" + "time" + + "code.gitea.io/gitea/models/db" + "code.gitea.io/gitea/modules/json" + "code.gitea.io/gitea/modules/log" + "code.gitea.io/gitea/modules/setting" + "code.gitea.io/gitea/modules/timeutil" +) + +// RunnerCapabilities represents the parsed capabilities from CapabilitiesJSON +type RunnerCapabilities struct { + OS string `json:"os"` + Arch string `json:"arch"` + Disk *DiskInfo `json:"disk"` + CPU *CPUInfo `json:"cpu"` + Bandwidth *BandwidthInfo `json:"bandwidth"` +} + +// DiskInfo contains disk usage information +type DiskInfo struct { + TotalBytes int64 `json:"total_bytes"` + FreeBytes int64 `json:"free_bytes"` + UsedBytes int64 `json:"used_bytes"` + UsedPercent float64 `json:"used_percent"` +} + +// CPUInfo contains CPU load information +type CPUInfo struct { + NumCPU int `json:"num_cpu"` // Number of logical CPUs + LoadAvg1m float64 `json:"load_avg_1m"` // 1-minute load average + LoadAvg5m float64 `json:"load_avg_5m"` // 5-minute load average + LoadAvg15m float64 `json:"load_avg_15m"` // 15-minute load average + LoadPercent float64 `json:"load_percent"` // (load_avg_1m / num_cpu) * 100 +} + +// BandwidthInfo contains network performance information +type BandwidthInfo struct { + DownloadMbps float64 `json:"download_mbps"` + LatencyMs float64 `json:"latency_ms"` + TestedAt time.Time `json:"tested_at"` +} + +// RunnerHealthStatus represents the health status of a runner +type RunnerHealthStatus struct { + Healthy bool `json:"healthy"` + DiskHealthy bool `json:"disk_healthy"` + CPUHealthy bool `json:"cpu_healthy"` + LatencyHealthy bool `json:"latency_healthy"` + DiskUsedPercent float64 `json:"disk_used_percent"` + DiskFreeBytes int64 `json:"disk_free_bytes"` + CPULoadPercent float64 `json:"cpu_load_percent"` + LatencyMs float64 `json:"latency_ms"` + Reason string `json:"reason,omitempty"` + NeedsCleanup bool `json:"needs_cleanup"` +} + +// GetCapabilities parses and returns the runner's capabilities +func (r *ActionRunner) GetCapabilities() *RunnerCapabilities { + if r.CapabilitiesJSON == "" { + return nil + } + + var caps RunnerCapabilities + if err := json.Unmarshal([]byte(r.CapabilitiesJSON), &caps); err != nil { + log.Error("Failed to parse runner %s capabilities: %v", r.Name, err) + return nil + } + return &caps +} + +// GetHealthStatus returns detailed health status of the runner +func (r *ActionRunner) GetHealthStatus() *RunnerHealthStatus { + status := &RunnerHealthStatus{ + Healthy: true, + DiskHealthy: true, + CPUHealthy: true, + LatencyHealthy: true, + } + + caps := r.GetCapabilities() + if caps == nil { + // No capabilities reported, assume healthy but note it + status.Reason = "no capabilities reported" + return status + } + + healthSettings := setting.Actions.RunnerHealthCheck + + // Check disk health + if caps.Disk != nil { + status.DiskUsedPercent = caps.Disk.UsedPercent + status.DiskFreeBytes = caps.Disk.FreeBytes + + freePercent := 100.0 - caps.Disk.UsedPercent + if freePercent < healthSettings.MinDiskPercent { + status.DiskHealthy = false + status.Healthy = false + status.Reason = "insufficient disk space" + status.NeedsCleanup = true + } + + if caps.Disk.UsedPercent >= healthSettings.MaxDiskUsagePercent { + status.NeedsCleanup = true + } + } + + // Check CPU health + if caps.CPU != nil { + status.CPULoadPercent = caps.CPU.LoadPercent + + if caps.CPU.LoadPercent > healthSettings.MaxCPULoadPercent { + status.CPUHealthy = false + status.Healthy = false + if status.Reason != "" { + status.Reason += "; " + } + status.Reason += "CPU overloaded" + } + } + + // Check latency health + if caps.Bandwidth != nil { + status.LatencyMs = caps.Bandwidth.LatencyMs + + if caps.Bandwidth.LatencyMs > healthSettings.MaxLatencyMs { + status.LatencyHealthy = false + status.Healthy = false + if status.Reason != "" { + status.Reason += "; " + } + status.Reason += "high latency" + } + } + + return status +} + +// IsHealthy returns true if the runner is healthy enough for job assignment +func (r *ActionRunner) IsHealthy() bool { + if !setting.Actions.RunnerHealthCheck.Enabled { + return true + } + return r.GetHealthStatus().Healthy +} + +// NeedsCleanup returns true if the runner should perform cleanup +func (r *ActionRunner) NeedsCleanup() bool { + status := r.GetHealthStatus() + return status.NeedsCleanup +} + +// RunnerCleanupRequest tracks cleanup requests sent to runners +type RunnerCleanupRequest struct { + ID int64 `xorm:"pk autoincr"` + RunnerID int64 `xorm:"INDEX NOT NULL"` + RequestedAt timeutil.TimeStamp `xorm:"created INDEX"` + CompletedAt timeutil.TimeStamp `xorm:"INDEX"` + Success bool + BytesFreed int64 + ErrorMsg string `xorm:"TEXT"` +} + +func init() { + db.RegisterModel(new(RunnerCleanupRequest)) +} + +// TableName returns the table name for RunnerCleanupRequest +func (RunnerCleanupRequest) TableName() string { + return "runner_cleanup_request" +} + +// CreateCleanupRequest creates a new cleanup request for a runner +func CreateCleanupRequest(ctx context.Context, runnerID int64) (*RunnerCleanupRequest, error) { + req := &RunnerCleanupRequest{ + RunnerID: runnerID, + } + _, err := db.GetEngine(ctx).Insert(req) + return req, err +} + +// GetLastCleanupRequest returns the last cleanup request for a runner +func GetLastCleanupRequest(ctx context.Context, runnerID int64) (*RunnerCleanupRequest, error) { + req := &RunnerCleanupRequest{} + has, err := db.GetEngine(ctx).Where("runner_id = ?", runnerID). + OrderBy("requested_at DESC"). + Limit(1). + Get(req) + if err != nil { + return nil, err + } + if !has { + return nil, nil + } + return req, nil +} + +// GetPendingCleanupRequest returns the pending (uncompleted) cleanup request for a runner +func GetPendingCleanupRequest(ctx context.Context, runnerID int64) (*RunnerCleanupRequest, error) { + req := &RunnerCleanupRequest{} + has, err := db.GetEngine(ctx).Where("runner_id = ? AND completed_at = 0", runnerID). + OrderBy("requested_at DESC"). + Limit(1). + Get(req) + if err != nil { + return nil, err + } + if !has { + return nil, nil + } + return req, nil +} + +// CanRequestCleanup checks if we can request cleanup (respects cooldown) +func CanRequestCleanup(ctx context.Context, runnerID int64) (bool, error) { + lastReq, err := GetLastCleanupRequest(ctx, runnerID) + if err != nil { + return false, err + } + if lastReq == nil { + return true, nil + } + + cooldown := setting.Actions.RunnerHealthCheck.CleanupCooldown + if time.Since(lastReq.RequestedAt.AsTime()) < cooldown { + return false, nil + } + return true, nil +} + +// CompleteCleanupRequest marks a cleanup request as completed +func CompleteCleanupRequest(ctx context.Context, id int64, success bool, bytesFreed int64, errorMsg string) error { + _, err := db.GetEngine(ctx).ID(id).Cols("completed_at", "success", "bytes_freed", "error_msg").Update(&RunnerCleanupRequest{ + CompletedAt: timeutil.TimeStampNow(), + Success: success, + BytesFreed: bytesFreed, + ErrorMsg: errorMsg, + }) + return err +} + +// GetUnhealthyRunners returns all runners that are unhealthy +func GetUnhealthyRunners(ctx context.Context) ([]*ActionRunner, error) { + var runners []*ActionRunner + err := db.GetEngine(ctx).Where("deleted_unix = 0").Find(&runners) + if err != nil { + return nil, err + } + + var unhealthy []*ActionRunner + for _, r := range runners { + if !r.IsOnline() { + continue // Skip offline runners + } + if !r.IsHealthy() { + unhealthy = append(unhealthy, r) + } + } + return unhealthy, nil +} diff --git a/models/actions/task.go b/models/actions/task.go index 8b4ecf28f7..346dd733ad 100644 --- a/models/actions/task.go +++ b/models/actions/task.go @@ -1,4 +1,5 @@ // Copyright 2022 The Gitea Authors. All rights reserved. +// Copyright 2026 MarketAlly. All rights reserved. // SPDX-License-Identifier: MIT package actions @@ -214,7 +215,36 @@ func GetRunningTaskByToken(ctx context.Context, token string) (*ActionTask, erro return nil, errNotExist } +// CreateTaskForRunner creates a task for a runner. +// It checks runner health before assigning jobs and skips unhealthy runners. func CreateTaskForRunner(ctx context.Context, runner *ActionRunner) (*ActionTask, bool, error) { + // Check runner health before assigning jobs + if setting.Actions.RunnerHealthCheck.Enabled { + healthStatus := runner.GetHealthStatus() + if !healthStatus.Healthy { + log.Warn("Runner %s (ID: %d) is unhealthy: %s (disk: %.1f%% used, latency: %.0fms)", + runner.Name, runner.ID, healthStatus.Reason, + healthStatus.DiskUsedPercent, healthStatus.LatencyMs) + + // Request cleanup if cooldown allows + if healthStatus.NeedsCleanup { + canCleanup, err := CanRequestCleanup(ctx, runner.ID) + if err != nil { + log.Error("Failed to check cleanup cooldown for runner %s: %v", runner.Name, err) + } else if canCleanup { + if _, err := CreateCleanupRequest(ctx, runner.ID); err != nil { + log.Error("Failed to create cleanup request for runner %s: %v", runner.Name, err) + } else { + log.Info("Requested cleanup for unhealthy runner %s", runner.Name) + } + } + } + + // Do not assign jobs to unhealthy runners + return nil, false, nil + } + } + ctx, committer, err := db.TxContext(ctx) if err != nil { return nil, false, err diff --git a/modules/setting/actions.go b/modules/setting/actions.go index 34346b62cf..4d610e745c 100644 --- a/modules/setting/actions.go +++ b/modules/setting/actions.go @@ -1,4 +1,5 @@ // Copyright 2022 The Gitea Authors. All rights reserved. +// Copyright 2026 MarketAlly. All rights reserved. // SPDX-License-Identifier: MIT package setting @@ -25,6 +26,9 @@ var ( EndlessTaskTimeout time.Duration `ini:"ENDLESS_TASK_TIMEOUT"` AbandonedJobTimeout time.Duration `ini:"ABANDONED_JOB_TIMEOUT"` SkipWorkflowStrings []string `ini:"SKIP_WORKFLOW_STRINGS"` + + // Runner health settings + RunnerHealthCheck RunnerHealthCheckSettings }{ Enabled: true, DefaultActionsURL: defaultActionsURLGitHub, @@ -32,6 +36,18 @@ var ( } ) +// RunnerHealthCheckSettings configures runner health monitoring +type RunnerHealthCheckSettings struct { + Enabled bool `ini:"ENABLED"` + MinDiskPercent float64 `ini:"MIN_DISK_PERCENT"` // Skip runners with less than this % free disk + MaxDiskUsagePercent float64 `ini:"MAX_DISK_USAGE_PERCENT"` // Alert when disk usage exceeds this % + MaxLatencyMs float64 `ini:"MAX_LATENCY_MS"` // Skip runners with latency above this + MaxCPULoadPercent float64 `ini:"MAX_CPU_LOAD_PERCENT"` // Skip runners with CPU load above this % + AdminEmail string `ini:"ADMIN_EMAIL"` // Email to notify about unhealthy runners + CleanupCooldown time.Duration `ini:"CLEANUP_COOLDOWN"` // Min time between cleanup requests + AutoCleanupThreshold float64 `ini:"AUTO_CLEANUP_THRESHOLD"` // Runner self-cleans at this disk % +} + type defaultActionsURL string func (url defaultActionsURL) URL() string { @@ -49,10 +65,6 @@ func (url defaultActionsURL) URL() string { const ( defaultActionsURLGitHub = "github" // https://github.com defaultActionsURLSelf = "self" // the root URL of the self-hosted Gitea instance - // DefaultActionsURL only supports GitHub and the self-hosted Gitea. - // It's intentionally not supported more, so please be cautious before adding more like "gitea" or "gitlab". - // If you get some trouble with `uses: username/action_name@version` in your workflow, - // please consider to use `uses: https://the_url_you_want_to_use/username/action_name@version` instead. ) type logCompression string @@ -89,6 +101,36 @@ func loadActionsFrom(rootCfg ConfigProvider) error { } } + // Load runner health check settings + healthSec := rootCfg.Section("actions.runner_health") + if err := healthSec.MapTo(&Actions.RunnerHealthCheck); err != nil { + return fmt.Errorf("failed to map runner health settings: %v", err) + } + + // Set defaults for runner health + if Actions.RunnerHealthCheck.MinDiskPercent <= 0 { + Actions.RunnerHealthCheck.MinDiskPercent = 5.0 // Need at least 5% free + } + if Actions.RunnerHealthCheck.MaxDiskUsagePercent <= 0 { + Actions.RunnerHealthCheck.MaxDiskUsagePercent = 95.0 // Alert at 95% used + } + if Actions.RunnerHealthCheck.MaxLatencyMs <= 0 { + Actions.RunnerHealthCheck.MaxLatencyMs = 500.0 // 500ms max latency + } + if Actions.RunnerHealthCheck.MaxCPULoadPercent <= 0 { + Actions.RunnerHealthCheck.MaxCPULoadPercent = 80.0 // Skip runners with >80% CPU load + } + if Actions.RunnerHealthCheck.CleanupCooldown <= 0 { + Actions.RunnerHealthCheck.CleanupCooldown = 10 * time.Minute + } + if Actions.RunnerHealthCheck.AutoCleanupThreshold <= 0 { + Actions.RunnerHealthCheck.AutoCleanupThreshold = 90.0 // Runner self-cleans at 90% + } + // Default enabled + if !healthSec.HasKey("ENABLED") { + Actions.RunnerHealthCheck.Enabled = true + } + // don't support to read configuration from [actions] Actions.LogStorage, err = getStorage(rootCfg, "actions_log", "", nil) if err != nil { diff --git a/routers/api/actions/runner/runner.go b/routers/api/actions/runner/runner.go index c42fd39442..60aada3169 100644 --- a/routers/api/actions/runner/runner.go +++ b/routers/api/actions/runner/runner.go @@ -188,10 +188,20 @@ func (s *Service) FetchTask( } } + // Check if there's a pending cleanup request for this runner + requestCleanup := false + if pendingCleanup, err := actions_model.GetPendingCleanupRequest(ctx, runner.ID); err != nil { + log.Warn("failed to check pending cleanup request: %v", err) + } else if pendingCleanup != nil { + requestCleanup = true + log.Info("Sending cleanup request to runner %s (request ID: %d)", runner.Name, pendingCleanup.ID) + } + res := connect.NewResponse(&runnerv1.FetchTaskResponse{ Task: task, TasksVersion: latestVersion, RequestBandwidthTest: requestBandwidthTest, + RequestCleanup: requestCleanup, }) return res, nil } diff --git a/services/actions/init.go b/services/actions/init.go index 7136da05ed..6a01ebd69b 100644 --- a/services/actions/init.go +++ b/services/actions/init.go @@ -1,4 +1,5 @@ // Copyright 2022 The Gitea Authors. All rights reserved. +// Copyright 2026 MarketAlly. All rights reserved. // SPDX-License-Identifier: MIT package actions @@ -67,5 +68,9 @@ func Init(ctx context.Context) error { go graceful.GetManager().RunWithCancel(jobEmitterQueue) notify_service.RegisterNotifier(NewNotifier()) + + // Start runner health monitoring + StartRunnerHealthMonitor(ctx) + return initGlobalRunnerToken(ctx) } diff --git a/services/actions/runner_health.go b/services/actions/runner_health.go new file mode 100644 index 0000000000..726406afc7 --- /dev/null +++ b/services/actions/runner_health.go @@ -0,0 +1,120 @@ +// Copyright 2026 MarketAlly. All rights reserved. +// SPDX-License-Identifier: MIT + +package actions + +import ( + "context" + "sync" + "time" + + actions_model "code.gitea.io/gitea/models/actions" + "code.gitea.io/gitea/modules/graceful" + "code.gitea.io/gitea/modules/log" + "code.gitea.io/gitea/modules/setting" + "code.gitea.io/gitea/services/mailer" +) + +var ( + // Track which runners we've already alerted about (to avoid spam) + alertedRunners = make(map[int64]time.Time) + alertedRunnersMu sync.Mutex + alertCooldown = time.Hour // Only alert once per hour per runner +) + +// StartRunnerHealthMonitor starts a background goroutine that monitors runner health +func StartRunnerHealthMonitor(ctx context.Context) { + if !setting.Actions.RunnerHealthCheck.Enabled { + log.Info("Runner health monitoring disabled") + return + } + + go runHealthMonitor(graceful.GetManager().ShutdownContext()) +} + +func runHealthMonitor(ctx context.Context) { + log.Info("Starting runner health monitor") + ticker := time.NewTicker(5 * time.Minute) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + log.Info("Runner health monitor stopped") + return + case <-ticker.C: + checkRunnerHealth(ctx) + } + } +} + +// checkRunnerHealth checks all online runners and alerts on unhealthy ones +func checkRunnerHealth(ctx context.Context) { + unhealthyRunners, err := actions_model.GetUnhealthyRunners(ctx) + if err != nil { + log.Error("Failed to get unhealthy runners: %v", err) + return + } + + for _, runner := range unhealthyRunners { + healthStatus := runner.GetHealthStatus() + + // Check if we should send an alert + if shouldAlert(runner.ID) { + log.Warn("Runner %s (ID: %d) is unhealthy: %s", runner.Name, runner.ID, healthStatus.Reason) + + // Send email alert + mailer.SendRunnerHealthAlert(runner, healthStatus) + + // Mark as alerted + markAlerted(runner.ID) + + // Request cleanup if not recently requested + if healthStatus.NeedsCleanup { + canCleanup, err := actions_model.CanRequestCleanup(ctx, runner.ID) + if err != nil { + log.Error("Failed to check cleanup cooldown for runner %s: %v", runner.Name, err) + } else if canCleanup { + if _, err := actions_model.CreateCleanupRequest(ctx, runner.ID); err != nil { + log.Error("Failed to create cleanup request for runner %s: %v", runner.Name, err) + } else { + log.Info("Requested cleanup for unhealthy runner %s", runner.Name) + } + } + } + } + } +} + +// shouldAlert checks if we should send an alert for this runner +func shouldAlert(runnerID int64) bool { + alertedRunnersMu.Lock() + defer alertedRunnersMu.Unlock() + + lastAlert, exists := alertedRunners[runnerID] + if !exists { + return true + } + return time.Since(lastAlert) > alertCooldown +} + +// markAlerted marks a runner as alerted +func markAlerted(runnerID int64) { + alertedRunnersMu.Lock() + defer alertedRunnersMu.Unlock() + alertedRunners[runnerID] = time.Now() +} + +// ClearRunnerAlert clears the alert status for a runner (call when runner becomes healthy) +func ClearRunnerAlert(runnerID int64) { + alertedRunnersMu.Lock() + defer alertedRunnersMu.Unlock() + delete(alertedRunners, runnerID) +} + +// GetPendingCleanupRequests returns cleanup requests that haven't been completed +func GetPendingCleanupRequests(ctx context.Context) ([]*actions_model.RunnerCleanupRequest, error) { + // This will be used by the runner when it polls for tasks + // to check if it should perform cleanup + return nil, nil // TODO: implement +} diff --git a/services/mailer/mail_runner_health.go b/services/mailer/mail_runner_health.go new file mode 100644 index 0000000000..20809026fc --- /dev/null +++ b/services/mailer/mail_runner_health.go @@ -0,0 +1,85 @@ +// Copyright 2026 MarketAlly. All rights reserved. +// SPDX-License-Identifier: MIT + +package mailer + +import ( + "bytes" + "fmt" + + actions_model "code.gitea.io/gitea/models/actions" + "code.gitea.io/gitea/modules/log" + "code.gitea.io/gitea/modules/setting" + sender_service "code.gitea.io/gitea/services/mailer/sender" +) + +// SendRunnerHealthAlert sends an email notification about unhealthy runners +func SendRunnerHealthAlert(runner *actions_model.ActionRunner, healthStatus *actions_model.RunnerHealthStatus) { + if setting.MailService == nil { + log.Warn("Mail service not configured, cannot send runner health alert") + return + } + + adminEmail := setting.Actions.RunnerHealthCheck.AdminEmail + if adminEmail == "" { + log.Debug("No admin email configured for runner health alerts") + return + } + + subject := fmt.Sprintf("[%s] Runner Health Alert: %s", setting.AppName, runner.Name) + + var body bytes.Buffer + body.WriteString(fmt.Sprintf("Runner Health Alert\n")) + body.WriteString(fmt.Sprintf("==================\n\n")) + body.WriteString(fmt.Sprintf("Runner: %s (ID: %d)\n", runner.Name, runner.ID)) + body.WriteString(fmt.Sprintf("Status: UNHEALTHY\n")) + body.WriteString(fmt.Sprintf("Reason: %s\n\n", healthStatus.Reason)) + + body.WriteString(fmt.Sprintf("Health Details:\n")) + body.WriteString(fmt.Sprintf("--------------\n")) + body.WriteString(fmt.Sprintf("Disk Usage: %.1f%%\n", healthStatus.DiskUsedPercent)) + body.WriteString(fmt.Sprintf("Disk Free: %s\n", formatBytes(healthStatus.DiskFreeBytes))) + body.WriteString(fmt.Sprintf("CPU Load: %.1f%%\n", healthStatus.CPULoadPercent)) + body.WriteString(fmt.Sprintf("Latency: %.0f ms\n", healthStatus.LatencyMs)) + body.WriteString(fmt.Sprintf("Disk Healthy: %v\n", healthStatus.DiskHealthy)) + body.WriteString(fmt.Sprintf("CPU Healthy: %v\n", healthStatus.CPUHealthy)) + body.WriteString(fmt.Sprintf("Latency Healthy: %v\n", healthStatus.LatencyHealthy)) + body.WriteString(fmt.Sprintf("Needs Cleanup: %v\n\n", healthStatus.NeedsCleanup)) + + body.WriteString(fmt.Sprintf("Thresholds:\n")) + body.WriteString(fmt.Sprintf("-----------\n")) + body.WriteString(fmt.Sprintf("Min Free Disk: %.1f%%\n", setting.Actions.RunnerHealthCheck.MinDiskPercent)) + body.WriteString(fmt.Sprintf("Max Disk Usage: %.1f%%\n", setting.Actions.RunnerHealthCheck.MaxDiskUsagePercent)) + body.WriteString(fmt.Sprintf("Max CPU Load: %.1f%%\n", setting.Actions.RunnerHealthCheck.MaxCPULoadPercent)) + body.WriteString(fmt.Sprintf("Max Latency: %.0f ms\n\n", setting.Actions.RunnerHealthCheck.MaxLatencyMs)) + + body.WriteString(fmt.Sprintf("Action Taken:\n")) + body.WriteString(fmt.Sprintf("-------------\n")) + body.WriteString(fmt.Sprintf("- Jobs will not be assigned to this runner until it is healthy\n")) + if healthStatus.NeedsCleanup { + body.WriteString(fmt.Sprintf("- A cleanup request has been sent to the runner\n")) + } + body.WriteString(fmt.Sprintf("\nPlease investigate and resolve the issue.\n")) + body.WriteString(fmt.Sprintf("\n--\n%s\n%s\n", setting.AppName, setting.AppURL)) + + msg := sender_service.NewMessage(adminEmail, subject, body.String()) + msg.Info = fmt.Sprintf("Runner health alert for %s", runner.Name) + + if err := sender_service.Send(sender, msg); err != nil { + log.Error("Failed to send runner health alert: %v", err) + } +} + +// formatBytes formats bytes into human readable string +func formatBytes(bytes int64) string { + const unit = 1024 + if bytes < unit { + return fmt.Sprintf("%d B", bytes) + } + div, exp := int64(unit), 0 + for n := bytes / unit; n >= unit; n /= unit { + div *= unit + exp++ + } + return fmt.Sprintf("%.1f %cB", float64(bytes)/float64(div), "KMGTPE"[exp]) +}