From 9a52b150fd24eb6d4982c781d6aa225446e39841 Mon Sep 17 00:00:00 2001 From: logikonline Date: Tue, 20 Jan 2026 00:26:29 -0500 Subject: [PATCH] feat(actions): add bandwidth health checks for runners Adds bandwidth monitoring to runner health checks with critical threshold of 8 Mbps (1 MB/s). Runners below this threshold are blocked from job assignment and trigger automatic bandwidth rechecks. Also refines health check logic: disk/CPU now only block at 95%+ (critical), and latency is informational only. Includes new RunnerBandwidthCheckRequest model to track recheck requests. --- .notes/note-1768881142114-zwzrej9gc.json | 5 +- models/actions/runner_health.go | 165 ++++++++++++++++++----- models/actions/runner_routing.go | 52 ++++--- models/actions/task.go | 9 ++ 4 files changed, 174 insertions(+), 57 deletions(-) diff --git a/.notes/note-1768881142114-zwzrej9gc.json b/.notes/note-1768881142114-zwzrej9gc.json index 32194018b0..aa2594164a 100644 --- a/.notes/note-1768881142114-zwzrej9gc.json +++ b/.notes/note-1768881142114-zwzrej9gc.json @@ -1,8 +1,7 @@ { "id": "note-1768881142114-zwzrej9gc", "title": "Release Notes for Gallery", - "content": " New Feature: Repository Home Page Tabs\n\n The repository home page now features a tabbed interface replacing the single README display. When content is available, users will see tabs for:\n\n - Readme - Displays the README file (default tab, existing behavior)\n - License - Displays the LICENSE file content\n - Gallery - Displays project screenshots from the .gallery folder\n\n Tabs only appear when the corresponding content exists in the repository.\n\n ---\n New Feature: Repository Gallery\n\n Developers can now showcase screenshots and images of their applications directly on the repository home page.\n\n Gallery Tab Features:\n - Responsive image grid layout\n - Lightbox viewer for full-size images with keyboard navigation (← →)\n - Custom captions for each image\n - Lazy loading for better performance\n\n Gallery Settings Page (Repository > Settings > Gallery):\n - Upload images (PNG, JPG, JPEG, GIF, WebP, SVG, BMP, ICO)\n - 5MB maximum file size per image\n - Add/edit captions for each image\n - Delete images with confirmation\n\n Storage:\n - Images are stored in the .gallery folder at the repository root\n - Captions are stored in .gallery/gallery.json\n - All changes are committed to the repository automatically\n\n ---\n Localization\n\n Full translations added for gallery-related strings in 20+ languages:\n - English, German, French, Spanish, Italian, Dutch, Polish\n - Portuguese (Brazil & Portugal)\n - Russian, Ukrainian, Turkish, Czech, Greek, Hungarian\n - Finnish, Swedish\n - Japanese, Korean, Chinese (Simplified & Traditional)", + "content": " New Feature: Repository Home Page Tabs\n\n The repository home page now features a tabbed interface replacing the single README display. When content is available, users will see tabs for:\n\n - Readme - Displays the README file (default tab, existing behavior)\n - License - Displays the LICENSE file content\n - Gallery - Displays project screenshots from the .gallery folder\n\n Tabs only appear when the corresponding content exists in the repository.\n\n ---\n New Feature: Repository Gallery\n\n Developers can now showcase screenshots and images of their applications directly on the repository home page.\n\n Gallery Tab Features:\n - Responsive image grid layout\n - Lightbox viewer for full-size images with keyboard navigation (\u2190 \u2192)\n - Custom captions for each image\n - Lazy loading for better performance\n\n Gallery Settings Page (Repository \u003E Settings \u003E Gallery):\n - Upload images (PNG, JPG, JPEG, GIF, WebP, SVG, BMP, ICO)\n - 5MB maximum file size per image\n - Add/edit captions for each image\n - Delete images with confirmation\n\n Storage:\n - Images are stored in the .gallery folder at the repository root\n - Captions are stored in .gallery/gallery.json\n - All changes are committed to the repository automatically\n\n ---\n Localization\n\n Full translations added for gallery-related strings in 20\u002B languages:\n - English, German, French, Spanish, Italian, Dutch, Polish\n - Portuguese (Brazil \u0026 Portugal)\n - Russian, Ukrainian, Turkish, Czech, Greek, Hungarian\n - Finnish, Swedish\n - Japanese, Korean, Chinese (Simplified \u0026 Traditional)\n", "createdAt": 1768881142111, - "updatedAt": 1768881147863, - "tags": [] + "updatedAt": 1768883658811 } \ No newline at end of file diff --git a/models/actions/runner_health.go b/models/actions/runner_health.go index ef54dac4be..26bed8b91d 100644 --- a/models/actions/runner_health.go +++ b/models/actions/runner_health.go @@ -49,16 +49,19 @@ type BandwidthInfo struct { // RunnerHealthStatus represents the health status of a runner type RunnerHealthStatus struct { - Healthy bool `json:"healthy"` - DiskHealthy bool `json:"disk_healthy"` - CPUHealthy bool `json:"cpu_healthy"` - LatencyHealthy bool `json:"latency_healthy"` - DiskUsedPercent float64 `json:"disk_used_percent"` - DiskFreeBytes int64 `json:"disk_free_bytes"` - CPULoadPercent float64 `json:"cpu_load_percent"` - LatencyMs float64 `json:"latency_ms"` - Reason string `json:"reason,omitempty"` - NeedsCleanup bool `json:"needs_cleanup"` + Healthy bool `json:"healthy"` + DiskHealthy bool `json:"disk_healthy"` + CPUHealthy bool `json:"cpu_healthy"` + LatencyHealthy bool `json:"latency_healthy"` + BandwidthHealthy bool `json:"bandwidth_healthy"` + DiskUsedPercent float64 `json:"disk_used_percent"` + DiskFreeBytes int64 `json:"disk_free_bytes"` + CPULoadPercent float64 `json:"cpu_load_percent"` + LatencyMs float64 `json:"latency_ms"` + BandwidthMbps float64 `json:"bandwidth_mbps"` + Reason string `json:"reason,omitempty"` + NeedsCleanup bool `json:"needs_cleanup"` + NeedsBandwidthCheck bool `json:"needs_bandwidth_check"` } // GetCapabilities parses and returns the runner's capabilities @@ -76,12 +79,18 @@ func (r *ActionRunner) GetCapabilities() *RunnerCapabilities { } // GetHealthStatus returns detailed health status of the runner +// Note: Only critical resource exhaustion blocks job assignment: +// - Disk usage >= 95% +// - CPU load >= 95% +// - Bandwidth < 1 MB/s (8 Mbps) - triggers recheck +// Latency is informational only and doesn't block job assignment. func (r *ActionRunner) GetHealthStatus() *RunnerHealthStatus { status := &RunnerHealthStatus{ - Healthy: true, - DiskHealthy: true, - CPUHealthy: true, - LatencyHealthy: true, + Healthy: true, + DiskHealthy: true, + CPUHealthy: true, + LatencyHealthy: true, + BandwidthHealthy: true, } caps := r.GetCapabilities() @@ -93,49 +102,71 @@ func (r *ActionRunner) GetHealthStatus() *RunnerHealthStatus { healthSettings := setting.Actions.RunnerHealthCheck - // Check disk health + // Critical threshold for blocking job assignment + const criticalThreshold = 95.0 + // Minimum bandwidth in Mbps (1 MB/s = 8 Mbps) + const minBandwidthMbps = 8.0 + + // Check disk health - blocks at 95%+ if caps.Disk != nil { status.DiskUsedPercent = caps.Disk.UsedPercent status.DiskFreeBytes = caps.Disk.FreeBytes - freePercent := 100.0 - caps.Disk.UsedPercent - if freePercent < healthSettings.MinDiskPercent { + if caps.Disk.UsedPercent >= criticalThreshold { status.DiskHealthy = false status.Healthy = false - status.Reason = "insufficient disk space" + status.Reason = "critical disk space (>=95%)" status.NeedsCleanup = true - } - - if caps.Disk.UsedPercent >= healthSettings.MaxDiskUsagePercent { + } else if caps.Disk.UsedPercent >= healthSettings.MaxDiskUsagePercent { + // Warn but don't block status.NeedsCleanup = true } } - // Check CPU health + // Check CPU health - blocks at 95%+ if caps.CPU != nil { status.CPULoadPercent = caps.CPU.LoadPercent - if caps.CPU.LoadPercent > healthSettings.MaxCPULoadPercent { + if caps.CPU.LoadPercent >= criticalThreshold { status.CPUHealthy = false status.Healthy = false if status.Reason != "" { status.Reason += "; " } - status.Reason += "CPU overloaded" - } - } - - // Check latency health - if caps.Bandwidth != nil { - status.LatencyMs = caps.Bandwidth.LatencyMs - - if caps.Bandwidth.LatencyMs > healthSettings.MaxLatencyMs { - status.LatencyHealthy = false - status.Healthy = false + status.Reason += "critical CPU load (>=95%)" + } else if caps.CPU.LoadPercent > healthSettings.MaxCPULoadPercent { + // Warn but don't block + status.CPUHealthy = false if status.Reason != "" { status.Reason += "; " } - status.Reason += "high latency" + status.Reason += "high CPU (warning)" + } + } + + // Check bandwidth health - blocks below 1 MB/s and triggers recheck + if caps.Bandwidth != nil { + status.BandwidthMbps = caps.Bandwidth.DownloadMbps + status.LatencyMs = caps.Bandwidth.LatencyMs + + if caps.Bandwidth.DownloadMbps > 0 && caps.Bandwidth.DownloadMbps < minBandwidthMbps { + status.BandwidthHealthy = false + status.Healthy = false + status.NeedsBandwidthCheck = true // Trigger recheck since this is tested infrequently + if status.Reason != "" { + status.Reason += "; " + } + status.Reason += "critical bandwidth (<1 MB/s)" + } + + // Check latency - informational only, doesn't block + if caps.Bandwidth.LatencyMs > healthSettings.MaxLatencyMs { + status.LatencyHealthy = false + // Don't set Healthy = false - latency doesn't block assignment + if status.Reason != "" { + status.Reason += "; " + } + status.Reason += "high latency (warning)" } } @@ -169,6 +200,7 @@ type RunnerCleanupRequest struct { func init() { db.RegisterModel(new(RunnerCleanupRequest)) + db.RegisterModel(new(RunnerBandwidthCheckRequest)) } // TableName returns the table name for RunnerCleanupRequest @@ -264,3 +296,66 @@ func GetUnhealthyRunners(ctx context.Context) ([]*ActionRunner, error) { } return unhealthy, nil } + +// RunnerBandwidthCheckRequest tracks bandwidth recheck requests sent to runners +type RunnerBandwidthCheckRequest struct { + ID int64 `xorm:"pk autoincr"` + RunnerID int64 `xorm:"INDEX NOT NULL"` + RequestedAt timeutil.TimeStamp `xorm:"created INDEX"` + CompletedAt timeutil.TimeStamp `xorm:"INDEX"` + OldMbps float64 // Bandwidth before recheck + NewMbps float64 // Bandwidth after recheck +} + +// TableName returns the table name for RunnerBandwidthCheckRequest +func (RunnerBandwidthCheckRequest) TableName() string { + return "runner_bandwidth_check_request" +} + +// RequestBandwidthCheck creates a bandwidth recheck request for a runner +// This is used when bandwidth drops below threshold to trigger immediate recheck +func RequestBandwidthCheck(ctx context.Context, runnerID int64) error { + // Check if there's already a pending request (within last 5 minutes) + var existing RunnerBandwidthCheckRequest + fiveMinutesAgo := timeutil.TimeStampNow() - 300 + has, err := db.GetEngine(ctx).Where("runner_id = ? AND requested_at > ? AND completed_at = 0", runnerID, fiveMinutesAgo).Get(&existing) + if err != nil { + return err + } + if has { + // Already have a recent pending request + return nil + } + + req := &RunnerBandwidthCheckRequest{ + RunnerID: runnerID, + } + _, err = db.GetEngine(ctx).Insert(req) + return err +} + +// GetPendingBandwidthCheckRequest returns the pending bandwidth check request for a runner +func GetPendingBandwidthCheckRequest(ctx context.Context, runnerID int64) (*RunnerBandwidthCheckRequest, error) { + req := &RunnerBandwidthCheckRequest{} + has, err := db.GetEngine(ctx).Where("runner_id = ? AND completed_at = 0", runnerID). + OrderBy("requested_at DESC"). + Limit(1). + Get(req) + if err != nil { + return nil, err + } + if !has { + return nil, nil + } + return req, nil +} + +// CompleteBandwidthCheckRequest marks a bandwidth check request as completed +func CompleteBandwidthCheckRequest(ctx context.Context, id int64, oldMbps, newMbps float64) error { + _, err := db.GetEngine(ctx).ID(id).Cols("completed_at", "old_mbps", "new_mbps").Update(&RunnerBandwidthCheckRequest{ + CompletedAt: timeutil.TimeStampNow(), + OldMbps: oldMbps, + NewMbps: newMbps, + }) + return err +} diff --git a/models/actions/runner_routing.go b/models/actions/runner_routing.go index ab398db99d..1781437279 100644 --- a/models/actions/runner_routing.go +++ b/models/actions/runner_routing.go @@ -55,7 +55,10 @@ func RunnerBandwidthScore(runner *ActionRunner) float64 { } // ShouldAssignJobToRunner determines if a job should be assigned to this runner -// considering bandwidth-aware routing +// considering bandwidth-aware routing. +// +// IMPORTANT: This function should NEVER leave a valid runner idle when there are +// waiting jobs. Bandwidth routing is a preference, not a hard block. // Returns: (shouldAssign bool, reason string) func ShouldAssignJobToRunner(ctx context.Context, runner *ActionRunner, job *ActionRunJob) (bool, string) { if !setting.Actions.BandwidthAwareRouting { @@ -72,33 +75,44 @@ func ShouldAssignJobToRunner(ctx context.Context, runner *ActionRunner, job *Act // Calculate scores myScore := RunnerBandwidthScore(runner) - // Find the best competing score - bestCompetingScore := 0.0 - var bestCompetitor *ActionRunner + // Find the best competing score among IDLE runners only + bestIdleScore := 0.0 + var bestIdleCompetitor *ActionRunner + allCompetitorsBusy := true + for _, r := range competingRunners { - score := RunnerBandwidthScore(r) - if score > bestCompetingScore { - bestCompetingScore = score - bestCompetitor = r + if isRunnerIdle(ctx, r) { + allCompetitorsBusy = false + score := RunnerBandwidthScore(r) + if score > bestIdleScore { + bestIdleScore = score + bestIdleCompetitor = r + } } } - // If requesting runner is within threshold of best, allow assignment - // This prevents slow runners from being completely starved + // If all competing runners are busy, always assign to this runner + // We should never leave a valid idle runner sitting when jobs are waiting + if allCompetitorsBusy { + return true, "all competing runners busy" + } + + // If this runner is within threshold of best idle runner, allow assignment threshold := setting.Actions.BandwidthScoreThreshold // default 20 - if myScore >= bestCompetingScore-threshold { - return true, "within threshold of best runner" + if myScore >= bestIdleScore-threshold { + return true, "within threshold of best idle runner" } - // If the better runner is busy, allow this runner to take it - if bestCompetitor != nil && !isRunnerIdle(ctx, bestCompetitor) { - return true, "better runner is busy" + // Only defer if there's actually a better idle runner available + // Give the better runner a brief window to claim it + if bestIdleCompetitor != nil { + log.Debug("Runner %s (score: %.1f) deferred job to faster idle runner %s (score: %.1f)", + runner.Name, myScore, bestIdleCompetitor.Name, bestIdleScore) + return false, "faster idle runner available" } - log.Debug("Runner %s (score: %.1f) deferred job to faster runner %s (score: %.1f)", - runner.Name, myScore, bestCompetitor.Name, bestCompetingScore) - - return false, "faster runner available" + // Default: always assign rather than leave runner idle + return true, "default assignment" } // findCompetingRunners finds other online runners that could handle this job diff --git a/models/actions/task.go b/models/actions/task.go index 60451e1453..8b0aded093 100644 --- a/models/actions/task.go +++ b/models/actions/task.go @@ -243,6 +243,15 @@ func CreateTaskForRunner(ctx context.Context, runner *ActionRunner) (*ActionTask } } } + + // Request bandwidth recheck if needed (bandwidth is tested infrequently) + if healthStatus.NeedsBandwidthCheck { + if err := RequestBandwidthCheck(ctx, runner.ID); err != nil { + log.Error("Failed to request bandwidth check for runner %s: %v", runner.Name, err) + } else { + log.Info("Requested bandwidth recheck for runner %s (current: %.1f Mbps)", runner.Name, healthStatus.BandwidthMbps) + } + } } }