2
0

feat(actions): add bandwidth health checks for runners
Some checks failed
Build and Release / Unit Tests (push) Successful in 3m9s
Build and Release / Create Release (push) Successful in 0s
Build and Release / Integration Tests (PostgreSQL) (push) Successful in 5m1s
Build and Release / Lint (push) Successful in 5m18s
Build and Release / Build Binaries (amd64, linux, linux-latest) (push) Successful in 3m2s
Build and Release / Build Binaries (amd64, windows, windows-latest) (push) Successful in 9h4m47s
Build and Release / Build Binaries (amd64, darwin, macos) (push) Successful in 11m9s
Build and Release / Build Binary (linux/arm64) (push) Has been cancelled
Build and Release / Build Binaries (arm64, darwin, macos) (push) Has been cancelled

Adds bandwidth monitoring to runner health checks with critical threshold of 8 Mbps (1 MB/s). Runners below this threshold are blocked from job assignment and trigger automatic bandwidth rechecks. Also refines health check logic: disk/CPU now only block at 95%+ (critical), and latency is informational only. Includes new RunnerBandwidthCheckRequest model to track recheck requests.
This commit is contained in:
2026-01-20 00:26:29 -05:00
parent a91f00e26b
commit 9a52b150fd
4 changed files with 174 additions and 57 deletions

View File

@@ -49,16 +49,19 @@ type BandwidthInfo struct {
// RunnerHealthStatus represents the health status of a runner
type RunnerHealthStatus struct {
Healthy bool `json:"healthy"`
DiskHealthy bool `json:"disk_healthy"`
CPUHealthy bool `json:"cpu_healthy"`
LatencyHealthy bool `json:"latency_healthy"`
DiskUsedPercent float64 `json:"disk_used_percent"`
DiskFreeBytes int64 `json:"disk_free_bytes"`
CPULoadPercent float64 `json:"cpu_load_percent"`
LatencyMs float64 `json:"latency_ms"`
Reason string `json:"reason,omitempty"`
NeedsCleanup bool `json:"needs_cleanup"`
Healthy bool `json:"healthy"`
DiskHealthy bool `json:"disk_healthy"`
CPUHealthy bool `json:"cpu_healthy"`
LatencyHealthy bool `json:"latency_healthy"`
BandwidthHealthy bool `json:"bandwidth_healthy"`
DiskUsedPercent float64 `json:"disk_used_percent"`
DiskFreeBytes int64 `json:"disk_free_bytes"`
CPULoadPercent float64 `json:"cpu_load_percent"`
LatencyMs float64 `json:"latency_ms"`
BandwidthMbps float64 `json:"bandwidth_mbps"`
Reason string `json:"reason,omitempty"`
NeedsCleanup bool `json:"needs_cleanup"`
NeedsBandwidthCheck bool `json:"needs_bandwidth_check"`
}
// GetCapabilities parses and returns the runner's capabilities
@@ -76,12 +79,18 @@ func (r *ActionRunner) GetCapabilities() *RunnerCapabilities {
}
// GetHealthStatus returns detailed health status of the runner
// Note: Only critical resource exhaustion blocks job assignment:
// - Disk usage >= 95%
// - CPU load >= 95%
// - Bandwidth < 1 MB/s (8 Mbps) - triggers recheck
// Latency is informational only and doesn't block job assignment.
func (r *ActionRunner) GetHealthStatus() *RunnerHealthStatus {
status := &RunnerHealthStatus{
Healthy: true,
DiskHealthy: true,
CPUHealthy: true,
LatencyHealthy: true,
Healthy: true,
DiskHealthy: true,
CPUHealthy: true,
LatencyHealthy: true,
BandwidthHealthy: true,
}
caps := r.GetCapabilities()
@@ -93,49 +102,71 @@ func (r *ActionRunner) GetHealthStatus() *RunnerHealthStatus {
healthSettings := setting.Actions.RunnerHealthCheck
// Check disk health
// Critical threshold for blocking job assignment
const criticalThreshold = 95.0
// Minimum bandwidth in Mbps (1 MB/s = 8 Mbps)
const minBandwidthMbps = 8.0
// Check disk health - blocks at 95%+
if caps.Disk != nil {
status.DiskUsedPercent = caps.Disk.UsedPercent
status.DiskFreeBytes = caps.Disk.FreeBytes
freePercent := 100.0 - caps.Disk.UsedPercent
if freePercent < healthSettings.MinDiskPercent {
if caps.Disk.UsedPercent >= criticalThreshold {
status.DiskHealthy = false
status.Healthy = false
status.Reason = "insufficient disk space"
status.Reason = "critical disk space (>=95%)"
status.NeedsCleanup = true
}
if caps.Disk.UsedPercent >= healthSettings.MaxDiskUsagePercent {
} else if caps.Disk.UsedPercent >= healthSettings.MaxDiskUsagePercent {
// Warn but don't block
status.NeedsCleanup = true
}
}
// Check CPU health
// Check CPU health - blocks at 95%+
if caps.CPU != nil {
status.CPULoadPercent = caps.CPU.LoadPercent
if caps.CPU.LoadPercent > healthSettings.MaxCPULoadPercent {
if caps.CPU.LoadPercent >= criticalThreshold {
status.CPUHealthy = false
status.Healthy = false
if status.Reason != "" {
status.Reason += "; "
}
status.Reason += "CPU overloaded"
}
}
// Check latency health
if caps.Bandwidth != nil {
status.LatencyMs = caps.Bandwidth.LatencyMs
if caps.Bandwidth.LatencyMs > healthSettings.MaxLatencyMs {
status.LatencyHealthy = false
status.Healthy = false
status.Reason += "critical CPU load (>=95%)"
} else if caps.CPU.LoadPercent > healthSettings.MaxCPULoadPercent {
// Warn but don't block
status.CPUHealthy = false
if status.Reason != "" {
status.Reason += "; "
}
status.Reason += "high latency"
status.Reason += "high CPU (warning)"
}
}
// Check bandwidth health - blocks below 1 MB/s and triggers recheck
if caps.Bandwidth != nil {
status.BandwidthMbps = caps.Bandwidth.DownloadMbps
status.LatencyMs = caps.Bandwidth.LatencyMs
if caps.Bandwidth.DownloadMbps > 0 && caps.Bandwidth.DownloadMbps < minBandwidthMbps {
status.BandwidthHealthy = false
status.Healthy = false
status.NeedsBandwidthCheck = true // Trigger recheck since this is tested infrequently
if status.Reason != "" {
status.Reason += "; "
}
status.Reason += "critical bandwidth (<1 MB/s)"
}
// Check latency - informational only, doesn't block
if caps.Bandwidth.LatencyMs > healthSettings.MaxLatencyMs {
status.LatencyHealthy = false
// Don't set Healthy = false - latency doesn't block assignment
if status.Reason != "" {
status.Reason += "; "
}
status.Reason += "high latency (warning)"
}
}
@@ -169,6 +200,7 @@ type RunnerCleanupRequest struct {
func init() {
db.RegisterModel(new(RunnerCleanupRequest))
db.RegisterModel(new(RunnerBandwidthCheckRequest))
}
// TableName returns the table name for RunnerCleanupRequest
@@ -264,3 +296,66 @@ func GetUnhealthyRunners(ctx context.Context) ([]*ActionRunner, error) {
}
return unhealthy, nil
}
// RunnerBandwidthCheckRequest tracks bandwidth recheck requests sent to runners
type RunnerBandwidthCheckRequest struct {
ID int64 `xorm:"pk autoincr"`
RunnerID int64 `xorm:"INDEX NOT NULL"`
RequestedAt timeutil.TimeStamp `xorm:"created INDEX"`
CompletedAt timeutil.TimeStamp `xorm:"INDEX"`
OldMbps float64 // Bandwidth before recheck
NewMbps float64 // Bandwidth after recheck
}
// TableName returns the table name for RunnerBandwidthCheckRequest
func (RunnerBandwidthCheckRequest) TableName() string {
return "runner_bandwidth_check_request"
}
// RequestBandwidthCheck creates a bandwidth recheck request for a runner
// This is used when bandwidth drops below threshold to trigger immediate recheck
func RequestBandwidthCheck(ctx context.Context, runnerID int64) error {
// Check if there's already a pending request (within last 5 minutes)
var existing RunnerBandwidthCheckRequest
fiveMinutesAgo := timeutil.TimeStampNow() - 300
has, err := db.GetEngine(ctx).Where("runner_id = ? AND requested_at > ? AND completed_at = 0", runnerID, fiveMinutesAgo).Get(&existing)
if err != nil {
return err
}
if has {
// Already have a recent pending request
return nil
}
req := &RunnerBandwidthCheckRequest{
RunnerID: runnerID,
}
_, err = db.GetEngine(ctx).Insert(req)
return err
}
// GetPendingBandwidthCheckRequest returns the pending bandwidth check request for a runner
func GetPendingBandwidthCheckRequest(ctx context.Context, runnerID int64) (*RunnerBandwidthCheckRequest, error) {
req := &RunnerBandwidthCheckRequest{}
has, err := db.GetEngine(ctx).Where("runner_id = ? AND completed_at = 0", runnerID).
OrderBy("requested_at DESC").
Limit(1).
Get(req)
if err != nil {
return nil, err
}
if !has {
return nil, nil
}
return req, nil
}
// CompleteBandwidthCheckRequest marks a bandwidth check request as completed
func CompleteBandwidthCheckRequest(ctx context.Context, id int64, oldMbps, newMbps float64) error {
_, err := db.GetEngine(ctx).ID(id).Cols("completed_at", "old_mbps", "new_mbps").Update(&RunnerBandwidthCheckRequest{
CompletedAt: timeutil.TimeStampNow(),
OldMbps: oldMbps,
NewMbps: newMbps,
})
return err
}

View File

@@ -55,7 +55,10 @@ func RunnerBandwidthScore(runner *ActionRunner) float64 {
}
// ShouldAssignJobToRunner determines if a job should be assigned to this runner
// considering bandwidth-aware routing
// considering bandwidth-aware routing.
//
// IMPORTANT: This function should NEVER leave a valid runner idle when there are
// waiting jobs. Bandwidth routing is a preference, not a hard block.
// Returns: (shouldAssign bool, reason string)
func ShouldAssignJobToRunner(ctx context.Context, runner *ActionRunner, job *ActionRunJob) (bool, string) {
if !setting.Actions.BandwidthAwareRouting {
@@ -72,33 +75,44 @@ func ShouldAssignJobToRunner(ctx context.Context, runner *ActionRunner, job *Act
// Calculate scores
myScore := RunnerBandwidthScore(runner)
// Find the best competing score
bestCompetingScore := 0.0
var bestCompetitor *ActionRunner
// Find the best competing score among IDLE runners only
bestIdleScore := 0.0
var bestIdleCompetitor *ActionRunner
allCompetitorsBusy := true
for _, r := range competingRunners {
score := RunnerBandwidthScore(r)
if score > bestCompetingScore {
bestCompetingScore = score
bestCompetitor = r
if isRunnerIdle(ctx, r) {
allCompetitorsBusy = false
score := RunnerBandwidthScore(r)
if score > bestIdleScore {
bestIdleScore = score
bestIdleCompetitor = r
}
}
}
// If requesting runner is within threshold of best, allow assignment
// This prevents slow runners from being completely starved
// If all competing runners are busy, always assign to this runner
// We should never leave a valid idle runner sitting when jobs are waiting
if allCompetitorsBusy {
return true, "all competing runners busy"
}
// If this runner is within threshold of best idle runner, allow assignment
threshold := setting.Actions.BandwidthScoreThreshold // default 20
if myScore >= bestCompetingScore-threshold {
return true, "within threshold of best runner"
if myScore >= bestIdleScore-threshold {
return true, "within threshold of best idle runner"
}
// If the better runner is busy, allow this runner to take it
if bestCompetitor != nil && !isRunnerIdle(ctx, bestCompetitor) {
return true, "better runner is busy"
// Only defer if there's actually a better idle runner available
// Give the better runner a brief window to claim it
if bestIdleCompetitor != nil {
log.Debug("Runner %s (score: %.1f) deferred job to faster idle runner %s (score: %.1f)",
runner.Name, myScore, bestIdleCompetitor.Name, bestIdleScore)
return false, "faster idle runner available"
}
log.Debug("Runner %s (score: %.1f) deferred job to faster runner %s (score: %.1f)",
runner.Name, myScore, bestCompetitor.Name, bestCompetingScore)
return false, "faster runner available"
// Default: always assign rather than leave runner idle
return true, "default assignment"
}
// findCompetingRunners finds other online runners that could handle this job

View File

@@ -243,6 +243,15 @@ func CreateTaskForRunner(ctx context.Context, runner *ActionRunner) (*ActionTask
}
}
}
// Request bandwidth recheck if needed (bandwidth is tested infrequently)
if healthStatus.NeedsBandwidthCheck {
if err := RequestBandwidthCheck(ctx, runner.ID); err != nil {
log.Error("Failed to request bandwidth check for runner %s: %v", runner.Name, err)
} else {
log.Info("Requested bandwidth recheck for runner %s (current: %.1f Mbps)", runner.Name, healthStatus.BandwidthMbps)
}
}
}
}