feat(actions): add bandwidth health checks for runners
Some checks failed
Build and Release / Unit Tests (push) Successful in 3m9s
Build and Release / Create Release (push) Successful in 0s
Build and Release / Integration Tests (PostgreSQL) (push) Successful in 5m1s
Build and Release / Lint (push) Successful in 5m18s
Build and Release / Build Binaries (amd64, linux, linux-latest) (push) Successful in 3m2s
Build and Release / Build Binaries (amd64, windows, windows-latest) (push) Successful in 9h4m47s
Build and Release / Build Binaries (amd64, darwin, macos) (push) Successful in 11m9s
Build and Release / Build Binary (linux/arm64) (push) Has been cancelled
Build and Release / Build Binaries (arm64, darwin, macos) (push) Has been cancelled
Some checks failed
Build and Release / Unit Tests (push) Successful in 3m9s
Build and Release / Create Release (push) Successful in 0s
Build and Release / Integration Tests (PostgreSQL) (push) Successful in 5m1s
Build and Release / Lint (push) Successful in 5m18s
Build and Release / Build Binaries (amd64, linux, linux-latest) (push) Successful in 3m2s
Build and Release / Build Binaries (amd64, windows, windows-latest) (push) Successful in 9h4m47s
Build and Release / Build Binaries (amd64, darwin, macos) (push) Successful in 11m9s
Build and Release / Build Binary (linux/arm64) (push) Has been cancelled
Build and Release / Build Binaries (arm64, darwin, macos) (push) Has been cancelled
Adds bandwidth monitoring to runner health checks with critical threshold of 8 Mbps (1 MB/s). Runners below this threshold are blocked from job assignment and trigger automatic bandwidth rechecks. Also refines health check logic: disk/CPU now only block at 95%+ (critical), and latency is informational only. Includes new RunnerBandwidthCheckRequest model to track recheck requests.
This commit is contained in:
@@ -49,16 +49,19 @@ type BandwidthInfo struct {
|
||||
|
||||
// RunnerHealthStatus represents the health status of a runner
|
||||
type RunnerHealthStatus struct {
|
||||
Healthy bool `json:"healthy"`
|
||||
DiskHealthy bool `json:"disk_healthy"`
|
||||
CPUHealthy bool `json:"cpu_healthy"`
|
||||
LatencyHealthy bool `json:"latency_healthy"`
|
||||
DiskUsedPercent float64 `json:"disk_used_percent"`
|
||||
DiskFreeBytes int64 `json:"disk_free_bytes"`
|
||||
CPULoadPercent float64 `json:"cpu_load_percent"`
|
||||
LatencyMs float64 `json:"latency_ms"`
|
||||
Reason string `json:"reason,omitempty"`
|
||||
NeedsCleanup bool `json:"needs_cleanup"`
|
||||
Healthy bool `json:"healthy"`
|
||||
DiskHealthy bool `json:"disk_healthy"`
|
||||
CPUHealthy bool `json:"cpu_healthy"`
|
||||
LatencyHealthy bool `json:"latency_healthy"`
|
||||
BandwidthHealthy bool `json:"bandwidth_healthy"`
|
||||
DiskUsedPercent float64 `json:"disk_used_percent"`
|
||||
DiskFreeBytes int64 `json:"disk_free_bytes"`
|
||||
CPULoadPercent float64 `json:"cpu_load_percent"`
|
||||
LatencyMs float64 `json:"latency_ms"`
|
||||
BandwidthMbps float64 `json:"bandwidth_mbps"`
|
||||
Reason string `json:"reason,omitempty"`
|
||||
NeedsCleanup bool `json:"needs_cleanup"`
|
||||
NeedsBandwidthCheck bool `json:"needs_bandwidth_check"`
|
||||
}
|
||||
|
||||
// GetCapabilities parses and returns the runner's capabilities
|
||||
@@ -76,12 +79,18 @@ func (r *ActionRunner) GetCapabilities() *RunnerCapabilities {
|
||||
}
|
||||
|
||||
// GetHealthStatus returns detailed health status of the runner
|
||||
// Note: Only critical resource exhaustion blocks job assignment:
|
||||
// - Disk usage >= 95%
|
||||
// - CPU load >= 95%
|
||||
// - Bandwidth < 1 MB/s (8 Mbps) - triggers recheck
|
||||
// Latency is informational only and doesn't block job assignment.
|
||||
func (r *ActionRunner) GetHealthStatus() *RunnerHealthStatus {
|
||||
status := &RunnerHealthStatus{
|
||||
Healthy: true,
|
||||
DiskHealthy: true,
|
||||
CPUHealthy: true,
|
||||
LatencyHealthy: true,
|
||||
Healthy: true,
|
||||
DiskHealthy: true,
|
||||
CPUHealthy: true,
|
||||
LatencyHealthy: true,
|
||||
BandwidthHealthy: true,
|
||||
}
|
||||
|
||||
caps := r.GetCapabilities()
|
||||
@@ -93,49 +102,71 @@ func (r *ActionRunner) GetHealthStatus() *RunnerHealthStatus {
|
||||
|
||||
healthSettings := setting.Actions.RunnerHealthCheck
|
||||
|
||||
// Check disk health
|
||||
// Critical threshold for blocking job assignment
|
||||
const criticalThreshold = 95.0
|
||||
// Minimum bandwidth in Mbps (1 MB/s = 8 Mbps)
|
||||
const minBandwidthMbps = 8.0
|
||||
|
||||
// Check disk health - blocks at 95%+
|
||||
if caps.Disk != nil {
|
||||
status.DiskUsedPercent = caps.Disk.UsedPercent
|
||||
status.DiskFreeBytes = caps.Disk.FreeBytes
|
||||
|
||||
freePercent := 100.0 - caps.Disk.UsedPercent
|
||||
if freePercent < healthSettings.MinDiskPercent {
|
||||
if caps.Disk.UsedPercent >= criticalThreshold {
|
||||
status.DiskHealthy = false
|
||||
status.Healthy = false
|
||||
status.Reason = "insufficient disk space"
|
||||
status.Reason = "critical disk space (>=95%)"
|
||||
status.NeedsCleanup = true
|
||||
}
|
||||
|
||||
if caps.Disk.UsedPercent >= healthSettings.MaxDiskUsagePercent {
|
||||
} else if caps.Disk.UsedPercent >= healthSettings.MaxDiskUsagePercent {
|
||||
// Warn but don't block
|
||||
status.NeedsCleanup = true
|
||||
}
|
||||
}
|
||||
|
||||
// Check CPU health
|
||||
// Check CPU health - blocks at 95%+
|
||||
if caps.CPU != nil {
|
||||
status.CPULoadPercent = caps.CPU.LoadPercent
|
||||
|
||||
if caps.CPU.LoadPercent > healthSettings.MaxCPULoadPercent {
|
||||
if caps.CPU.LoadPercent >= criticalThreshold {
|
||||
status.CPUHealthy = false
|
||||
status.Healthy = false
|
||||
if status.Reason != "" {
|
||||
status.Reason += "; "
|
||||
}
|
||||
status.Reason += "CPU overloaded"
|
||||
}
|
||||
}
|
||||
|
||||
// Check latency health
|
||||
if caps.Bandwidth != nil {
|
||||
status.LatencyMs = caps.Bandwidth.LatencyMs
|
||||
|
||||
if caps.Bandwidth.LatencyMs > healthSettings.MaxLatencyMs {
|
||||
status.LatencyHealthy = false
|
||||
status.Healthy = false
|
||||
status.Reason += "critical CPU load (>=95%)"
|
||||
} else if caps.CPU.LoadPercent > healthSettings.MaxCPULoadPercent {
|
||||
// Warn but don't block
|
||||
status.CPUHealthy = false
|
||||
if status.Reason != "" {
|
||||
status.Reason += "; "
|
||||
}
|
||||
status.Reason += "high latency"
|
||||
status.Reason += "high CPU (warning)"
|
||||
}
|
||||
}
|
||||
|
||||
// Check bandwidth health - blocks below 1 MB/s and triggers recheck
|
||||
if caps.Bandwidth != nil {
|
||||
status.BandwidthMbps = caps.Bandwidth.DownloadMbps
|
||||
status.LatencyMs = caps.Bandwidth.LatencyMs
|
||||
|
||||
if caps.Bandwidth.DownloadMbps > 0 && caps.Bandwidth.DownloadMbps < minBandwidthMbps {
|
||||
status.BandwidthHealthy = false
|
||||
status.Healthy = false
|
||||
status.NeedsBandwidthCheck = true // Trigger recheck since this is tested infrequently
|
||||
if status.Reason != "" {
|
||||
status.Reason += "; "
|
||||
}
|
||||
status.Reason += "critical bandwidth (<1 MB/s)"
|
||||
}
|
||||
|
||||
// Check latency - informational only, doesn't block
|
||||
if caps.Bandwidth.LatencyMs > healthSettings.MaxLatencyMs {
|
||||
status.LatencyHealthy = false
|
||||
// Don't set Healthy = false - latency doesn't block assignment
|
||||
if status.Reason != "" {
|
||||
status.Reason += "; "
|
||||
}
|
||||
status.Reason += "high latency (warning)"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -169,6 +200,7 @@ type RunnerCleanupRequest struct {
|
||||
|
||||
func init() {
|
||||
db.RegisterModel(new(RunnerCleanupRequest))
|
||||
db.RegisterModel(new(RunnerBandwidthCheckRequest))
|
||||
}
|
||||
|
||||
// TableName returns the table name for RunnerCleanupRequest
|
||||
@@ -264,3 +296,66 @@ func GetUnhealthyRunners(ctx context.Context) ([]*ActionRunner, error) {
|
||||
}
|
||||
return unhealthy, nil
|
||||
}
|
||||
|
||||
// RunnerBandwidthCheckRequest tracks bandwidth recheck requests sent to runners
|
||||
type RunnerBandwidthCheckRequest struct {
|
||||
ID int64 `xorm:"pk autoincr"`
|
||||
RunnerID int64 `xorm:"INDEX NOT NULL"`
|
||||
RequestedAt timeutil.TimeStamp `xorm:"created INDEX"`
|
||||
CompletedAt timeutil.TimeStamp `xorm:"INDEX"`
|
||||
OldMbps float64 // Bandwidth before recheck
|
||||
NewMbps float64 // Bandwidth after recheck
|
||||
}
|
||||
|
||||
// TableName returns the table name for RunnerBandwidthCheckRequest
|
||||
func (RunnerBandwidthCheckRequest) TableName() string {
|
||||
return "runner_bandwidth_check_request"
|
||||
}
|
||||
|
||||
// RequestBandwidthCheck creates a bandwidth recheck request for a runner
|
||||
// This is used when bandwidth drops below threshold to trigger immediate recheck
|
||||
func RequestBandwidthCheck(ctx context.Context, runnerID int64) error {
|
||||
// Check if there's already a pending request (within last 5 minutes)
|
||||
var existing RunnerBandwidthCheckRequest
|
||||
fiveMinutesAgo := timeutil.TimeStampNow() - 300
|
||||
has, err := db.GetEngine(ctx).Where("runner_id = ? AND requested_at > ? AND completed_at = 0", runnerID, fiveMinutesAgo).Get(&existing)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if has {
|
||||
// Already have a recent pending request
|
||||
return nil
|
||||
}
|
||||
|
||||
req := &RunnerBandwidthCheckRequest{
|
||||
RunnerID: runnerID,
|
||||
}
|
||||
_, err = db.GetEngine(ctx).Insert(req)
|
||||
return err
|
||||
}
|
||||
|
||||
// GetPendingBandwidthCheckRequest returns the pending bandwidth check request for a runner
|
||||
func GetPendingBandwidthCheckRequest(ctx context.Context, runnerID int64) (*RunnerBandwidthCheckRequest, error) {
|
||||
req := &RunnerBandwidthCheckRequest{}
|
||||
has, err := db.GetEngine(ctx).Where("runner_id = ? AND completed_at = 0", runnerID).
|
||||
OrderBy("requested_at DESC").
|
||||
Limit(1).
|
||||
Get(req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if !has {
|
||||
return nil, nil
|
||||
}
|
||||
return req, nil
|
||||
}
|
||||
|
||||
// CompleteBandwidthCheckRequest marks a bandwidth check request as completed
|
||||
func CompleteBandwidthCheckRequest(ctx context.Context, id int64, oldMbps, newMbps float64) error {
|
||||
_, err := db.GetEngine(ctx).ID(id).Cols("completed_at", "old_mbps", "new_mbps").Update(&RunnerBandwidthCheckRequest{
|
||||
CompletedAt: timeutil.TimeStampNow(),
|
||||
OldMbps: oldMbps,
|
||||
NewMbps: newMbps,
|
||||
})
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -55,7 +55,10 @@ func RunnerBandwidthScore(runner *ActionRunner) float64 {
|
||||
}
|
||||
|
||||
// ShouldAssignJobToRunner determines if a job should be assigned to this runner
|
||||
// considering bandwidth-aware routing
|
||||
// considering bandwidth-aware routing.
|
||||
//
|
||||
// IMPORTANT: This function should NEVER leave a valid runner idle when there are
|
||||
// waiting jobs. Bandwidth routing is a preference, not a hard block.
|
||||
// Returns: (shouldAssign bool, reason string)
|
||||
func ShouldAssignJobToRunner(ctx context.Context, runner *ActionRunner, job *ActionRunJob) (bool, string) {
|
||||
if !setting.Actions.BandwidthAwareRouting {
|
||||
@@ -72,33 +75,44 @@ func ShouldAssignJobToRunner(ctx context.Context, runner *ActionRunner, job *Act
|
||||
// Calculate scores
|
||||
myScore := RunnerBandwidthScore(runner)
|
||||
|
||||
// Find the best competing score
|
||||
bestCompetingScore := 0.0
|
||||
var bestCompetitor *ActionRunner
|
||||
// Find the best competing score among IDLE runners only
|
||||
bestIdleScore := 0.0
|
||||
var bestIdleCompetitor *ActionRunner
|
||||
allCompetitorsBusy := true
|
||||
|
||||
for _, r := range competingRunners {
|
||||
score := RunnerBandwidthScore(r)
|
||||
if score > bestCompetingScore {
|
||||
bestCompetingScore = score
|
||||
bestCompetitor = r
|
||||
if isRunnerIdle(ctx, r) {
|
||||
allCompetitorsBusy = false
|
||||
score := RunnerBandwidthScore(r)
|
||||
if score > bestIdleScore {
|
||||
bestIdleScore = score
|
||||
bestIdleCompetitor = r
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If requesting runner is within threshold of best, allow assignment
|
||||
// This prevents slow runners from being completely starved
|
||||
// If all competing runners are busy, always assign to this runner
|
||||
// We should never leave a valid idle runner sitting when jobs are waiting
|
||||
if allCompetitorsBusy {
|
||||
return true, "all competing runners busy"
|
||||
}
|
||||
|
||||
// If this runner is within threshold of best idle runner, allow assignment
|
||||
threshold := setting.Actions.BandwidthScoreThreshold // default 20
|
||||
if myScore >= bestCompetingScore-threshold {
|
||||
return true, "within threshold of best runner"
|
||||
if myScore >= bestIdleScore-threshold {
|
||||
return true, "within threshold of best idle runner"
|
||||
}
|
||||
|
||||
// If the better runner is busy, allow this runner to take it
|
||||
if bestCompetitor != nil && !isRunnerIdle(ctx, bestCompetitor) {
|
||||
return true, "better runner is busy"
|
||||
// Only defer if there's actually a better idle runner available
|
||||
// Give the better runner a brief window to claim it
|
||||
if bestIdleCompetitor != nil {
|
||||
log.Debug("Runner %s (score: %.1f) deferred job to faster idle runner %s (score: %.1f)",
|
||||
runner.Name, myScore, bestIdleCompetitor.Name, bestIdleScore)
|
||||
return false, "faster idle runner available"
|
||||
}
|
||||
|
||||
log.Debug("Runner %s (score: %.1f) deferred job to faster runner %s (score: %.1f)",
|
||||
runner.Name, myScore, bestCompetitor.Name, bestCompetingScore)
|
||||
|
||||
return false, "faster runner available"
|
||||
// Default: always assign rather than leave runner idle
|
||||
return true, "default assignment"
|
||||
}
|
||||
|
||||
// findCompetingRunners finds other online runners that could handle this job
|
||||
|
||||
@@ -243,6 +243,15 @@ func CreateTaskForRunner(ctx context.Context, runner *ActionRunner) (*ActionTask
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Request bandwidth recheck if needed (bandwidth is tested infrequently)
|
||||
if healthStatus.NeedsBandwidthCheck {
|
||||
if err := RequestBandwidthCheck(ctx, runner.ID); err != nil {
|
||||
log.Error("Failed to request bandwidth check for runner %s: %v", runner.Name, err)
|
||||
} else {
|
||||
log.Info("Requested bandwidth recheck for runner %s (current: %.1f Mbps)", runner.Name, healthStatus.BandwidthMbps)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user