2
0
Files
logikonline 9a52b150fd
Some checks failed
Build and Release / Unit Tests (push) Successful in 3m9s
Build and Release / Create Release (push) Successful in 0s
Build and Release / Integration Tests (PostgreSQL) (push) Successful in 5m1s
Build and Release / Lint (push) Successful in 5m18s
Build and Release / Build Binaries (amd64, linux, linux-latest) (push) Successful in 3m2s
Build and Release / Build Binaries (amd64, windows, windows-latest) (push) Successful in 9h4m47s
Build and Release / Build Binaries (amd64, darwin, macos) (push) Successful in 11m9s
Build and Release / Build Binary (linux/arm64) (push) Has been cancelled
Build and Release / Build Binaries (arm64, darwin, macos) (push) Has been cancelled
feat(actions): add bandwidth health checks for runners
Adds bandwidth monitoring to runner health checks with critical threshold of 8 Mbps (1 MB/s). Runners below this threshold are blocked from job assignment and trigger automatic bandwidth rechecks. Also refines health check logic: disk/CPU now only block at 95%+ (critical), and latency is informational only. Includes new RunnerBandwidthCheckRequest model to track recheck requests.
2026-01-20 00:26:29 -05:00

183 lines
5.3 KiB
Go

// Copyright 2026 MarketAlly. All rights reserved.
// SPDX-License-Identifier: MIT
package actions
import (
"context"
"math"
"sort"
"code.gitcaddy.com/server/v3/models/db"
"code.gitcaddy.com/server/v3/modules/log"
"code.gitcaddy.com/server/v3/modules/optional"
"code.gitcaddy.com/server/v3/modules/setting"
)
// RunnerBandwidthScore calculates a routing score for a runner
// Higher score = better for job assignment
// Score considers: bandwidth (primary), latency (secondary)
func RunnerBandwidthScore(runner *ActionRunner) float64 {
caps := runner.GetCapabilities()
if caps == nil {
return 50.0 // Default middling score if no capabilities
}
bw := caps.Bandwidth
if bw == nil || bw.DownloadMbps <= 0 {
// No bandwidth data - give a default middling score
return 50.0
}
// Base score from bandwidth (log scale to prevent huge gaps)
// 1 Mbps = 0, 10 Mbps = 33, 100 Mbps = 66, 1000 Mbps = 100
bandwidthScore := 0.0
if bw.DownloadMbps > 0 {
// Log10 scale: log10(1)=0, log10(10)=1, log10(100)=2, log10(1000)=3
logVal := math.Log10(bw.DownloadMbps)
if logVal < 0 {
logVal = 0
}
bandwidthScore = logVal * 33.3 // Scale to 0-100
}
// Latency penalty (subtract up to 20 points for high latency)
latencyPenalty := 0.0
if bw.LatencyMs > 10 {
// 0-10ms = no penalty, 10-50ms = small penalty, 50-200ms = bigger penalty
latencyPenalty = (bw.LatencyMs - 10) / 10.0
if latencyPenalty > 20 {
latencyPenalty = 20
}
}
return bandwidthScore - latencyPenalty
}
// ShouldAssignJobToRunner determines if a job should be assigned to this runner
// considering bandwidth-aware routing.
//
// IMPORTANT: This function should NEVER leave a valid runner idle when there are
// waiting jobs. Bandwidth routing is a preference, not a hard block.
// Returns: (shouldAssign bool, reason string)
func ShouldAssignJobToRunner(ctx context.Context, runner *ActionRunner, job *ActionRunJob) (bool, string) {
if !setting.Actions.BandwidthAwareRouting {
return true, "bandwidth routing disabled"
}
// Always assign if this runner is the only option
// (e.g., macos-only jobs when only mac runner available)
competingRunners := findCompetingRunners(ctx, runner, job)
if len(competingRunners) == 0 {
return true, "only matching runner"
}
// Calculate scores
myScore := RunnerBandwidthScore(runner)
// Find the best competing score among IDLE runners only
bestIdleScore := 0.0
var bestIdleCompetitor *ActionRunner
allCompetitorsBusy := true
for _, r := range competingRunners {
if isRunnerIdle(ctx, r) {
allCompetitorsBusy = false
score := RunnerBandwidthScore(r)
if score > bestIdleScore {
bestIdleScore = score
bestIdleCompetitor = r
}
}
}
// If all competing runners are busy, always assign to this runner
// We should never leave a valid idle runner sitting when jobs are waiting
if allCompetitorsBusy {
return true, "all competing runners busy"
}
// If this runner is within threshold of best idle runner, allow assignment
threshold := setting.Actions.BandwidthScoreThreshold // default 20
if myScore >= bestIdleScore-threshold {
return true, "within threshold of best idle runner"
}
// Only defer if there's actually a better idle runner available
// Give the better runner a brief window to claim it
if bestIdleCompetitor != nil {
log.Debug("Runner %s (score: %.1f) deferred job to faster idle runner %s (score: %.1f)",
runner.Name, myScore, bestIdleCompetitor.Name, bestIdleScore)
return false, "faster idle runner available"
}
// Default: always assign rather than leave runner idle
return true, "default assignment"
}
// findCompetingRunners finds other online runners that could handle this job
func findCompetingRunners(ctx context.Context, excludeRunner *ActionRunner, job *ActionRunJob) []*ActionRunner {
runners, err := db.Find[ActionRunner](ctx, FindRunnerOptions{
IsOnline: optional.Some(true),
})
if err != nil {
log.Error("Failed to find competing runners: %v", err)
return nil
}
var competing []*ActionRunner
for _, r := range runners {
// Skip the requesting runner
if r.ID == excludeRunner.ID {
continue
}
// Skip offline runners
if !r.IsOnline() {
continue
}
// Check if this runner can handle the job
if r.CanMatchLabels(job.RunsOn) {
competing = append(competing, r)
}
}
return competing
}
// isRunnerIdle checks if a runner currently has no active tasks
func isRunnerIdle(ctx context.Context, runner *ActionRunner) bool {
count, err := db.GetEngine(ctx).
Where("runner_id = ? AND status = ?", runner.ID, StatusRunning).
Count(&ActionTask{})
if err != nil {
log.Error("Failed to check if runner %s is idle: %v", runner.Name, err)
return false
}
return count == 0
}
// GetRunnersForJobByBandwidth returns runners sorted by bandwidth score (best first)
func GetRunnersForJobByBandwidth(ctx context.Context, job *ActionRunJob) []*ActionRunner {
runners, err := db.Find[ActionRunner](ctx, FindRunnerOptions{
IsOnline: optional.Some(true),
})
if err != nil {
log.Error("Failed to find runners for job: %v", err)
return nil
}
var matching []*ActionRunner
for _, r := range runners {
if r.CanMatchLabels(job.RunsOn) {
matching = append(matching, r)
}
}
// Sort by bandwidth score (highest first)
sort.Slice(matching, func(i, j int) bool {
return RunnerBandwidthScore(matching[i]) > RunnerBandwidthScore(matching[j])
})
return matching
}