fix(actions): improve unhealthy runner job assignment logic
All checks were successful
Build and Release / Create Release (push) Successful in 0s
Build and Release / Unit Tests (push) Successful in 3m10s
Build and Release / Integration Tests (PostgreSQL) (push) Successful in 5m8s
Build and Release / Lint (push) Successful in 5m16s
Build and Release / Build Binaries (amd64, linux, linux-latest) (push) Successful in 3m1s
Build and Release / Build Binaries (amd64, darwin, macos) (push) Successful in 4m12s
Build and Release / Build Binaries (arm64, darwin, macos) (push) Successful in 5m2s
Build and Release / Build Binaries (amd64, windows, windows-latest) (push) Successful in 9h4m59s
Build and Release / Build Binary (linux/arm64) (push) Successful in 7m19s
All checks were successful
Build and Release / Create Release (push) Successful in 0s
Build and Release / Unit Tests (push) Successful in 3m10s
Build and Release / Integration Tests (PostgreSQL) (push) Successful in 5m8s
Build and Release / Lint (push) Successful in 5m16s
Build and Release / Build Binaries (amd64, linux, linux-latest) (push) Successful in 3m1s
Build and Release / Build Binaries (amd64, darwin, macos) (push) Successful in 4m12s
Build and Release / Build Binaries (arm64, darwin, macos) (push) Successful in 5m2s
Build and Release / Build Binaries (amd64, windows, windows-latest) (push) Successful in 9h4m59s
Build and Release / Build Binary (linux/arm64) (push) Successful in 7m19s
Refine unhealthy runner behavior to only skip jobs when a healthy AND idle alternative exists. Previously, unhealthy runners would skip jobs if any healthy runner existed, even if busy. Now unhealthy runners take jobs when all alternatives are either unhealthy or busy. Rename isOnlyMatchingRunner to hasHealthyIdleAlternative to better reflect the actual check being performed. Also add cleanup for leftover vault clones in Windows build step and ensure HOME directory exists in release job.
This commit is contained in:
@@ -299,6 +299,9 @@ jobs:
|
||||
env:
|
||||
VAULT_TOKEN: ${{ secrets.RELEASE_TOKEN }}
|
||||
run: |
|
||||
# Clean up any leftover vault clone from previous runs
|
||||
Remove-Item -Recurse -Force "$env:TEMP\gitcaddy-vault" -ErrorAction SilentlyContinue
|
||||
|
||||
# Clone vault repo
|
||||
git -c "http.extraheader=Authorization: token $($env:VAULT_TOKEN)" clone --depth 1 https://direct.git.marketally.com/gitcaddy/gitcaddy-vault.git "$env:TEMP\gitcaddy-vault"
|
||||
|
||||
@@ -601,6 +604,7 @@ jobs:
|
||||
env:
|
||||
RELEASE_TOKEN: ${{ secrets.RELEASE_TOKEN }}
|
||||
run: |
|
||||
mkdir -p "$HOME"
|
||||
git config --global http.https://git.marketally.com/.extraheader "Authorization: token ${RELEASE_TOKEN}"
|
||||
git config --global http.https://direct.git.marketally.com/.extraheader "Authorization: token ${RELEASE_TOKEN}"
|
||||
|
||||
|
||||
@@ -285,15 +285,15 @@ func CreateTaskForRunner(ctx context.Context, runner *ActionRunner) (*ActionTask
|
||||
log.Trace("runner labels: %v", runner.AgentLabels)
|
||||
for _, v := range jobs {
|
||||
if runner.CanMatchLabels(v.RunsOn) {
|
||||
// If runner is unhealthy, only assign if it's the only matching runner
|
||||
// If runner is unhealthy, only skip if a healthy idle alternative exists
|
||||
if runnerUnhealthy {
|
||||
if isOnlyMatchingRunner(ctx, runner, v) {
|
||||
log.Info("Assigning job %d to unhealthy runner %s (only matching runner)", v.ID, runner.Name)
|
||||
job = v
|
||||
break
|
||||
if hasHealthyIdleAlternative(ctx, runner, v) {
|
||||
log.Trace("Skipping job %d for unhealthy runner %s: healthy idle alternative available", v.ID, runner.Name)
|
||||
continue
|
||||
}
|
||||
log.Trace("Skipping job %d for unhealthy runner %s: other healthy runners available", v.ID, runner.Name)
|
||||
continue
|
||||
log.Info("Assigning job %d to unhealthy runner %s (no healthy idle alternative)", v.ID, runner.Name)
|
||||
job = v
|
||||
break
|
||||
}
|
||||
|
||||
// Check if this runner should get this job based on bandwidth
|
||||
@@ -595,32 +595,31 @@ func GetCurrentTasksForRunners(ctx context.Context, runnerIDs []int64) (map[int6
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// isOnlyMatchingRunner checks if this runner is the only one that can handle the job.
|
||||
// Used to ensure unhealthy runners still get assigned jobs if they're the only option.
|
||||
func isOnlyMatchingRunner(ctx context.Context, runner *ActionRunner, job *ActionRunJob) bool {
|
||||
// hasHealthyIdleAlternative checks if there is another online, healthy, and idle
|
||||
// runner that can handle this job. Used to decide whether an unhealthy runner
|
||||
// should skip a job: only skip if a healthy idle alternative actually exists.
|
||||
// If all alternatives are also unhealthy or busy, the unhealthy runner takes the job.
|
||||
func hasHealthyIdleAlternative(ctx context.Context, runner *ActionRunner, job *ActionRunJob) bool {
|
||||
runners, err := db.Find[ActionRunner](ctx, FindRunnerOptions{
|
||||
IsOnline: optional.Some(true),
|
||||
})
|
||||
if err != nil {
|
||||
log.Error("Failed to find runners for job %d: %v", job.ID, err)
|
||||
return true // Assume we're the only one if we can't check
|
||||
return false // No alternative found, assign to current runner
|
||||
}
|
||||
|
||||
for _, r := range runners {
|
||||
// Skip self
|
||||
if r.ID == runner.ID {
|
||||
continue
|
||||
}
|
||||
// Skip offline runners
|
||||
if !r.IsOnline() {
|
||||
continue
|
||||
}
|
||||
// Check if this runner can handle the job
|
||||
if r.CanMatchLabels(job.RunsOn) {
|
||||
// Found another runner that can handle it
|
||||
return false
|
||||
// Only count as an alternative if it can match labels, is healthy, and is idle
|
||||
if r.CanMatchLabels(job.RunsOn) && r.IsHealthy() && isRunnerIdle(ctx, r) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return true
|
||||
return false
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user