2
0
Files
logikonline 43adbaeffe
Some checks failed
Build and Release / Build Binaries (amd64, darwin, macos) (push) Has been cancelled
Build and Release / Build Binaries (amd64, linux, linux-latest) (push) Has been cancelled
Build and Release / Build Binaries (amd64, windows, windows-latest) (push) Has been cancelled
Build and Release / Build Binaries (arm64, darwin, macos) (push) Has been cancelled
Build and Release / Build Binary (linux/arm64) (push) Has been cancelled
Build and Release / Integration Tests (PostgreSQL) (push) Has been cancelled
Build and Release / Create Release (push) Has been cancelled
Build and Release / Unit Tests (push) Has been cancelled
Build and Release / Lint (push) Has been cancelled
feat(actions): add stuck job rescue mechanism
Introduce a cron task that rescues waiting jobs stuck due to version-sync issues by bumping the task version for affected scopes. Also bump version after each successful job pick to ensure runners re-poll for remaining waiting jobs. Configurable via STUCK_JOB_TIMEOUT (default: 5 minutes).
2026-01-27 09:00:36 -05:00

177 lines
6.2 KiB
Go

// Copyright 2022 The Gitea Authors. All rights reserved.
// Copyright 2026 MarketAlly. All rights reserved.
// SPDX-License-Identifier: MIT
package setting
import (
"fmt"
"strings"
"time"
"code.gitcaddy.com/server/v3/modules/log"
)
// Actions settings
var (
Actions = struct {
Enabled bool
LogStorage *Storage // how the created logs should be stored
LogRetentionDays int64 `ini:"LOG_RETENTION_DAYS"`
LogCompression logCompression `ini:"LOG_COMPRESSION"`
ArtifactStorage *Storage // how the created artifacts should be stored
ArtifactRetentionDays int64 `ini:"ARTIFACT_RETENTION_DAYS"`
DefaultActionsURL defaultActionsURL `ini:"DEFAULT_ACTIONS_URL"`
ZombieTaskTimeout time.Duration `ini:"ZOMBIE_TASK_TIMEOUT"`
EndlessTaskTimeout time.Duration `ini:"ENDLESS_TASK_TIMEOUT"`
AbandonedJobTimeout time.Duration `ini:"ABANDONED_JOB_TIMEOUT"`
StuckJobTimeout time.Duration `ini:"STUCK_JOB_TIMEOUT"`
SkipWorkflowStrings []string `ini:"SKIP_WORKFLOW_STRINGS"`
// Runner health settings
RunnerHealthCheck RunnerHealthCheckSettings
// Bandwidth-aware job routing
BandwidthAwareRouting bool `ini:"BANDWIDTH_AWARE_ROUTING"`
BandwidthScoreThreshold float64 `ini:"BANDWIDTH_SCORE_THRESHOLD"`
}{
Enabled: true,
DefaultActionsURL: defaultActionsURLGitHub,
SkipWorkflowStrings: []string{"[skip ci]", "[ci skip]", "[no ci]", "[skip actions]", "[actions skip]"},
}
)
// RunnerHealthCheckSettings configures runner health monitoring
type RunnerHealthCheckSettings struct {
Enabled bool `ini:"ENABLED"`
MinDiskPercent float64 `ini:"MIN_DISK_PERCENT"` // Skip runners with less than this % free disk
MaxDiskUsagePercent float64 `ini:"MAX_DISK_USAGE_PERCENT"` // Alert when disk usage exceeds this %
MaxLatencyMs float64 `ini:"MAX_LATENCY_MS"` // Skip runners with latency above this
MaxCPULoadPercent float64 `ini:"MAX_CPU_LOAD_PERCENT"` // Skip runners with CPU load above this %
AdminEmail string `ini:"ADMIN_EMAIL"` // Email to notify about unhealthy runners
CleanupCooldown time.Duration `ini:"CLEANUP_COOLDOWN"` // Min time between cleanup requests
AutoCleanupThreshold float64 `ini:"AUTO_CLEANUP_THRESHOLD"` // Runner self-cleans at this disk %
}
type defaultActionsURL string
func (url defaultActionsURL) URL() string {
switch url {
case defaultActionsURLGitHub:
return "https://github.com"
case defaultActionsURLSelf:
return strings.TrimSuffix(AppURL, "/")
default:
// This should never happen, but just in case, use GitHub as fallback
return "https://github.com"
}
}
const (
defaultActionsURLGitHub = "github" // https://github.com
defaultActionsURLSelf = "self" // the root URL of the self-hosted Gitea instance
)
type logCompression string
func (c logCompression) IsValid() bool {
return c.IsNone() || c.IsZstd()
}
func (c logCompression) IsNone() bool {
return string(c) == "none"
}
func (c logCompression) IsZstd() bool {
return c == "" || string(c) == "zstd"
}
func loadActionsFrom(rootCfg ConfigProvider) error {
sec := rootCfg.Section("actions")
err := sec.MapTo(&Actions)
if err != nil {
return fmt.Errorf("failed to map Actions settings: %v", err)
}
if urls := string(Actions.DefaultActionsURL); urls != defaultActionsURLGitHub && urls != defaultActionsURLSelf {
url := strings.Split(urls, ",")[0]
if strings.HasPrefix(url, "https://") || strings.HasPrefix(url, "http://") {
log.Error("[actions] DEFAULT_ACTIONS_URL does not support %q as custom URL any longer, fallback to %q",
urls,
defaultActionsURLGitHub,
)
Actions.DefaultActionsURL = defaultActionsURLGitHub
} else {
return fmt.Errorf("unsupported [actions] DEFAULT_ACTIONS_URL: %q", urls)
}
}
// Load runner health check settings
healthSec := rootCfg.Section("actions.runner_health")
if err := healthSec.MapTo(&Actions.RunnerHealthCheck); err != nil {
return fmt.Errorf("failed to map runner health settings: %v", err)
}
// Set defaults for runner health
if Actions.RunnerHealthCheck.MinDiskPercent <= 0 {
Actions.RunnerHealthCheck.MinDiskPercent = 5.0 // Need at least 5% free
}
if Actions.RunnerHealthCheck.MaxDiskUsagePercent <= 0 {
Actions.RunnerHealthCheck.MaxDiskUsagePercent = 85.0 // Alert at 85% used
}
if Actions.RunnerHealthCheck.MaxLatencyMs <= 0 {
Actions.RunnerHealthCheck.MaxLatencyMs = 500.0 // 500ms max latency
}
if Actions.RunnerHealthCheck.MaxCPULoadPercent <= 0 {
Actions.RunnerHealthCheck.MaxCPULoadPercent = 80.0 // Skip runners with >80% CPU load
}
if Actions.RunnerHealthCheck.CleanupCooldown <= 0 {
Actions.RunnerHealthCheck.CleanupCooldown = 10 * time.Minute
}
if Actions.RunnerHealthCheck.AutoCleanupThreshold <= 0 {
Actions.RunnerHealthCheck.AutoCleanupThreshold = 90.0 // Runner self-cleans at 90%
}
// Bandwidth-aware routing defaults
if Actions.BandwidthScoreThreshold <= 0 {
Actions.BandwidthScoreThreshold = 20.0 // Allow runners within 20 points of best
}
// Default enabled
if !healthSec.HasKey("ENABLED") {
Actions.RunnerHealthCheck.Enabled = true
}
// don't support to read configuration from [actions]
Actions.LogStorage, err = getStorage(rootCfg, "actions_log", "", nil)
if err != nil {
return err
}
// default to 1 year
if Actions.LogRetentionDays <= 0 {
Actions.LogRetentionDays = 365
}
actionsSec, _ := rootCfg.GetSection("actions.artifacts")
Actions.ArtifactStorage, err = getStorage(rootCfg, "actions_artifacts", "", actionsSec)
if err != nil {
return err
}
// default to 90 days in Github Actions
if Actions.ArtifactRetentionDays <= 0 {
Actions.ArtifactRetentionDays = 90
}
Actions.ZombieTaskTimeout = sec.Key("ZOMBIE_TASK_TIMEOUT").MustDuration(10 * time.Minute)
Actions.EndlessTaskTimeout = sec.Key("ENDLESS_TASK_TIMEOUT").MustDuration(3 * time.Hour)
Actions.AbandonedJobTimeout = sec.Key("ABANDONED_JOB_TIMEOUT").MustDuration(24 * time.Hour)
Actions.StuckJobTimeout = sec.Key("STUCK_JOB_TIMEOUT").MustDuration(5 * time.Minute)
if !Actions.LogCompression.IsValid() {
return fmt.Errorf("invalid [actions] LOG_COMPRESSION: %q", Actions.LogCompression)
}
return nil
}