fix: recover from phantom 'running' state after crash/restart
The run-cancel registry is in-memory; a container restart mid-run leaves accounts/tasks persisted as 'running' with no goroutine, wedging cancel (not-in-map -> 409) and blocking remove/re-run. - startup: ResetRunningOnStartup clears stale 'running' -> 'idle' on boot - cancel handler: when no live goroutine, ClearStuckAccount + ReconcileTaskStatus reset the stuck account (and its task) instead of returning 409 Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> Claude-Session: https://claude.ai/code/session_01MMHQTtnQtQqL8muAXHr9kd
This commit is contained in:
@@ -33,6 +33,14 @@ func main() {
|
||||
slog.Error("store", "err", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
// Clear phantom "running" left by a prior crash/restart — no goroutines
|
||||
// survive a restart, so any persisted "running" is stale.
|
||||
if t, a, err := st.ResetRunningOnStartup(context.Background()); err != nil {
|
||||
slog.Error("reset stale running", "err", err)
|
||||
os.Exit(1)
|
||||
} else if t > 0 || a > 0 {
|
||||
slog.Warn("reset stale running statuses on startup", "tasks", t, "accounts", a)
|
||||
}
|
||||
hub := wshub.New()
|
||||
orch := orchestrator.New(st, hub, cfg.EncKey, cfg.WorkerConcurrency)
|
||||
srv := httpapi.NewServer(cfg, st, orch, hub)
|
||||
|
||||
Reference in New Issue
Block a user