fix: recover from phantom 'running' state after crash/restart

The run-cancel registry is in-memory; a container restart mid-run leaves
accounts/tasks persisted as 'running' with no goroutine, wedging cancel
(not-in-map -> 409) and blocking remove/re-run.

- startup: ResetRunningOnStartup clears stale 'running' -> 'idle' on boot
- cancel handler: when no live goroutine, ClearStuckAccount + ReconcileTaskStatus
  reset the stuck account (and its task) instead of returning 409

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01MMHQTtnQtQqL8muAXHr9kd
This commit is contained in:
2026-07-02 12:57:39 +07:00
parent 6a10697548
commit fa72f1b323
4 changed files with 136 additions and 1 deletions
+22 -1
View File
@@ -86,15 +86,36 @@ func (s *Server) handleRun(w http.ResponseWriter, r *http.Request) {
}
func (s *Server) handleCancelAccount(w http.ResponseWriter, r *http.Request) {
taskID, err := pathID(r, "id")
if err != nil {
http.Error(w, "bad id", http.StatusBadRequest)
return
}
accID, err := pathID(r, "accountId")
if err != nil {
http.Error(w, "bad account id", http.StatusBadRequest)
return
}
if !s.orch.CancelAccount(accID) {
// Live in-flight copy: signal it to stop.
if s.orch.CancelAccount(accID) {
w.WriteHeader(http.StatusAccepted)
return
}
// No live goroutine but the DB may still say "running" (stale state left by
// a crash/restart): clear it so the account/task become usable again.
cleared, err := s.store.ClearStuckAccount(r.Context(), accID)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
if !cleared {
http.Error(w, "account is not running", http.StatusConflict)
return
}
if err := s.store.ReconcileTaskStatus(r.Context(), taskID); err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
w.WriteHeader(http.StatusAccepted)
}