fix(orchestrator): prevent concurrent double-run duplicating messages; reflect errors in status

2026-07-01 19:32:04 +07:00
parent 1373aa0a77
commit 2429c786e4
6 changed files with 72 additions and 4 deletions
@@ -74,6 +74,10 @@ func (s *Server) handleRun(w http.ResponseWriter, r *http.Request) {
 		http.Error(w, "accounts must pass connection tests first", http.StatusConflict)
 		return
 	}
+	if errors.Is(err, orchestrator.ErrAlreadyRunning) {
+		http.Error(w, "task is already running", http.StatusConflict)
+		return
+	}
 	if err != nil {
 		http.Error(w, err.Error(), http.StatusInternalServerError)
 		return
@@ -13,6 +13,7 @@ import (
 )

 var ErrNotTested = errors.New("accounts not fully tested")
+var ErrAlreadyRunning = errors.New("task already running")

 type Orchestrator struct {
 	store       *store.Store
@@ -99,15 +100,23 @@ func (o *Orchestrator) Run(ctx context.Context, taskID int64) (int64, error) {
 	if !gateOK(accs) {
 		return 0, ErrNotTested
 	}
+	acquired, err := o.store.TryMarkTaskRunning(ctx, taskID)
+	if err != nil {
+		return 0, err
+	}
+	if !acquired {
+		return 0, ErrAlreadyRunning
+	}
 	srcEP, dstEP, err := o.endpoints(ctx, task)
 	if err != nil {
+		_ = o.store.SetTaskStatus(ctx, taskID, "error")
 		return 0, err
 	}
 	runID, err := o.store.CreateRun(ctx, taskID)
 	if err != nil {
+		_ = o.store.SetTaskStatus(ctx, taskID, "error")
 		return 0, err
 	}
-	_ = o.store.SetTaskStatus(ctx, taskID, "running")
 	o.hub.Publish(wshub.Event{Type: "run_started", TaskID: taskID, Data: map[string]any{"run_id": runID}})

 	go o.runAll(context.WithoutCancel(ctx), task, runID, accs, srcEP, dstEP)
@@ -138,8 +147,12 @@ func (o *Orchestrator) runAll(ctx context.Context, task store.Task, runID int64,
 	}
 	wg.Wait()

-	_ = o.store.FinishRun(ctx, runID, "done", totCopied, totSkipped, totErr)
-	_ = o.store.SetTaskStatus(ctx, task.ID, "done")
+	status := "done"
+	if totErr > 0 {
+		status = "done_with_errors"
+	}
+	_ = o.store.FinishRun(ctx, runID, status, totCopied, totSkipped, totErr)
+	_ = o.store.SetTaskStatus(ctx, task.ID, status)
 	o.hub.Publish(wshub.Event{Type: "run_done", TaskID: task.ID,
 		Data: map[string]any{"run_id": runID, "copied": totCopied, "skipped": totSkipped, "errors": totErr}})
 }
@@ -55,3 +55,14 @@ func (s *Store) SetTaskStatus(ctx context.Context, id int64, status string) erro
 	_, err := s.Pool.Exec(ctx, `UPDATE tasks SET status=$2 WHERE id=$1`, id, status)
 	return err
 }
+
+// TryMarkTaskRunning atomically sets status='running' only if the task is not already running.
+// Returns true if this call acquired the run (status was not 'running' before), false otherwise.
+func (s *Store) TryMarkTaskRunning(ctx context.Context, id int64) (bool, error) {
+	ct, err := s.Pool.Exec(ctx,
+		`UPDATE tasks SET status='running' WHERE id=$1 AND status<>'running'`, id)
+	if err != nil {
+		return false, err
+	}
+	return ct.RowsAffected() == 1, nil
+}
@@ -0,0 +1,32 @@
+package store
+
+import (
+	"context"
+	"testing"
+)
+
+func TestTryMarkTaskRunningIsExclusive(t *testing.T) {
+	s := testStore(t)
+	ctx := context.Background()
+	epSrc, _ := s.CreateEndpoint(ctx, Endpoint{RoleLabel: "src", Host: "a", Port: 993, TLSMode: "ssl"})
+	epDst, _ := s.CreateEndpoint(ctx, Endpoint{RoleLabel: "dst", Host: "b", Port: 993, TLSMode: "ssl"})
+	taskID, _ := s.CreateTask(ctx, Task{Name: "t", SrcEndpointID: epSrc, DstEndpointID: epDst})
+
+	first, err := s.TryMarkTaskRunning(ctx, taskID)
+	if err != nil || !first {
+		t.Fatalf("first acquire must succeed: ok=%v err=%v", first, err)
+	}
+	second, err := s.TryMarkTaskRunning(ctx, taskID)
+	if err != nil {
+		t.Fatalf("err: %v", err)
+	}
+	if second {
+		t.Fatal("second acquire must fail while running")
+	}
+	// after completion, a re-run may acquire again
+	_ = s.SetTaskStatus(ctx, taskID, "done")
+	third, _ := s.TryMarkTaskRunning(ctx, taskID)
+	if !third {
+		t.Fatal("acquire after completion must succeed")
+	}
+}