fix(scheduler): убрать двойной SaveCheckRun (Checker персистит), SetDrift через CountDriftDomains, resolved после error

2026-07-04 14:03:49 +07:00
parent 23e02d6804
commit 9475af441e
5 changed files with 60 additions and 32 deletions
@@ -28,14 +28,16 @@ const (
 )

 // SchedStore is the narrow store dependency the scheduler needs: due
-// schedules, their domains, and per-domain status bookkeeping.
+// schedules, their domains, and per-domain status bookkeeping. Persisting
+// the check result itself (check_runs) is the Checker's job — see Checker
+// below — not the scheduler's.
 type SchedStore interface {
 	ListDueSchedules(ctx context.Context, now time.Time) ([]store.Schedule, error)
 	TouchScheduleRun(ctx context.Context, projectID uuid.UUID, at time.Time) error
 	ListDomains(ctx context.Context, projectID uuid.UUID) ([]store.Domain, error)
 	GetDomainStatus(ctx context.Context, domainID uuid.UUID) (string, error)
 	SetDomainStatus(ctx context.Context, domainID uuid.UUID, status string) error
-	SaveCheckRun(ctx context.Context, domainID uuid.UUID, cs diff.Changeset) error
+	CountDriftDomains(ctx context.Context) (int, error)
 }

 // Checker computes the diff between a domain's desired template and its
@@ -92,8 +94,6 @@ func (s *Scheduler) RunOnce(ctx context.Context, now time.Time) error {
 		return fmt.Errorf("list due schedules: %w", err)
 	}

-	driftCount := 0
-
 	for _, sch := range due {
 		domains, err := s.store.ListDomains(ctx, sch.ProjectID)
 		if err != nil {
@@ -102,9 +102,7 @@ func (s *Scheduler) RunOnce(ctx context.Context, now time.Time) error {
 		}

 		for _, d := range domains {
-			if s.checkDomain(ctx, sch.ProjectID, d, now) == StatusDrift {
-				driftCount++
-			}
+			s.checkDomain(ctx, sch.ProjectID, d, now)
 		}

 		if err := s.store.TouchScheduleRun(ctx, sch.ProjectID, now); err != nil {
@@ -112,7 +110,15 @@ func (s *Scheduler) RunOnce(ctx context.Context, now time.Time) error {
 		}
 	}

-	s.metrics.SetDrift(driftCount)
+	// The real, system-wide count of drift domains — not a local
+	// accumulator scoped to this tick's due projects — so the gauge
+	// reflects reality even across ticks where different projects are due.
+	count, err := s.store.CountDriftDomains(ctx)
+	if err != nil {
+		log.Printf("scheduler: count drift domains failed: %v", err)
+	} else {
+		s.metrics.SetDrift(count)
+	}
 	return nil
 }

@@ -139,12 +145,10 @@ func (s *Scheduler) checkDomain(ctx context.Context, projectID uuid.UUID, d stor
 		prev = StatusUnknown
 	}

-	// A failed Check has no changeset worth recording; a successful one does.
-	if checkErr == nil {
-		if err := s.store.SaveCheckRun(ctx, d.ID, cs); err != nil {
-			log.Printf("scheduler: save check run for %s failed: %v", d.ID, err)
-		}
-	}
+	// Persisting the check_runs row is the Checker's job: DomainService.Check
+	// already calls Recorder.SaveCheckRun internally on every successful
+	// check (drift or in_sync). Calling it again here would double-write
+	// check_runs history for the same check.

 	if err := s.store.SetDomainStatus(ctx, d.ID, newStatus); err != nil {
 		log.Printf("scheduler: set domain status for %s failed: %v", d.ID, err)
@@ -170,15 +174,16 @@ func (s *Scheduler) checkDomain(ctx context.Context, projectID uuid.UUID, d stor
 // shouldNotify decides whether a prev -> new status transition is worth
 // alerting on:
 //   - entering drift or error from any other status is always notified;
-//   - recovering from drift back to in_sync ("resolved") is notified;
+//   - recovering from drift OR error back to in_sync ("resolved") is
+//     notified — including recovery after a provider/check failure;
 //   - the initial unknown -> in_sync transition (first successful check of a
-//     domain that never drifted) is NOT notified — it is not news, it is the
-//     expected steady state.
+//     domain that never drifted or errored) is NOT notified — it is not
+//     news, it is the expected steady state.
 func shouldNotify(prev, newStatus string) bool {
 	if (newStatus == StatusDrift || newStatus == StatusError) && newStatus != prev {
 		return true
 	}
-	if prev == StatusDrift && newStatus == StatusInSync {
+	if (prev == StatusDrift || prev == StatusError) && newStatus == StatusInSync {
 		return true
 	}
 	return false
@@ -24,8 +24,11 @@ type mockStore struct {
 	domains   map[uuid.UUID][]store.Domain
 	status    map[uuid.UUID]string

-	savedCheckRuns  []uuid.UUID
 	touchedProjects []uuid.UUID
+
+	// driftCount is what CountDriftDomains returns — a canned system-wide
+	// count, independent of what this RunOnce's due projects touched.
+	driftCount int
 }

 func newMockStore() *mockStore {
@@ -66,11 +69,10 @@ func (m *mockStore) SetDomainStatus(ctx context.Context, domainID uuid.UUID, sta
 	return nil
 }

-func (m *mockStore) SaveCheckRun(ctx context.Context, domainID uuid.UUID, cs diff.Changeset) error {
+func (m *mockStore) CountDriftDomains(ctx context.Context) (int, error) {
 	m.mu.Lock()
 	defer m.mu.Unlock()
-	m.savedCheckRuns = append(m.savedCheckRuns, domainID)
-	return nil
+	return m.driftCount, nil
 }

 // mockChecker returns a preset Changeset or error per domainID.
@@ -127,6 +129,12 @@ func TestRunOnce_NotifiesOnDriftNotOnFirstInSync(t *testing.T) {
 	notifier := &mockNotifier{}
 	m := metrics.New()

+	// CountDriftDomains is the real system-wide count, independent of what
+	// this tick touched — set it to something that would NOT match a local
+	// per-tick accumulator (only 1 of 2 domains here drifted) to prove the
+	// gauge comes from the store call, not a local tally.
+	st.driftCount = 7
+
 	sched := New(st, checker, notifier, m)

 	if err := sched.RunOnce(context.Background(), time.Now()); err != nil {
@@ -150,9 +158,6 @@ func TestRunOnce_NotifiesOnDriftNotOnFirstInSync(t *testing.T) {
 		t.Fatalf("notified status = %q, want drift", notifier.events[0].Status)
 	}

-	if len(st.savedCheckRuns) != 2 {
-		t.Fatalf("SaveCheckRun calls = %d, want 2", len(st.savedCheckRuns))
-	}
 	if len(st.touchedProjects) != 1 || st.touchedProjects[0] != projectID {
 		t.Fatalf("TouchScheduleRun calls = %v, want [%s]", st.touchedProjects, projectID)
 	}
@@ -163,8 +168,8 @@ func TestRunOnce_NotifiesOnDriftNotOnFirstInSync(t *testing.T) {
 	if got := testutil.ToFloat64(m.ChecksTotal.WithLabelValues(StatusInSync)); got != 1 {
 		t.Fatalf("ChecksTotal{in_sync} = %v, want 1", got)
 	}
-	if got := testutil.ToFloat64(m.DriftDomains); got != 1 {
-		t.Fatalf("DriftDomains gauge = %v, want 1", got)
+	if got := testutil.ToFloat64(m.DriftDomains); got != float64(st.driftCount) {
+		t.Fatalf("DriftDomains gauge = %v, want %d (from CountDriftDomains)", got, st.driftCount)
 	}
 }

@@ -229,10 +234,6 @@ func TestRunOnce_CheckError_StatusErrorAndNotify(t *testing.T) {
 	if got := testutil.ToFloat64(m.ChecksTotal.WithLabelValues(StatusError)); got != 1 {
 		t.Fatalf("ChecksTotal{error} = %v, want 1", got)
 	}
-	// A failed Check has no changeset worth recording.
-	if len(st.savedCheckRuns) != 0 {
-		t.Fatalf("SaveCheckRun calls on error = %d, want 0", len(st.savedCheckRuns))
-	}
 }

 func TestShouldNotify(t *testing.T) {
@@ -252,7 +253,7 @@ func TestShouldNotify(t *testing.T) {
 		{"in_sync->error notifies", StatusInSync, StatusError, true},
 		{"in_sync->in_sync is silent", StatusInSync, StatusInSync, false},
 		{"error->drift notifies (still bad, different bad)", StatusError, StatusDrift, true},
-		{"error->in_sync is not the 'resolved' case, per spec", StatusError, StatusInSync, false},
+		{"error->in_sync notifies (resolved after failure)", StatusError, StatusInSync, true},
 	}

 	for _, tc := range cases {