From 9475af441ee1ba62192be2c51c53be0c211783b8 Mon Sep 17 00:00:00 2001 From: Vassiliy Yegorov Date: Sat, 4 Jul 2026 14:03:49 +0700 Subject: [PATCH] =?UTF-8?q?fix(scheduler):=20=D1=83=D0=B1=D1=80=D0=B0?= =?UTF-8?q?=D1=82=D1=8C=20=D0=B4=D0=B2=D0=BE=D0=B9=D0=BD=D0=BE=D0=B9=20Sav?= =?UTF-8?q?eCheckRun=20(Checker=20=D0=BF=D0=B5=D1=80=D1=81=D0=B8=D1=81?= =?UTF-8?q?=D1=82=D0=B8=D1=82),=20SetDrift=20=D1=87=D0=B5=D1=80=D0=B5?= =?UTF-8?q?=D0=B7=20CountDriftDomains,=20resolved=20=D0=BF=D0=BE=D1=81?= =?UTF-8?q?=D0=BB=D0=B5=20error?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- internal/scheduler/scheduler.go | 41 ++++++++++++++++------------ internal/scheduler/scheduler_test.go | 29 ++++++++++---------- internal/store/db/domains.sql.go | 11 ++++++++ internal/store/queries/domains.sql | 3 ++ internal/store/tenant.go | 8 ++++++ 5 files changed, 60 insertions(+), 32 deletions(-) diff --git a/internal/scheduler/scheduler.go b/internal/scheduler/scheduler.go index 8dba3b4..c470d09 100644 --- a/internal/scheduler/scheduler.go +++ b/internal/scheduler/scheduler.go @@ -28,14 +28,16 @@ const ( ) // SchedStore is the narrow store dependency the scheduler needs: due -// schedules, their domains, and per-domain status bookkeeping. +// schedules, their domains, and per-domain status bookkeeping. Persisting +// the check result itself (check_runs) is the Checker's job — see Checker +// below — not the scheduler's. type SchedStore interface { ListDueSchedules(ctx context.Context, now time.Time) ([]store.Schedule, error) TouchScheduleRun(ctx context.Context, projectID uuid.UUID, at time.Time) error ListDomains(ctx context.Context, projectID uuid.UUID) ([]store.Domain, error) GetDomainStatus(ctx context.Context, domainID uuid.UUID) (string, error) SetDomainStatus(ctx context.Context, domainID uuid.UUID, status string) error - SaveCheckRun(ctx context.Context, domainID uuid.UUID, cs diff.Changeset) error + CountDriftDomains(ctx context.Context) (int, error) } // Checker computes the diff between a domain's desired template and its @@ -92,8 +94,6 @@ func (s *Scheduler) RunOnce(ctx context.Context, now time.Time) error { return fmt.Errorf("list due schedules: %w", err) } - driftCount := 0 - for _, sch := range due { domains, err := s.store.ListDomains(ctx, sch.ProjectID) if err != nil { @@ -102,9 +102,7 @@ func (s *Scheduler) RunOnce(ctx context.Context, now time.Time) error { } for _, d := range domains { - if s.checkDomain(ctx, sch.ProjectID, d, now) == StatusDrift { - driftCount++ - } + s.checkDomain(ctx, sch.ProjectID, d, now) } if err := s.store.TouchScheduleRun(ctx, sch.ProjectID, now); err != nil { @@ -112,7 +110,15 @@ func (s *Scheduler) RunOnce(ctx context.Context, now time.Time) error { } } - s.metrics.SetDrift(driftCount) + // The real, system-wide count of drift domains — not a local + // accumulator scoped to this tick's due projects — so the gauge + // reflects reality even across ticks where different projects are due. + count, err := s.store.CountDriftDomains(ctx) + if err != nil { + log.Printf("scheduler: count drift domains failed: %v", err) + } else { + s.metrics.SetDrift(count) + } return nil } @@ -139,12 +145,10 @@ func (s *Scheduler) checkDomain(ctx context.Context, projectID uuid.UUID, d stor prev = StatusUnknown } - // A failed Check has no changeset worth recording; a successful one does. - if checkErr == nil { - if err := s.store.SaveCheckRun(ctx, d.ID, cs); err != nil { - log.Printf("scheduler: save check run for %s failed: %v", d.ID, err) - } - } + // Persisting the check_runs row is the Checker's job: DomainService.Check + // already calls Recorder.SaveCheckRun internally on every successful + // check (drift or in_sync). Calling it again here would double-write + // check_runs history for the same check. if err := s.store.SetDomainStatus(ctx, d.ID, newStatus); err != nil { log.Printf("scheduler: set domain status for %s failed: %v", d.ID, err) @@ -170,15 +174,16 @@ func (s *Scheduler) checkDomain(ctx context.Context, projectID uuid.UUID, d stor // shouldNotify decides whether a prev -> new status transition is worth // alerting on: // - entering drift or error from any other status is always notified; -// - recovering from drift back to in_sync ("resolved") is notified; +// - recovering from drift OR error back to in_sync ("resolved") is +// notified — including recovery after a provider/check failure; // - the initial unknown -> in_sync transition (first successful check of a -// domain that never drifted) is NOT notified — it is not news, it is the -// expected steady state. +// domain that never drifted or errored) is NOT notified — it is not +// news, it is the expected steady state. func shouldNotify(prev, newStatus string) bool { if (newStatus == StatusDrift || newStatus == StatusError) && newStatus != prev { return true } - if prev == StatusDrift && newStatus == StatusInSync { + if (prev == StatusDrift || prev == StatusError) && newStatus == StatusInSync { return true } return false diff --git a/internal/scheduler/scheduler_test.go b/internal/scheduler/scheduler_test.go index ecc0ef0..dda06ee 100644 --- a/internal/scheduler/scheduler_test.go +++ b/internal/scheduler/scheduler_test.go @@ -24,8 +24,11 @@ type mockStore struct { domains map[uuid.UUID][]store.Domain status map[uuid.UUID]string - savedCheckRuns []uuid.UUID touchedProjects []uuid.UUID + + // driftCount is what CountDriftDomains returns — a canned system-wide + // count, independent of what this RunOnce's due projects touched. + driftCount int } func newMockStore() *mockStore { @@ -66,11 +69,10 @@ func (m *mockStore) SetDomainStatus(ctx context.Context, domainID uuid.UUID, sta return nil } -func (m *mockStore) SaveCheckRun(ctx context.Context, domainID uuid.UUID, cs diff.Changeset) error { +func (m *mockStore) CountDriftDomains(ctx context.Context) (int, error) { m.mu.Lock() defer m.mu.Unlock() - m.savedCheckRuns = append(m.savedCheckRuns, domainID) - return nil + return m.driftCount, nil } // mockChecker returns a preset Changeset or error per domainID. @@ -127,6 +129,12 @@ func TestRunOnce_NotifiesOnDriftNotOnFirstInSync(t *testing.T) { notifier := &mockNotifier{} m := metrics.New() + // CountDriftDomains is the real system-wide count, independent of what + // this tick touched — set it to something that would NOT match a local + // per-tick accumulator (only 1 of 2 domains here drifted) to prove the + // gauge comes from the store call, not a local tally. + st.driftCount = 7 + sched := New(st, checker, notifier, m) if err := sched.RunOnce(context.Background(), time.Now()); err != nil { @@ -150,9 +158,6 @@ func TestRunOnce_NotifiesOnDriftNotOnFirstInSync(t *testing.T) { t.Fatalf("notified status = %q, want drift", notifier.events[0].Status) } - if len(st.savedCheckRuns) != 2 { - t.Fatalf("SaveCheckRun calls = %d, want 2", len(st.savedCheckRuns)) - } if len(st.touchedProjects) != 1 || st.touchedProjects[0] != projectID { t.Fatalf("TouchScheduleRun calls = %v, want [%s]", st.touchedProjects, projectID) } @@ -163,8 +168,8 @@ func TestRunOnce_NotifiesOnDriftNotOnFirstInSync(t *testing.T) { if got := testutil.ToFloat64(m.ChecksTotal.WithLabelValues(StatusInSync)); got != 1 { t.Fatalf("ChecksTotal{in_sync} = %v, want 1", got) } - if got := testutil.ToFloat64(m.DriftDomains); got != 1 { - t.Fatalf("DriftDomains gauge = %v, want 1", got) + if got := testutil.ToFloat64(m.DriftDomains); got != float64(st.driftCount) { + t.Fatalf("DriftDomains gauge = %v, want %d (from CountDriftDomains)", got, st.driftCount) } } @@ -229,10 +234,6 @@ func TestRunOnce_CheckError_StatusErrorAndNotify(t *testing.T) { if got := testutil.ToFloat64(m.ChecksTotal.WithLabelValues(StatusError)); got != 1 { t.Fatalf("ChecksTotal{error} = %v, want 1", got) } - // A failed Check has no changeset worth recording. - if len(st.savedCheckRuns) != 0 { - t.Fatalf("SaveCheckRun calls on error = %d, want 0", len(st.savedCheckRuns)) - } } func TestShouldNotify(t *testing.T) { @@ -252,7 +253,7 @@ func TestShouldNotify(t *testing.T) { {"in_sync->error notifies", StatusInSync, StatusError, true}, {"in_sync->in_sync is silent", StatusInSync, StatusInSync, false}, {"error->drift notifies (still bad, different bad)", StatusError, StatusDrift, true}, - {"error->in_sync is not the 'resolved' case, per spec", StatusError, StatusInSync, false}, + {"error->in_sync notifies (resolved after failure)", StatusError, StatusInSync, true}, } for _, tc := range cases { diff --git a/internal/store/db/domains.sql.go b/internal/store/db/domains.sql.go index dfa3255..805bd7d 100644 --- a/internal/store/db/domains.sql.go +++ b/internal/store/db/domains.sql.go @@ -12,6 +12,17 @@ import ( dto "github.com/vasyakrg/dns-autoresolver/internal/store/dto" ) +const countDriftDomains = `-- name: CountDriftDomains :one +SELECT count(*) FROM domains WHERE last_check_status = 'drift' +` + +func (q *Queries) CountDriftDomains(ctx context.Context) (int64, error) { + row := q.db.QueryRow(ctx, countDriftDomains) + var count int64 + err := row.Scan(&count) + return count, err +} + const createDomain = `-- name: CreateDomain :one INSERT INTO domains (id, project_id, provider_account_id, zone_name, zone_id, template_id) VALUES ($1, $2, $3, $4, $5, $6) diff --git a/internal/store/queries/domains.sql b/internal/store/queries/domains.sql index 72a8392..6cd3fcf 100644 --- a/internal/store/queries/domains.sql +++ b/internal/store/queries/domains.sql @@ -34,3 +34,6 @@ SELECT last_check_status FROM domains WHERE id = $1; -- name: SetDomainStatus :exec UPDATE domains SET last_check_status = $2 WHERE id = $1; + +-- name: CountDriftDomains :one +SELECT count(*) FROM domains WHERE last_check_status = 'drift'; diff --git a/internal/store/tenant.go b/internal/store/tenant.go index 7296f08..d5dec6f 100644 --- a/internal/store/tenant.go +++ b/internal/store/tenant.go @@ -258,6 +258,14 @@ func (s *Store) SetDomainStatus(ctx context.Context, domainID uuid.UUID, status return s.q.SetDomainStatus(ctx, db.SetDomainStatusParams{ID: domainID, LastCheckStatus: status}) } +// CountDriftDomains returns the current number of domains system-wide whose +// last check status is "drift". This is a global count (not per-project) — +// it backs the dns_ar_drift_domains gauge, which is a system-level metric. +func (s *Store) CountDriftDomains(ctx context.Context) (int, error) { + n, err := s.q.CountDriftDomains(ctx) + return int(n), err +} + // User and Project are provider-neutral domain structs for the auth/tenant // layer (Фаза 2), mirroring the Account/Template/Domain wrappers above so // callers never need to import internal/store/db directly.