fix(scheduler): убрать двойной SaveCheckRun (Checker персистит), SetDrift через CountDriftDomains, resolved после error
This commit is contained in:
@@ -28,14 +28,16 @@ const (
|
||||
)
|
||||
|
||||
// SchedStore is the narrow store dependency the scheduler needs: due
|
||||
// schedules, their domains, and per-domain status bookkeeping.
|
||||
// schedules, their domains, and per-domain status bookkeeping. Persisting
|
||||
// the check result itself (check_runs) is the Checker's job — see Checker
|
||||
// below — not the scheduler's.
|
||||
type SchedStore interface {
|
||||
ListDueSchedules(ctx context.Context, now time.Time) ([]store.Schedule, error)
|
||||
TouchScheduleRun(ctx context.Context, projectID uuid.UUID, at time.Time) error
|
||||
ListDomains(ctx context.Context, projectID uuid.UUID) ([]store.Domain, error)
|
||||
GetDomainStatus(ctx context.Context, domainID uuid.UUID) (string, error)
|
||||
SetDomainStatus(ctx context.Context, domainID uuid.UUID, status string) error
|
||||
SaveCheckRun(ctx context.Context, domainID uuid.UUID, cs diff.Changeset) error
|
||||
CountDriftDomains(ctx context.Context) (int, error)
|
||||
}
|
||||
|
||||
// Checker computes the diff between a domain's desired template and its
|
||||
@@ -92,8 +94,6 @@ func (s *Scheduler) RunOnce(ctx context.Context, now time.Time) error {
|
||||
return fmt.Errorf("list due schedules: %w", err)
|
||||
}
|
||||
|
||||
driftCount := 0
|
||||
|
||||
for _, sch := range due {
|
||||
domains, err := s.store.ListDomains(ctx, sch.ProjectID)
|
||||
if err != nil {
|
||||
@@ -102,9 +102,7 @@ func (s *Scheduler) RunOnce(ctx context.Context, now time.Time) error {
|
||||
}
|
||||
|
||||
for _, d := range domains {
|
||||
if s.checkDomain(ctx, sch.ProjectID, d, now) == StatusDrift {
|
||||
driftCount++
|
||||
}
|
||||
s.checkDomain(ctx, sch.ProjectID, d, now)
|
||||
}
|
||||
|
||||
if err := s.store.TouchScheduleRun(ctx, sch.ProjectID, now); err != nil {
|
||||
@@ -112,7 +110,15 @@ func (s *Scheduler) RunOnce(ctx context.Context, now time.Time) error {
|
||||
}
|
||||
}
|
||||
|
||||
s.metrics.SetDrift(driftCount)
|
||||
// The real, system-wide count of drift domains — not a local
|
||||
// accumulator scoped to this tick's due projects — so the gauge
|
||||
// reflects reality even across ticks where different projects are due.
|
||||
count, err := s.store.CountDriftDomains(ctx)
|
||||
if err != nil {
|
||||
log.Printf("scheduler: count drift domains failed: %v", err)
|
||||
} else {
|
||||
s.metrics.SetDrift(count)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -139,12 +145,10 @@ func (s *Scheduler) checkDomain(ctx context.Context, projectID uuid.UUID, d stor
|
||||
prev = StatusUnknown
|
||||
}
|
||||
|
||||
// A failed Check has no changeset worth recording; a successful one does.
|
||||
if checkErr == nil {
|
||||
if err := s.store.SaveCheckRun(ctx, d.ID, cs); err != nil {
|
||||
log.Printf("scheduler: save check run for %s failed: %v", d.ID, err)
|
||||
}
|
||||
}
|
||||
// Persisting the check_runs row is the Checker's job: DomainService.Check
|
||||
// already calls Recorder.SaveCheckRun internally on every successful
|
||||
// check (drift or in_sync). Calling it again here would double-write
|
||||
// check_runs history for the same check.
|
||||
|
||||
if err := s.store.SetDomainStatus(ctx, d.ID, newStatus); err != nil {
|
||||
log.Printf("scheduler: set domain status for %s failed: %v", d.ID, err)
|
||||
@@ -170,15 +174,16 @@ func (s *Scheduler) checkDomain(ctx context.Context, projectID uuid.UUID, d stor
|
||||
// shouldNotify decides whether a prev -> new status transition is worth
|
||||
// alerting on:
|
||||
// - entering drift or error from any other status is always notified;
|
||||
// - recovering from drift back to in_sync ("resolved") is notified;
|
||||
// - recovering from drift OR error back to in_sync ("resolved") is
|
||||
// notified — including recovery after a provider/check failure;
|
||||
// - the initial unknown -> in_sync transition (first successful check of a
|
||||
// domain that never drifted) is NOT notified — it is not news, it is the
|
||||
// expected steady state.
|
||||
// domain that never drifted or errored) is NOT notified — it is not
|
||||
// news, it is the expected steady state.
|
||||
func shouldNotify(prev, newStatus string) bool {
|
||||
if (newStatus == StatusDrift || newStatus == StatusError) && newStatus != prev {
|
||||
return true
|
||||
}
|
||||
if prev == StatusDrift && newStatus == StatusInSync {
|
||||
if (prev == StatusDrift || prev == StatusError) && newStatus == StatusInSync {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
|
||||
Reference in New Issue
Block a user