feat(server): graceful scheduler shutdown, /healthz, healthcheck mode

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01BwxdSt4reTm7Dj1oxRvpP3
This commit is contained in:
2026-07-04 15:46:56 +07:00
parent c265d36bdb
commit a27ddc79e8
2 changed files with 132 additions and 16 deletions
+62 -15
View File
@@ -8,6 +8,7 @@ import (
"os"
"os/signal"
"strings"
"sync"
"syscall"
"time"
@@ -49,7 +50,57 @@ func isAPIPath(path string) bool {
return path == "/api" || strings.HasPrefix(path, "/api/")
}
// buildMux wires the public /healthz + /metrics endpoints, the API router,
// and the embedded SPA. /healthz and /metrics are intentionally auth-free —
// /healthz is a liveness probe (always 200 while the process serves), and
// metricsHandler only ever exposes aggregate counters/gauges.
func buildMux(metricsHandler http.Handler, apiRouter http.Handler, webHandler http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch {
case r.URL.Path == "/healthz":
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte("ok"))
case r.URL.Path == "/metrics":
metricsHandler.ServeHTTP(w, r)
case isAPIPath(r.URL.Path):
apiRouter.ServeHTTP(w, r)
case webHandler != nil:
webHandler.ServeHTTP(w, r)
default:
http.NotFound(w, r)
}
})
}
// healthcheck performs an in-process liveness probe used as the container
// HEALTHCHECK — distroless images have no curl/wget. It GETs /healthz on the
// configured listen address and maps 200 -> 0, anything else -> 1.
func healthcheck() int {
addr := os.Getenv("DNS_AR_LISTEN")
if addr == "" {
addr = ":8080"
}
// ":8080" -> "127.0.0.1:8080"
if strings.HasPrefix(addr, ":") {
addr = "127.0.0.1" + addr
}
c := &http.Client{Timeout: 3 * time.Second}
resp, err := c.Get("http://" + addr + "/healthz")
if err != nil {
return 1
}
defer resp.Body.Close()
if resp.StatusCode == http.StatusOK {
return 0
}
return 1
}
func main() {
if len(os.Args) > 1 && os.Args[1] == "-healthcheck" {
os.Exit(healthcheck())
}
ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM)
defer stop()
@@ -97,22 +148,14 @@ func main() {
// internally and never stop the loop; ctx cancellation (signal) is the
// only thing that ends Run.
sched := scheduler.New(st, svc, dispatcher, m)
go sched.Run(ctx, schedulerTick)
var wg sync.WaitGroup
wg.Add(1)
go func() {
defer wg.Done()
sched.Run(ctx, schedulerTick)
}()
mux := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch {
case r.URL.Path == "/metrics":
// Public by design (no auth) — Metrics.Handler only ever exposes
// aggregate counters/gauges, never per-domain or secret data.
m.Handler().ServeHTTP(w, r)
case isAPIPath(r.URL.Path):
apiRouter.ServeHTTP(w, r)
case webHandler != nil:
webHandler.ServeHTTP(w, r)
default:
http.NotFound(w, r)
}
})
mux := buildMux(m.Handler(), apiRouter, webHandler)
srv := &http.Server{Addr: cfg.ListenAddr, Handler: mux}
@@ -135,6 +178,10 @@ func main() {
log.Printf("server: graceful shutdown failed: %v", err)
}
<-serveErr
// Wait for the in-flight scheduler RunOnce (interrupted by the
// cancelled ctx passed into checker.Check) to finish before exiting,
// so we never kill the process mid-write of a check/notify status.
wg.Wait()
log.Printf("server stopped")
case err := <-serveErr:
if err != nil {