zl-base-in-kube/4.monitoring/alerts.yaml

apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  labels:
    prometheus: zalando
    role: alert-rules
  name: prometheus-zalando-rules
  namespace: zalando
spec:
  groups:
    - name: zalando.rules
      rules:
        - alert: PG exporter
          annotations:
            description: >-
              Failed to scrape {{ $labels.job }} on {{ $labels.namespace }} for
              more than 2 minutes.
            title: PG exporter is down
          expr: up{job="zalando-monitoring"} == 0
          for: 2m
          labels:
            severity: warning
        - alert: PG InstanceDown
          annotations:
            description: Failed to scrape {{$labels.namespace}} for more than 3 minutes.
            title: PG Instance is down
          expr: pg_up == 0
          for: 3m
          labels:
            severity: critical
        - alert: PG Lag
          annotations:
            description: >-
              Failed replication on replica {{$labels.namespace}} for more than
              3 minutes.
            title: PG Peplication lag
          expr: pg_replication_slots_pg_wal_lsn_diff > 1024
          for: 3m
          labels:
            severity: critical