41 lines
1.2 KiB
YAML
41 lines
1.2 KiB
YAML
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
labels:
|
|
prometheus: zalando
|
|
role: alert-rules
|
|
name: prometheus-zalando-rules
|
|
namespace: zalando
|
|
spec:
|
|
groups:
|
|
- name: zalando.rules
|
|
rules:
|
|
- alert: PG exporter
|
|
annotations:
|
|
description: >-
|
|
Failed to scrape {{ $labels.job }} on {{ $labels.namespace }} for
|
|
more than 2 minutes.
|
|
title: PG exporter is down
|
|
expr: up{job="zalando-monitoring"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
- alert: PG InstanceDown
|
|
annotations:
|
|
description: Failed to scrape {{$labels.namespace}} for more than 3 minutes.
|
|
title: PG Instance is down
|
|
expr: pg_up == 0
|
|
for: 3m
|
|
labels:
|
|
severity: critical
|
|
- alert: PG Lag
|
|
annotations:
|
|
description: >-
|
|
Failed replication on replica {{$labels.namespace}} for more than
|
|
3 minutes.
|
|
title: PG Peplication lag
|
|
expr: pg_replication_slots_pg_wal_lsn_diff > 1024
|
|
for: 3m
|
|
labels:
|
|
severity: critical
|