This commit is contained in:
2024-08-02 17:24:31 +07:00
parent 23fa8b4ff5
commit 89eec9691a
9 changed files with 146 additions and 64 deletions

41
4.monitoring/alerts.yaml Normal file
View File

@@ -0,0 +1,41 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
generation: 2
labels:
prometheus: zalando
role: alert-rules
name: prometheus-zalando-rules
namespace: zalando
spec:
groups:
- name: zalando.rules
rules:
- alert: PG exporter
annotations:
description: >-
Failed to scrape {{ $labels.job }} on {{ $labels.namespace }} for
more than 2 minutes.
title: PG exporter is down
expr: up{job="zalando-monitoring"} == 0
for: 2m
labels:
severity: warning
- alert: PG InstanceDown
annotations:
description: Failed to scrape {{$labels.namespace}} for more than 3 minutes.
title: PG Instance is down
expr: pg_up == 0
for: 3m
labels:
severity: critical
- alert: PG Lag
annotations:
description: >-
Failed replication on replica {{$labels.namespace}} for more than
3 minutes.
title: PG Peplication lag
expr: pg_replication_slots_pg_wal_lsn_diff > 1024
for: 3m
labels:
severity: critical

View File

@@ -0,0 +1,19 @@
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
labels:
app: psql-pg-exporter
name: zalando-pg-exporter
namespace: rm-pgsql
spec:
podMetricsEndpoints:
- path: /metrics
port: exporter
relabelings:
- action: replace
replacement: rke-first-cluster
targetLabel: cluster
scheme: http
selector:
matchLabels:
application: spilo