init
This commit is contained in:
126
02-validation/03-reporting/README.md
Normal file
126
02-validation/03-reporting/README.md
Normal file
@@ -0,0 +1,126 @@
|
||||
# Урок 2.3 — Обработка ошибок и отчётность
|
||||
|
||||
## Файлы
|
||||
|
||||
| Файл | Описание |
|
||||
|------|----------|
|
||||
| `prometheus-alert-rules.yaml` | PrometheusRule с алертами |
|
||||
| `service-monitor.yaml` | ServiceMonitor для Prometheus Operator |
|
||||
|
||||
## Работа с PolicyReport
|
||||
|
||||
```bash
|
||||
# Посмотреть все отчёты
|
||||
kubectl get policyreports -A
|
||||
kubectl get clusterpolicyreports
|
||||
|
||||
# Детальный отчёт по namespace
|
||||
kubectl describe policyreport -n production
|
||||
|
||||
# Все нарушения в кластере (форматированный вывод)
|
||||
kubectl get policyreports -A -o json | \
|
||||
jq -r '.items[] | .metadata.namespace as $ns |
|
||||
.results[] | select(.result == "fail") |
|
||||
"\($ns)/\(.resources[0].name)\t\(.policy)/\(.rule)\t\(.message)"' | \
|
||||
column -t -s $'\t'
|
||||
|
||||
# Топ-10 политик по нарушениям
|
||||
kubectl get policyreports -A -o json | \
|
||||
jq -r '[.items[].results[] | select(.result == "fail") | .policy] |
|
||||
group_by(.) | map({policy: .[0], count: length}) |
|
||||
sort_by(-.count)[:10][] | "\(.count)\t\(.policy)"'
|
||||
|
||||
# Нарушения по конкретной политике
|
||||
POLICY="require-resource-limits"
|
||||
kubectl get policyreports -A -o json | \
|
||||
jq --arg p "$POLICY" \
|
||||
-r '.items[] | .metadata.namespace as $ns |
|
||||
.results[] | select(.policy == $p and .result == "fail") |
|
||||
"\($ns)/\(.resources[0].name): \(.message)"'
|
||||
|
||||
# Compliance rate по namespace
|
||||
kubectl get policyreports -A -o json | \
|
||||
jq -r '.items[] | .metadata.namespace as $ns |
|
||||
{ns: $ns, pass: [.results[] | select(.result=="pass")] | length,
|
||||
fail: [.results[] | select(.result=="fail")] | length} |
|
||||
"\(.ns): pass=\(.pass) fail=\(.fail)"'
|
||||
```
|
||||
|
||||
## Prometheus метрики
|
||||
|
||||
```bash
|
||||
# Проверить доступность метрик (port-forward если нет Ingress)
|
||||
kubectl port-forward -n kyverno svc/kyverno-svc-metrics 8000:8000 &
|
||||
|
||||
# Посмотреть все метрики
|
||||
curl -s http://localhost:8000/metrics | grep kyverno_
|
||||
|
||||
# Ключевые метрики
|
||||
curl -s http://localhost:8000/metrics | grep kyverno_policy_results_total
|
||||
curl -s http://localhost:8000/metrics | grep kyverno_admission_review_duration
|
||||
```
|
||||
|
||||
## Полезные PromQL запросы (для Grafana)
|
||||
|
||||
```promql
|
||||
# Compliance rate (цель: 1.0 = 100%)
|
||||
sum(rate(kyverno_policy_results_total{rule_result="pass"}[5m])) /
|
||||
sum(rate(kyverno_policy_results_total[5m]))
|
||||
|
||||
# Нарушений в час по политикам
|
||||
topk(10, sum by(policy_name)(
|
||||
increase(kyverno_policy_results_total{rule_result="fail"}[1h])
|
||||
))
|
||||
|
||||
# p95 латентность admission
|
||||
histogram_quantile(0.95,
|
||||
sum(rate(kyverno_admission_review_duration_seconds_bucket[5m])) by (le)
|
||||
)
|
||||
|
||||
# Нарушения по namespace за сутки
|
||||
sum by(resource_namespace)(
|
||||
increase(kyverno_policy_results_total{rule_result="fail"}[24h])
|
||||
)
|
||||
```
|
||||
|
||||
## Применение мониторинга
|
||||
|
||||
```bash
|
||||
# Применить ServiceMonitor (требует Prometheus Operator)
|
||||
kubectl apply -f service-monitor.yaml
|
||||
|
||||
# Применить правила алертов
|
||||
kubectl apply -f prometheus-alert-rules.yaml
|
||||
|
||||
# Проверить что правила подхватились
|
||||
kubectl get prometheusrule -n kyverno
|
||||
```
|
||||
|
||||
## Режим Audit для существующего кластера
|
||||
|
||||
```bash
|
||||
# Применить политику в Audit режиме
|
||||
kubectl apply -f - <<EOF
|
||||
apiVersion: kyverno.io/v1
|
||||
kind: ClusterPolicy
|
||||
metadata:
|
||||
name: audit-all-pods
|
||||
spec:
|
||||
validationFailureAction: Audit
|
||||
background: true
|
||||
rules:
|
||||
- name: check-labels
|
||||
match:
|
||||
resources:
|
||||
kinds: [Pod]
|
||||
validate:
|
||||
message: "Под должен иметь лейбл app"
|
||||
pattern:
|
||||
metadata:
|
||||
labels:
|
||||
app: "?*"
|
||||
EOF
|
||||
|
||||
# Подождать background scan (30-60 секунд), потом:
|
||||
kubectl get policyreports -A | grep -v 0/0
|
||||
```
|
||||
99
02-validation/03-reporting/prometheus-alert-rules.yaml
Normal file
99
02-validation/03-reporting/prometheus-alert-rules.yaml
Normal file
@@ -0,0 +1,99 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: kyverno-alerts
|
||||
namespace: kyverno
|
||||
labels:
|
||||
prometheus: kube-prometheus
|
||||
role: alert-rules
|
||||
spec:
|
||||
groups:
|
||||
- name: kyverno.availability
|
||||
rules:
|
||||
- alert: KyvernoDown
|
||||
expr: up{job="kyverno-svc-metrics"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Kyverno недоступен"
|
||||
description: >-
|
||||
Admission controller Kyverno не отвечает более 1 минуты.
|
||||
Проверьте поды: kubectl get pods -n kyverno
|
||||
|
||||
- alert: KyvernoAdmissionLatencyHigh
|
||||
expr: >
|
||||
histogram_quantile(0.95,
|
||||
sum(rate(kyverno_admission_review_duration_seconds_bucket[5m])) by (le)
|
||||
) > 0.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Высокая латентность Kyverno admission (p95 > 500ms)"
|
||||
description: >-
|
||||
p95 латентность: {{ $value | humanizeDuration }}.
|
||||
Это замедляет деплойменты. Проверьте политики с apiCall в context.
|
||||
|
||||
- alert: KyvernoAdmissionErrors
|
||||
expr: >
|
||||
rate(kyverno_admission_requests_total{
|
||||
admission_request_type="error"
|
||||
}[5m]) > 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Ошибки обработки запросов в Kyverno"
|
||||
description: "Kyverno возвращает ошибки. Проверьте логи: kubectl logs -n kyverno -l app.kubernetes.io/component=admission-controller"
|
||||
|
||||
- name: kyverno.policy
|
||||
rules:
|
||||
- alert: KyvernoCriticalPolicyViolation
|
||||
expr: >
|
||||
increase(kyverno_policy_results_total{
|
||||
rule_result="fail",
|
||||
policy_name=~"disallow-privileged.*|disallow-host.*|disallow-dangerous.*"
|
||||
}[5m]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Нарушение критической политики безопасности: {{ $labels.policy_name }}"
|
||||
description: >-
|
||||
Политика {{ $labels.policy_name }} была нарушена в namespace {{ $labels.resource_namespace }}.
|
||||
Немедленно проверьте: kubectl get policyreports -n {{ $labels.resource_namespace }}
|
||||
|
||||
- alert: KyvernoHighViolationRate
|
||||
expr: >
|
||||
sum(increase(kyverno_policy_results_total{rule_result="fail"}[1h])) > 50
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Высокое количество нарушений политик (> 50 за час)"
|
||||
description: >-
|
||||
За последний час: {{ $value }} нарушений.
|
||||
Проверьте отчёты: kubectl get policyreports -A
|
||||
|
||||
- name: kyverno.performance
|
||||
rules:
|
||||
- alert: KyvernoCPUThrottling
|
||||
expr: >
|
||||
rate(container_cpu_cfs_throttled_seconds_total{
|
||||
namespace="kyverno",
|
||||
container=~"kyverno.*"
|
||||
}[5m]) > 0.1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "CPU throttling Kyverno — возможна деградация производительности"
|
||||
description: "Увеличьте CPU limit для Kyverno admission controller."
|
||||
|
||||
- name: kyverno.recording
|
||||
rules:
|
||||
- record: kyverno:compliance_rate:5m
|
||||
expr: >
|
||||
sum(rate(kyverno_policy_results_total{rule_result="pass"}[5m])) /
|
||||
sum(rate(kyverno_policy_results_total[5m]))
|
||||
17
02-validation/03-reporting/service-monitor.yaml
Normal file
17
02-validation/03-reporting/service-monitor.yaml
Normal file
@@ -0,0 +1,17 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: kyverno-metrics
|
||||
namespace: kyverno
|
||||
labels:
|
||||
prometheus: kube-prometheus
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: kyverno
|
||||
app.kubernetes.io/component: admission-controller
|
||||
endpoints:
|
||||
- port: metrics-port
|
||||
interval: 30s
|
||||
path: /metrics
|
||||
honorLabels: true
|
||||
Reference in New Issue
Block a user