64 lines
2.3 KiB
YAML
64 lines
2.3 KiB
YAML
groups:
|
|
- name: homelab_alerts
|
|
interval: 30s
|
|
rules:
|
|
# CPU Usage Alert
|
|
- alert: HighCPUUsage
|
|
expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High CPU usage detected on {{ $labels.instance }}"
|
|
description: "CPU usage is above 80% (current value: {{ $value }}%)"
|
|
|
|
# Memory Usage Alert
|
|
- alert: HighMemoryUsage
|
|
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High memory usage detected on {{ $labels.instance }}"
|
|
description: "Memory usage is above 85% (current value: {{ $value }}%)"
|
|
|
|
# Disk Usage Alert
|
|
- alert: HighDiskUsage
|
|
expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|fuse.lxcfs"} / node_filesystem_size_bytes{fstype!~"tmpfs|fuse.lxcfs"})) * 100 > 80
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High disk usage detected on {{ $labels.instance }}"
|
|
description: "Disk usage on {{ $labels.mountpoint }} is above 80% (current value: {{ $value }}%)"
|
|
|
|
# Node Down Alert
|
|
- alert: NodeDown
|
|
expr: up{job="node-exporter"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Node {{ $labels.instance }} is down"
|
|
description: "Node exporter on {{ $labels.instance }} has been down for more than 2 minutes"
|
|
|
|
# Container Down Alert
|
|
- alert: ContainerDown
|
|
expr: up{job="docker"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Container {{ $labels.instance }} is down"
|
|
description: "Docker container on {{ $labels.instance }} has been down for more than 2 minutes"
|
|
|
|
# Disk I/O Alert (high wait time)
|
|
- alert: HighDiskIOWait
|
|
expr: rate(node_cpu_seconds_total{mode="iowait"}[5m]) * 100 > 20
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High disk I/O wait on {{ $labels.instance }}"
|
|
description: "Disk I/O wait time is above 20% (current value: {{ $value }}%)"
|