Initial commit: homelab configuration and documentation
This commit is contained in:
63
monitoring/grafana/alert_rules.yml
Normal file
63
monitoring/grafana/alert_rules.yml
Normal file
@@ -0,0 +1,63 @@
|
||||
groups:
|
||||
- name: homelab_alerts
|
||||
interval: 30s
|
||||
rules:
|
||||
# CPU Usage Alert
|
||||
- alert: HighCPUUsage
|
||||
expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High CPU usage detected on {{ $labels.instance }}"
|
||||
description: "CPU usage is above 80% (current value: {{ $value }}%)"
|
||||
|
||||
# Memory Usage Alert
|
||||
- alert: HighMemoryUsage
|
||||
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High memory usage detected on {{ $labels.instance }}"
|
||||
description: "Memory usage is above 85% (current value: {{ $value }}%)"
|
||||
|
||||
# Disk Usage Alert
|
||||
- alert: HighDiskUsage
|
||||
expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|fuse.lxcfs"} / node_filesystem_size_bytes{fstype!~"tmpfs|fuse.lxcfs"})) * 100 > 80
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High disk usage detected on {{ $labels.instance }}"
|
||||
description: "Disk usage on {{ $labels.mountpoint }} is above 80% (current value: {{ $value }}%)"
|
||||
|
||||
# Node Down Alert
|
||||
- alert: NodeDown
|
||||
expr: up{job="node-exporter"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Node {{ $labels.instance }} is down"
|
||||
description: "Node exporter on {{ $labels.instance }} has been down for more than 2 minutes"
|
||||
|
||||
# Container Down Alert
|
||||
- alert: ContainerDown
|
||||
expr: up{job="docker"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Container {{ $labels.instance }} is down"
|
||||
description: "Docker container on {{ $labels.instance }} has been down for more than 2 minutes"
|
||||
|
||||
# Disk I/O Alert (high wait time)
|
||||
- alert: HighDiskIOWait
|
||||
expr: rate(node_cpu_seconds_total{mode="iowait"}[5m]) * 100 > 20
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High disk I/O wait on {{ $labels.instance }}"
|
||||
description: "Disk I/O wait time is above 20% (current value: {{ $value }}%)"
|
||||
Reference in New Issue
Block a user