groups: - name: homelab_alerts interval: 30s rules: # CPU Usage Alert - alert: HighCPUUsage expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80 for: 5m labels: severity: warning annotations: summary: "High CPU usage detected on {{ $labels.instance }}" description: "CPU usage is above 80% (current value: {{ $value }}%)" # Memory Usage Alert - alert: HighMemoryUsage expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85 for: 5m labels: severity: warning annotations: summary: "High memory usage detected on {{ $labels.instance }}" description: "Memory usage is above 85% (current value: {{ $value }}%)" # Disk Usage Alert - alert: HighDiskUsage expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|fuse.lxcfs"} / node_filesystem_size_bytes{fstype!~"tmpfs|fuse.lxcfs"})) * 100 > 80 for: 10m labels: severity: warning annotations: summary: "High disk usage detected on {{ $labels.instance }}" description: "Disk usage on {{ $labels.mountpoint }} is above 80% (current value: {{ $value }}%)" # Node Down Alert - alert: NodeDown expr: up{job="node-exporter"} == 0 for: 2m labels: severity: critical annotations: summary: "Node {{ $labels.instance }} is down" description: "Node exporter on {{ $labels.instance }} has been down for more than 2 minutes" # Container Down Alert - alert: ContainerDown expr: up{job="docker"} == 0 for: 2m labels: severity: critical annotations: summary: "Container {{ $labels.instance }} is down" description: "Docker container on {{ $labels.instance }} has been down for more than 2 minutes" # Disk I/O Alert (high wait time) - alert: HighDiskIOWait expr: rate(node_cpu_seconds_total{mode="iowait"}[5m]) * 100 > 20 for: 10m labels: severity: warning annotations: summary: "High disk I/O wait on {{ $labels.instance }}" description: "Disk I/O wait time is above 20% (current value: {{ $value }}%)"