Initial commit: homelab configuration and documentation

This commit is contained in:
2025-11-29 19:03:14 +00:00
commit 0769ca6888
72 changed files with 7806 additions and 0 deletions

View File

@@ -0,0 +1,63 @@
groups:
- name: homelab_alerts
interval: 30s
rules:
# CPU Usage Alert
- alert: HighCPUUsage
expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High CPU usage detected on {{ $labels.instance }}"
description: "CPU usage is above 80% (current value: {{ $value }}%)"
# Memory Usage Alert
- alert: HighMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
for: 5m
labels:
severity: warning
annotations:
summary: "High memory usage detected on {{ $labels.instance }}"
description: "Memory usage is above 85% (current value: {{ $value }}%)"
# Disk Usage Alert
- alert: HighDiskUsage
expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|fuse.lxcfs"} / node_filesystem_size_bytes{fstype!~"tmpfs|fuse.lxcfs"})) * 100 > 80
for: 10m
labels:
severity: warning
annotations:
summary: "High disk usage detected on {{ $labels.instance }}"
description: "Disk usage on {{ $labels.mountpoint }} is above 80% (current value: {{ $value }}%)"
# Node Down Alert
- alert: NodeDown
expr: up{job="node-exporter"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Node {{ $labels.instance }} is down"
description: "Node exporter on {{ $labels.instance }} has been down for more than 2 minutes"
# Container Down Alert
- alert: ContainerDown
expr: up{job="docker"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Container {{ $labels.instance }} is down"
description: "Docker container on {{ $labels.instance }} has been down for more than 2 minutes"
# Disk I/O Alert (high wait time)
- alert: HighDiskIOWait
expr: rate(node_cpu_seconds_total{mode="iowait"}[5m]) * 100 > 20
for: 10m
labels:
severity: warning
annotations:
summary: "High disk I/O wait on {{ $labels.instance }}"
description: "Disk I/O wait time is above 20% (current value: {{ $value }}%)"