Refactor: Reorganize services into standalone structure

This commit is contained in:
2026-01-25 15:19:53 -06:00
parent cf360234c1
commit 10521ee94d
52 changed files with 3253 additions and 11 deletions

View File

@@ -0,0 +1,248 @@
version: '3.8'
networks:
traefik-public:
external: true
monitoring:
driver: overlay
volumes:
prometheus_data:
grafana_data:
alertmanager_data:
secrets:
grafana_admin_password:
external: true
configs:
prometheus_config:
external: true
name: prometheus.yml
alertmanager_config:
external: true
name: alertmanager.yml
services:
prometheus:
image: prom/prometheus:latest
volumes:
- prometheus_data:/prometheus
configs:
- source: prometheus_config
target: /etc/prometheus/prometheus.yml
networks:
- monitoring
- traefik-public
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9090/-/healthy"]
interval: 30s
timeout: 5s
retries: 3
start_period: 30s
deploy:
placement:
constraints:
- node.role == manager
resources:
limits:
memory: 2G
cpus: '1.0'
reservations:
memory: 512M
cpus: '0.25'
restart_policy:
condition: on-failure
delay: 5s
max_attempts: 3
update_config:
parallelism: 1
delay: 10s
failure_action: rollback
labels:
- "traefik.enable=true"
- "traefik.http.routers.prometheus.rule=Host(`prometheus.sterl.xyz`)"
- "traefik.http.routers.prometheus.entrypoints=websecure"
- "traefik.http.routers.prometheus.tls.certresolver=cfresolver"
- "traefik.http.services.prometheus.loadbalancer.server.port=9090"
- "traefik.swarm.network=traefik-public"
- "docktail.enable=true"
- "docktail.name=prometheus"
- "docktail.container_port=9090"
logging:
driver: "json-file"
options:
max-size: "10m"
max-file: "3"
grafana:
image: grafana/grafana:latest
volumes:
- grafana_data:/var/lib/grafana
environment:
- GF_SERVER_ROOT_URL=https://grafana.sterl.xyz
- GF_SECURITY_ADMIN_PASSWORD__FILE=/run/secrets/grafana_admin_password
secrets:
- grafana_admin_password
networks:
- monitoring
- traefik-public
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3000/api/health"]
interval: 30s
timeout: 5s
retries: 3
start_period: 30s
deploy:
placement:
constraints:
- node.role == manager
resources:
limits:
memory: 1G
cpus: '1.0'
reservations:
memory: 256M
cpus: '0.25'
restart_policy:
condition: on-failure
delay: 5s
max_attempts: 3
update_config:
parallelism: 1
delay: 10s
failure_action: rollback
labels:
- "traefik.enable=true"
- "traefik.http.routers.grafana.rule=Host(`grafana.sterl.xyz`)"
- "traefik.http.routers.grafana.entrypoints=websecure"
- "traefik.http.routers.grafana.tls.certresolver=cfresolver"
- "traefik.http.services.grafana.loadbalancer.server.port=3000"
- "traefik.swarm.network=traefik-public"
- "docktail.enable=true"
- "docktail.name=grafana"
- "docktail.container_port=3000"
logging:
driver: "json-file"
options:
max-size: "10m"
max-file: "3"
alertmanager:
image: prom/alertmanager:latest
volumes:
- alertmanager_data:/alertmanager
configs:
- source: alertmanager_config
target: /etc/alertmanager/config.yml
command:
- '--config.file=/etc/alertmanager/config.yml'
- '--storage.path=/alertmanager'
networks:
- monitoring
- traefik-public
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9093/-/healthy"]
interval: 30s
timeout: 5s
retries: 3
start_period: 15s
deploy:
placement:
constraints:
- node.role == manager
resources:
limits:
memory: 256M
cpus: '0.25'
reservations:
memory: 64M
cpus: '0.05'
restart_policy:
condition: on-failure
delay: 5s
max_attempts: 3
labels:
- "traefik.enable=true"
- "traefik.http.routers.alertmanager.rule=Host(`alertmanager.sterl.xyz`)"
- "traefik.http.routers.alertmanager.entrypoints=websecure"
- "traefik.http.routers.alertmanager.tls.certresolver=cfresolver"
- "traefik.http.services.alertmanager.loadbalancer.server.port=9093"
- "traefik.swarm.network=traefik-public"
- "docktail.enable=true"
- "docktail.name=alertmanager"
- "docktail.container_port=9093"
logging:
driver: "json-file"
options:
max-size: "10m"
max-file: "3"
node-exporter:
image: prom/node-exporter:latest
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- '--path.procfs=/host/proc'
- '--path.rootfs=/rootfs'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
networks:
- monitoring
deploy:
mode: global
resources:
limits:
memory: 128M
cpus: '0.2'
reservations:
memory: 32M
cpus: '0.05'
restart_policy:
condition: on-failure
delay: 5s
max_attempts: 3
logging:
driver: "json-file"
options:
max-size: "5m"
max-file: "2"
cadvisor:
image: gcr.io/cadvisor/cadvisor:latest
volumes:
- /:/rootfs:ro
- /var/run:/var/run:ro
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
- /dev/disk/:/dev/disk:ro
command:
- '--docker_only=true'
- '--housekeeping_interval=30s'
networks:
- monitoring
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8080/healthz"]
interval: 30s
timeout: 5s
retries: 3
deploy:
mode: global
resources:
limits:
memory: 256M
cpus: '0.3'
reservations:
memory: 64M
cpus: '0.1'
restart_policy:
condition: on-failure
delay: 5s
max_attempts: 3
logging:
driver: "json-file"
options:
max-size: "5m"
max-file: "2"