#!/bin/bash # validate_deployment.sh - Validation script to verify all homelab components # Run this after deployment to ensure everything is working correctly set -euo pipefail # Colors RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' NC='\033[0m' PASSED=0 FAILED=0 WARNINGS=0 check_pass() { echo -e "${GREEN}✓ $1${NC}" ((PASSED++)) } check_fail() { echo -e "${RED}✗ $1${NC}" ((FAILED++)) } check_warn() { echo -e "${YELLOW}⚠ $1${NC}" ((WARNINGS++)) } echo "=========================================" echo "Home Lab Deployment Validation" echo "Started at $(date)" echo "=========================================" # Network Validation echo -e "\n${YELLOW}[1/6] Network Configuration${NC}" if ip -d link show | grep -q "vlan"; then check_pass "VLANs configured" else check_warn "VLANs not detected (may not be configured yet)" fi if command -v ethtool >/dev/null 2>&1; then SPEED=$(ethtool eth0 2>/dev/null | grep Speed | awk '{print $2}') if [[ "$SPEED" == *"2500"* ]] || [[ "$SPEED" == *"5000"* ]]; then check_pass "High-speed network detected: $SPEED" else check_warn "Network speed: $SPEED (expected 2.5Gb or higher)" fi else check_warn "ethtool not installed, cannot verify network speed" fi # Storage Validation echo -e "\n${YELLOW}[2/6] Storage Configuration${NC}" if command -v zpool >/dev/null 2>&1; then if zpool list tank >/dev/null 2>&1; then HEALTH=$(zpool list -H -o health tank) if [[ "$HEALTH" == "ONLINE" ]]; then check_pass "ZFS pool 'tank' is ONLINE" else check_fail "ZFS pool 'tank' health: $HEALTH" fi else check_warn "ZFS pool 'tank' not found (may not be on this node)" fi else check_warn "ZFS not installed on this node" fi if mount | grep -q "/mnt/nas"; then check_pass "NAS is mounted" else check_warn "NAS not mounted at /mnt/nas" fi if crontab -l 2>/dev/null | grep -q "prune_ai_models.sh"; then check_pass "AI model pruning cron job configured" else check_warn "AI model pruning cron job not found" fi # Service Validation echo -e "\n${YELLOW}[3/6] Docker Services${NC}" if command -v docker >/dev/null 2>&1; then if docker service ls >/dev/null 2>&1; then TRAEFIK_COUNT=$(docker service ls | grep -c traefik || true) if [[ $TRAEFIK_COUNT -ge 1 ]]; then REPLICAS=$(docker service ls | grep traefik | awk '{print $4}') check_pass "Traefik service running ($REPLICAS)" else check_warn "Traefik service not found in Swarm" fi if docker service ls | grep -q node-exporter; then check_pass "node-exporter service running" else check_warn "node-exporter service not found" fi else check_warn "Not a Swarm manager node" fi UNHEALTHY=$(docker ps --filter "health=unhealthy" --format "{{.Names}}" | wc -l) if [[ $UNHEALTHY -eq 0 ]]; then check_pass "No unhealthy containers" else check_fail "$UNHEALTHY unhealthy containers detected" docker ps --filter "health=unhealthy" --format " - {{.Names}}" fi else check_fail "Docker not installed" fi # Security Validation echo -e "\n${YELLOW}[4/6] Security Configuration${NC}" if systemctl is-active --quiet fail2ban 2>/dev/null; then check_pass "fail2ban service is active" BANNED=$(sudo fail2ban-client status sshd 2>/dev/null | grep "Currently banned" | awk '{print $4}') if [[ -n "$BANNED" ]]; then check_pass "fail2ban protecting SSH ($BANNED IPs banned)" fi else check_warn "fail2ban not installed or not running" fi if sudo iptables -L >/dev/null 2>&1; then RULES=$(sudo iptables -L | grep -c "ACCEPT\|DROP" || true) if [[ $RULES -gt 0 ]]; then check_pass "Firewall rules configured ($RULES rules)" else check_warn "No firewall rules detected" fi else check_warn "Cannot check iptables (permission denied)" fi # Monitoring Validation echo -e "\n${YELLOW}[5/6] Monitoring & Metrics${NC}" if curl -s http://localhost:9100/metrics >/dev/null 2>&1; then check_pass "node-exporter metrics accessible" else check_warn "node-exporter not accessible on this node" fi if curl -s http://192.168.1.196:3000 >/dev/null 2>&1; then check_pass "Grafana UI accessible" else check_warn "Grafana not accessible (may not be on this node)" fi # Backup Validation echo -e "\n${YELLOW}[6/6] Backup Configuration${NC}" if systemctl list-timers --all | grep -q restic-backup.timer; then if systemctl is-active --quiet restic-backup.timer; then check_pass "Restic backup timer is active" NEXT_RUN=$(systemctl list-timers | grep restic-backup | awk '{print $1, $2}') echo " Next backup: $NEXT_RUN" else check_fail "Restic backup timer is not active" fi else check_warn "Restic backup timer not found" fi if command -v restic >/dev/null 2>&1; then check_pass "Restic is installed" else check_warn "Restic not installed" fi # Summary echo -e "\n=========================================" echo "Validation Summary" echo "=========================================" echo -e "${GREEN}Passed: $PASSED${NC}" echo -e "${YELLOW}Warnings: $WARNINGS${NC}" echo -e "${RED}Failed: $FAILED${NC}" if [[ $FAILED -eq 0 ]]; then echo -e "\n${GREEN}✓ Deployment validation successful!${NC}" exit 0 else echo -e "\n${RED}✗ Some checks failed. Review above for details.${NC}" exit 1 fi