commit 7e4a302d9560fdd255e25a327e4db47ef2d72d89 Author: Seton Carmichael Date: Fri Mar 6 18:41:07 2026 -0500 Initial commit diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..08a9f97 --- /dev/null +++ b/.env.example @@ -0,0 +1,65 @@ +# ============================================================================== +# Metrics Stack — Environment Configuration +# ============================================================================== +# Copy this file to .env and fill in your values before starting the stack. +# cp .env.example .env +# ============================================================================== + +# ------------------------------------------------------------------------------ +# Client Identity +# Used for your own reference — update to match the client/site. +# ------------------------------------------------------------------------------ +CLIENT_NAME=ClientName + +# ------------------------------------------------------------------------------ +# Host Binding +# The LAN IP of the machine running this stack. +# Services bind to this IP so they are reachable over VPN. +# Use 0.0.0.0 to bind to all interfaces (less secure). +# ------------------------------------------------------------------------------ +BIND_HOST=192.168.X.X + +# ------------------------------------------------------------------------------ +# Timezone +# Used by Grafana for display. Use TZ database names: +# https://en.wikipedia.org/wiki/List_of_tz_database_time_zones +# ------------------------------------------------------------------------------ +TZ=America/New_York + +# ------------------------------------------------------------------------------ +# VictoriaMetrics +# VM_RETENTION_PERIOD: how many months of metrics to keep (default: 6) +# VM_PORT: port VictoriaMetrics listens on (default: 8428) +# ------------------------------------------------------------------------------ +VM_RETENTION_PERIOD=6 +VM_PORT=8428 + +# ------------------------------------------------------------------------------ +# vmagent +# The scrape agent. Manages all endpoint collection. +# See vmagent/config/scrape.yml to configure endpoints. +# VMAGENT_PORT: port for the vmagent web UI (default: 8429) +# ------------------------------------------------------------------------------ +VMAGENT_PORT=8429 + +# ------------------------------------------------------------------------------ +# Grafana +# GF_PORT: port Grafana listens on (default: 3000) +# GF_ADMIN_USER: admin username +# GF_ADMIN_PASSWORD: admin password — CHANGE THIS +# ------------------------------------------------------------------------------ +GF_PORT=3000 +GF_ADMIN_USER=admin +GF_ADMIN_PASSWORD=CHANGE_ME_STRONG_PASSWORD + +# ------------------------------------------------------------------------------ +# Uptime Kuma +# KUMA_PORT: port Uptime Kuma listens on (default: 3001) +# UPTIME_KUMA_WS_ORIGIN_CHECK: set to "bypass" if behind a reverse proxy +# KUMA_SCRAPE_USER / KUMA_SCRAPE_PASSWORD: credentials vmagent uses to +# scrape Uptime Kuma's metrics endpoint. Set these after initial Kuma setup. +# ------------------------------------------------------------------------------ +KUMA_PORT=3001 +UPTIME_KUMA_WS_ORIGIN_CHECK=bypass +KUMA_SCRAPE_USER=admin +KUMA_SCRAPE_PASSWORD=CHANGE_ME_KUMA_PASSWORD diff --git a/README.md b/README.md new file mode 100644 index 0000000..cb06757 --- /dev/null +++ b/README.md @@ -0,0 +1,158 @@ +# Metrics Stack + +Self-contained monitoring stack using VictoriaMetrics, vmagent, Grafana, and Uptime Kuma. +Deploy one instance per client site. Access remotely over VPN. + +## Stack Components + +| Service | Purpose | Default Port | +|---|---|---| +| VictoriaMetrics | Time-series metric storage | 8428 | +| vmagent | Prometheus-compatible scrape agent | 8429 | +| Grafana | Dashboards and visualization | 3000 | +| Uptime Kuma | Availability monitoring + alerting | 3001 | +| node_exporter | Host metrics (this machine) | internal only | +| snmp_exporter | SNMP metrics for network devices | 9116 (optional) | + +--- + +## Initial Setup + +### 1. Configure environment + +```bash +cp .env.example .env +``` + +Edit `.env`: +- Set `BIND_HOST` to this machine's LAN IP +- Set `CLIENT_NAME` to identify the client +- Set strong passwords for `GF_ADMIN_PASSWORD` +- Set `TZ` to the correct timezone + +### 2. Configure endpoints + +Edit `vmagent/config/scrape.yml`: +- Update the `linux-host` job with this machine's hostname and site name +- Add any other endpoints (see "Adding Endpoints" below) + +### 3. Start the stack + +```bash +podman-compose up -d +``` + +### 4. Finish Uptime Kuma setup + +1. Browse to `http://BIND_HOST:3001` and complete the initial setup wizard +2. Note the username/password you set +3. In `vmagent/config/scrape.yml`, uncomment the `uptime_kuma` job and fill in those credentials +4. Run `podman-compose restart vmagent` + +--- + +## Adding Endpoints + +Open `vmagent/config/scrape.yml`. The file has two sections: + +- **ACTIVE JOBS** — jobs that are currently running +- **TEMPLATES** — commented-out job blocks, one per endpoint type + +To add a new endpoint: + +1. Find the matching template at the bottom of `scrape.yml` +2. Copy the entire commented block (from `# - job_name:` to the end of the block) +3. Paste it into the **ACTIVE JOBS** section +4. Uncomment it (remove the leading `# ` from each line) +5. Fill in the IP addresses, hostnames, and site label +6. Restart vmagent: + +```bash +podman-compose restart vmagent +``` + +### Available templates + +| Template | Exporter needed on target | Port | +|---|---|---| +| Windows Domain Controller | windows_exporter | 9182 | +| Hyper-V Host | windows_exporter (with hyperv collector) | 9182 | +| Windows General Purpose Server | windows_exporter | 9182 | +| Linux Server | node_exporter | 9100 | +| SNMP Device | snmp_exporter (runs in this stack) | n/a | + +### Installing windows_exporter + +Download the latest `.msi` from: +https://github.com/prometheus-community/windows_exporter/releases + +For Hyper-V hosts, ensure the `hyperv` collector is enabled. You can set this +in the MSI installer or by modifying the service arguments post-install: + +``` +--collectors.enabled defaults,hyperv,cpu_info,physical_disk,process +``` + +### Enabling SNMP monitoring + +1. Uncomment the `snmp-exporter` service in `podman-compose.yml` +2. Download a pre-built `snmp.yml` from: + https://github.com/prometheus/snmp_exporter/releases +3. Place it at `snmp_exporter/snmp.yml` +4. Uncomment and configure the `snmp-devices` job template in `scrape.yml` +5. Restart the stack: `podman-compose up -d` + +--- + +## Useful Commands + +```bash +# Start the stack +podman-compose up -d + +# Stop the stack +podman-compose down + +# Restart a single service (e.g., after editing scrape.yml) +podman-compose restart vmagent + +# View logs for a service +podman-compose logs -f vmagent +podman-compose logs -f victoriametrics + +# Check running containers +podman-compose ps + +# Pull latest images and restart +podman-compose pull && podman-compose up -d +``` + +## Verify vmagent is scraping + +Browse to `http://BIND_HOST:8429/targets` to see all configured scrape targets +and their current status (up/down, last scrape time, errors). + +--- + +## Directory Structure + +``` +metrics/ +├── .env # Active config (do not commit) +├── .env.example # Config template +├── podman-compose.yml # Stack definition +├── vmagent/ +│ └── config/ +│ └── scrape.yml # Endpoint config — edit this to add endpoints +├── grafana/ +│ ├── data/ # Grafana database (auto-created) +│ └── provisioning/ +│ └── datasources/ +│ └── victoriametrics.yml # Auto-wires VictoriaMetrics as datasource +├── victoriametrics/ +│ └── data/ # Metric storage (auto-created) +├── uptime_kuma/ +│ └── data/ # Uptime Kuma database (auto-created) +└── snmp_exporter/ + └── snmp.yml # SNMP module config (download separately) +``` diff --git a/grafana/provisioning/datasources/victoriametrics.yml b/grafana/provisioning/datasources/victoriametrics.yml new file mode 100644 index 0000000..b943c12 --- /dev/null +++ b/grafana/provisioning/datasources/victoriametrics.yml @@ -0,0 +1,13 @@ +apiVersion: 1 + +datasources: + - name: VictoriaMetrics + type: prometheus + access: proxy + url: http://victoriametrics:8428 + isDefault: true + editable: true + jsonData: + prometheusType: Prometheus + prometheusVersion: "2.24.0" + timeInterval: "15s" diff --git a/podman-compose.yml b/podman-compose.yml new file mode 100644 index 0000000..aa68fd5 --- /dev/null +++ b/podman-compose.yml @@ -0,0 +1,162 @@ +networks: + monitoring: + driver: bridge + +volumes: + vm_data: + driver: local + driver_opts: + type: none + o: bind + device: ./victoriametrics/data + grafana_data: + driver: local + driver_opts: + type: none + o: bind + device: ./grafana/data + vmagent_data: + driver: local + driver_opts: + type: none + o: bind + device: ./vmagent/data + kuma_data: + driver: local + driver_opts: + type: none + o: bind + device: ./uptime_kuma/data + +services: + + # -------------------------------------------------------------------------- + # VictoriaMetrics — time-series database + # -------------------------------------------------------------------------- + victoriametrics: + image: victoriametrics/victoria-metrics:latest + container_name: victoriametrics + restart: unless-stopped + ports: + - "${BIND_HOST}:${VM_PORT}:8428" + volumes: + - vm_data:/storage + command: + - "--storageDataPath=/storage" + - "--retentionPeriod=${VM_RETENTION_PERIOD}" + - "--dedup.minScrapeInterval=60s" + healthcheck: + test: ["CMD", "wget", "-qO-", "http://localhost:8428/health"] + interval: 30s + timeout: 10s + retries: 3 + networks: + - monitoring + + # -------------------------------------------------------------------------- + # vmagent — Prometheus-compatible scrape agent + # See vmagent/config/scrape.yml to add endpoints + # -------------------------------------------------------------------------- + vmagent: + image: victoriametrics/vmagent:latest + container_name: vmagent + restart: unless-stopped + ports: + - "${BIND_HOST}:${VMAGENT_PORT}:8429" + volumes: + - ./vmagent/config/scrape.yml:/etc/vmagent/scrape.yml:ro + - vmagent_data:/vmagent_data + command: + - "--promscrape.config=/etc/vmagent/scrape.yml" + - "--remoteWrite.url=http://victoriametrics:8428/api/v1/write" + - "--promscrape.config.strictParse=false" + - "--remoteWrite.tmpDataPath=/vmagent_data" + depends_on: + victoriametrics: + condition: service_healthy + networks: + - monitoring + + # -------------------------------------------------------------------------- + # Grafana — dashboards and visualization + # -------------------------------------------------------------------------- + grafana: + image: grafana/grafana:latest + container_name: grafana + restart: unless-stopped + ports: + - "${BIND_HOST}:${GF_PORT}:3000" + volumes: + - grafana_data:/var/lib/grafana + - ./grafana/provisioning:/etc/grafana/provisioning:ro + environment: + - GF_SECURITY_ADMIN_USER=${GF_ADMIN_USER} + - GF_SECURITY_ADMIN_PASSWORD=${GF_ADMIN_PASSWORD} + - GF_ANALYTICS_REPORTING_ENABLED=false + - GF_ANALYTICS_CHECK_FOR_UPDATES=false + - GF_USERS_ALLOW_SIGN_UP=false + - TZ=${TZ} + networks: + - monitoring + + # -------------------------------------------------------------------------- + # Uptime Kuma — availability monitoring with alerting + # -------------------------------------------------------------------------- + uptime-kuma: + image: louislam/uptime-kuma:2 + container_name: uptime-kuma + restart: unless-stopped + ports: + - "${BIND_HOST}:${KUMA_PORT}:3001" + volumes: + - kuma_data:/app/data + environment: + - UPTIME_KUMA_WS_ORIGIN_CHECK=${UPTIME_KUMA_WS_ORIGIN_CHECK} + security_opt: + - no-new-privileges:true + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:3001"] + interval: 30s + timeout: 10s + retries: 3 + networks: + - monitoring + + # -------------------------------------------------------------------------- + # node_exporter — Linux host metrics (the machine running this stack) + # Provides CPU, memory, disk, network, and filesystem metrics for this host. + # -------------------------------------------------------------------------- + node-exporter: + image: prom/node-exporter:latest + container_name: node-exporter + restart: unless-stopped + volumes: + - /proc:/host/proc:ro + - /sys:/host/sys:ro + - /:/rootfs:ro + command: + - "--path.procfs=/host/proc" + - "--path.rootfs=/rootfs" + - "--path.sysfs=/host/sys" + - "--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)" + networks: + - monitoring + + # -------------------------------------------------------------------------- + # snmp_exporter — SNMP metrics for network devices (switches, routers, APs) + # OPTIONAL: Uncomment this service if you need SNMP monitoring. + # You must also provide a valid snmp_exporter/snmp.yml config. + # Download a pre-built snmp.yml: https://github.com/prometheus/snmp_exporter/releases + # -------------------------------------------------------------------------- + # snmp-exporter: + # image: prom/snmp-exporter:latest + # container_name: snmp-exporter + # restart: unless-stopped + # ports: + # - "${BIND_HOST}:9116:9116" + # volumes: + # - ./snmp_exporter/snmp.yml:/etc/snmp_exporter/snmp.yml:ro + # command: + # - "--config.file=/etc/snmp_exporter/snmp.yml" + # networks: + # - monitoring diff --git a/vmagent/config/scrape.yml b/vmagent/config/scrape.yml new file mode 100644 index 0000000..b0c4fab --- /dev/null +++ b/vmagent/config/scrape.yml @@ -0,0 +1,229 @@ +# ============================================================================== +# vmagent Scrape Configuration +# ============================================================================== +# +# HOW TO ADD A NEW ENDPOINT: +# 1. Scroll to the TEMPLATES section at the bottom of this file +# 2. Find the template matching your endpoint type +# 3. Copy the entire block (everything between the dashes) +# 4. Paste it into the ACTIVE JOBS section below +# 5. Fill in the IP addresses, hostnames, and site label +# 6. Restart vmagent: podman-compose restart vmagent +# +# LABEL CONVENTIONS: +# site: Short name for the physical/logical site (e.g., "HQ", "Branch1") +# host_name: Friendly hostname of the monitored machine +# dc_name: Domain controller name +# +# ============================================================================== + +global: + scrape_interval: 15s + scrape_timeout: 30s + +scrape_configs: + +# ============================================================================== +# ACTIVE JOBS — your configured endpoints live here +# ============================================================================== + + # ---------------------------------------------------------------------------- + # vmagent self-monitoring — always keep this, do not remove + # ---------------------------------------------------------------------------- + - job_name: vmagent + scrape_interval: 30s + static_configs: + - targets: ["vmagent:8429"] + + # ---------------------------------------------------------------------------- + # Linux host — the machine running this container stack (node_exporter) + # node_exporter runs as part of the compose stack, no additional setup needed. + # ---------------------------------------------------------------------------- + - job_name: linux-host + scrape_interval: 30s + static_configs: + - targets: ["node-exporter:9100"] + labels: + host_name: "HOSTNAME" # REPLACE: short hostname of this machine + site: "SITE" # REPLACE: site name (e.g., "HQ") + + # ---------------------------------------------------------------------------- + # Uptime Kuma — availability monitoring metrics + # Set credentials in .env (KUMA_SCRAPE_USER / KUMA_SCRAPE_PASSWORD) + # then uncomment this job after completing initial Uptime Kuma setup. + # ---------------------------------------------------------------------------- + # - job_name: uptime_kuma + # scrape_interval: 30s + # static_configs: + # - targets: ["uptime-kuma:3001"] + # basic_auth: + # username: "KUMA_SCRAPE_USER" # REPLACE with your Kuma username + # password: "KUMA_SCRAPE_PASSWORD" # REPLACE with your Kuma password + # relabel_configs: + # - target_label: job + # replacement: uptime_kuma + +# ============================================================================== +# TEMPLATES — copy a block into ACTIVE JOBS above and fill in your values +# ============================================================================== +# +# Each template includes: +# - What exporter is required on the target machine +# - Default port +# - Labels to fill in +# - Any special configuration notes +# +# ============================================================================== + +# ------------------------------------------------------------------------------ +# TEMPLATE: Windows Domain Controller +# ------------------------------------------------------------------------------ +# Exporter: windows_exporter (formerly wmi_exporter) +# Install: https://github.com/prometheus-community/windows_exporter/releases +# Port: 9182 (default) +# Notes: Default collectors are sufficient for DC monitoring. +# For additional collectors, see the windows_exporter README. +# ------------------------------------------------------------------------------ +# +# - job_name: domain-controllers +# scrape_interval: 30s +# scrape_timeout: 10s +# static_configs: +# - targets: ["192.168.X.X:9182"] +# labels: +# dc_name: "DC-NAME" # REPLACE: domain controller hostname (e.g., "DC01") +# site: "SITE" # REPLACE: site name (e.g., "HQ") +# # Add additional DCs below — copy the block above for each one +# # - targets: ["192.168.X.Y:9182"] +# # labels: +# # dc_name: "DC-NAME2" +# # site: "SITE" + +# ------------------------------------------------------------------------------ +# TEMPLATE: Hyper-V Host +# ------------------------------------------------------------------------------ +# Exporter: windows_exporter +# Install: https://github.com/prometheus-community/windows_exporter/releases +# Port: 9182 (default) +# Notes: Requires the hyperv collector enabled on the windows_exporter. +# Install with: windows_exporter.exe --collectors.enabled defaults,hyperv +# Or set via the windows_exporter service config. +# scrape_timeout is set high (25s) because hyperv metrics can be slow. +# ------------------------------------------------------------------------------ +# +# - job_name: hyperv-hosts +# scrape_interval: 30s +# scrape_timeout: 25s +# static_configs: +# - targets: ["192.168.X.X:9182"] +# labels: +# host_name: "HOST-NAME" # REPLACE: Hyper-V host hostname (e.g., "HV01") +# site: "SITE" # REPLACE: site name +# # Add additional Hyper-V hosts below +# # - targets: ["192.168.X.Y:9182"] +# # labels: +# # host_name: "HOST-NAME2" +# # site: "SITE" +# params: +# collect[]: +# - defaults +# - hyperv +# - cpu_info +# - physical_disk +# - process + +# ------------------------------------------------------------------------------ +# TEMPLATE: Windows General Purpose Server +# ------------------------------------------------------------------------------ +# Exporter: windows_exporter +# Install: https://github.com/prometheus-community/windows_exporter/releases +# Port: 9182 (default) +# Notes: Uses default collectors. Suitable for file servers, app servers, +# print servers, or any Windows server not classified as DC or Hyper-V. +# Add specific collectors to the params block if needed. +# ------------------------------------------------------------------------------ +# +# - job_name: windows-servers +# scrape_interval: 30s +# scrape_timeout: 15s +# static_configs: +# - targets: ["192.168.X.X:9182"] +# labels: +# host_name: "SERVER-NAME" # REPLACE: hostname (e.g., "FS01") +# site: "SITE" # REPLACE: site name +# role: "file-server" # OPTIONAL: add a role label to distinguish server types +# # Add additional servers below +# # - targets: ["192.168.X.Y:9182"] +# # labels: +# # host_name: "SERVER-NAME2" +# # site: "SITE" +# # role: "app-server" + +# ------------------------------------------------------------------------------ +# TEMPLATE: Linux Server +# ------------------------------------------------------------------------------ +# Exporter: node_exporter +# Install: https://github.com/prometheus/node_exporter/releases +# Or via package manager: apt install prometheus-node-exporter +# Or run as a container: docker run -d --net="host" --pid="host" +# -v "/:/host:ro,rslave" +# prom/node-exporter --path.rootfs=/host +# Port: 9100 (default) +# Notes: The node_exporter already running in this compose stack covers THIS +# host. Use this template for OTHER Linux machines on the network. +# ------------------------------------------------------------------------------ +# +# - job_name: linux-servers +# scrape_interval: 30s +# scrape_timeout: 10s +# static_configs: +# - targets: ["192.168.X.X:9100"] +# labels: +# host_name: "LINUX-HOST-NAME" # REPLACE: hostname +# site: "SITE" # REPLACE: site name +# # Add additional Linux servers below +# # - targets: ["192.168.X.Y:9100"] +# # labels: +# # host_name: "LINUX-HOST-NAME2" +# # site: "SITE" + +# ------------------------------------------------------------------------------ +# TEMPLATE: SNMP Device (switches, routers, APs, UPS, etc.) +# ------------------------------------------------------------------------------ +# Exporter: snmp_exporter (must be enabled in podman-compose.yml) +# Config: snmp_exporter/snmp.yml — download a pre-built config from: +# https://github.com/prometheus/snmp_exporter/releases +# The "snmp.yml" in that release covers most common network gear. +# Port: 9116 (snmp_exporter listens here; SNMP itself uses UDP 161 on targets) +# Modules: "if_mib" = interface stats (works on almost any device) +# Other modules depend on vendor — check the snmp.yml for available ones. +# Steps: +# 1. Uncomment snmp-exporter in podman-compose.yml +# 2. Place your snmp.yml in snmp_exporter/snmp.yml +# 3. Uncomment and fill in this job block +# 4. Restart the stack: podman-compose up -d +# Notes: Each target is passed as a URL parameter to snmp_exporter. +# The exporter itself must be reachable from vmagent (it's on the +# monitoring network), and it must reach the SNMP device via the host. +# ------------------------------------------------------------------------------ +# +# - job_name: snmp-devices +# scrape_interval: 60s +# scrape_timeout: 30s +# static_configs: +# - targets: +# - "192.168.X.X" # REPLACE: IP of SNMP device (switch, router, AP, etc.) +# # Add more SNMP device IPs here +# # - "192.168.X.Y" +# labels: +# site: "SITE" # REPLACE: site name +# params: +# module: [if_mib] # REPLACE: SNMP module to use (see snmp_exporter/snmp.yml) +# # Common: if_mib, cisco_wlc, apc_ups, pdu, printer_mib +# relabel_configs: +# - source_labels: [__address__] +# target_label: __param_target +# - source_labels: [__param_target] +# target_label: instance +# - target_label: __address__ +# replacement: snmp-exporter:9116 # points vmagent at the snmp_exporter container