diff --git a/.env.example b/.env.example index 4372fc9..e89da91 100644 --- a/.env.example +++ b/.env.example @@ -7,3 +7,6 @@ IMAGE_TAG=latest # Optional: check retention (limits DB growth) # CHECK_RETENTION_COUNT=5000 # keep last N checks per service (default 5000) # CHECK_RETENTION_DAYS=30 # also delete checks older than N days (0=disabled) + +# Rollup: aggregate checks older than N hours into hourly buckets for 90+ day reporting +# ROLLUP_AGE_HOURS=24 # default 24; raw checks kept for this long, then rolled up diff --git a/README.md b/README.md index 63a2c9a..b211548 100644 --- a/README.md +++ b/README.md @@ -42,16 +42,22 @@ docker run -p 8080:8080 -v $(pwd)/data:/app/data myapp:test Add services from the dashboard (e.g. `https://example.com`, `google.com:443` for TCP) and view reports. -### Check Retention +### Check Retention and Rollups -To limit database growth, the app prunes old checks every 15 minutes: +To limit database growth, the app **rolls up** old checks into hourly aggregates, then prunes raw data: + +1. **Rollup** (every 15 min): Checks older than `ROLLUP_AGE_HOURS` are aggregated into hourly buckets (total, success count, latency stats) and stored in `uptime_rollups`. Raw checks in those hours are deleted. +2. **Prune**: Keeps last `CHECK_RETENTION_COUNT` raw checks per service; optionally deletes by age. + +This lets you report accurate uptime over **90+ days** without storing millions of raw checks. Reports combine rollups (historical) + raw checks (recent). | Env var | Default | Description | |---------|---------|-------------| -| `CHECK_RETENTION_COUNT` | 5000 | Keep last N checks per service | +| `ROLLUP_AGE_HOURS` | 24 | Aggregate checks older than N hours into hourly buckets | +| `CHECK_RETENTION_COUNT` | 5000 | Keep last N raw checks per service | | `CHECK_RETENTION_DAYS` | 0 (disabled) | Also delete checks older than N days | -Example: keep 2000 checks per service and drop anything older than 30 days: +Example: keep 2000 raw checks per service and drop anything older than 30 days: ```bash docker run -e CHECK_RETENTION_COUNT=2000 -e CHECK_RETENTION_DAYS=30 ... diff --git a/app/main.py b/app/main.py index dbcd3bc..9746eb8 100644 --- a/app/main.py +++ b/app/main.py @@ -19,6 +19,9 @@ def _parse_report_dates(from_ts, to_ts, preset): elif preset == "30d": to_ts = now.isoformat() from_ts = (now - timedelta(days=30)).isoformat() + elif preset == "90d": + to_ts = now.isoformat() + from_ts = (now - timedelta(days=90)).isoformat() if from_ts and len(from_ts) == 10: from_ts = from_ts + "T00:00:00" if to_ts and len(to_ts) == 10: diff --git a/app/models.py b/app/models.py index f7931be..5a61b70 100644 --- a/app/models.py +++ b/app/models.py @@ -12,6 +12,9 @@ DB_PATH = Path(DATA_PATH) / "monitor.db" CHECK_RETENTION_COUNT = int(os.environ.get("CHECK_RETENTION_COUNT", "5000")) CHECK_RETENTION_DAYS = int(os.environ.get("CHECK_RETENTION_DAYS", "0")) or None +# Rollup: aggregate checks older than N hours into hourly buckets for long-term reporting +ROLLUP_AGE_HOURS = int(os.environ.get("ROLLUP_AGE_HOURS", "24")) + def _ensure_data_dir(): Path(DATA_PATH).mkdir(parents=True, exist_ok=True) @@ -26,6 +29,32 @@ def _migrate_add_status(conn): conn.execute("UPDATE checks SET status = CASE WHEN success = 1 THEN 'OK' ELSE 'ERROR' END") +def _migrate_add_rollups(conn): + """Create uptime_rollups table for aggregated hourly stats (long-term reporting).""" + conn.execute(""" + CREATE TABLE IF NOT EXISTS uptime_rollups ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + service_id INTEGER NOT NULL, + period_start TEXT NOT NULL, + period_end TEXT NOT NULL, + total_checks INTEGER NOT NULL, + success_count INTEGER NOT NULL, + sum_response_ms REAL NOT NULL, + response_count INTEGER NOT NULL, + min_response_ms REAL, + max_response_ms REAL, + FOREIGN KEY (service_id) REFERENCES services(id), + UNIQUE(service_id, period_start) + ) + """) + conn.execute("CREATE INDEX IF NOT EXISTS idx_rollups_service ON uptime_rollups(service_id)") + conn.execute("CREATE INDEX IF NOT EXISTS idx_rollups_period ON uptime_rollups(period_start)") + try: + conn.execute("SELECT response_count FROM uptime_rollups LIMIT 1") + except sqlite3.OperationalError: + conn.execute("ALTER TABLE uptime_rollups ADD COLUMN response_count INTEGER NOT NULL DEFAULT 0") + + @contextmanager def get_db(): _ensure_data_dir() @@ -67,6 +96,7 @@ def init_db(): conn.execute("CREATE INDEX IF NOT EXISTS idx_checks_timestamp ON checks(timestamp)") _migrate_add_status(conn) conn.execute("CREATE INDEX IF NOT EXISTS idx_checks_status ON checks(status)") + _migrate_add_rollups(conn) def list_services(): @@ -179,30 +209,67 @@ def get_checks(service_id: int, limit: int = 50, offset: int = 0, from_ts: str = def get_report_stats(service_id: int, from_ts: str = None, to_ts: str = None): - """Compute uptime % and latency stats for a service, optionally over a time range.""" + """ + Compute uptime % and latency stats for a service over a time range. + Uses hourly rollups for old data + raw checks for recent data (last ROLLUP_AGE_HOURS). + Supports accurate reporting over 90+ days. + """ + now = datetime.now(timezone.utc) + raw_cutoff = (now - timedelta(hours=ROLLUP_AGE_HOURS)).isoformat() + to_ts = to_ts or now.isoformat() + from_ts = from_ts or "1970-01-01T00:00:00" + + total = 0 + success_count = 0 + sum_response_ms = 0.0 + count_with_response = 0 + min_ms = None + max_ms = None + with get_db() as conn: - q = "SELECT success, response_time_ms FROM checks WHERE service_id = ?" - args = [service_id] - if from_ts: - q += " AND timestamp >= ?" - args.append(from_ts) - if to_ts: - q += " AND timestamp <= ?" - args.append(to_ts) - q += " ORDER BY timestamp DESC LIMIT 10000" - rows = conn.execute(q, args).fetchall() - if not rows: + # 1. Rollups: hourly buckets that end before raw_cutoff + rollup_end = raw_cutoff if raw_cutoff < to_ts else from_ts + if from_ts < rollup_end: + q = """ + SELECT total_checks, success_count, sum_response_ms, response_count, min_response_ms, max_response_ms + FROM uptime_rollups + WHERE service_id = ? AND period_start >= ? AND period_start < ? + """ + rollup_rows = conn.execute(q, (service_id, from_ts, rollup_end)).fetchall() + for r in rollup_rows: + total += r["total_checks"] + success_count += r["success_count"] + sum_response_ms += r["sum_response_ms"] or 0 + count_with_response += r["response_count"] or 0 + if r["min_response_ms"] is not None: + min_ms = r["min_response_ms"] if min_ms is None else min(min_ms, r["min_response_ms"]) + if r["max_response_ms"] is not None: + max_ms = r["max_response_ms"] if max_ms is None else max(max_ms, r["max_response_ms"]) + + # 2. Raw checks: recent data (overlaps with rollup period if range is entirely recent) + raw_from = from_ts if from_ts >= raw_cutoff else raw_cutoff + if raw_from <= to_ts: + q = "SELECT success, response_time_ms FROM checks WHERE service_id = ? AND timestamp >= ? AND timestamp <= ?" + raw_rows = conn.execute(q, (service_id, raw_from, to_ts)).fetchall() + for r in raw_rows: + total += 1 + success_count += 1 if r["success"] else 0 + if r["response_time_ms"] is not None: + sum_response_ms += r["response_time_ms"] + count_with_response += 1 + min_ms = r["response_time_ms"] if min_ms is None else min(min_ms, r["response_time_ms"]) + max_ms = r["response_time_ms"] if max_ms is None else max(max_ms, r["response_time_ms"]) + + if total == 0: return {"total": 0, "uptime_pct": 0, "avg_ms": None, "min_ms": None, "max_ms": None} - total = len(rows) - success_count = sum(1 for r in rows if r["success"]) - uptime_pct = (success_count / total) * 100 if total else 0 - response_times = [r["response_time_ms"] for r in rows if r["response_time_ms"] is not None] + uptime_pct = (success_count / total) * 100 + avg_ms = round(sum_response_ms / count_with_response, 2) if count_with_response else None return { "total": total, "uptime_pct": round(uptime_pct, 2), - "avg_ms": round(sum(response_times) / len(response_times), 2) if response_times else None, - "min_ms": min(response_times) if response_times else None, - "max_ms": max(response_times) if response_times else None, + "avg_ms": avg_ms, + "min_ms": round(min_ms, 2) if min_ms is not None else None, + "max_ms": round(max_ms, 2) if max_ms is not None else None, } @@ -227,6 +294,7 @@ def delete_service(service_id: int) -> bool: """Delete a service and its check history. Returns True if deleted.""" with get_db() as conn: conn.execute("DELETE FROM checks WHERE service_id = ?", (service_id,)) + conn.execute("DELETE FROM uptime_rollups WHERE service_id = ?", (service_id,)) cur = conn.execute("DELETE FROM services WHERE id = ?", (service_id,)) return cur.rowcount > 0 @@ -238,6 +306,76 @@ def get_all_services_for_scheduler(): return [dict(r) for r in rows] +def _hour_start(ts: str) -> str: + """Return ISO timestamp truncated to hour boundary (e.g. 2026-03-10T14:00:00).""" + dt = datetime.fromisoformat(ts.replace("Z", "+00:00")) + return dt.replace(minute=0, second=0, microsecond=0).isoformat() + + +def rollup_old_checks() -> int: + """ + Aggregate checks older than ROLLUP_AGE_HOURS into hourly buckets. + Returns number of raw checks that were rolled up and deleted. + """ + cutoff = datetime.now(timezone.utc) - timedelta(hours=ROLLUP_AGE_HOURS) + cutoff_ts = cutoff.isoformat() + with get_db() as conn: + # Get checks older than cutoff, grouped by service and hour + rows = conn.execute( + """ + SELECT service_id, + strftime('%Y-%m-%dT%H:00:00', timestamp) as period_start, + COUNT(*) as total_checks, + SUM(success) as success_count, + SUM(CASE WHEN response_time_ms IS NOT NULL THEN response_time_ms ELSE 0 END) as sum_response_ms, + SUM(CASE WHEN response_time_ms IS NOT NULL THEN 1 ELSE 0 END) as response_count, + MIN(CASE WHEN response_time_ms IS NOT NULL THEN response_time_ms END) as min_response_ms, + MAX(response_time_ms) as max_response_ms, + GROUP_CONCAT(id) as check_ids + FROM checks + WHERE timestamp < ? + GROUP BY service_id, period_start + """, + (cutoff_ts,), + ).fetchall() + if not rows: + return 0 + deleted = 0 + for r in rows: + period_end = datetime.fromisoformat(r["period_start"].replace("Z", "+00:00")) + timedelta(hours=1) + period_end_ts = period_end.isoformat() + conn.execute( + """ + INSERT INTO uptime_rollups (service_id, period_start, period_end, total_checks, success_count, sum_response_ms, response_count, min_response_ms, max_response_ms) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) + ON CONFLICT(service_id, period_start) DO UPDATE SET + total_checks = total_checks + excluded.total_checks, + success_count = success_count + excluded.success_count, + sum_response_ms = sum_response_ms + excluded.sum_response_ms, + response_count = response_count + excluded.response_count, + min_response_ms = MIN(min_response_ms, excluded.min_response_ms), + max_response_ms = MAX(max_response_ms, excluded.max_response_ms) + """, + ( + r["service_id"], + r["period_start"], + period_end_ts, + r["total_checks"], + r["success_count"], + r["sum_response_ms"] or 0, + r["response_count"] or 0, + r["min_response_ms"], + r["max_response_ms"], + ), + ) + ids = [int(x) for x in (r["check_ids"] or "").split(",") if x] + if ids: + placeholders = ",".join("?" * len(ids)) + cur = conn.execute(f"DELETE FROM checks WHERE id IN ({placeholders})", ids) + deleted += cur.rowcount + return deleted + + def prune_checks_retention() -> int: """ Remove old checks to limit storage. Keeps last CHECK_RETENTION_COUNT per service. diff --git a/app/scheduler.py b/app/scheduler.py index f8a5f7e..303d69a 100644 --- a/app/scheduler.py +++ b/app/scheduler.py @@ -2,7 +2,7 @@ from apscheduler.schedulers.background import BackgroundScheduler from app.checker import run_check -from app.models import get_all_services_for_scheduler, prune_checks_retention +from app.models import get_all_services_for_scheduler, prune_checks_retention, rollup_old_checks def _run_all_checks(): @@ -54,7 +54,11 @@ def start_scheduler(): # Sync job list every 60 seconds (only adds/removes when services change) scheduler.add_job(sync_jobs, "interval", seconds=60, id="sync_jobs") - # Prune old checks every 15 minutes (retention/compression) - scheduler.add_job(prune_checks_retention, "interval", minutes=15, id="prune_checks") + # Roll up old checks into hourly buckets, then prune (every 15 min) + def rollup_and_prune(): + rollup_old_checks() + prune_checks_retention() + + scheduler.add_job(rollup_and_prune, "interval", minutes=15, id="prune_checks") scheduler.start() diff --git a/templates/report.html b/templates/report.html index a2526fb..3bafe54 100644 --- a/templates/report.html +++ b/templates/report.html @@ -16,6 +16,7 @@ Last 24h Last 7 days Last 30 days + Last 90 days {% if period_label %}
Showing: {{ period_label }}
@@ -141,7 +142,7 @@ {% else %} 1 {% if page > 3 %}…{% endif %} - {% for p in range(max(2, page - 1), min(total_pages, page + 1) + 1) %} + {% for p in range([2, page - 1] | max, [total_pages, page + 1] | min + 1) %} {% if p == page %} {{ p }} {% else %}