fix compression
This commit is contained in:
@@ -7,3 +7,6 @@ IMAGE_TAG=latest
|
|||||||
# Optional: check retention (limits DB growth)
|
# Optional: check retention (limits DB growth)
|
||||||
# CHECK_RETENTION_COUNT=5000 # keep last N checks per service (default 5000)
|
# CHECK_RETENTION_COUNT=5000 # keep last N checks per service (default 5000)
|
||||||
# CHECK_RETENTION_DAYS=30 # also delete checks older than N days (0=disabled)
|
# CHECK_RETENTION_DAYS=30 # also delete checks older than N days (0=disabled)
|
||||||
|
|
||||||
|
# Rollup: aggregate checks older than N hours into hourly buckets for 90+ day reporting
|
||||||
|
# ROLLUP_AGE_HOURS=24 # default 24; raw checks kept for this long, then rolled up
|
||||||
|
|||||||
14
README.md
14
README.md
@@ -42,16 +42,22 @@ docker run -p 8080:8080 -v $(pwd)/data:/app/data myapp:test
|
|||||||
|
|
||||||
Add services from the dashboard (e.g. `https://example.com`, `google.com:443` for TCP) and view reports.
|
Add services from the dashboard (e.g. `https://example.com`, `google.com:443` for TCP) and view reports.
|
||||||
|
|
||||||
### Check Retention
|
### Check Retention and Rollups
|
||||||
|
|
||||||
To limit database growth, the app prunes old checks every 15 minutes:
|
To limit database growth, the app **rolls up** old checks into hourly aggregates, then prunes raw data:
|
||||||
|
|
||||||
|
1. **Rollup** (every 15 min): Checks older than `ROLLUP_AGE_HOURS` are aggregated into hourly buckets (total, success count, latency stats) and stored in `uptime_rollups`. Raw checks in those hours are deleted.
|
||||||
|
2. **Prune**: Keeps last `CHECK_RETENTION_COUNT` raw checks per service; optionally deletes by age.
|
||||||
|
|
||||||
|
This lets you report accurate uptime over **90+ days** without storing millions of raw checks. Reports combine rollups (historical) + raw checks (recent).
|
||||||
|
|
||||||
| Env var | Default | Description |
|
| Env var | Default | Description |
|
||||||
|---------|---------|-------------|
|
|---------|---------|-------------|
|
||||||
| `CHECK_RETENTION_COUNT` | 5000 | Keep last N checks per service |
|
| `ROLLUP_AGE_HOURS` | 24 | Aggregate checks older than N hours into hourly buckets |
|
||||||
|
| `CHECK_RETENTION_COUNT` | 5000 | Keep last N raw checks per service |
|
||||||
| `CHECK_RETENTION_DAYS` | 0 (disabled) | Also delete checks older than N days |
|
| `CHECK_RETENTION_DAYS` | 0 (disabled) | Also delete checks older than N days |
|
||||||
|
|
||||||
Example: keep 2000 checks per service and drop anything older than 30 days:
|
Example: keep 2000 raw checks per service and drop anything older than 30 days:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
docker run -e CHECK_RETENTION_COUNT=2000 -e CHECK_RETENTION_DAYS=30 ...
|
docker run -e CHECK_RETENTION_COUNT=2000 -e CHECK_RETENTION_DAYS=30 ...
|
||||||
|
|||||||
@@ -19,6 +19,9 @@ def _parse_report_dates(from_ts, to_ts, preset):
|
|||||||
elif preset == "30d":
|
elif preset == "30d":
|
||||||
to_ts = now.isoformat()
|
to_ts = now.isoformat()
|
||||||
from_ts = (now - timedelta(days=30)).isoformat()
|
from_ts = (now - timedelta(days=30)).isoformat()
|
||||||
|
elif preset == "90d":
|
||||||
|
to_ts = now.isoformat()
|
||||||
|
from_ts = (now - timedelta(days=90)).isoformat()
|
||||||
if from_ts and len(from_ts) == 10:
|
if from_ts and len(from_ts) == 10:
|
||||||
from_ts = from_ts + "T00:00:00"
|
from_ts = from_ts + "T00:00:00"
|
||||||
if to_ts and len(to_ts) == 10:
|
if to_ts and len(to_ts) == 10:
|
||||||
|
|||||||
176
app/models.py
176
app/models.py
@@ -12,6 +12,9 @@ DB_PATH = Path(DATA_PATH) / "monitor.db"
|
|||||||
CHECK_RETENTION_COUNT = int(os.environ.get("CHECK_RETENTION_COUNT", "5000"))
|
CHECK_RETENTION_COUNT = int(os.environ.get("CHECK_RETENTION_COUNT", "5000"))
|
||||||
CHECK_RETENTION_DAYS = int(os.environ.get("CHECK_RETENTION_DAYS", "0")) or None
|
CHECK_RETENTION_DAYS = int(os.environ.get("CHECK_RETENTION_DAYS", "0")) or None
|
||||||
|
|
||||||
|
# Rollup: aggregate checks older than N hours into hourly buckets for long-term reporting
|
||||||
|
ROLLUP_AGE_HOURS = int(os.environ.get("ROLLUP_AGE_HOURS", "24"))
|
||||||
|
|
||||||
|
|
||||||
def _ensure_data_dir():
|
def _ensure_data_dir():
|
||||||
Path(DATA_PATH).mkdir(parents=True, exist_ok=True)
|
Path(DATA_PATH).mkdir(parents=True, exist_ok=True)
|
||||||
@@ -26,6 +29,32 @@ def _migrate_add_status(conn):
|
|||||||
conn.execute("UPDATE checks SET status = CASE WHEN success = 1 THEN 'OK' ELSE 'ERROR' END")
|
conn.execute("UPDATE checks SET status = CASE WHEN success = 1 THEN 'OK' ELSE 'ERROR' END")
|
||||||
|
|
||||||
|
|
||||||
|
def _migrate_add_rollups(conn):
|
||||||
|
"""Create uptime_rollups table for aggregated hourly stats (long-term reporting)."""
|
||||||
|
conn.execute("""
|
||||||
|
CREATE TABLE IF NOT EXISTS uptime_rollups (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
service_id INTEGER NOT NULL,
|
||||||
|
period_start TEXT NOT NULL,
|
||||||
|
period_end TEXT NOT NULL,
|
||||||
|
total_checks INTEGER NOT NULL,
|
||||||
|
success_count INTEGER NOT NULL,
|
||||||
|
sum_response_ms REAL NOT NULL,
|
||||||
|
response_count INTEGER NOT NULL,
|
||||||
|
min_response_ms REAL,
|
||||||
|
max_response_ms REAL,
|
||||||
|
FOREIGN KEY (service_id) REFERENCES services(id),
|
||||||
|
UNIQUE(service_id, period_start)
|
||||||
|
)
|
||||||
|
""")
|
||||||
|
conn.execute("CREATE INDEX IF NOT EXISTS idx_rollups_service ON uptime_rollups(service_id)")
|
||||||
|
conn.execute("CREATE INDEX IF NOT EXISTS idx_rollups_period ON uptime_rollups(period_start)")
|
||||||
|
try:
|
||||||
|
conn.execute("SELECT response_count FROM uptime_rollups LIMIT 1")
|
||||||
|
except sqlite3.OperationalError:
|
||||||
|
conn.execute("ALTER TABLE uptime_rollups ADD COLUMN response_count INTEGER NOT NULL DEFAULT 0")
|
||||||
|
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
def get_db():
|
def get_db():
|
||||||
_ensure_data_dir()
|
_ensure_data_dir()
|
||||||
@@ -67,6 +96,7 @@ def init_db():
|
|||||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_checks_timestamp ON checks(timestamp)")
|
conn.execute("CREATE INDEX IF NOT EXISTS idx_checks_timestamp ON checks(timestamp)")
|
||||||
_migrate_add_status(conn)
|
_migrate_add_status(conn)
|
||||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_checks_status ON checks(status)")
|
conn.execute("CREATE INDEX IF NOT EXISTS idx_checks_status ON checks(status)")
|
||||||
|
_migrate_add_rollups(conn)
|
||||||
|
|
||||||
|
|
||||||
def list_services():
|
def list_services():
|
||||||
@@ -179,30 +209,67 @@ def get_checks(service_id: int, limit: int = 50, offset: int = 0, from_ts: str =
|
|||||||
|
|
||||||
|
|
||||||
def get_report_stats(service_id: int, from_ts: str = None, to_ts: str = None):
|
def get_report_stats(service_id: int, from_ts: str = None, to_ts: str = None):
|
||||||
"""Compute uptime % and latency stats for a service, optionally over a time range."""
|
"""
|
||||||
|
Compute uptime % and latency stats for a service over a time range.
|
||||||
|
Uses hourly rollups for old data + raw checks for recent data (last ROLLUP_AGE_HOURS).
|
||||||
|
Supports accurate reporting over 90+ days.
|
||||||
|
"""
|
||||||
|
now = datetime.now(timezone.utc)
|
||||||
|
raw_cutoff = (now - timedelta(hours=ROLLUP_AGE_HOURS)).isoformat()
|
||||||
|
to_ts = to_ts or now.isoformat()
|
||||||
|
from_ts = from_ts or "1970-01-01T00:00:00"
|
||||||
|
|
||||||
|
total = 0
|
||||||
|
success_count = 0
|
||||||
|
sum_response_ms = 0.0
|
||||||
|
count_with_response = 0
|
||||||
|
min_ms = None
|
||||||
|
max_ms = None
|
||||||
|
|
||||||
with get_db() as conn:
|
with get_db() as conn:
|
||||||
q = "SELECT success, response_time_ms FROM checks WHERE service_id = ?"
|
# 1. Rollups: hourly buckets that end before raw_cutoff
|
||||||
args = [service_id]
|
rollup_end = raw_cutoff if raw_cutoff < to_ts else from_ts
|
||||||
if from_ts:
|
if from_ts < rollup_end:
|
||||||
q += " AND timestamp >= ?"
|
q = """
|
||||||
args.append(from_ts)
|
SELECT total_checks, success_count, sum_response_ms, response_count, min_response_ms, max_response_ms
|
||||||
if to_ts:
|
FROM uptime_rollups
|
||||||
q += " AND timestamp <= ?"
|
WHERE service_id = ? AND period_start >= ? AND period_start < ?
|
||||||
args.append(to_ts)
|
"""
|
||||||
q += " ORDER BY timestamp DESC LIMIT 10000"
|
rollup_rows = conn.execute(q, (service_id, from_ts, rollup_end)).fetchall()
|
||||||
rows = conn.execute(q, args).fetchall()
|
for r in rollup_rows:
|
||||||
if not rows:
|
total += r["total_checks"]
|
||||||
|
success_count += r["success_count"]
|
||||||
|
sum_response_ms += r["sum_response_ms"] or 0
|
||||||
|
count_with_response += r["response_count"] or 0
|
||||||
|
if r["min_response_ms"] is not None:
|
||||||
|
min_ms = r["min_response_ms"] if min_ms is None else min(min_ms, r["min_response_ms"])
|
||||||
|
if r["max_response_ms"] is not None:
|
||||||
|
max_ms = r["max_response_ms"] if max_ms is None else max(max_ms, r["max_response_ms"])
|
||||||
|
|
||||||
|
# 2. Raw checks: recent data (overlaps with rollup period if range is entirely recent)
|
||||||
|
raw_from = from_ts if from_ts >= raw_cutoff else raw_cutoff
|
||||||
|
if raw_from <= to_ts:
|
||||||
|
q = "SELECT success, response_time_ms FROM checks WHERE service_id = ? AND timestamp >= ? AND timestamp <= ?"
|
||||||
|
raw_rows = conn.execute(q, (service_id, raw_from, to_ts)).fetchall()
|
||||||
|
for r in raw_rows:
|
||||||
|
total += 1
|
||||||
|
success_count += 1 if r["success"] else 0
|
||||||
|
if r["response_time_ms"] is not None:
|
||||||
|
sum_response_ms += r["response_time_ms"]
|
||||||
|
count_with_response += 1
|
||||||
|
min_ms = r["response_time_ms"] if min_ms is None else min(min_ms, r["response_time_ms"])
|
||||||
|
max_ms = r["response_time_ms"] if max_ms is None else max(max_ms, r["response_time_ms"])
|
||||||
|
|
||||||
|
if total == 0:
|
||||||
return {"total": 0, "uptime_pct": 0, "avg_ms": None, "min_ms": None, "max_ms": None}
|
return {"total": 0, "uptime_pct": 0, "avg_ms": None, "min_ms": None, "max_ms": None}
|
||||||
total = len(rows)
|
uptime_pct = (success_count / total) * 100
|
||||||
success_count = sum(1 for r in rows if r["success"])
|
avg_ms = round(sum_response_ms / count_with_response, 2) if count_with_response else None
|
||||||
uptime_pct = (success_count / total) * 100 if total else 0
|
|
||||||
response_times = [r["response_time_ms"] for r in rows if r["response_time_ms"] is not None]
|
|
||||||
return {
|
return {
|
||||||
"total": total,
|
"total": total,
|
||||||
"uptime_pct": round(uptime_pct, 2),
|
"uptime_pct": round(uptime_pct, 2),
|
||||||
"avg_ms": round(sum(response_times) / len(response_times), 2) if response_times else None,
|
"avg_ms": avg_ms,
|
||||||
"min_ms": min(response_times) if response_times else None,
|
"min_ms": round(min_ms, 2) if min_ms is not None else None,
|
||||||
"max_ms": max(response_times) if response_times else None,
|
"max_ms": round(max_ms, 2) if max_ms is not None else None,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -227,6 +294,7 @@ def delete_service(service_id: int) -> bool:
|
|||||||
"""Delete a service and its check history. Returns True if deleted."""
|
"""Delete a service and its check history. Returns True if deleted."""
|
||||||
with get_db() as conn:
|
with get_db() as conn:
|
||||||
conn.execute("DELETE FROM checks WHERE service_id = ?", (service_id,))
|
conn.execute("DELETE FROM checks WHERE service_id = ?", (service_id,))
|
||||||
|
conn.execute("DELETE FROM uptime_rollups WHERE service_id = ?", (service_id,))
|
||||||
cur = conn.execute("DELETE FROM services WHERE id = ?", (service_id,))
|
cur = conn.execute("DELETE FROM services WHERE id = ?", (service_id,))
|
||||||
return cur.rowcount > 0
|
return cur.rowcount > 0
|
||||||
|
|
||||||
@@ -238,6 +306,76 @@ def get_all_services_for_scheduler():
|
|||||||
return [dict(r) for r in rows]
|
return [dict(r) for r in rows]
|
||||||
|
|
||||||
|
|
||||||
|
def _hour_start(ts: str) -> str:
|
||||||
|
"""Return ISO timestamp truncated to hour boundary (e.g. 2026-03-10T14:00:00)."""
|
||||||
|
dt = datetime.fromisoformat(ts.replace("Z", "+00:00"))
|
||||||
|
return dt.replace(minute=0, second=0, microsecond=0).isoformat()
|
||||||
|
|
||||||
|
|
||||||
|
def rollup_old_checks() -> int:
|
||||||
|
"""
|
||||||
|
Aggregate checks older than ROLLUP_AGE_HOURS into hourly buckets.
|
||||||
|
Returns number of raw checks that were rolled up and deleted.
|
||||||
|
"""
|
||||||
|
cutoff = datetime.now(timezone.utc) - timedelta(hours=ROLLUP_AGE_HOURS)
|
||||||
|
cutoff_ts = cutoff.isoformat()
|
||||||
|
with get_db() as conn:
|
||||||
|
# Get checks older than cutoff, grouped by service and hour
|
||||||
|
rows = conn.execute(
|
||||||
|
"""
|
||||||
|
SELECT service_id,
|
||||||
|
strftime('%Y-%m-%dT%H:00:00', timestamp) as period_start,
|
||||||
|
COUNT(*) as total_checks,
|
||||||
|
SUM(success) as success_count,
|
||||||
|
SUM(CASE WHEN response_time_ms IS NOT NULL THEN response_time_ms ELSE 0 END) as sum_response_ms,
|
||||||
|
SUM(CASE WHEN response_time_ms IS NOT NULL THEN 1 ELSE 0 END) as response_count,
|
||||||
|
MIN(CASE WHEN response_time_ms IS NOT NULL THEN response_time_ms END) as min_response_ms,
|
||||||
|
MAX(response_time_ms) as max_response_ms,
|
||||||
|
GROUP_CONCAT(id) as check_ids
|
||||||
|
FROM checks
|
||||||
|
WHERE timestamp < ?
|
||||||
|
GROUP BY service_id, period_start
|
||||||
|
""",
|
||||||
|
(cutoff_ts,),
|
||||||
|
).fetchall()
|
||||||
|
if not rows:
|
||||||
|
return 0
|
||||||
|
deleted = 0
|
||||||
|
for r in rows:
|
||||||
|
period_end = datetime.fromisoformat(r["period_start"].replace("Z", "+00:00")) + timedelta(hours=1)
|
||||||
|
period_end_ts = period_end.isoformat()
|
||||||
|
conn.execute(
|
||||||
|
"""
|
||||||
|
INSERT INTO uptime_rollups (service_id, period_start, period_end, total_checks, success_count, sum_response_ms, response_count, min_response_ms, max_response_ms)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||||
|
ON CONFLICT(service_id, period_start) DO UPDATE SET
|
||||||
|
total_checks = total_checks + excluded.total_checks,
|
||||||
|
success_count = success_count + excluded.success_count,
|
||||||
|
sum_response_ms = sum_response_ms + excluded.sum_response_ms,
|
||||||
|
response_count = response_count + excluded.response_count,
|
||||||
|
min_response_ms = MIN(min_response_ms, excluded.min_response_ms),
|
||||||
|
max_response_ms = MAX(max_response_ms, excluded.max_response_ms)
|
||||||
|
""",
|
||||||
|
(
|
||||||
|
r["service_id"],
|
||||||
|
r["period_start"],
|
||||||
|
period_end_ts,
|
||||||
|
r["total_checks"],
|
||||||
|
r["success_count"],
|
||||||
|
r["sum_response_ms"] or 0,
|
||||||
|
r["response_count"] or 0,
|
||||||
|
r["min_response_ms"],
|
||||||
|
r["max_response_ms"],
|
||||||
|
),
|
||||||
|
)
|
||||||
|
ids = [int(x) for x in (r["check_ids"] or "").split(",") if x]
|
||||||
|
if ids:
|
||||||
|
placeholders = ",".join("?" * len(ids))
|
||||||
|
cur = conn.execute(f"DELETE FROM checks WHERE id IN ({placeholders})", ids)
|
||||||
|
deleted += cur.rowcount
|
||||||
|
return deleted
|
||||||
|
|
||||||
|
|
||||||
def prune_checks_retention() -> int:
|
def prune_checks_retention() -> int:
|
||||||
"""
|
"""
|
||||||
Remove old checks to limit storage. Keeps last CHECK_RETENTION_COUNT per service.
|
Remove old checks to limit storage. Keeps last CHECK_RETENTION_COUNT per service.
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
from apscheduler.schedulers.background import BackgroundScheduler
|
from apscheduler.schedulers.background import BackgroundScheduler
|
||||||
|
|
||||||
from app.checker import run_check
|
from app.checker import run_check
|
||||||
from app.models import get_all_services_for_scheduler, prune_checks_retention
|
from app.models import get_all_services_for_scheduler, prune_checks_retention, rollup_old_checks
|
||||||
|
|
||||||
|
|
||||||
def _run_all_checks():
|
def _run_all_checks():
|
||||||
@@ -54,7 +54,11 @@ def start_scheduler():
|
|||||||
# Sync job list every 60 seconds (only adds/removes when services change)
|
# Sync job list every 60 seconds (only adds/removes when services change)
|
||||||
scheduler.add_job(sync_jobs, "interval", seconds=60, id="sync_jobs")
|
scheduler.add_job(sync_jobs, "interval", seconds=60, id="sync_jobs")
|
||||||
|
|
||||||
# Prune old checks every 15 minutes (retention/compression)
|
# Roll up old checks into hourly buckets, then prune (every 15 min)
|
||||||
scheduler.add_job(prune_checks_retention, "interval", minutes=15, id="prune_checks")
|
def rollup_and_prune():
|
||||||
|
rollup_old_checks()
|
||||||
|
prune_checks_retention()
|
||||||
|
|
||||||
|
scheduler.add_job(rollup_and_prune, "interval", minutes=15, id="prune_checks")
|
||||||
|
|
||||||
scheduler.start()
|
scheduler.start()
|
||||||
|
|||||||
@@ -16,6 +16,7 @@
|
|||||||
<a href="{{ url_for('report', service_id=service.id, preset='24h') }}" class="preset-btn{% if preset == '24h' %} preset-active{% endif %}">Last 24h</a>
|
<a href="{{ url_for('report', service_id=service.id, preset='24h') }}" class="preset-btn{% if preset == '24h' %} preset-active{% endif %}">Last 24h</a>
|
||||||
<a href="{{ url_for('report', service_id=service.id, preset='7d') }}" class="preset-btn{% if preset == '7d' %} preset-active{% endif %}">Last 7 days</a>
|
<a href="{{ url_for('report', service_id=service.id, preset='7d') }}" class="preset-btn{% if preset == '7d' %} preset-active{% endif %}">Last 7 days</a>
|
||||||
<a href="{{ url_for('report', service_id=service.id, preset='30d') }}" class="preset-btn{% if preset == '30d' %} preset-active{% endif %}">Last 30 days</a>
|
<a href="{{ url_for('report', service_id=service.id, preset='30d') }}" class="preset-btn{% if preset == '30d' %} preset-active{% endif %}">Last 30 days</a>
|
||||||
|
<a href="{{ url_for('report', service_id=service.id, preset='90d') }}" class="preset-btn{% if preset == '90d' %} preset-active{% endif %}">Last 90 days</a>
|
||||||
</div>
|
</div>
|
||||||
{% if period_label %}
|
{% if period_label %}
|
||||||
<p class="period-label">Showing: {{ period_label }}</p>
|
<p class="period-label">Showing: {{ period_label }}</p>
|
||||||
@@ -141,7 +142,7 @@
|
|||||||
{% else %}
|
{% else %}
|
||||||
<a href="{{ url_for('report', service_id=service.id, preset=preset or '', from=from_date, to=to_date, status=status_filter or '', search=search or '', per_page=per_page, page=1) }}" class="pagination-btn">1</a>
|
<a href="{{ url_for('report', service_id=service.id, preset=preset or '', from=from_date, to=to_date, status=status_filter or '', search=search or '', per_page=per_page, page=1) }}" class="pagination-btn">1</a>
|
||||||
{% if page > 3 %}<span class="pagination-ellipsis">…</span>{% endif %}
|
{% if page > 3 %}<span class="pagination-ellipsis">…</span>{% endif %}
|
||||||
{% for p in range(max(2, page - 1), min(total_pages, page + 1) + 1) %}
|
{% for p in range([2, page - 1] | max, [total_pages, page + 1] | min + 1) %}
|
||||||
{% if p == page %}
|
{% if p == page %}
|
||||||
<span class="pagination-btn pagination-current">{{ p }}</span>
|
<span class="pagination-btn pagination-current">{{ p }}</span>
|
||||||
{% else %}
|
{% else %}
|
||||||
|
|||||||
Reference in New Issue
Block a user