#!/opt/vexor/api/venv/bin/python
"""check_vexor_baseline - Predictive / baseline Nagios plugin.

Compares the most recent value of a perfdata metric to a baseline computed
from historical samples in InfluxDB, and alerts when the deviation exceeds a
configurable sensitivity (expressed as standard deviations / z-score).

Usage:
  check_vexor_baseline --host H --metric M [--service S]
                       [--baseline 7d] [--current 15m]
                       [--sensitivity 3.0] [--critical 4.5]
                       [--direction both|above|below]
                       [--min-stddev 0.0]

Exit codes: 0 OK, 1 WARN, 2 CRIT, 3 UNKNOWN
"""
from __future__ import annotations

import argparse
import os
import re
import sys


OK, WARN, CRIT, UNKNOWN = 0, 1, 2, 3

INFLUX_URL = os.environ.get("INFLUX_URL", "http://127.0.0.1:8086")
INFLUX_ORG = os.environ.get("INFLUX_ORG", "vexor")
INFLUX_BUCKET = os.environ.get("INFLUX_BUCKET", "perfdata")

_TAG_RE = re.compile(r"^[A-Za-z0-9._:/ +\-]{1,200}$")
_DUR_RE = re.compile(r"^\d{1,6}(ns|us|ms|s|m|h|d|w|y)$")


def die(code: int, msg: str) -> "None":
    sys.stdout.write(msg + "\n")
    sys.exit(code)


def _safe(v: str, what: str) -> str:
    if not v or not _TAG_RE.match(v):
        die(UNKNOWN, f"UNKNOWN - invalid {what}")
    return v


def _safe_dur(v: str, what: str) -> str:
    if not v or not _DUR_RE.match(v):
        die(UNKNOWN, f"UNKNOWN - invalid {what} (use eg 1h, 7d)")
    return v


def main() -> None:
    ap = argparse.ArgumentParser()
    ap.add_argument("--host", required=True)
    ap.add_argument("--service", default="")
    ap.add_argument("--metric", required=True)
    ap.add_argument("--baseline", default="7d",
                    help="Historical window to compute baseline (default 7d)")
    ap.add_argument("--current", default="15m",
                    help="Recent window to evaluate (default 15m)")
    ap.add_argument("--sensitivity", type=float, default=3.0,
                    help="WARN when |z| exceeds this many stddev (default 3.0)")
    ap.add_argument("--critical", type=float, default=4.5,
                    help="CRIT when |z| exceeds this many stddev (default 4.5)")
    ap.add_argument("--direction", default="both",
                    choices=["both", "above", "below"],
                    help="Which side of the baseline triggers (default both)")
    ap.add_argument("--min-stddev", type=float, default=0.0,
                    help="Floor for stddev to avoid div-by-zero on flat series")
    args = ap.parse_args()

    host = _safe(args.host, "host")
    service = _safe(args.service, "service") if args.service else ""
    metric = _safe(args.metric, "metric")
    baseline = _safe_dur(args.baseline, "baseline")
    current = _safe_dur(args.current, "current")

    if args.critical < args.sensitivity:
        die(UNKNOWN, "UNKNOWN - --critical must be >= --sensitivity")

    try:
        from influxdb_client import InfluxDBClient
    except Exception as e:  # noqa: BLE001
        die(UNKNOWN, f"UNKNOWN - influxdb-client missing: {e}")

    token = os.environ.get("INFLUX_TOKEN", "")
    if not token:
        die(UNKNOWN, "UNKNOWN - INFLUX_TOKEN not set")

    measurement = "service_perf" if service else "host_perf"
    svc_filter = f'  |> filter(fn: (r) => r.service == "{service}")\n' if service else ""

    flux_baseline = f'''
from(bucket: "{INFLUX_BUCKET}")
  |> range(start: -{baseline}, stop: -{current})
  |> filter(fn: (r) => r._measurement == "{measurement}")
  |> filter(fn: (r) => r.host == "{host}")
{svc_filter}  |> filter(fn: (r) => r.metric == "{metric}")
  |> filter(fn: (r) => r._field == "value")
'''
    flux_current = f'''
from(bucket: "{INFLUX_BUCKET}")
  |> range(start: -{current})
  |> filter(fn: (r) => r._measurement == "{measurement}")
  |> filter(fn: (r) => r.host == "{host}")
{svc_filter}  |> filter(fn: (r) => r.metric == "{metric}")
  |> filter(fn: (r) => r._field == "value")
'''

    try:
        with InfluxDBClient(url=INFLUX_URL, token=token, org=INFLUX_ORG) as client:
            qa = client.query_api()
            base_vals = [r.get_value() for t in qa.query(flux_baseline) for r in t.records
                         if r.get_value() is not None]
            cur_vals = [r.get_value() for t in qa.query(flux_current) for r in t.records
                        if r.get_value() is not None]
    except Exception as e:  # noqa: BLE001
        die(UNKNOWN, f"UNKNOWN - InfluxDB query failed: {e}")

    if len(base_vals) < 10:
        die(UNKNOWN, f"UNKNOWN - insufficient baseline data ({len(base_vals)} pts in {baseline})")
    if not cur_vals:
        die(UNKNOWN, f"UNKNOWN - no current data in last {current}")

    n = len(base_vals)
    mean = sum(base_vals) / n
    var = sum((v - mean) ** 2 for v in base_vals) / max(n - 1, 1)
    stddev = max(var ** 0.5, args.min_stddev)
    cur = sum(cur_vals) / len(cur_vals)
    deviation = cur - mean
    z = deviation / stddev if stddev > 0 else 0.0

    if args.direction == "above":
        score = z
    elif args.direction == "below":
        score = -z
    else:
        score = abs(z)

    if score >= args.critical:
        status = "CRITICAL"
        code = CRIT
    elif score >= args.sensitivity:
        status = "WARNING"
        code = WARN
    else:
        status = "OK"
        code = OK

    msg = (
        f"{status} - {metric} current={cur:.3f} baseline={mean:.3f} "
        f"stddev={stddev:.3f} z={z:+.2f} (window={baseline}, dir={args.direction}) | "
        f"current={cur:.3f} baseline={mean:.3f} stddev={stddev:.3f} "
        f"z={z:.3f};{args.sensitivity};{args.critical}"
    )
    die(code, msg)


if __name__ == "__main__":
    main()
