ML predictions Phase 4: SARIMA spending forecast with dual confidence bands

Replaces unused Prophet dependency (unrunnable without cmdstan) with SARIMA (statsmodels SARIMAX) as the primary spending forecast algorithm. Strategy: SARIMA(1,1,1)(1,0,1,12) for 12+ months of data, ARIMA(1,1,1) for 6-11 months, Holt-Winters for 3-5 months, simple average below that. Adds 95% confidence bands (1.96σ) alongside existing 80% (1.28σ). Extends forecast horizon from 3 to 6 months and actuals display from 6 to 12 months. Each category now carries an algorithm field surfaced as a badge in the UI. Frontend chart shows both confidence tiers as stacked bar overlays with a 3-month summary grid below. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-28 10:30:26 +00:00 · 2026-04-28 10:30:26 +00:00 · 4572621f5d
commit 4572621f5d
parent 3b4787d8b9
4 changed files with 109 additions and 30 deletions
--- a/backend/app/ml/spending_forecast.py
+++ b/backend/app/ml/spending_forecast.py
@ -10,19 +10,55 @@ import pandas as pd
 warnings.filterwarnings("ignore")

 MIN_POINTS = 3
-FORECAST_MONTHS = 3
+FORECAST_MONTHS = 6


 def _next_month_starts(from_date: date, n: int) -> list[str]:
    months = []
-    d = (from_date.replace(day=1) + relativedelta(months=1))
+    d = from_date.replace(day=1) + relativedelta(months=1)
    for _ in range(n):
        months.append(d.strftime("%Y-%m-%d"))
        d += relativedelta(months=1)
    return months


-def _fit_holt(values: list[float], n: int) -> tuple[list[float], list[float], list[float]]:
+def _fit_sarima(values: list[float], n: int) -> tuple[list[float], list[float], list[float], list[float], list[float], str]:
+    """
+    Primary algorithm. Uses SARIMAX with seasonal component when enough data exists,
+    plain ARIMA otherwise. Returns (forecast, lower_80, upper_80, lower_95, upper_95, algorithm).
+    """
+    from statsmodels.tsa.statespace.sarimax import SARIMAX
+
+    series = np.array(values, dtype=float)
+    algo = "sarima"
+
+    try:
+        if len(series) >= 12:
+            # Seasonal ARIMA with annual period
+            model = SARIMAX(series, order=(1, 1, 1), seasonal_order=(1, 0, 1, 12),
+                            enforce_stationarity=False, enforce_invertibility=False)
+        else:
+            model = SARIMAX(series, order=(1, 1, 1),
+                            enforce_stationarity=False, enforce_invertibility=False)
+
+        fit = model.fit(disp=False, maxiter=200)
+        forecast_obj = fit.get_forecast(steps=n)
+        mean = forecast_obj.predicted_mean
+        ci_80 = forecast_obj.conf_int(alpha=0.20)   # 80% interval
+        ci_95 = forecast_obj.conf_int(alpha=0.05)   # 95% interval
+
+        lower_80 = np.maximum(0, ci_80.iloc[:, 0].values).tolist()
+        upper_80 = ci_80.iloc[:, 1].values.tolist()
+        lower_95 = np.maximum(0, ci_95.iloc[:, 0].values).tolist()
+        upper_95 = ci_95.iloc[:, 1].values.tolist()
+        return mean.tolist(), lower_80, upper_80, lower_95, upper_95, algo
+
+    except Exception:
+        return _fit_holt(values, n)
+
+
+def _fit_holt(values: list[float], n: int) -> tuple[list[float], list[float], list[float], list[float], list[float], str]:
+    """Holt-Winters fallback."""
    from statsmodels.tsa.holtwinters import ExponentialSmoothing

    try:
@ -36,13 +72,22 @@ def _fit_holt(values: list[float], n: int) -> tuple[list[float], list[float], li
        fit = model.fit(optimized=True, disp=False)
        forecast = fit.forecast(n)
        sigma = float(np.std(fit.resid)) if len(fit.resid) > 1 else float(np.mean(values) * 0.15)
-        lower = np.maximum(0, forecast - 1.28 * sigma)
-        upper = forecast + 1.28 * sigma
-        return forecast.tolist(), lower.tolist(), upper.tolist()
+
+        lower_80 = np.maximum(0, forecast - 1.28 * sigma).tolist()
+        upper_80 = (forecast + 1.28 * sigma).tolist()
+        lower_95 = np.maximum(0, forecast - 1.96 * sigma).tolist()
+        upper_95 = (forecast + 1.96 * sigma).tolist()
+        return forecast.tolist(), lower_80, upper_80, lower_95, upper_95, "holt_winters"
+
    except Exception:
        avg = float(np.mean(values))
        sigma = float(np.std(values)) if len(values) > 1 else avg * 0.15
-        return [avg] * n, [max(0, avg - 1.28 * sigma)] * n, [(avg + 1.28 * sigma)] * n
+        fcast = [avg] * n
+        lower_80 = [max(0.0, avg - 1.28 * sigma)] * n
+        upper_80 = [(avg + 1.28 * sigma)] * n
+        lower_95 = [max(0.0, avg - 1.96 * sigma)] * n
+        upper_95 = [(avg + 1.96 * sigma)] * n
+        return fcast, lower_80, upper_80, lower_95, upper_95, "average"


 def forecast_spending(df: pd.DataFrame) -> list[dict]:
@ -61,31 +106,47 @@ def forecast_spending(df: pd.DataFrame) -> list[dict]:
        group = group.sort_values("ds")
        values = group["y"].tolist()
        actuals = [
-            {"date": row["ds"].strftime("%Y-%m-%d"), "amount": row["y"]}
+            {"date": row["ds"].strftime("%Y-%m-%d"), "amount": round(float(row["y"]), 2)}
            for _, row in group.iterrows()
        ]

        if len(values) < MIN_POINTS:
            avg = float(np.mean(values))
+            sigma = avg * 0.15
            forecast_pts = [
-                {"date": d, "amount": round(avg, 2), "lower": round(avg * 0.7, 2), "upper": round(avg * 1.3, 2)}
+                {
+                    "date": d,
+                    "amount": round(avg, 2),
+                    "lower": round(max(0.0, avg - 1.28 * sigma), 2),
+                    "upper": round(avg + 1.28 * sigma, 2),
+                    "lower_95": round(max(0.0, avg - 1.96 * sigma), 2),
+                    "upper_95": round(avg + 1.96 * sigma, 2),
+                }
                for d in future_dates
            ]
+            algo = "average"
        else:
-            fcast, lower, upper = _fit_holt(values, FORECAST_MONTHS)
+            fcast, lower_80, upper_80, lower_95, upper_95, algo = _fit_sarima(values, FORECAST_MONTHS)
            forecast_pts = [
-                {"date": d, "amount": round(max(0, f), 2), "lower": round(l, 2), "upper": round(u, 2)}
-                for d, f, l, u in zip(future_dates, fcast, lower, upper)
+                {
+                    "date": d,
+                    "amount": round(max(0.0, f), 2),
+                    "lower": round(l80, 2),
+                    "upper": round(u80, 2),
+                    "lower_95": round(l95, 2),
+                    "upper_95": round(u95, 2),
+                }
+                for d, f, l80, u80, l95, u95 in zip(future_dates, fcast, lower_80, upper_80, lower_95, upper_95)
            ]

        results.append({
-            "category_id": cat_id,
+            "category_id": str(cat_id),
            "category_name": cat_name,
            "monthly_avg": round(float(np.mean(values)), 2),
-            "actuals": actuals[-6:],  # last 6 months for display
+            "algorithm": algo,
+            "actuals": actuals[-12:],  # last 12 months for display
            "forecast": forecast_pts,
        })

-    # Sort by monthly_avg descending (highest spend first)
    results.sort(key=lambda x: x["monthly_avg"], reverse=True)
    return results