Add AI receipt scanning with OCR pipeline and debug toggle

- OCR pipeline: Tesseract (images) + pdfplumber (PDFs) → AI text prompt → rule-based regex fallback; works with any text model, not just vision models - Scan Receipt toolbar button parses a photo and pre-fills the transaction form; receipt image is automatically attached to the created transaction - AI settings page: provider, API key (AES-256-GCM encrypted), custom URL, model, and per-user debug toggle that gates the OCR/AI debug panel - Fix CSRF cookie secure=False so HTTP deployments work; add 7-day max_age - Fix attachment_refs missing from _to_response (attachments never appeared in UI) - Fix multipart boundary lost when Content-Type was set manually in axios calls - nginx: raise client_max_body_size to 15 MB, add 120s proxy timeout for OCR - Migration 0005: add ai_debug boolean to users table - Update README and CLAUDE.md with AI scanning docs and architecture notes Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-22 22:07:38 +00:00 · 2026-04-22 22:07:38 +00:00 · 26e2a055db
commit 26e2a055db
parent a7c54ca61c
16 changed files with 397 additions and 99 deletions
--- a/backend/Dockerfile
+++ b/backend/Dockerfile
@ -5,6 +5,9 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
    gnupg \
    gzip \
    gosu \
+    tesseract-ocr \
+    tesseract-ocr-eng \
+    poppler-utils \
  && rm -rf /var/lib/apt/lists/*
 RUN pip install --no-cache-dir uv
 WORKDIR /app
--- a/backend/alembic/versions/0005_ai_debug.py
+++ b/backend/alembic/versions/0005_ai_debug.py
@ -0,0 +1,21 @@
+"""add ai_debug flag to users
+
+Revision ID: 0005
+Revises: 0004
+Create Date: 2026-04-22
+"""
+from alembic import op
+import sqlalchemy as sa
+
+revision = "0005"
+down_revision = "0004"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.add_column("users", sa.Column("ai_debug", sa.Boolean(), nullable=False, server_default="false"))
+
+
+def downgrade() -> None:
+    op.drop_column("users", "ai_debug")
--- a/backend/app/api/v1/settings.py
+++ b/backend/app/api/v1/settings.py
@ -22,6 +22,7 @@ class AiSettingsResponse(BaseModel):
    has_api_key: bool
    base_url: str | None
    model: str | None
+    debug: bool


 class AiSettingsSave(BaseModel):
@ -29,6 +30,7 @@ class AiSettingsSave(BaseModel):
    api_key: str = ""
    base_url: str = ""
    model: str = ""
+    debug: bool = False


@router.get("/ai", response_model=AiSettingsResponse)
@ -38,6 +40,7 @@ async def get_ai_settings(user: User = Depends(get_current_user)):
        has_api_key=bool(user.ai_api_key_enc),
        base_url=user.ai_base_url,
        model=user.ai_model,
+        debug=user.ai_debug,
    )


@ -54,6 +57,7 @@ async def save_ai_settings(
        "ai_provider": body.provider,
        "ai_base_url": body.base_url.rstrip("/") or None,
        "ai_model": body.model.strip() or None,
+        "ai_debug": body.debug,
    }

    if body.api_key.strip():
@ -68,6 +72,7 @@ async def save_ai_settings(
        has_api_key=True,
        base_url=values["ai_base_url"],
        model=values["ai_model"],
+        debug=body.debug,
    )


--- a/backend/app/api/v1/transactions.py
+++ b/backend/app/api/v1/transactions.py
@ -278,93 +278,230 @@ async def delete_attachment(
    await db.commit()


-_RECEIPT_PROMPT = (
-    "You are a receipt parser. Extract information from this receipt and return ONLY a JSON object "
-    "with exactly these keys (use null for any field you cannot determine):\n"
-    '{"merchant": "store name", "amount": 0.00, "currency": "GBP", '
+_RECEIPT_TEXT_PROMPT = (
+    "You are a receipt parser. Below is the raw text extracted from a receipt via OCR.\n\n"
+    "Receipt text:\n{ocr_text}\n\n"
+    "Extract the information and return ONLY a JSON object with exactly these keys "
+    "(use null for any field you cannot determine):\n"
+    '{{"merchant": "store name", "amount": 0.00, "currency": "GBP", '
    '"date": "YYYY-MM-DD", "description": "brief description", '
-    '"category": "one of: Food & Drink, Transport, Shopping, Entertainment, Health, Travel, Bills & Utilities, Other"}\n'
+    '"category": "one of: Food & Drink, Transport, Shopping, Entertainment, Health, Travel, Bills & Utilities, Other"}}\n'
    "Return ONLY the JSON object. No markdown, no explanation, no code fences."
 )

+_EMPTY_RESULT: dict = {
+    "merchant": None, "amount": None, "currency": None,
+    "date": None, "description": None, "category": None,
+    "raw": None, "ocr_text": None,
+}
+
+
+def _extract_ocr_text(file_bytes: bytes, mime_type: str) -> str:
+    """Extract text from an image or PDF. Returns empty string on failure."""
+    if mime_type == "application/pdf":
+        import io
+        import pdfplumber
+        try:
+            with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
+                pages_text = [page.extract_text() or "" for page in pdf.pages[:4]]
+            text = "\n".join(pages_text).strip()
+            if text:
+                return text
+        except Exception:
+            pass
+        # Scanned PDF — convert first page to image then OCR
+        try:
+            from pdf2image import convert_from_bytes
+            import pytesseract
+            images = convert_from_bytes(file_bytes, first_page=1, last_page=1, dpi=200)
+            if images:
+                return pytesseract.image_to_string(images[0])
+        except Exception:
+            pass
+        return ""
+    else:
+        import io
+        import pytesseract
+        from PIL import Image
+        try:
+            img = Image.open(io.BytesIO(file_bytes))
+            return pytesseract.image_to_string(img)
+        except Exception:
+            return ""
+
+
+def _rule_based_parse(ocr_text: str) -> dict:
+    """Extract receipt fields from OCR text using regex. Best-effort."""
+    import re
+    from datetime import datetime
+
+    lines = [ln.strip() for ln in ocr_text.splitlines() if ln.strip()]
+
+    # Merchant: skip very short lines and lines that look like addresses/phone numbers
+    merchant = None
+    for ln in lines[:5]:
+        if len(ln) > 2 and not re.match(r"^[\d\s\-\+\(\)]+$", ln) and not re.match(r"^\d+\s+\w+", ln):
+            merchant = ln
+            break
+
+    # Currency from symbols
+    currency = None
+    if "£" in ocr_text:
+        currency = "GBP"
+    elif "€" in ocr_text:
+        currency = "EUR"
+    elif "$" in ocr_text:
+        currency = "USD"
+
+    # Amount: prefer lines containing total/amount keywords, then fall back to largest number
+    amount = None
+    total_line_pat = re.compile(
+        r"(?:total|amount\s*due|grand\s*total|balance\s*due|subtotal|net\s*total)"
+        r"[^\d£$€]*([£$€]?\s*\d{1,6}[.,]\d{2})\b",
+        re.IGNORECASE,
+    )
+    all_amount_pat = re.compile(r"[£$€]?\s*(\d{1,6}[.,]\d{2})\b")
+
+    for m in total_line_pat.finditer(ocr_text):
+        raw = re.sub(r"[£$€\s]", "", m.group(1)).replace(",", ".")
+        try:
+            amount = float(raw)
+            break
+        except ValueError:
+            pass
+
+    if amount is None:
+        candidates = []
+        for m in all_amount_pat.finditer(ocr_text):
+            try:
+                candidates.append(float(m.group(1).replace(",", ".")))
+            except ValueError:
+                pass
+        if candidates:
+            amount = max(candidates)
+
+    # Date: try common formats
+    date = None
+    date_patterns = [
+        (r"\b(\d{4}[-/]\d{2}[-/]\d{2})\b", ["%Y-%m-%d", "%Y/%m/%d"]),
+        (r"\b(\d{2}[-/]\d{2}[-/]\d{4})\b", ["%d-%m-%Y", "%d/%m/%Y", "%m/%d/%Y"]),
+        (r"\b(\d{1,2}\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\.?\s+\d{4})\b", ["%d %B %Y", "%d %b %Y"]),
+        (r"\b((?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\.?\s+\d{1,2},?\s+\d{4})\b", ["%B %d, %Y", "%b %d, %Y"]),
+    ]
+    for pattern, fmts in date_patterns:
+        m = re.search(pattern, ocr_text, re.IGNORECASE)
+        if m:
+            raw_date = m.group(1).rstrip(".")
+            for fmt in fmts:
+                try:
+                    date = datetime.strptime(raw_date, fmt).strftime("%Y-%m-%d")
+                    break
+                except ValueError:
+                    pass
+            if date:
+                break
+
+    description = merchant  # simple default
+
+    return {
+        "merchant":    merchant,
+        "amount":      amount,
+        "currency":    currency,
+        "date":        date,
+        "description": description,
+        "category":    None,
+        "raw":         None,
+        "ocr_text":    ocr_text,
+    }
+
+
+def _strip_code_fence(text: str) -> str:
+    if text.startswith("```"):
+        parts = text.split("```")
+        text = parts[1] if len(parts) > 1 else text
+        if text.startswith("json"):
+            text = text[4:]
+    return text.strip()
+

 async def _call_ai_parse(file_bytes: bytes, mime_type: str, user_row) -> dict:
-    """Call the configured AI provider and return parsed receipt fields."""
-    import base64
+    """
+    Parse a receipt: OCR text extraction → AI (text prompt) → rule-based fallback.
+    AI is optional; rules always run as fallback if AI is unconfigured or fails.
+    """
    import json
    import httpx
    from app.core.security import decrypt_field

-    if not user_row.ai_provider or not user_row.ai_api_key_enc:
-        raise HTTPException(status_code=400, detail="No AI provider configured. Add your API key in Settings → AI.")
+    # Step 1: extract text via OCR / PDF text layer
+    ocr_text = _extract_ocr_text(file_bytes, mime_type)

-    api_key = decrypt_field(user_row.ai_api_key_enc)
-    b64 = base64.standard_b64encode(file_bytes).decode()
-    custom_base_url = (user_row.ai_base_url or "").rstrip("/")
-    custom_model = (user_row.ai_model or "").strip()
+    has_ai = bool(user_row and user_row.ai_provider and user_row.ai_api_key_enc)
+
+    # Step 2: attempt AI parse if configured
+    if has_ai and ocr_text.strip():
+        api_key = decrypt_field(user_row.ai_api_key_enc)
+        custom_base_url = (user_row.ai_base_url or "").rstrip("/")
+        custom_model = (user_row.ai_model or "").strip()
+        prompt = _RECEIPT_TEXT_PROMPT.format(ocr_text=ocr_text)
+
+        try:
+            if user_row.ai_provider == "anthropic":
+                base_url = custom_base_url or "https://api.anthropic.com"
+                model = custom_model or "claude-haiku-4-5-20251001"
+                async with httpx.AsyncClient(timeout=60) as client:
+                    resp = await client.post(
+                        f"{base_url}/v1/messages",
+                        headers={"x-api-key": api_key, "anthropic-version": "2023-06-01", "content-type": "application/json"},
+                        json={"model": model, "max_tokens": 512, "messages": [{"role": "user", "content": prompt}]},
+                    )
+                resp.raise_for_status()
+                raw = resp.json()["content"][0]["text"].strip()
+
+            elif user_row.ai_provider == "openai":
+                base_url = custom_base_url or "https://api.openai.com"
+                model = custom_model or "gpt-4o-mini"
+                async with httpx.AsyncClient(timeout=60) as client:
+                    resp = await client.post(
+                        f"{base_url}/v1/chat/completions",
+                        headers={"Authorization": f"Bearer {api_key}", "content-type": "application/json"},
+                        json={"model": model, "max_tokens": 512, "messages": [{"role": "user", "content": prompt}]},
+                    )
+                resp.raise_for_status()
+                raw = resp.json()["choices"][0]["message"]["content"].strip()

-    try:
-        if user_row.ai_provider == "anthropic":
-            base_url = custom_base_url or "https://api.anthropic.com"
-            model = custom_model or "claude-haiku-4-5-20251001"
-            if mime_type == "application/pdf":
-                content_block = {"type": "document", "source": {"type": "base64", "media_type": "application/pdf", "data": b64}}
            else:
-                content_block = {"type": "image", "source": {"type": "base64", "media_type": mime_type, "data": b64}}
-            async with httpx.AsyncClient(timeout=60) as client:
-                resp = await client.post(
-                    f"{base_url}/v1/messages",
-                    headers={"x-api-key": api_key, "anthropic-version": "2023-06-01", "content-type": "application/json"},
-                    json={"model": model, "max_tokens": 512, "messages": [{"role": "user", "content": [content_block, {"type": "text", "text": _RECEIPT_PROMPT}]}]},
-                )
-            resp.raise_for_status()
-            text = resp.json()["content"][0]["text"].strip()
+                raw = None

-        elif user_row.ai_provider == "openai":
-            base_url = custom_base_url or "https://api.openai.com"
-            model = custom_model or "gpt-4o-mini"
-            if mime_type == "application/pdf" and not custom_base_url:
-                raise HTTPException(status_code=400, detail="PDF parsing is not supported with the OpenAI provider. Use an image format or switch to Anthropic.")
-            async with httpx.AsyncClient(timeout=60) as client:
-                resp = await client.post(
-                    f"{base_url}/v1/chat/completions",
-                    headers={"Authorization": f"Bearer {api_key}", "content-type": "application/json"},
-                    json={"model": model, "max_tokens": 512, "messages": [{"role": "user", "content": [
-                        {"type": "image_url", "image_url": {"url": f"data:{mime_type};base64,{b64}"}},
-                        {"type": "text", "text": _RECEIPT_PROMPT},
-                    ]}]},
-                )
-            resp.raise_for_status()
-            text = resp.json()["choices"][0]["message"]["content"].strip()
+            if raw:
+                cleaned = _strip_code_fence(raw)
+                try:
+                    parsed = json.loads(cleaned)
+                    return {
+                        "merchant":    parsed.get("merchant"),
+                        "amount":      parsed.get("amount"),
+                        "currency":    parsed.get("currency"),
+                        "date":        parsed.get("date"),
+                        "description": parsed.get("description"),
+                        "category":    parsed.get("category"),
+                        "raw":         raw,
+                        "ocr_text":    ocr_text,
+                    }
+                except json.JSONDecodeError:
+                    # AI returned something non-JSON — fall through to rules, keep raw for debug
+                    pass

-        else:
-            raise HTTPException(status_code=400, detail="Unknown provider")
+        except (httpx.HTTPStatusError, httpx.RequestError):
+            pass  # fall through to rule-based

-    except httpx.HTTPStatusError as e:
-        raise HTTPException(status_code=502, detail=f"AI provider error: {e.response.status_code}")
-    except httpx.RequestError:
-        raise HTTPException(status_code=502, detail="Could not reach AI provider")
+    # Step 3: rule-based fallback (also used when AI is not configured)
+    if ocr_text.strip():
+        return _rule_based_parse(ocr_text)

-    if text.startswith("```"):
-        text = text.split("```")[1]
-        if text.startswith("json"):
-            text = text[4:]
-        text = text.strip()
-
-    try:
-        parsed = json.loads(text)
-    except json.JSONDecodeError:
-        raise HTTPException(status_code=502, detail="AI returned an unexpected response. Try again.")
-
-    return {
-        "merchant":    parsed.get("merchant"),
-        "amount":      parsed.get("amount"),
-        "currency":    parsed.get("currency"),
-        "date":        parsed.get("date"),
-        "description": parsed.get("description"),
-        "category":    parsed.get("category"),
-        "raw":         text,
-    }
+    # Nothing worked
+    if has_ai:
+        raise HTTPException(status_code=400, detail="Could not extract any text from the file. Try a clearer image.")
+    raise HTTPException(status_code=400, detail="No AI configured and OCR extracted no text. Add an API key in Settings → AI or try a clearer image.")


@router.post("/parse-receipt")
--- a/backend/app/core/middleware.py
+++ b/backend/app/core/middleware.py
@ -7,7 +7,6 @@ from fastapi import Request, Response
 from starlette.middleware.base import BaseHTTPMiddleware
 from starlette.responses import JSONResponse

-from app.config import get_settings

 SAFE_METHODS = {"GET", "HEAD", "OPTIONS"}

@ -57,7 +56,8 @@ class CSRFMiddleware(BaseHTTPMiddleware):
                    "csrf_token", token,
                    httponly=False,   # must be readable by JS
                    samesite="lax",
-                    secure=not get_settings().is_development,
+                    secure=False,     # CSRF token is public by design; Secure would break HTTP deployments
+                    max_age=604800,   # 7 days — survive browser restarts
                )
            return response

@ -65,7 +65,7 @@ class CSRFMiddleware(BaseHTTPMiddleware):
            response = await call_next(request)
            if not existing_csrf:
                token = str(uuid.uuid4())
-                response.set_cookie("csrf_token", token, httponly=False, samesite="lax", secure=not get_settings().is_development)
+                response.set_cookie("csrf_token", token, httponly=False, samesite="lax", secure=False, max_age=604800)
            return response

        if request.url.path in {"/api/v1/auth/login", "/api/v1/auth/login/totp"}:
--- a/backend/app/db/models/user.py
+++ b/backend/app/db/models/user.py
@ -33,6 +33,7 @@ class User(Base):
    ai_api_key_enc: Mapped[bytes | None] = mapped_column(LargeBinary, nullable=True)
    ai_base_url: Mapped[str | None] = mapped_column(Text, nullable=True)
    ai_model: Mapped[str | None] = mapped_column(Text, nullable=True)
+    ai_debug: Mapped[bool] = mapped_column(Boolean, default=False, nullable=False)

    accounts: Mapped[list["Account"]] = relationship(back_populates="user", lazy="noload")  # type: ignore[name-defined]
    sessions: Mapped[list["Session"]] = relationship(back_populates="user", lazy="noload")  # type: ignore[name-defined]
--- a/backend/app/services/transaction_service.py
+++ b/backend/app/services/transaction_service.py
@ -47,6 +47,7 @@ def _to_response(t: Transaction) -> dict:
        "notes": _dec(t.notes_enc),
        "tags": t.tags or [],
        "is_recurring": t.is_recurring,
+        "attachment_refs": t.attachment_refs or [],
        "created_at": t.created_at,
        "updated_at": t.updated_at,
    }
--- a/backend/pyproject.toml
+++ b/backend/pyproject.toml
@ -31,6 +31,9 @@ dependencies = [
    "structlog>=24.0",
    "pillow>=11.0",
    "python-magic>=0.4",
+    "pytesseract>=0.3",
+    "pdfplumber>=0.11",
+    "pdf2image>=1.17",
    "psycopg2-binary>=2.9",
 ]