""" Auto-detect CSV bank export formats and produce a column mapping. Supports: Monzo, Starling, Revolut, Barclays, Lloyds, NatWest/RBS, HSBC, Santander. Falls back to a generic best-effort mapping for unknown formats. """ from __future__ import annotations import csv import io from dataclasses import dataclass, field from typing import Literal @dataclass class CsvMapping: date: str description: str amount: str | None = None # single signed amount column debit: str | None = None # separate debit column (positive value = money out) credit: str | None = None # separate credit column (positive value = money in) balance: str | None = None reference: str | None = None detected_format: str | None = None def is_split(self) -> bool: return self.debit is not None and self.credit is not None KNOWN_FORMATS: list[dict] = [ { "name": "Monzo", "detect": lambda h: {"transaction id", "emoji"}.issubset(h), "date": "Date", "description": "Name", "amount": "Amount", "balance": None, "reference": "Notes and #tags", }, { "name": "Starling", "detect": lambda h: {"counter party", "spending category"}.issubset(h), "date": "Date", "description": "Counter Party", "amount": "Amount (GBP)", "balance": "Balance (GBP)", "reference": "Reference", }, { "name": "Revolut", "detect": lambda h: {"product", "started date", "completed date"}.issubset(h), "date": "Started Date", "description": "Description", "amount": "Amount", "balance": "Balance", "reference": None, }, { "name": "Barclays", "detect": lambda h: {"subcategory", "memo", "number"}.issubset(h), "date": "Date", "description": "Memo", "amount": "Amount", "balance": None, "reference": "Subcategory", }, { "name": "Lloyds Bank", "detect": lambda h: {"transaction date", "debit amount", "credit amount", "transaction description"}.issubset(h), "date": "Transaction Date", "description": "Transaction Description", "debit": "Debit Amount", "credit": "Credit Amount", "balance": "Balance", "reference": None, }, { "name": "Halifax", "detect": lambda h: {"transaction date", "debit amount", "credit amount", "transaction description"}.issubset(h), "date": "Transaction Date", "description": "Transaction Description", "debit": "Debit Amount", "credit": "Credit Amount", "balance": "Balance", "reference": None, }, { "name": "NatWest / RBS", "detect": lambda h: {"date", "type", "description", "value", "balance"}.issubset(h) and "value" in h, "date": "Date", "description": "Description", "amount": "Value", "balance": "Balance", "reference": None, }, { "name": "HSBC", "detect": lambda h: h == {"date", "description", "amount"} or h == {"date", "description", "debit", "credit", "balance"}, "date": "Date", "description": "Description", "amount": "Amount", "balance": None, "reference": None, }, { "name": "Santander", "detect": lambda h: {"date", "description", "debit", "credit", "balance"}.issubset(h), "date": "Date", "description": "Description", "debit": "Debit", "credit": "Credit", "balance": "Balance", "reference": None, }, { "name": "Nationwide", "detect": lambda h: {"date", "transaction", "payments out", "payments in", "balance"}.issubset(h), "date": "Date", "description": "Transaction", "debit": "Payments Out", "credit": "Payments In", "balance": "Balance", "reference": None, }, ] def _normalise_headers(raw_headers: list[str]) -> dict[str, str]: """Return {normalised_key: original_header}.""" return {h.strip().lower(): h.strip() for h in raw_headers if h} def detect_format(raw_headers: list[str]) -> CsvMapping: norm = _normalise_headers(raw_headers) norm_set = set(norm.keys()) for fmt in KNOWN_FORMATS: if fmt["detect"](norm_set): # Map logical names → actual header using case-insensitive lookup def resolve(col: str | None) -> str | None: if col is None: return None return norm.get(col.strip().lower(), col) if "debit" in fmt: return CsvMapping( date=resolve(fmt["date"]) or fmt["date"], description=resolve(fmt["description"]) or fmt["description"], debit=resolve(fmt["debit"]), credit=resolve(fmt["credit"]), balance=resolve(fmt.get("balance")), reference=resolve(fmt.get("reference")), detected_format=fmt["name"], ) else: return CsvMapping( date=resolve(fmt["date"]) or fmt["date"], description=resolve(fmt["description"]) or fmt["description"], amount=resolve(fmt["amount"]), balance=resolve(fmt.get("balance")), reference=resolve(fmt.get("reference")), detected_format=fmt["name"], ) # Generic fallback: guess by common column name patterns return _generic_mapping(norm) def _generic_mapping(norm: dict[str, str]) -> CsvMapping: def find(*candidates: str) -> str | None: for c in candidates: if c in norm: return norm[c] return None date_col = find("date", "transaction date", "trans date", "value date", "posting date") desc_col = find("description", "narrative", "details", "memo", "payee", "merchant", "name", "counter party") amt_col = find("amount", "value", "net amount", "transaction amount") debit_col = find("debit", "debit amount", "payments out", "money out", "withdrawal") credit_col = find("credit", "credit amount", "payments in", "money in", "deposit") bal_col = find("balance", "running balance") ref_col = find("reference", "notes", "tags", "category") if not date_col: date_col = list(norm.values())[0] if norm else "date" if not desc_col: desc_col = list(norm.values())[1] if len(norm) > 1 else "description" if debit_col and credit_col: return CsvMapping( date=date_col, description=desc_col, debit=debit_col, credit=credit_col, balance=bal_col, reference=ref_col, detected_format=None, ) return CsvMapping( date=date_col, description=desc_col, amount=amt_col or (list(norm.values())[2] if len(norm) > 2 else "amount"), balance=bal_col, reference=ref_col, detected_format=None, ) def parse_csv_content(content: bytes) -> tuple[list[str], list[dict]]: """Decode and return (headers, rows).""" for enc in ("utf-8-sig", "utf-8", "latin-1"): try: text = content.decode(enc) break except UnicodeDecodeError: continue else: raise ValueError("Cannot decode file — try saving as UTF-8") # Some bank exports (Lloyds, Barclays) include preamble lines before the header lines = text.splitlines() header_idx = 0 for i, line in enumerate(lines): if "," in line and len(line.split(",")) >= 2: header_idx = i break cleaned = "\n".join(lines[header_idx:]) reader = csv.DictReader(io.StringIO(cleaned)) headers = [h.strip() for h in (reader.fieldnames or []) if h and h.strip()] rows = [] for row in reader: clean_row = {k.strip(): (v.strip() if v else "") for k, v in row.items() if k and k.strip()} if any(clean_row.values()): rows.append(clean_row) return headers, rows