feat: Implement deterministic expense matching using configurable providers and keywords, integrating it into the processing pipeline.

This commit is contained in:
Marco Gallegos
2025-12-18 12:25:48 -06:00
parent 899482580e
commit 519a5ad705
9 changed files with 338 additions and 136 deletions

View File

@@ -0,0 +1,61 @@
"""
Configuration loader for providers and keywords.
"""
import csv
import os
import logging
from typing import List, Dict, Any
logger = logging.getLogger(__name__)
# Paths to configuration files
BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
PROVIDERS_PATH = os.path.join(BASE_DIR, 'config', 'providers.csv')
KEYWORDS_PATH = os.path.join(BASE_DIR, 'config', 'keywords.csv')
def load_providers() -> List[Dict[str, Any]]:
"""
Loads the providers configuration from CSV.
"""
providers = []
if not os.path.exists(PROVIDERS_PATH):
logger.warning(f"Providers file not found at {PROVIDERS_PATH}")
return providers
try:
with open(PROVIDERS_PATH, mode='r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
# Process aliases into a list
if 'aliases' in row and row['aliases']:
row['aliases'] = [a.strip().lower() for a in row['aliases'].split(',')]
else:
row['aliases'] = []
providers.append(row)
logger.info(f"Loaded {len(providers)} providers from {PROVIDERS_PATH}")
except Exception as e:
logger.error(f"Error loading providers: {e}")
return providers
def load_keywords() -> List[Dict[str, Any]]:
"""
Loads the keywords configuration from CSV.
"""
keywords = []
if not os.path.exists(KEYWORDS_PATH):
logger.warning(f"Keywords file not found at {KEYWORDS_PATH}")
return keywords
try:
with open(KEYWORDS_PATH, mode='r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
if 'keyword' in row:
row['keyword'] = row['keyword'].strip().lower()
keywords.append(row)
logger.info(f"Loaded {len(keywords)} keywords from {KEYWORDS_PATH}")
except Exception as e:
logger.error(f"Error loading keywords: {e}")
return keywords

View File

@@ -0,0 +1,90 @@
"""
Matching logic for providers and keywords.
"""
import logging
from typing import Optional, Dict, Any
from app.preprocessing.config_loader import load_providers, load_keywords
logger = logging.getLogger(__name__)
# Global cache for configuration
_PROVIDERS = None
_KEYWORDS = None
def get_config():
"""
Returns the loaded configuration, using cache if available.
"""
global _PROVIDERS, _KEYWORDS
if _PROVIDERS is None:
_PROVIDERS = load_providers()
if _KEYWORDS is None:
_KEYWORDS = load_keywords()
return _PROVIDERS, _KEYWORDS
def match_provider(description: str) -> Optional[Dict[str, Any]]:
"""
Searches for a provider name or alias in the description.
"""
providers, _ = get_config()
desc_lower = description.lower()
for p in providers:
name = p.get('provider_name', '').lower()
aliases = p.get('aliases', [])
# Check name
if name and name in desc_lower:
return p
# Check aliases
for alias in aliases:
if alias and alias in desc_lower:
return p
return None
def match_keywords(description: str) -> Optional[Dict[str, Any]]:
"""
Searches for keywords in the description.
"""
_, keywords = get_config()
desc_lower = description.lower()
for k in keywords:
keyword = k.get('keyword', '').lower()
if keyword and keyword in desc_lower:
return k
return None
def get_metadata_from_match(description: str) -> Dict[str, Any]:
"""
Attempts to find metadata (category, subcategory, etc.) for a description.
Priority: Provider Match > Keyword Match.
"""
# 1. Try Provider Match
provider = match_provider(description)
if provider:
logger.info(f"Matched provider: {provider['provider_name']}")
return {
"category": provider.get('categoria_principal'),
"subcategory": provider.get('subcategoria'),
"expense_type": provider.get('tipo_gasto_default'),
"match_type": "provider",
"matched_name": provider['provider_name']
}
# 2. Try Keyword Match
keyword = match_keywords(description)
if keyword:
logger.info(f"Matched keyword: {keyword['keyword']}")
return {
"category": keyword.get('categoria_principal'),
"subcategory": keyword.get('subcategoria'),
"expense_type": keyword.get('tipo_gasto_default'),
"match_type": "keyword",
"matched_name": keyword['keyword']
}
return {}

View File

@@ -8,6 +8,7 @@ import logging
from app.schema.base import RawInput, ProvisionalExpense, FinalExpense, ExpenseStatus
from app.ingestion import text, image, audio, document
from app.ai import extractor, classifier
from app.preprocessing import matcher
from app.persistence import repositories
from sqlalchemy.orm import Session
@@ -59,18 +60,22 @@ def process_expense_input(db: Session, raw_input: RawInput) -> FinalExpense:
audited_expense = classifier.classify_and_audit(provisional_expense)
# 3.5 Deterministic Matching (Phase 3)
# Enrich data with categories from providers/keywords if available
match_metadata = matcher.get_metadata_from_match(extracted_data.description)
# For now, we auto-confirm if confidence is high.
if audited_expense.confidence_score > 0.7:
final_expense = FinalExpense(
user_id=audited_expense.user_id,
provider_name=audited_expense.extracted_data.description, # Simplified mapping
provider_name=match_metadata.get("matched_name") or audited_expense.extracted_data.description,
amount=audited_expense.extracted_data.amount,
currency=audited_expense.extracted_data.currency,
expense_date=audited_expense.extracted_data.expense_date,
description=audited_expense.extracted_data.description,
category=audited_expense.category,
expense_type="personal", # Default
initial_processing_method=audited_expense.processing_method,
category=match_metadata.get("category") or audited_expense.category,
expense_type=match_metadata.get("expense_type") or "personal",
initial_processing_method=match_metadata.get("match_type") or audited_expense.processing_method,
confirmed_by="auto-confirm"
)