feat: Implement deterministic expense matching using configurable providers and keywords, integrating it into the processing pipeline.

This commit is contained in:
Marco Gallegos
2025-12-18 12:25:48 -06:00
parent 899482580e
commit 519a5ad705
9 changed files with 338 additions and 136 deletions

View File

@@ -0,0 +1,61 @@
"""
Configuration loader for providers and keywords.
"""
import csv
import os
import logging
from typing import List, Dict, Any
logger = logging.getLogger(__name__)
# Paths to configuration files
BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
PROVIDERS_PATH = os.path.join(BASE_DIR, 'config', 'providers.csv')
KEYWORDS_PATH = os.path.join(BASE_DIR, 'config', 'keywords.csv')
def load_providers() -> List[Dict[str, Any]]:
"""
Loads the providers configuration from CSV.
"""
providers = []
if not os.path.exists(PROVIDERS_PATH):
logger.warning(f"Providers file not found at {PROVIDERS_PATH}")
return providers
try:
with open(PROVIDERS_PATH, mode='r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
# Process aliases into a list
if 'aliases' in row and row['aliases']:
row['aliases'] = [a.strip().lower() for a in row['aliases'].split(',')]
else:
row['aliases'] = []
providers.append(row)
logger.info(f"Loaded {len(providers)} providers from {PROVIDERS_PATH}")
except Exception as e:
logger.error(f"Error loading providers: {e}")
return providers
def load_keywords() -> List[Dict[str, Any]]:
"""
Loads the keywords configuration from CSV.
"""
keywords = []
if not os.path.exists(KEYWORDS_PATH):
logger.warning(f"Keywords file not found at {KEYWORDS_PATH}")
return keywords
try:
with open(KEYWORDS_PATH, mode='r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
if 'keyword' in row:
row['keyword'] = row['keyword'].strip().lower()
keywords.append(row)
logger.info(f"Loaded {len(keywords)} keywords from {KEYWORDS_PATH}")
except Exception as e:
logger.error(f"Error loading keywords: {e}")
return keywords

View File

@@ -0,0 +1,90 @@
"""
Matching logic for providers and keywords.
"""
import logging
from typing import Optional, Dict, Any
from app.preprocessing.config_loader import load_providers, load_keywords
logger = logging.getLogger(__name__)
# Global cache for configuration
_PROVIDERS = None
_KEYWORDS = None
def get_config():
"""
Returns the loaded configuration, using cache if available.
"""
global _PROVIDERS, _KEYWORDS
if _PROVIDERS is None:
_PROVIDERS = load_providers()
if _KEYWORDS is None:
_KEYWORDS = load_keywords()
return _PROVIDERS, _KEYWORDS
def match_provider(description: str) -> Optional[Dict[str, Any]]:
"""
Searches for a provider name or alias in the description.
"""
providers, _ = get_config()
desc_lower = description.lower()
for p in providers:
name = p.get('provider_name', '').lower()
aliases = p.get('aliases', [])
# Check name
if name and name in desc_lower:
return p
# Check aliases
for alias in aliases:
if alias and alias in desc_lower:
return p
return None
def match_keywords(description: str) -> Optional[Dict[str, Any]]:
"""
Searches for keywords in the description.
"""
_, keywords = get_config()
desc_lower = description.lower()
for k in keywords:
keyword = k.get('keyword', '').lower()
if keyword and keyword in desc_lower:
return k
return None
def get_metadata_from_match(description: str) -> Dict[str, Any]:
"""
Attempts to find metadata (category, subcategory, etc.) for a description.
Priority: Provider Match > Keyword Match.
"""
# 1. Try Provider Match
provider = match_provider(description)
if provider:
logger.info(f"Matched provider: {provider['provider_name']}")
return {
"category": provider.get('categoria_principal'),
"subcategory": provider.get('subcategoria'),
"expense_type": provider.get('tipo_gasto_default'),
"match_type": "provider",
"matched_name": provider['provider_name']
}
# 2. Try Keyword Match
keyword = match_keywords(description)
if keyword:
logger.info(f"Matched keyword: {keyword['keyword']}")
return {
"category": keyword.get('categoria_principal'),
"subcategory": keyword.get('subcategoria'),
"expense_type": keyword.get('tipo_gasto_default'),
"match_type": "keyword",
"matched_name": keyword['keyword']
}
return {}