mirror of
https://github.com/marcogll/telegram_expenses_controller.git
synced 2026-01-13 13:25:15 +00:00
feat: Implement deterministic expense matching using configurable providers and keywords, integrating it into the processing pipeline.
This commit is contained in:
61
app/preprocessing/config_loader.py
Normal file
61
app/preprocessing/config_loader.py
Normal file
@@ -0,0 +1,61 @@
|
||||
"""
|
||||
Configuration loader for providers and keywords.
|
||||
"""
|
||||
import csv
|
||||
import os
|
||||
import logging
|
||||
from typing import List, Dict, Any
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Paths to configuration files
|
||||
BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
PROVIDERS_PATH = os.path.join(BASE_DIR, 'config', 'providers.csv')
|
||||
KEYWORDS_PATH = os.path.join(BASE_DIR, 'config', 'keywords.csv')
|
||||
|
||||
def load_providers() -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Loads the providers configuration from CSV.
|
||||
"""
|
||||
providers = []
|
||||
if not os.path.exists(PROVIDERS_PATH):
|
||||
logger.warning(f"Providers file not found at {PROVIDERS_PATH}")
|
||||
return providers
|
||||
|
||||
try:
|
||||
with open(PROVIDERS_PATH, mode='r', encoding='utf-8') as f:
|
||||
reader = csv.DictReader(f)
|
||||
for row in reader:
|
||||
# Process aliases into a list
|
||||
if 'aliases' in row and row['aliases']:
|
||||
row['aliases'] = [a.strip().lower() for a in row['aliases'].split(',')]
|
||||
else:
|
||||
row['aliases'] = []
|
||||
providers.append(row)
|
||||
logger.info(f"Loaded {len(providers)} providers from {PROVIDERS_PATH}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error loading providers: {e}")
|
||||
|
||||
return providers
|
||||
|
||||
def load_keywords() -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Loads the keywords configuration from CSV.
|
||||
"""
|
||||
keywords = []
|
||||
if not os.path.exists(KEYWORDS_PATH):
|
||||
logger.warning(f"Keywords file not found at {KEYWORDS_PATH}")
|
||||
return keywords
|
||||
|
||||
try:
|
||||
with open(KEYWORDS_PATH, mode='r', encoding='utf-8') as f:
|
||||
reader = csv.DictReader(f)
|
||||
for row in reader:
|
||||
if 'keyword' in row:
|
||||
row['keyword'] = row['keyword'].strip().lower()
|
||||
keywords.append(row)
|
||||
logger.info(f"Loaded {len(keywords)} keywords from {KEYWORDS_PATH}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error loading keywords: {e}")
|
||||
|
||||
return keywords
|
||||
90
app/preprocessing/matcher.py
Normal file
90
app/preprocessing/matcher.py
Normal file
@@ -0,0 +1,90 @@
|
||||
"""
|
||||
Matching logic for providers and keywords.
|
||||
"""
|
||||
import logging
|
||||
from typing import Optional, Dict, Any
|
||||
from app.preprocessing.config_loader import load_providers, load_keywords
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Global cache for configuration
|
||||
_PROVIDERS = None
|
||||
_KEYWORDS = None
|
||||
|
||||
def get_config():
|
||||
"""
|
||||
Returns the loaded configuration, using cache if available.
|
||||
"""
|
||||
global _PROVIDERS, _KEYWORDS
|
||||
if _PROVIDERS is None:
|
||||
_PROVIDERS = load_providers()
|
||||
if _KEYWORDS is None:
|
||||
_KEYWORDS = load_keywords()
|
||||
return _PROVIDERS, _KEYWORDS
|
||||
|
||||
def match_provider(description: str) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Searches for a provider name or alias in the description.
|
||||
"""
|
||||
providers, _ = get_config()
|
||||
desc_lower = description.lower()
|
||||
|
||||
for p in providers:
|
||||
name = p.get('provider_name', '').lower()
|
||||
aliases = p.get('aliases', [])
|
||||
|
||||
# Check name
|
||||
if name and name in desc_lower:
|
||||
return p
|
||||
|
||||
# Check aliases
|
||||
for alias in aliases:
|
||||
if alias and alias in desc_lower:
|
||||
return p
|
||||
|
||||
return None
|
||||
|
||||
def match_keywords(description: str) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Searches for keywords in the description.
|
||||
"""
|
||||
_, keywords = get_config()
|
||||
desc_lower = description.lower()
|
||||
|
||||
for k in keywords:
|
||||
keyword = k.get('keyword', '').lower()
|
||||
if keyword and keyword in desc_lower:
|
||||
return k
|
||||
|
||||
return None
|
||||
|
||||
def get_metadata_from_match(description: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Attempts to find metadata (category, subcategory, etc.) for a description.
|
||||
Priority: Provider Match > Keyword Match.
|
||||
"""
|
||||
# 1. Try Provider Match
|
||||
provider = match_provider(description)
|
||||
if provider:
|
||||
logger.info(f"Matched provider: {provider['provider_name']}")
|
||||
return {
|
||||
"category": provider.get('categoria_principal'),
|
||||
"subcategory": provider.get('subcategoria'),
|
||||
"expense_type": provider.get('tipo_gasto_default'),
|
||||
"match_type": "provider",
|
||||
"matched_name": provider['provider_name']
|
||||
}
|
||||
|
||||
# 2. Try Keyword Match
|
||||
keyword = match_keywords(description)
|
||||
if keyword:
|
||||
logger.info(f"Matched keyword: {keyword['keyword']}")
|
||||
return {
|
||||
"category": keyword.get('categoria_principal'),
|
||||
"subcategory": keyword.get('subcategoria'),
|
||||
"expense_type": keyword.get('tipo_gasto_default'),
|
||||
"match_type": "keyword",
|
||||
"matched_name": keyword['keyword']
|
||||
}
|
||||
|
||||
return {}
|
||||
@@ -8,6 +8,7 @@ import logging
|
||||
from app.schema.base import RawInput, ProvisionalExpense, FinalExpense, ExpenseStatus
|
||||
from app.ingestion import text, image, audio, document
|
||||
from app.ai import extractor, classifier
|
||||
from app.preprocessing import matcher
|
||||
from app.persistence import repositories
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
@@ -59,18 +60,22 @@ def process_expense_input(db: Session, raw_input: RawInput) -> FinalExpense:
|
||||
|
||||
audited_expense = classifier.classify_and_audit(provisional_expense)
|
||||
|
||||
# 3.5 Deterministic Matching (Phase 3)
|
||||
# Enrich data with categories from providers/keywords if available
|
||||
match_metadata = matcher.get_metadata_from_match(extracted_data.description)
|
||||
|
||||
# For now, we auto-confirm if confidence is high.
|
||||
if audited_expense.confidence_score > 0.7:
|
||||
final_expense = FinalExpense(
|
||||
user_id=audited_expense.user_id,
|
||||
provider_name=audited_expense.extracted_data.description, # Simplified mapping
|
||||
provider_name=match_metadata.get("matched_name") or audited_expense.extracted_data.description,
|
||||
amount=audited_expense.extracted_data.amount,
|
||||
currency=audited_expense.extracted_data.currency,
|
||||
expense_date=audited_expense.extracted_data.expense_date,
|
||||
description=audited_expense.extracted_data.description,
|
||||
category=audited_expense.category,
|
||||
expense_type="personal", # Default
|
||||
initial_processing_method=audited_expense.processing_method,
|
||||
category=match_metadata.get("category") or audited_expense.category,
|
||||
expense_type=match_metadata.get("expense_type") or "personal",
|
||||
initial_processing_method=match_metadata.get("match_type") or audited_expense.processing_method,
|
||||
confirmed_by="auto-confirm"
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user