| | """ |
| | Document Classification Schemas |
| | |
| | Pydantic models for document type classification and categorization. |
| | """ |
| |
|
| | from enum import Enum |
| | from typing import List, Dict, Any, Optional |
| | from pydantic import BaseModel, Field |
| |
|
| | from .core import EvidenceRef |
| |
|
| |
|
| | class DocumentType(str, Enum): |
| | """ |
| | Common document types for classification. |
| | Extensible for domain-specific types. |
| | """ |
| | |
| | CONTRACT = "contract" |
| | INVOICE = "invoice" |
| | RECEIPT = "receipt" |
| | PURCHASE_ORDER = "purchase_order" |
| | AGREEMENT = "agreement" |
| | NDA = "nda" |
| | TERMS_OF_SERVICE = "terms_of_service" |
| |
|
| | |
| | PATENT = "patent" |
| | RESEARCH_PAPER = "research_paper" |
| | TECHNICAL_REPORT = "technical_report" |
| | SPECIFICATION = "specification" |
| | DATASHEET = "datasheet" |
| | USER_MANUAL = "user_manual" |
| |
|
| | |
| | FINANCIAL_REPORT = "financial_report" |
| | BANK_STATEMENT = "bank_statement" |
| | TAX_FORM = "tax_form" |
| | BALANCE_SHEET = "balance_sheet" |
| | INCOME_STATEMENT = "income_statement" |
| |
|
| | |
| | ID_DOCUMENT = "id_document" |
| | PASSPORT = "passport" |
| | DRIVERS_LICENSE = "drivers_license" |
| | CERTIFICATE = "certificate" |
| | FORM = "form" |
| | APPLICATION = "application" |
| |
|
| | |
| | MEDICAL_RECORD = "medical_record" |
| | PRESCRIPTION = "prescription" |
| | LAB_REPORT = "lab_report" |
| | INSURANCE_CLAIM = "insurance_claim" |
| |
|
| | |
| | LETTER = "letter" |
| | EMAIL = "email" |
| | MEMO = "memo" |
| | PRESENTATION = "presentation" |
| | SPREADSHEET = "spreadsheet" |
| | REPORT = "report" |
| | ARTICLE = "article" |
| | BOOK = "book" |
| |
|
| | |
| | OTHER = "other" |
| | UNKNOWN = "unknown" |
| |
|
| |
|
| | class ClassificationScore(BaseModel): |
| | """Score for a single document type classification.""" |
| | document_type: DocumentType = Field(..., description="Document type") |
| | confidence: float = Field(..., ge=0.0, le=1.0, description="Classification confidence") |
| | reasoning: Optional[str] = Field(default=None, description="Reasoning for classification") |
| |
|
| |
|
| | class DocumentClassification(BaseModel): |
| | """ |
| | Document classification result with confidence scores. |
| | """ |
| | document_id: str = Field(..., description="Document identifier") |
| |
|
| | |
| | primary_type: DocumentType = Field(..., description="Most likely document type") |
| | primary_confidence: float = Field( |
| | ..., |
| | ge=0.0, |
| | le=1.0, |
| | description="Confidence in primary classification" |
| | ) |
| |
|
| | |
| | scores: List[ClassificationScore] = Field( |
| | default_factory=list, |
| | description="Scores for all considered types" |
| | ) |
| |
|
| | |
| | evidence: List[EvidenceRef] = Field( |
| | default_factory=list, |
| | description="Evidence supporting classification" |
| | ) |
| |
|
| | |
| | method: str = Field( |
| | default="llm", |
| | description="Classification method used (llm/rule-based/hybrid)" |
| | ) |
| | model_used: Optional[str] = Field(default=None, description="Model used for classification") |
| |
|
| | |
| | is_confident: bool = Field( |
| | default=True, |
| | description="Whether classification meets confidence threshold" |
| | ) |
| | warnings: List[str] = Field(default_factory=list, description="Classification warnings") |
| | needs_human_review: bool = Field( |
| | default=False, |
| | description="Whether human review is recommended" |
| | ) |
| |
|
| | |
| | attributes: Dict[str, Any] = Field( |
| | default_factory=dict, |
| | description="Additional detected attributes (language, domain, etc.)" |
| | ) |
| |
|
| | def get_top_k(self, k: int = 3) -> List[ClassificationScore]: |
| | """Get top k classifications by confidence.""" |
| | sorted_scores = sorted(self.scores, key=lambda x: x.confidence, reverse=True) |
| | return sorted_scores[:k] |
| |
|
| | def is_type(self, doc_type: DocumentType, min_confidence: float = 0.5) -> bool: |
| | """Check if document is classified as a specific type with minimum confidence.""" |
| | for score in self.scores: |
| | if score.document_type == doc_type and score.confidence >= min_confidence: |
| | return True |
| | return False |
| |
|
| |
|
| | class DocumentCategoryRule(BaseModel): |
| | """ |
| | Rule for rule-based document classification. |
| | """ |
| | name: str = Field(..., description="Rule name") |
| | document_type: DocumentType = Field(..., description="Target document type") |
| |
|
| | |
| | title_keywords: List[str] = Field( |
| | default_factory=list, |
| | description="Keywords to match in title" |
| | ) |
| | content_keywords: List[str] = Field( |
| | default_factory=list, |
| | description="Keywords to match in content" |
| | ) |
| | required_sections: List[str] = Field( |
| | default_factory=list, |
| | description="Required section headings" |
| | ) |
| | file_patterns: List[str] = Field( |
| | default_factory=list, |
| | description="Filename patterns (regex)" |
| | ) |
| |
|
| | |
| | base_confidence: float = Field( |
| | default=0.8, |
| | ge=0.0, |
| | le=1.0, |
| | description="Base confidence when rule matches" |
| | ) |
| | keyword_boost: float = Field( |
| | default=0.05, |
| | ge=0.0, |
| | le=0.2, |
| | description="Confidence boost per matched keyword" |
| | ) |
| |
|
| | |
| | priority: int = Field( |
| | default=0, |
| | description="Rule priority (higher = checked first)" |
| | ) |
| |
|
| |
|
| | class ClassificationConfig(BaseModel): |
| | """ |
| | Configuration for document classification. |
| | """ |
| | |
| | min_confidence: float = Field( |
| | default=0.6, |
| | ge=0.0, |
| | le=1.0, |
| | description="Minimum confidence for classification" |
| | ) |
| | human_review_threshold: float = Field( |
| | default=0.7, |
| | ge=0.0, |
| | le=1.0, |
| | description="Below this, flag for human review" |
| | ) |
| |
|
| | |
| | use_llm: bool = Field(default=True, description="Use LLM for classification") |
| | use_rules: bool = Field(default=True, description="Use rule-based classification") |
| | hybrid_mode: str = Field( |
| | default="llm_primary", |
| | description="Hybrid mode: llm_primary, rules_primary, or ensemble" |
| | ) |
| |
|
| | |
| | custom_rules: List[DocumentCategoryRule] = Field( |
| | default_factory=list, |
| | description="Custom classification rules" |
| | ) |
| |
|
| | |
| | enabled_types: List[DocumentType] = Field( |
| | default_factory=lambda: list(DocumentType), |
| | description="Document types to consider" |
| | ) |
| |
|
| | |
| | require_evidence: bool = Field( |
| | default=True, |
| | description="Require evidence for classification" |
| | ) |
| | max_evidence_snippets: int = Field( |
| | default=3, |
| | description="Maximum evidence snippets to include" |
| | ) |
| |
|