File size: 1,302 Bytes
6fc3143
 
12fe8d7
6fc3143
 
 
 
 
 
 
 
 
 
12fe8d7
6fc3143
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import logging
from pathlib import Path
from typing import Optional, Union
import PyPDF2

logger = logging.getLogger(__name__)

class PDFParser:
    """
    Extracts text from PDF files.
    """
    
    @staticmethod
    def parse(file_path: Union[str, Path]) -> str:
        """
        Extract text from a PDF file.
        """
        path = Path(file_path)
        if not path.exists():
            raise FileNotFoundError(f"PDF file not found: {path}")
            
        logger.info(f"Parsing PDF: {path}")
        
        text_content = []
        
        try:
            with open(path, 'rb') as file:
                reader = PyPDF2.PdfReader(file)
                num_pages = len(reader.pages)
                logger.info(f"PDF has {num_pages} pages")
                
                for i, page in enumerate(reader.pages):
                    text = page.extract_text()
                    if text:
                        text_content.append(text)
                        
            full_text = "\n\n".join(text_content)
            logger.info(f"Extracted {len(full_text)} characters from PDF")
            return full_text
            
        except Exception as e:
            logger.error(f"Error parsing PDF: {e}")
            raise RuntimeError(f"Failed to parse PDF: {e}")