Coverage for src/ragindexer/documents/PdfDocument.py: 73%
63 statements
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-20 15:57 +0000
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-20 15:57 +0000
1from pathlib import Path
2from typing import Iterable, Tuple
4import pytesseract
5from pdf2image import convert_from_path
6from pypdf import PdfReader
8from .. import logger
9from .ADocument import ADocument
10from ..config import config
13def ocr_pdf(path: Path, k_page: int, ocr_dir: Path) -> str:
14 ocr_dir.mkdir(parents=True, exist_ok=True)
16 # Convert the page to an image
17 ocr_txt = ocr_dir / f"page{k_page:05}.cache"
18 if ocr_txt.exists(): 18 ↛ 19line 18 didn't jump to line 19 because the condition on line 18 was never true
19 with open(ocr_txt, "r") as f:
20 txt = f.read()
22 else:
23 img = convert_from_path(path, first_page=k_page, last_page=k_page, dpi=300)[0]
25 try:
26 txt = pytesseract.image_to_string(img, lang=config.OCR_LANG)
27 with open(ocr_txt, "w") as f:
28 f.write(txt)
29 except Exception as e:
30 logger.error(f"OCR failed : {e}")
31 txt = None
33 return txt
36class PdfDocument(ADocument):
37 def __init__(self, abspath):
38 super().__init__(abspath)
40 if abspath.parts[0] == "/": 40 ↛ 48line 40 didn't jump to line 48 because the condition on line 40 was always true
41 self.ocr_dir = (
42 config.STATE_DB_PATH.parent
43 / "cache"
44 / abspath.parent.relative_to("/")
45 / (abspath.parts[-1] + ".ocr")
46 )
47 else:
48 self.ocr_dir = (
49 config.STATE_DB_PATH.parent
50 / "cache"
51 / abspath.parent
52 / (abspath.parts[-1] + ".ocr")
53 )
55 if self.ocr_dir.exists(): 55 ↛ 56line 55 didn't jump to line 56 because the condition on line 55 was never true
56 logger.info(f"Reusing OCR cache for {self.ocr_dir}")
57 self.using_ocr = True
58 else:
59 self.using_ocr = False
61 def iterate_raw_text(self) -> Iterable[Tuple[str, dict]]:
62 path = self.get_abs_path()
63 try:
64 reader = PdfReader(path)
65 nb_pages = len(reader.pages)
66 except Exception:
67 logger.error("Error while reading the file. Skipping")
68 return None, {"ocr_used": False}
70 logger.info(f"Reading {nb_pages} pages pdf file")
71 file_metadata = {"ocr_used": False}
72 avct = -1
73 for k_page, page in enumerate(reader.pages):
74 new_avct = int(k_page / nb_pages * 100 / 10)
75 if new_avct != avct: 75 ↛ 79line 75 didn't jump to line 79 because the condition on line 75 was always true
76 logger.info(f"Lecture page {k_page+1}/{nb_pages}")
77 avct = new_avct
79 try:
80 txt = page.extract_text() or ""
81 except Exception as e:
82 logger.error(f"While extracting text: {e}")
83 txt = ""
85 if len(txt) < config.MIN_EXPECTED_CHAR:
86 if not self.using_ocr: 86 ↛ 90line 86 didn't jump to line 90 because the condition on line 86 was always true
87 self.using_ocr = True
88 logger.info(f"Using OCR for '{self.get_abs_path()}' in '{self.ocr_dir}")
90 file_metadata["ocr_used"] = True
91 txt = ocr_pdf(path, k_page + 1, self.ocr_dir)
93 if txt is None or txt == "": 93 ↛ 94line 93 didn't jump to line 94 because the condition on line 93 was never true
94 continue
96 yield k_page, txt, file_metadata