Coverage for src/ragindexer/documents/DocDocument.py: 83%
20 statements
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-20 15:57 +0000
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-20 15:57 +0000
1from typing import Iterable, Tuple
3import docx
5from .. import logger
6from .ADocument import ADocument
9class DocDocument(ADocument):
10 def iterate_raw_text(self) -> Iterable[Tuple[str, dict]]:
11 try:
12 doc = docx.Document(str(self.get_abs_path()))
13 except Exception:
14 logger.warning("Error while reading the file. Skipping")
15 return None, {"ocr_used": False}
17 page_count = sum(p.contains_page_break for p in doc.paragraphs) + 1
18 logger.info(f"Reading {page_count} pages doc file")
19 avct = -1
20 for k_page, p in enumerate(doc.paragraphs):
21 new_avct = int(k_page / page_count * 100 / 10)
22 if new_avct != avct: 22 ↛ 26line 22 didn't jump to line 26 because the condition on line 22 was always true
23 logger.info(f"Lecture page {k_page+1}/{page_count}")
24 avct = new_avct
26 yield k_page, "\n".join(p.text).strip(), {"ocr_used": False}