Coverage for src/ragindexer/documents/DocDocument.py: 83%

20 statements  

« prev     ^ index     » next       coverage.py v7.8.2, created at 2025-06-20 15:57 +0000

1from typing import Iterable, Tuple 

2 

3import docx 

4 

5from .. import logger 

6from .ADocument import ADocument 

7 

8 

9class DocDocument(ADocument): 

10 def iterate_raw_text(self) -> Iterable[Tuple[str, dict]]: 

11 try: 

12 doc = docx.Document(str(self.get_abs_path())) 

13 except Exception: 

14 logger.warning("Error while reading the file. Skipping") 

15 return None, {"ocr_used": False} 

16 

17 page_count = sum(p.contains_page_break for p in doc.paragraphs) + 1 

18 logger.info(f"Reading {page_count} pages doc file") 

19 avct = -1 

20 for k_page, p in enumerate(doc.paragraphs): 

21 new_avct = int(k_page / page_count * 100 / 10) 

22 if new_avct != avct: 22 ↛ 26line 22 didn't jump to line 26 because the condition on line 22 was always true

23 logger.info(f"Lecture page {k_page+1}/{page_count}") 

24 avct = new_avct 

25 

26 yield k_page, "\n".join(p.text).strip(), {"ocr_used": False}