Coverage for src/ragindexer/documents/XlsDocument.py: 18%

25 statements  

« prev     ^ index     » next       coverage.py v7.8.2, created at 2025-06-20 15:57 +0000

1from typing import Iterable, Tuple 

2 

3import openpyxl 

4 

5from .. import logger 

6from .ADocument import ADocument 

7 

8 

9class XlsDocument(ADocument): 

10 def iterate_raw_text(self) -> Iterable[Tuple[int, str, dict]]: 

11 try: 

12 wb = openpyxl.load_workbook(self.get_abs_path(), read_only=True, data_only=True) 

13 except Exception: 

14 logger.warning("Error while reading the file. Skipping") 

15 return None, {"ocr_used": False} 

16 

17 nb_sheets = len(wb.worksheets) 

18 logger.info(f"Reading {nb_sheets} pages excel file") 

19 avct = -1 

20 all_text = [] 

21 for k_sheet, sheet in enumerate(wb.worksheets): 

22 new_avct = int(k_sheet / nb_sheets * 100 / 10) 

23 if new_avct != avct: 

24 logger.info(f"Lecture page {k_sheet+1}/{nb_sheets}") 

25 avct = new_avct 

26 

27 for row in sheet.iter_rows(values_only=True): 

28 row_text = [str(cell) for cell in row if cell is not None] 

29 if row_text: 

30 all_text.append(" ".join(row_text)) 

31 yield k_sheet, "\n".join(all_text).strip(), {"ocr_used": False}