Coverage for src/ragindexer/documents/XlsDocument.py: 18%
25 statements
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-20 15:57 +0000
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-20 15:57 +0000
1from typing import Iterable, Tuple
3import openpyxl
5from .. import logger
6from .ADocument import ADocument
9class XlsDocument(ADocument):
10 def iterate_raw_text(self) -> Iterable[Tuple[int, str, dict]]:
11 try:
12 wb = openpyxl.load_workbook(self.get_abs_path(), read_only=True, data_only=True)
13 except Exception:
14 logger.warning("Error while reading the file. Skipping")
15 return None, {"ocr_used": False}
17 nb_sheets = len(wb.worksheets)
18 logger.info(f"Reading {nb_sheets} pages excel file")
19 avct = -1
20 all_text = []
21 for k_sheet, sheet in enumerate(wb.worksheets):
22 new_avct = int(k_sheet / nb_sheets * 100 / 10)
23 if new_avct != avct:
24 logger.info(f"Lecture page {k_sheet+1}/{nb_sheets}")
25 avct = new_avct
27 for row in sheet.iter_rows(values_only=True):
28 row_text = [str(cell) for cell in row if cell is not None]
29 if row_text:
30 all_text.append(" ".join(row_text))
31 yield k_sheet, "\n".join(all_text).strip(), {"ocr_used": False}