Coverage for src/ragindexer/documents/PdfDocument.py: 73%

63 statements  

« prev     ^ index     » next       coverage.py v7.8.2, created at 2025-06-20 15:57 +0000

1from pathlib import Path 

2from typing import Iterable, Tuple 

3 

4import pytesseract 

5from pdf2image import convert_from_path 

6from pypdf import PdfReader 

7 

8from .. import logger 

9from .ADocument import ADocument 

10from ..config import config 

11 

12 

13def ocr_pdf(path: Path, k_page: int, ocr_dir: Path) -> str: 

14 ocr_dir.mkdir(parents=True, exist_ok=True) 

15 

16 # Convert the page to an image 

17 ocr_txt = ocr_dir / f"page{k_page:05}.cache" 

18 if ocr_txt.exists(): 18 ↛ 19line 18 didn't jump to line 19 because the condition on line 18 was never true

19 with open(ocr_txt, "r") as f: 

20 txt = f.read() 

21 

22 else: 

23 img = convert_from_path(path, first_page=k_page, last_page=k_page, dpi=300)[0] 

24 

25 try: 

26 txt = pytesseract.image_to_string(img, lang=config.OCR_LANG) 

27 with open(ocr_txt, "w") as f: 

28 f.write(txt) 

29 except Exception as e: 

30 logger.error(f"OCR failed : {e}") 

31 txt = None 

32 

33 return txt 

34 

35 

36class PdfDocument(ADocument): 

37 def __init__(self, abspath): 

38 super().__init__(abspath) 

39 

40 if abspath.parts[0] == "/": 40 ↛ 48line 40 didn't jump to line 48 because the condition on line 40 was always true

41 self.ocr_dir = ( 

42 config.STATE_DB_PATH.parent 

43 / "cache" 

44 / abspath.parent.relative_to("/") 

45 / (abspath.parts[-1] + ".ocr") 

46 ) 

47 else: 

48 self.ocr_dir = ( 

49 config.STATE_DB_PATH.parent 

50 / "cache" 

51 / abspath.parent 

52 / (abspath.parts[-1] + ".ocr") 

53 ) 

54 

55 if self.ocr_dir.exists(): 55 ↛ 56line 55 didn't jump to line 56 because the condition on line 55 was never true

56 logger.info(f"Reusing OCR cache for {self.ocr_dir}") 

57 self.using_ocr = True 

58 else: 

59 self.using_ocr = False 

60 

61 def iterate_raw_text(self) -> Iterable[Tuple[str, dict]]: 

62 path = self.get_abs_path() 

63 try: 

64 reader = PdfReader(path) 

65 nb_pages = len(reader.pages) 

66 except Exception: 

67 logger.error("Error while reading the file. Skipping") 

68 return None, {"ocr_used": False} 

69 

70 logger.info(f"Reading {nb_pages} pages pdf file") 

71 file_metadata = {"ocr_used": False} 

72 avct = -1 

73 for k_page, page in enumerate(reader.pages): 

74 new_avct = int(k_page / nb_pages * 100 / 10) 

75 if new_avct != avct: 75 ↛ 79line 75 didn't jump to line 79 because the condition on line 75 was always true

76 logger.info(f"Lecture page {k_page+1}/{nb_pages}") 

77 avct = new_avct 

78 

79 try: 

80 txt = page.extract_text() or "" 

81 except Exception as e: 

82 logger.error(f"While extracting text: {e}") 

83 txt = "" 

84 

85 if len(txt) < config.MIN_EXPECTED_CHAR: 

86 if not self.using_ocr: 86 ↛ 90line 86 didn't jump to line 90 because the condition on line 86 was always true

87 self.using_ocr = True 

88 logger.info(f"Using OCR for '{self.get_abs_path()}' in '{self.ocr_dir}") 

89 

90 file_metadata["ocr_used"] = True 

91 txt = ocr_pdf(path, k_page + 1, self.ocr_dir) 

92 

93 if txt is None or txt == "": 93 ↛ 94line 93 didn't jump to line 94 because the condition on line 93 was never true

94 continue 

95 

96 yield k_page, txt, file_metadata