Coverage for src/ragindexer/documents/PdfDocument.py: 73%

1from pathlib import Path

2from typing import Iterable, Tuple

4import pytesseract

5from pdf2image import convert_from_path

6from pypdf import PdfReader

8from .. import logger

9from .ADocument import ADocument

10from ..config import config

13def ocr_pdf(path: Path, k_page: int, ocr_dir: Path) -> str:

14 ocr_dir.mkdir(parents=True, exist_ok=True)

16 # Convert the page to an image

17 ocr_txt = ocr_dir / f"page{k_page:05}.cache"

18 if ocr_txt.exists(): 18 ↛ 19line 18 didn't jump to line 19 because the condition on line 18 was never true

19 with open(ocr_txt, "r") as f:

20 txt = f.read()

22 else:

23 img = convert_from_path(path, first_page=k_page, last_page=k_page, dpi=300)[0]

25 try:

26 txt = pytesseract.image_to_string(img, lang=config.OCR_LANG)

27 with open(ocr_txt, "w") as f:

28 f.write(txt)

29 except Exception as e:

30 logger.error(f"OCR failed : {e}")

31 txt = None

33 return txt

36class PdfDocument(ADocument):

37 def __init__(self, abspath):

38 super().__init__(abspath)

40 if abspath.parts[0] == "/": 40 ↛ 48line 40 didn't jump to line 48 because the condition on line 40 was always true

41 self.ocr_dir = (

42 config.STATE_DB_PATH.parent

43 / "cache"

44 / abspath.parent.relative_to("/")

45 / (abspath.parts[-1] + ".ocr")

46 )

47 else:

48 self.ocr_dir = (

49 config.STATE_DB_PATH.parent

50 / "cache"

51 / abspath.parent

52 / (abspath.parts[-1] + ".ocr")

53 )

55 if self.ocr_dir.exists(): 55 ↛ 56line 55 didn't jump to line 56 because the condition on line 55 was never true

56 logger.info(f"Reusing OCR cache for {self.ocr_dir}")

57 self.using_ocr = True

58 else:

59 self.using_ocr = False

61 def iterate_raw_text(self) -> Iterable[Tuple[str, dict]]:

62 path = self.get_abs_path()

63 try:

64 reader = PdfReader(path)

65 nb_pages = len(reader.pages)

66 except Exception:

67 logger.error("Error while reading the file. Skipping")

68 return None, {"ocr_used": False}

70 logger.info(f"Reading {nb_pages} pages pdf file")

71 file_metadata = {"ocr_used": False}

72 avct = -1

73 for k_page, page in enumerate(reader.pages):

74 new_avct = int(k_page / nb_pages * 100 / 10)

75 if new_avct != avct: 75 ↛ 79line 75 didn't jump to line 79 because the condition on line 75 was always true

76 logger.info(f"Lecture page {k_page+1}/{nb_pages}")

77 avct = new_avct

79 try:

80 txt = page.extract_text() or ""

81 except Exception as e:

82 logger.error(f"While extracting text: {e}")

83 txt = ""

85 if len(txt) < config.MIN_EXPECTED_CHAR:

86 if not self.using_ocr: 86 ↛ 90line 86 didn't jump to line 90 because the condition on line 86 was always true

87 self.using_ocr = True

88 logger.info(f"Using OCR for '{self.get_abs_path()}' in '{self.ocr_dir}")

90 file_metadata["ocr_used"] = True

91 txt = ocr_pdf(path, k_page + 1, self.ocr_dir)

93 if txt is None or txt == "": 93 ↛ 94line 93 didn't jump to line 94 because the condition on line 93 was never true

94 continue

96 yield k_page, txt, file_metadata