Coverage for src/ragindexer/documents/ADocument.py: 96%

1from abc import abstractmethod, ABC

2from pathlib import Path

3from typing import List, Tuple, Iterable

5from nltk.tokenize import sent_tokenize

6from sentence_transformers import SentenceTransformer

8from ..config import config

9from ..models import ChunkType, EmbeddingType

12class ADocument(ABC):

13 """

14 Handle documents based on their extension

16 Args:

17 abspath: Path to the file to handle

19 """

21 def __init__(self, abspath: Path):

22 self.__abspath = abspath

24 def get_abs_path(self) -> Path:

25 """

26 Get the abspath to the handled file

28 Returns:

29 Path to the handled file

31 """

32 return self.__abspath

34 @abstractmethod

35 def iterate_raw_text(self) -> Iterable[Tuple[int, str, dict]]:

36 """

37 Abstract method that should implement the concrete way to handle the file.

39 Yields:

40 A tuple with extracted text and file metadata

42 """

44 def __get_chunk_text(self, text: str, chunk_size: int, chunk_overlap: int) -> List[ChunkType]:

45 """

46 Splits text into overlapping chunks of ~chunk_size characters, aligned on sentences.

47 """

48 sentences = sent_tokenize(text)

49 chunks = []

50 current_chunk = ""

51 for sent in sentences:

52 if len(current_chunk) + len(sent) + 1 <= chunk_size:

53 current_chunk += " " + sent if current_chunk else sent

54 else:

55 chunks.append(current_chunk)

56 # Start new chunk: include overlap

57 overlap_text = (

58 current_chunk[-chunk_overlap:]

59 if chunk_overlap < len(current_chunk)

60 else current_chunk

61 )

62 current_chunk = overlap_text + " " + sent

64 if current_chunk:

65 chunks.append(current_chunk)

66 return chunks

68 def __get_embeddings(

69 self, text: str, embedding_model: SentenceTransformer

70 ) -> Tuple[List[ChunkType], List[EmbeddingType]]:

71 chunks = self.__get_chunk_text(text, config.CHUNK_SIZE, config.CHUNK_OVERLAP)

72 embeddings = embedding_model.encode(chunks, device="cpu", show_progress_bar=False).tolist()

73 return chunks, embeddings

75 def process(

76 self, embedding_model: SentenceTransformer

77 ) -> Iterable[Tuple[int, List[ChunkType], List[EmbeddingType], dict]]:

78 for k_page, text, file_metadata in self.iterate_raw_text():

79 file_metadata["abspath"] = self.get_abs_path()

81 chunks, embeddings = self.__get_embeddings(text, embedding_model)

82 while "" in chunks: 82 ↛ 83line 82 didn't jump to line 83 because the condition on line 82 was never true

83 chunks.remove("")

85 yield k_page, chunks, embeddings, file_metadata