Coverage for src/ragindexer/documents/DocumentFactory.py: 81%

41 statements  

« prev     ^ index     » next       coverage.py v7.8.2, created at 2025-06-20 15:57 +0000

1from pathlib import Path 

2from typing import Iterable, List, Tuple 

3from solus import Singleton 

4from sentence_transformers import SentenceTransformer 

5 

6from ..models import ChunkType, EmbeddingType 

7from .ADocument import ADocument 

8from .XlsDocument import XlsDocument 

9from .PdfDocument import PdfDocument 

10from .MarkdownDocument import MarkdownDocument 

11from .DocDocument import DocDocument 

12 

13 

14class DocumentFactory(Singleton): 

15 def __init__(self): 

16 self.__association = {} 

17 self.__embedding_model = None 

18 

19 def filter_file(self, path: Path) -> bool: 

20 if path.suffix not in self.__association.keys(): 

21 return False 

22 

23 if path.stem.startswith(".sftpgo-upload"): 

24 return False 

25 

26 return True 

27 

28 def register(self, ext: str, cls: type): 

29 self.__association[ext] = cls 

30 

31 def getBuild(self, ext: str) -> ADocument: 

32 return self.__association[ext] 

33 

34 def set_embedding_model(self, embedding_model: SentenceTransformer): 

35 self.__embedding_model = embedding_model 

36 

37 def processDocument( 

38 self, abspath: Path 

39 ) -> Iterable[Tuple[int, List[ChunkType], List[EmbeddingType], dict]]: 

40 ext = abspath.suffix 

41 cls = self.getBuild(ext) 

42 doc: ADocument = cls(abspath) 

43 for k_page, chunks, embeddings, file_metadata in doc.process(self.__embedding_model): 

44 yield k_page, chunks, embeddings, file_metadata 

45 

46 

47DocumentFactory().register(".doc", DocDocument) 

48DocumentFactory().register(".docx", DocDocument) 

49DocumentFactory().register(".docm", DocDocument) 

50 

51DocumentFactory().register(".xls", XlsDocument) 

52DocumentFactory().register(".xlsx", XlsDocument) 

53DocumentFactory().register(".xlsm", XlsDocument) 

54 

55DocumentFactory().register(".pdf", PdfDocument) 

56 

57DocumentFactory().register(".txt", MarkdownDocument) 

58DocumentFactory().register(".md", MarkdownDocument)