Coverage for src/ragindexer/documents/DocumentFactory.py: 81%
41 statements
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-20 15:57 +0000
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-20 15:57 +0000
1from pathlib import Path
2from typing import Iterable, List, Tuple
3from solus import Singleton
4from sentence_transformers import SentenceTransformer
6from ..models import ChunkType, EmbeddingType
7from .ADocument import ADocument
8from .XlsDocument import XlsDocument
9from .PdfDocument import PdfDocument
10from .MarkdownDocument import MarkdownDocument
11from .DocDocument import DocDocument
14class DocumentFactory(Singleton):
15 def __init__(self):
16 self.__association = {}
17 self.__embedding_model = None
19 def filter_file(self, path: Path) -> bool:
20 if path.suffix not in self.__association.keys():
21 return False
23 if path.stem.startswith(".sftpgo-upload"):
24 return False
26 return True
28 def register(self, ext: str, cls: type):
29 self.__association[ext] = cls
31 def getBuild(self, ext: str) -> ADocument:
32 return self.__association[ext]
34 def set_embedding_model(self, embedding_model: SentenceTransformer):
35 self.__embedding_model = embedding_model
37 def processDocument(
38 self, abspath: Path
39 ) -> Iterable[Tuple[int, List[ChunkType], List[EmbeddingType], dict]]:
40 ext = abspath.suffix
41 cls = self.getBuild(ext)
42 doc: ADocument = cls(abspath)
43 for k_page, chunks, embeddings, file_metadata in doc.process(self.__embedding_model):
44 yield k_page, chunks, embeddings, file_metadata
47DocumentFactory().register(".doc", DocDocument)
48DocumentFactory().register(".docx", DocDocument)
49DocumentFactory().register(".docm", DocDocument)
51DocumentFactory().register(".xls", XlsDocument)
52DocumentFactory().register(".xlsx", XlsDocument)
53DocumentFactory().register(".xlsm", XlsDocument)
55DocumentFactory().register(".pdf", PdfDocument)
57DocumentFactory().register(".txt", MarkdownDocument)
58DocumentFactory().register(".md", MarkdownDocument)