Source code for autorag.data

import logging
from typing import Callable, List

from langchain_community.document_loaders import (
	BSHTMLLoader,
	CSVLoader,
	DirectoryLoader,
	JSONLoader,
	PDFMinerLoader,
	PDFPlumberLoader,
	PyMuPDFLoader,
	PyPDFium2Loader,
	PyPDFLoader,
	UnstructuredFileLoader,
	UnstructuredMarkdownLoader,
	UnstructuredPDFLoader,
	UnstructuredXMLLoader,
)
from langchain_text_splitters import (
	CharacterTextSplitter,
	KonlpyTextSplitter,
	RecursiveCharacterTextSplitter,
	SentenceTransformersTokenTextSplitter,
)
from llama_index.core.node_parser import (
	SemanticDoubleMergingSplitterNodeParser,
	SemanticSplitterNodeParser,
	SentenceSplitter,
	SentenceWindowNodeParser,
	SimpleFileNodeParser,
	TokenTextSplitter,
)

from autorag import LazyInit

logger = logging.getLogger("AutoRAG")


[docs] class UnstructuredLoader: def __init__(self, file_path_list: List[str], **kwargs): self._file_path_list = file_path_list self._kwargs = kwargs
[docs] def load(self): documents = [] for file_path in self._file_path_list: documents.extend(UnstructuredFileLoader(file_path, **self._kwargs).load()) return documents
[docs] class UpstageLayoutAnalysisLoader: def __new__(cls, *args, **kwargs): loader_cls = None try: from langchain_upstage import ( UpstageDocumentParseLoader as loader_cls, ) except Exception: try: from langchain_upstage import UpstageLayoutAnalysisLoader as loader_cls except Exception as exc: raise ImportError( "The 'upstagedocumentparse' parser requires a compatible " "langchain-upstage installation. Install a version that supports " "your current langchain-core release." ) from exc return loader_cls(*args, **kwargs)
parse_modules = { # PDF "pdfminer": PDFMinerLoader, "pdfplumber": PDFPlumberLoader, "pypdfium2": PyPDFium2Loader, "pypdf": PyPDFLoader, "pymupdf": PyMuPDFLoader, "unstructuredpdf": UnstructuredPDFLoader, # Common File Types # 1. CSV "csv": CSVLoader, # 2. JSON "json": JSONLoader, # 3. Markdown "unstructuredmarkdown": UnstructuredMarkdownLoader, # 4. HTML "bshtml": BSHTMLLoader, # 5. XML "unstructuredxml": UnstructuredXMLLoader, # 6. All files "directory": DirectoryLoader, "unstructured": UnstructuredLoader, "upstagedocumentparse": UpstageLayoutAnalysisLoader, } chunk_modules = { # Llama Index # Token "token": TokenTextSplitter, # Sentence "sentence": SentenceSplitter, # window "sentencewindow": SentenceWindowNodeParser, # Semantic "semantic_llama_index": SemanticSplitterNodeParser, "semanticdoublemerging": SemanticDoubleMergingSplitterNodeParser, # Simple "simplefile": SimpleFileNodeParser, # LangChain # Token "sentencetransformerstoken": SentenceTransformersTokenTextSplitter, # Character "recursivecharacter": RecursiveCharacterTextSplitter, "character": CharacterTextSplitter, # Sentence "konlpy": KonlpyTextSplitter, }
[docs] def split_by_sentence_kiwi() -> Callable[[str], List[str]]: try: from kiwipiepy import Kiwi except ImportError: raise ImportError( "You need to install kiwipiepy to use 'ko_kiwi' tokenizer. " "Please install kiwipiepy by running 'pip install kiwipiepy'. " "Or install Korean version of AutoRAG by running 'pip install AutoRAG[ko]'." ) kiwi = Kiwi() def split(text: str) -> List[str]: kiwi_result = kiwi.split_into_sents(text) sentences = list(map(lambda x: x.text, kiwi_result)) return sentences return split
sentence_splitter_modules = {"kiwi": LazyInit(split_by_sentence_kiwi)}