Source code for autorag.data

import logging
from typing import Callable, List

from langchain_community.document_loaders import (
	BSHTMLLoader,
	CSVLoader,
	DirectoryLoader,
	JSONLoader,
	PDFMinerLoader,
	PDFPlumberLoader,
	PyMuPDFLoader,
	PyPDFium2Loader,
	PyPDFLoader,
	UnstructuredFileLoader,
	UnstructuredMarkdownLoader,
	UnstructuredPDFLoader,
	UnstructuredXMLLoader,
)
from langchain_text_splitters import (
	CharacterTextSplitter,
	KonlpyTextSplitter,
	RecursiveCharacterTextSplitter,
	SentenceTransformersTokenTextSplitter,
)
from llama_index.core.node_parser import (
	SemanticDoubleMergingSplitterNodeParser,
	SemanticSplitterNodeParser,
	SentenceSplitter,
	SentenceWindowNodeParser,
	SimpleFileNodeParser,
	TokenTextSplitter,
)

from autorag import LazyInit

logger = logging.getLogger("AutoRAG")



[docs]
class UnstructuredLoader:
	def __init__(self, file_path_list: List[str], **kwargs):
		self._file_path_list = file_path_list
		self._kwargs = kwargs


[docs]
	def load(self):
		documents = []
		for file_path in self._file_path_list:
			documents.extend(UnstructuredFileLoader(file_path, **self._kwargs).load())
		return documents





[docs]
class UpstageLayoutAnalysisLoader:
	def __new__(cls, *args, **kwargs):
		loader_cls = None
		try:
			from langchain_upstage import (
				UpstageDocumentParseLoader as loader_cls,
			)
		except Exception:
			try:
				from langchain_upstage import UpstageLayoutAnalysisLoader as loader_cls
			except Exception as exc:
				raise ImportError(
					"The 'upstagedocumentparse' parser requires a compatible "
					"langchain-upstage installation. Install a version that supports "
					"your current langchain-core release."
				) from exc
		return loader_cls(*args, **kwargs)



parse_modules = {
	# PDF
	"pdfminer": PDFMinerLoader,
	"pdfplumber": PDFPlumberLoader,
	"pypdfium2": PyPDFium2Loader,
	"pypdf": PyPDFLoader,
	"pymupdf": PyMuPDFLoader,
	"unstructuredpdf": UnstructuredPDFLoader,
	# Common File Types
	# 1. CSV
	"csv": CSVLoader,
	# 2. JSON
	"json": JSONLoader,
	# 3. Markdown
	"unstructuredmarkdown": UnstructuredMarkdownLoader,
	# 4. HTML
	"bshtml": BSHTMLLoader,
	# 5. XML
	"unstructuredxml": UnstructuredXMLLoader,
	# 6. All files
	"directory": DirectoryLoader,
	"unstructured": UnstructuredLoader,
	"upstagedocumentparse": UpstageLayoutAnalysisLoader,
}

chunk_modules = {
	# Llama Index
	# Token
	"token": TokenTextSplitter,
	# Sentence
	"sentence": SentenceSplitter,
	# window
	"sentencewindow": SentenceWindowNodeParser,
	# Semantic
	"semantic_llama_index": SemanticSplitterNodeParser,
	"semanticdoublemerging": SemanticDoubleMergingSplitterNodeParser,
	# Simple
	"simplefile": SimpleFileNodeParser,
	# LangChain
	# Token
	"sentencetransformerstoken": SentenceTransformersTokenTextSplitter,
	# Character
	"recursivecharacter": RecursiveCharacterTextSplitter,
	"character": CharacterTextSplitter,
	# Sentence
	"konlpy": KonlpyTextSplitter,
}



[docs]
def split_by_sentence_kiwi() -> Callable[[str], List[str]]:
	try:
		from kiwipiepy import Kiwi
	except ImportError:
		raise ImportError(
			"You need to install kiwipiepy to use 'ko_kiwi' tokenizer. "
			"Please install kiwipiepy by running 'pip install kiwipiepy'. "
			"Or install Korean version of AutoRAG by running 'pip install AutoRAG[ko]'."
		)
	kiwi = Kiwi()

	def split(text: str) -> List[str]:
		kiwi_result = kiwi.split_into_sents(text)
		sentences = list(map(lambda x: x.text, kiwi_result))

		return sentences

	return split



sentence_splitter_modules = {"kiwi": LazyInit(split_by_sentence_kiwi)}