Source code for autorag.data.parse.langchain_parse
import multiprocessing as mp
from itertools import chain
from typing import List, Tuple
from autorag.data import parse_modules
from autorag.data.parse.base import parser_node
[docs]
@parser_node
def langchain_parse(
data_path_list: List[str], parse_method: str, **kwargs
) -> Tuple[List[str], List[str], List[int]]:
"""
Parse documents to use langchain document_loaders(parse) method
:param data_path_list: The list of data paths to parse.
:param parse_method: A langchain document_loaders(parse) method to use.
:param kwargs: The extra parameters for creating the langchain document_loaders(parse) instance.
:return: tuple of lists containing the parsed texts, path and pages.
"""
if parse_method in ["directory", "unstructured"]:
results = parse_all_files(data_path_list, parse_method, **kwargs)
texts, path = results[0], results[1]
pages = [-1] * len(texts)
else:
num_workers = mp.cpu_count()
# Execute parallel processing
with mp.Pool(num_workers) as pool:
results = pool.starmap(
langchain_parse_pure,
[(data_path, parse_method, kwargs) for data_path in data_path_list],
)
texts, path, pages = (list(chain.from_iterable(item)) for item in zip(*results))
return texts, path, pages
[docs]
def langchain_parse_pure(
data_path: str, parse_method: str, kwargs
) -> Tuple[List[str], List[str], List[int]]:
"""
Parses a single file using the specified parse method.
Args:
data_path (str): The file path to parse.
parse_method (str): The parsing method to use.
kwargs (Dict): Additional keyword arguments for the parsing method.
Returns:
Tuple[str, str]: A tuple containing the parsed text and the file path.
"""
parse_instance = parse_modules[parse_method](data_path, **kwargs)
# Load the text from the file
documents = parse_instance.load()
texts = list(map(lambda x: x.page_content, documents))
path = [data_path] * len(texts)
if parse_method in ["pymupdf", "pdfplumber", "pypdf", "pypdfium2"]:
pages = list(range(1, len(documents) + 1))
else:
pages = [-1] * len(texts)
# Clean up the parse instance
del parse_instance
return texts, path, pages
[docs]
def parse_all_files(
data_path_list: List[str], parse_method: str, **kwargs
) -> Tuple[List[str], List[str]]:
if parse_method == "unstructured":
parse_instance = parse_modules[parse_method](data_path_list, **kwargs)
elif parse_method == "directory":
parse_instance = parse_modules[parse_method](**kwargs)
else:
raise ValueError(f"Unsupported parse method: {parse_method}")
docs = parse_instance.load()
texts = [doc.page_content for doc in docs]
file_names = [doc.metadata["source"] for doc in docs]
del parse_instance
return texts, file_names