Source code for autorag.data.legacy.corpus.langchain
import uuid
from typing import List, Optional
import pandas as pd
from langchain_core.documents import Document
from autorag.data.utils.util import add_essential_metadata
from autorag.utils.util import save_parquet_safe
[docs]
def langchain_documents_to_parquet(
langchain_documents: List[Document],
output_filepath: Optional[str] = None,
upsert: bool = False,
) -> pd.DataFrame:
"""
Langchain documents to corpus dataframe.
Corpus dataframe will be saved to filepath(file_dir/filename) if given.
Return corpus dataframe whether the filepath is given.
You can use this method to create corpus.parquet after load and chunk using Llama Index.
:param langchain_documents: List of langchain documents.
:param output_filepath: Optional filepath to save the parquet file.
If None, the function will return the processed_data as pd.DataFrame, but do not save as parquet.
File directory must exist. File extension must be .parquet
:param upsert: If true, the function will overwrite the existing file if it exists.
Default is False.
:return: Corpus data as pd.DataFrame
"""
corpus_df = pd.DataFrame(
list(
map(
lambda doc: {
"doc_id": str(uuid.uuid4()),
"contents": doc.page_content,
"metadata": add_essential_metadata(doc.metadata),
},
langchain_documents,
)
)
)
if output_filepath is not None:
save_parquet_safe(corpus_df, output_filepath, upsert=upsert)
return corpus_df