Source code for autorag.data.legacy.corpus.llama_index
import uuid
from typing import List, Optional
import pandas as pd
from llama_index.core import Document
from llama_index.core.schema import TextNode
from autorag.data.utils.util import (
add_essential_metadata,
add_essential_metadata_llama_text_node,
)
from autorag.utils.util import save_parquet_safe
[docs]
def llama_documents_to_parquet(
llama_documents: List[Document],
output_filepath: Optional[str] = None,
upsert: bool = False,
) -> pd.DataFrame:
"""
Llama Index documents to corpus dataframe.
Corpus dataframe will be saved to filepath(file_dir/filename) if given.
Return corpus dataframe whether the filepath is given.
You can use this method to create corpus.parquet after load and chunk using Llama Index.
:param llama_documents: List[Document]
:param output_filepath: Optional filepath to save the parquet file.
If None, the function will return the processed_data as pd.DataFrame, but do not save as parquet.
File directory must exist. File extension must be .parquet
:param upsert: If true, the function will overwrite the existing file if it exists.
Default is False.
:return: Corpus data as pd.DataFrame
"""
doc_lst = pd.DataFrame(
list(
map(
lambda doc: {
"doc_id": str(uuid.uuid4()),
"contents": doc.text,
"metadata": add_essential_metadata(doc.metadata),
},
llama_documents,
)
)
)
processed_df = pd.DataFrame(doc_lst)
if output_filepath is not None:
save_parquet_safe(processed_df, output_filepath, upsert=upsert)
return processed_df
[docs]
def llama_text_node_to_parquet(
text_nodes: List[TextNode],
output_filepath: Optional[str] = None,
upsert: bool = False,
) -> pd.DataFrame:
"""
Llama Index text nodes to corpus dataframe.
Corpus dataframe will be saved to filepath(file_dir/filename) if given.
Return corpus dataframe whether the filepath is given.
You can use this method to create corpus.parquet after load and chunk using Llama Index.
:param text_nodes: List of llama index text nodes.
:param output_filepath: Optional filepath to save the parquet file.
If None, the function will return the processed_data as pd.DataFrame, but do not save as parquet.
File directory must exist. File extension must be .parquet
:param upsert: If true, the function will overwrite the existing file if it exists.
Default is False.
:return: Corpus data as pd.DataFrame
"""
corpus_df = pd.DataFrame(
list(
map(
lambda node: {
"doc_id": node.node_id,
"contents": node.text,
"metadata": add_essential_metadata_llama_text_node(
node.metadata, node.relationships
),
},
text_nodes,
)
)
)
if output_filepath is not None:
save_parquet_safe(corpus_df, output_filepath, upsert=upsert)
return corpus_df