Skip to content

Commit

Permalink
Merge pull request #21 from feast-dev/rag
Browse files Browse the repository at this point in the history
chore: Updating the chunking to do sentence chunking
  • Loading branch information
franciscojavierarceo authored Jan 7, 2025
2 parents c5af6c5 + 7a76a33 commit f0f64fd
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 11 deletions.
38 changes: 27 additions & 11 deletions module_4_rag/batch_score_documents.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,17 @@
import os
import pandas as pd
from nltk.tokenize import sent_tokenize
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

INPUT_FILENAME = "./data/city_wikipedia_summaries.csv"
EXPORT_FILENAME = "./data/city_wikipedia_summaries_with_embeddings.parquet"
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), "feature_repo"))
DATA_DIR = os.path.join(BASE_DIR, "data")
INPUT_FILENAME = os.path.join(DATA_DIR, "city_wikipedia_summaries.csv")
CHUNKED_FILENAME = os.path.join(DATA_DIR, "city_wikipedia_summaries_chunked.csv")
EXPORT_FILENAME = os.path.join(
DATA_DIR, "city_wikipedia_summaries_with_embeddings.parquet"
)
TOKENIZER = "sentence-transformers/all-MiniLM-L6-v2"
MODEL = "sentence-transformers/all-MiniLM-L6-v2"

Expand Down Expand Up @@ -36,23 +42,33 @@ def run_model(sentences, tokenizer, model):


def score_data() -> None:
if EXPORT_FILENAME not in os.listdir():
print("scored data not found...generating embeddings...")
df = pd.read_csv(INPUT_FILENAME)
os.makedirs(DATA_DIR, exist_ok=True)

if not os.path.exists(EXPORT_FILENAME):
print("Scored data not found... generating embeddings...")

if not os.path.exists(CHUNKED_FILENAME):
print("Chunked data not found... generating chunked data...")
df = pd.read_csv(INPUT_FILENAME)
df["Sentence Chunks"] = df["Wiki Summary"].apply(lambda x: sent_tokenize(x))
chunked_df = df.explode("Sentence Chunks")
chunked_df.to_csv(CHUNKED_FILENAME, index=False)
df = chunked_df
else:
df = pd.read_csv(CHUNKED_FILENAME)

tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)
model = AutoModel.from_pretrained(MODEL)
embeddings = run_model(df["Wiki Summary"].tolist(), tokenizer, model)
print(embeddings)
print("shape = ", df.shape)
df["Embeddings"] = list(embeddings.detach().cpu().numpy())
print("embeddings generated...")
df["Embeddings"] = list(embeddings.detach().cpu().numpy())
df["event_timestamp"] = pd.to_datetime("today")
df["item_id"] = df.index
print(df.head())

df.to_parquet(EXPORT_FILENAME, index=False)
print("...data exported. job complete")
print("...data exported. Job complete")
else:
print("scored data found...skipping generating embeddings.")
print("Scored data found... skipping generating embeddings.")


if __name__ == "__main__":
Expand Down
Binary file not shown.

0 comments on commit f0f64fd

Please sign in to comment.