Skip to content

Commit

Permalink
updated to export fields with appropriate name
Browse files Browse the repository at this point in the history
Signed-off-by: Francisco Javier Arceo <[email protected]>
  • Loading branch information
franciscojavierarceo committed Jan 9, 2025
1 parent 8271955 commit e959053
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 2 deletions.
24 changes: 22 additions & 2 deletions module_4_rag/batch_score_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,20 +59,40 @@ def score_data() -> None:

tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)
model = AutoModel.from_pretrained(MODEL)
embeddings = run_model(df["Wiki Summary"].tolist(), tokenizer, model)
embeddings = run_model(df["Sentence Chunks"].tolist(), tokenizer, model)
print("embeddings generated...")
df["id"] = [i for i in range(len(df))]
df["Embeddings"] = list(embeddings.detach().cpu().numpy())
df["event_timestamp"] = pd.to_datetime("today")
df["item_id"] = df.index
df = _rename_df(df)

df.to_parquet(EXPORT_FILENAME, index=False)
print("...data exported. Job complete")
else:
df = pd.read_parquet(EXPORT_FILENAME)
print("Scored data found... skipping generating embeddings.")

print("preview of data:")
print(df.head().T)


def _rename_df(df: pd.DataFrame) -> pd.DataFrame:
df.columns = [c.replace(" ", "_").lower() for c in df.columns]
df.rename({"embeddings": "vector"}, axis=1, inplace=True)
df = df[
[
"id",
"item_id",
"event_timestamp",
"state",
"wiki_summary",
"sentence_chunks",
"vector",
]
]
return df


if __name__ == "__main__":
score_data()
Binary file not shown.

0 comments on commit e959053

Please sign in to comment.