Skip to content

Commit

Permalink
Add embedding scripts
Browse files Browse the repository at this point in the history
Signed-off-by: Evangelos Lamprou <[email protected]>
  • Loading branch information
vagos committed Jan 14, 2025
1 parent fa0d9db commit 690e0fb
Show file tree
Hide file tree
Showing 2 changed files with 161 additions and 40 deletions.
61 changes: 61 additions & 0 deletions infrastructure/do_embedding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import json
import glob
import os
from openai import OpenAI
import pandas as pd
import dotenv

# Load environment variables
dotenv.load_dotenv()

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Set your OpenAI API key

def read_json_and_generate_embeddings(json_file):
"""
Reads the JSON file, combines script contents for each benchmark, sends them to the OpenAI embedding API,
and returns a dictionary of dataframes containing embeddings for each benchmark.
"""
# Read the JSON file
with open(json_file, 'r') as file:
data = json.load(file)

embedding_df = pd.DataFrame(columns=["benchmark", "embedding"])

# Process each benchmark
for benchmark, details in data.items():
print(f"Processing benchmark: {benchmark}")

# Combine all script contents into a single string
scripts_globs = details.get("scripts", [])
combined_script = ""
for script_glob in scripts_globs:
for script_file in glob.glob(f"../{script_glob}"):
with open(script_file, 'r') as f:
combined_script += f.read() + "\n" # Append content

print(f"Combined script for {benchmark}: {combined_script}")

# Generate embedding using OpenAI's API
try:
response = client.embeddings.create(model="text-embedding-ada-002", # Use a suitable model for embedding
input=combined_script)
embedding = response.data[0].embedding
except Exception as e:
print(f"Error generating embedding for {benchmark}: {e}")
continue

# Create a dataframe to hold the benchmark and its embedding
embedding_df = embedding_df._append({"benchmark": benchmark, "embedding": embedding}, ignore_index=True)

return embedding_df

# Example usage
if __name__ == "__main__":
json_file = "./data/script-globs.json"
embeddings_df = read_json_and_generate_embeddings(json_file)

# Save or inspect the results
print(embeddings_df)
embeddings_df.to_csv("./data/embeddings.csv", index=False)
140 changes: 100 additions & 40 deletions infrastructure/do_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,50 +3,110 @@
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from adjustText import adjust_text

def perform_pca_and_plot(dataframe):
def perform_pca_and_plot(dataframe1, dataframe2, name='row_analysis'):
"""
Performs PCA on the numeric columns of the input dataframe and plots the first two principal components.
Performs PCA on the numeric columns of two input dataframes and plots each pair of principal components
(1&2 and 3&4) in a 2x2 grid, with one dataset per row and unified titles for each dataset.
Each point is annotated with the corresponding benchmark name, avoiding label collisions.
Parameters:
dataframe (pd.DataFrame): Input dataframe containing data for PCA.
dataframe1 (pd.DataFrame): First input dataframe.
dataframe2 (pd.DataFrame): Second input dataframe.
name (str): Name for saving the plots.
Returns:
pd.DataFrame: A dataframe containing the principal components.
tuple: Two dataframes containing the principal components for each input dataframe.
"""
# Ensure numeric columns are selected for PCA
numeric_cols = dataframe.select_dtypes(include=[np.number]).columns
if numeric_cols.empty:
raise ValueError("No numeric columns available in the dataframe for PCA.")
print(f"Numeric columns selected for PCA: {numeric_cols}")

# Drop rows with NaN values in numeric columns (if any)
dataframe_numeric = dataframe[numeric_cols].dropna()

# Standardize the data
scaler = StandardScaler()
data_scaled = scaler.fit_transform(dataframe_numeric)

# Perform PCA
pca = PCA(n_components=2) # Reduce to 2 components for visualization
principal_components = pca.fit_transform(data_scaled)

# Create a new dataframe with the principal components
pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])

# Plot the results
plt.figure(figsize=(10, 7))
plt.scatter(pca_df['PC1'], pca_df['PC2'], alpha=0.7)
plt.title('PCA of Input DataFrame', fontsize=16)
plt.xlabel('Principal Component 1', fontsize=12)
plt.ylabel('Principal Component 2', fontsize=12)
plt.grid(True)

# Optionally, add labels for points (if 'benchmark' column exists)
if 'benchmark' in dataframe.columns:
for i, label in enumerate(dataframe['benchmark']):
plt.annotate(label, (pca_df['PC1'][i], pca_df['PC2'][i]), fontsize=8, alpha=0.6)

plt.savefig('pca_plot.pdf')

return pca_df
def prepare_pca(dataframe):
# Ensure numeric columns are selected for PCA
numeric_cols = dataframe.select_dtypes(include=[np.number]).columns
if numeric_cols.empty:
raise ValueError("No numeric columns available in the dataframe for PCA.")
print(f"Numeric columns selected for PCA: {numeric_cols}")

# Drop rows with NaN values in numeric columns and retain their indices for annotation
dataframe_clean = dataframe.dropna(subset=numeric_cols)
benchmark_names = dataframe_clean['benchmark'].values

# Standardize the data
scaler = StandardScaler()
data_scaled = scaler.fit_transform(dataframe_clean[numeric_cols])

# Perform PCA
pca = PCA(n_components=4) # Reduce to 4 components for analysis
principal_components = pca.fit_transform(data_scaled)

# Create a new dataframe with the principal components
pca_df = pd.DataFrame(
data=principal_components,
columns=['PC1', 'PC2', 'PC3', 'PC4']
)
return pca_df, benchmark_names

# Perform PCA on both dataframes
pca_df1, benchmarks1 = prepare_pca(dataframe1)
pca_df2, benchmarks2 = prepare_pca(dataframe2)

# Create a 2x2 grid for the plots
fig, axes = plt.subplots(2, 2, figsize=(12, 12), constrained_layout=True)

# Set the main titles for each dataset
axes[0, 0].set_title('PCA from collected metrics', fontsize=14, loc='left')
axes[1, 0].set_title('PCA from language model embeddings', fontsize=14, loc='left')

# Helper function to plot and annotate
def plot_with_labels(ax, x, y, labels, title, secondary=False):
scatter = ax.scatter(x, y, c='black', alpha=0.7)
# ax.set_title(title, fontsize=14, loc='left')
ax.set_xlabel(f'Component {1 if not secondary else 3}', fontsize=14)
ax.set_ylabel(f'Component {2 if not secondary else 4}', fontsize=14)
ax.grid(color='lightgray', linestyle='--', linewidth=0.5)

# Add text annotations
texts = [ax.text(x[i], y[i], labels[i], fontsize=14, ha='center', va='center') for i in range(len(labels))]
adjust_text(texts, ax=ax, arrowprops=dict(arrowstyle='-', color='gray', lw=0.5))

# Plot Components 1 and 2 for Dataset 1
plot_with_labels(
axes[0, 0],
pca_df1['PC1'],
pca_df1['PC2'],
benchmarks1,
'PCA from collected metrics'
)

# Plot Components 3 and 4 for Dataset 1
plot_with_labels(
axes[0, 1],
pca_df1['PC3'],
pca_df1['PC4'],
benchmarks1,
'',
True
)

# Plot Components 1 and 2 for Dataset 2
plot_with_labels(
axes[1, 0],
pca_df2['PC1'],
pca_df2['PC2'],
benchmarks2,
'PCA from language model embeddings'
)

# Plot Components 3 and 4 for Dataset 2
plot_with_labels(
axes[1, 1],
pca_df2['PC3'],
pca_df2['PC4'],
benchmarks2,
'',
True
)

# Save the plots
plt.savefig(f'pca-row-plot-{name}.pdf', format='pdf')

return pca_df1, pca_df2

0 comments on commit 690e0fb

Please sign in to comment.