Add embedding scripts

Signed-off-by: Evangelos Lamprou <[email protected]>
binpash · Jan 14, 2025 · 690e0fb · 690e0fb
1 parent fa0d9db
commit 690e0fb
Show file tree

Hide file tree

Showing 2 changed files with 161 additions and 40 deletions.
diff --git a/infrastructure/do_embedding.py b/infrastructure/do_embedding.py
@@ -0,0 +1,61 @@
+import json
+import glob
+import os
+from openai import OpenAI
+import pandas as pd
+import dotenv
+
+# Load environment variables
+dotenv.load_dotenv()
+
+client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+
+# Set your OpenAI API key
+
+def read_json_and_generate_embeddings(json_file):
+    """
+    Reads the JSON file, combines script contents for each benchmark, sends them to the OpenAI embedding API,
+    and returns a dictionary of dataframes containing embeddings for each benchmark.
+    """
+    # Read the JSON file
+    with open(json_file, 'r') as file:
+        data = json.load(file)
+
+    embedding_df = pd.DataFrame(columns=["benchmark", "embedding"])
+
+    # Process each benchmark
+    for benchmark, details in data.items():
+        print(f"Processing benchmark: {benchmark}")
+
+        # Combine all script contents into a single string
+        scripts_globs = details.get("scripts", [])
+        combined_script = ""
+        for script_glob in scripts_globs:
+            for script_file in glob.glob(f"../{script_glob}"):
+                with open(script_file, 'r') as f:
+                    combined_script += f.read() + "\n"  # Append content
+
+            print(f"Combined script for {benchmark}: {combined_script}")
+
+        # Generate embedding using OpenAI's API
+        try:
+            response = client.embeddings.create(model="text-embedding-ada-002",  # Use a suitable model for embedding
+            input=combined_script)
+            embedding = response.data[0].embedding
+        except Exception as e:
+            print(f"Error generating embedding for {benchmark}: {e}")
+            continue
+
+        # Create a dataframe to hold the benchmark and its embedding
+        embedding_df = embedding_df._append({"benchmark": benchmark, "embedding": embedding}, ignore_index=True)
+
+    return embedding_df
+
+# Example usage
+if __name__ == "__main__":
+    json_file = "./data/script-globs.json"
+    embeddings_df = read_json_and_generate_embeddings(json_file)
+
+    # Save or inspect the results
+    print(embeddings_df)
+    embeddings_df.to_csv("./data/embeddings.csv", index=False)
diff --git a/infrastructure/do_pca.py b/infrastructure/do_pca.py
@@ -3,50 +3,110 @@
 from sklearn.decomposition import PCA
 from sklearn.preprocessing import StandardScaler
 import matplotlib.pyplot as plt
+from adjustText import adjust_text
 
-def perform_pca_and_plot(dataframe):
+def perform_pca_and_plot(dataframe1, dataframe2, name='row_analysis'):
     """
-    Performs PCA on the numeric columns of the input dataframe and plots the first two principal components.
+    Performs PCA on the numeric columns of two input dataframes and plots each pair of principal components
+    (1&2 and 3&4) in a 2x2 grid, with one dataset per row and unified titles for each dataset.
+    Each point is annotated with the corresponding benchmark name, avoiding label collisions.
 
     Parameters:
-        dataframe (pd.DataFrame): Input dataframe containing data for PCA.
+        dataframe1 (pd.DataFrame): First input dataframe.
+        dataframe2 (pd.DataFrame): Second input dataframe.
+        name (str): Name for saving the plots.
 
     Returns:
-        pd.DataFrame: A dataframe containing the principal components.
+        tuple: Two dataframes containing the principal components for each input dataframe.
     """
-    # Ensure numeric columns are selected for PCA
-    numeric_cols = dataframe.select_dtypes(include=[np.number]).columns
-    if numeric_cols.empty:
-        raise ValueError("No numeric columns available in the dataframe for PCA.")
-    print(f"Numeric columns selected for PCA: {numeric_cols}")
-
-    # Drop rows with NaN values in numeric columns (if any)
-    dataframe_numeric = dataframe[numeric_cols].dropna()
-
-    # Standardize the data
-    scaler = StandardScaler()
-    data_scaled = scaler.fit_transform(dataframe_numeric)
-
-    # Perform PCA
-    pca = PCA(n_components=2)  # Reduce to 2 components for visualization
-    principal_components = pca.fit_transform(data_scaled)
-
-    # Create a new dataframe with the principal components
-    pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])
-
-    # Plot the results
-    plt.figure(figsize=(10, 7))
-    plt.scatter(pca_df['PC1'], pca_df['PC2'], alpha=0.7)
-    plt.title('PCA of Input DataFrame', fontsize=16)
-    plt.xlabel('Principal Component 1', fontsize=12)
-    plt.ylabel('Principal Component 2', fontsize=12)
-    plt.grid(True)
-
-    # Optionally, add labels for points (if 'benchmark' column exists)
-    if 'benchmark' in dataframe.columns:
-        for i, label in enumerate(dataframe['benchmark']):
-            plt.annotate(label, (pca_df['PC1'][i], pca_df['PC2'][i]), fontsize=8, alpha=0.6)
-
-    plt.savefig('pca_plot.pdf')
-
-    return pca_df
+    def prepare_pca(dataframe):
+        # Ensure numeric columns are selected for PCA
+        numeric_cols = dataframe.select_dtypes(include=[np.number]).columns
+        if numeric_cols.empty:
+            raise ValueError("No numeric columns available in the dataframe for PCA.")
+        print(f"Numeric columns selected for PCA: {numeric_cols}")
+
+        # Drop rows with NaN values in numeric columns and retain their indices for annotation
+        dataframe_clean = dataframe.dropna(subset=numeric_cols)
+        benchmark_names = dataframe_clean['benchmark'].values
+
+        # Standardize the data
+        scaler = StandardScaler()
+        data_scaled = scaler.fit_transform(dataframe_clean[numeric_cols])
+
+        # Perform PCA
+        pca = PCA(n_components=4)  # Reduce to 4 components for analysis
+        principal_components = pca.fit_transform(data_scaled)
+
+        # Create a new dataframe with the principal components
+        pca_df = pd.DataFrame(
+            data=principal_components, 
+            columns=['PC1', 'PC2', 'PC3', 'PC4']
+        )
+        return pca_df, benchmark_names
+
+    # Perform PCA on both dataframes
+    pca_df1, benchmarks1 = prepare_pca(dataframe1)
+    pca_df2, benchmarks2 = prepare_pca(dataframe2)
+
+    # Create a 2x2 grid for the plots
+    fig, axes = plt.subplots(2, 2, figsize=(12, 12), constrained_layout=True)
+
+    # Set the main titles for each dataset
+    axes[0, 0].set_title('PCA from collected metrics', fontsize=14, loc='left')
+    axes[1, 0].set_title('PCA from language model embeddings', fontsize=14, loc='left')
+
+    # Helper function to plot and annotate
+    def plot_with_labels(ax, x, y, labels, title, secondary=False):
+        scatter = ax.scatter(x, y, c='black', alpha=0.7)
+        # ax.set_title(title, fontsize=14, loc='left')
+        ax.set_xlabel(f'Component {1 if not secondary else 3}', fontsize=14)
+        ax.set_ylabel(f'Component {2 if not secondary else 4}', fontsize=14)
+        ax.grid(color='lightgray', linestyle='--', linewidth=0.5)
+
+        # Add text annotations
+        texts = [ax.text(x[i], y[i], labels[i], fontsize=14, ha='center', va='center') for i in range(len(labels))]
+        adjust_text(texts, ax=ax, arrowprops=dict(arrowstyle='-', color='gray', lw=0.5))
+
+    # Plot Components 1 and 2 for Dataset 1
+    plot_with_labels(
+        axes[0, 0], 
+        pca_df1['PC1'], 
+        pca_df1['PC2'], 
+        benchmarks1, 
+        'PCA from collected metrics'
+    )
+
+    # Plot Components 3 and 4 for Dataset 1
+    plot_with_labels(
+        axes[0, 1], 
+        pca_df1['PC3'], 
+        pca_df1['PC4'], 
+        benchmarks1, 
+        '',
+        True
+    )
+
+    # Plot Components 1 and 2 for Dataset 2
+    plot_with_labels(
+        axes[1, 0], 
+        pca_df2['PC1'], 
+        pca_df2['PC2'], 
+        benchmarks2, 
+        'PCA from language model embeddings'
+    )
+
+    # Plot Components 3 and 4 for Dataset 2
+    plot_with_labels(
+        axes[1, 1], 
+        pca_df2['PC3'], 
+        pca_df2['PC4'], 
+        benchmarks2, 
+        '',
+        True
+    )
+
+    # Save the plots
+    plt.savefig(f'pca-row-plot-{name}.pdf', format='pdf')
+
+    return pca_df1, pca_df2