Merge pull request #127 from nishant-sachdeva/seedemb-opt

Extending support for vocabularies of different dimensions
IITH-Compilers · Oct 9, 2024 · 30b8324 · 30b8324
2 parents 6bd3ddc + 12da415
commit 30b8324
Show file tree

Hide file tree

Showing 23 changed files with 658 additions and 292 deletions.
diff --git a/Manylinux2014_Compliant_Source/pkg/build.sh b/Manylinux2014_Compliant_Source/pkg/build.sh
@@ -18,7 +18,7 @@ cd ..
 cmake -DCMAKE_BUILD_TYPE=Release .. && make -j"$(nproc)" && make install
 
 cd ..
-cp build/vocabulary.h Manylinux2014_Compliant_Source/pkg/ir2vec/
+cp build/include/Vocabulary*.h Manylinux2014_Compliant_Source/pkg/ir2vec/
 cp src/include/utils.h Manylinux2014_Compliant_Source/pkg/ir2vec/
 cp src/include/IR2Vec.h Manylinux2014_Compliant_Source/pkg/ir2vec/
 cp build/src/version.h Manylinux2014_Compliant_Source/pkg/ir2vec/

diff --git a/Manylinux2014_Compliant_Source/pkg/ir2vec/core.cpp b/Manylinux2014_Compliant_Source/pkg/ir2vec/core.cpp
@@ -20,7 +20,6 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/CFG.h"
@@ -75,19 +74,21 @@ class IR2VecHandler {
   std::string outputFile;
   std::string mode;
   std::string level;
+  unsigned dim;
 
 public:
   IR2VecHandler(std::string fileName, std::string outputFile, std::string mode,
-                std::string level)
-      : fileName(fileName), outputFile(outputFile), mode(mode), level(level) {}
+                std::string level, unsigned dim)
+      : fileName(fileName), outputFile(outputFile), mode(mode), level(level),
+        dim(dim) {}
 
   std::string getFile() { return fileName; }
   std::string getOutputFile() { return outputFile; }
   std::string getMode() { return mode; }
   std::string getLevel() { return level; }
 
   // Function to get Program Vector List
-  PyObject *createProgramVectorList(llvm::SmallVector<double, DIM> llvmPgmVec) {
+  PyObject *createProgramVectorList(IR2Vec::Vector llvmPgmVec) {
     // for PgmVector
     PyObject *PgmList = PyList_New(0);
     for (auto &Pgm_it : llvmPgmVec)
@@ -138,7 +139,6 @@ class IR2VecHandler {
     PyObject *instructionVectorList = PyList_New(0);
     for (auto &Inst_it : llvmInstVecMap) {
       PyObject *instructionVector = PyList_New(0);
-      // copy this SmallVector into c++ Vector
       for (auto &Vec_it : Inst_it.second) {
         PyList_Append(instructionVector, PyFloat_FromDouble(Vec_it));
       }
@@ -166,10 +166,10 @@ class IR2VecHandler {
       ofstream output;
       output.open(outFile, ios_base::app);
       emb = std::move(new IR2Vec::Embeddings(
-          *Module, ir2vecMode, (this->level)[0], &output, funcName));
+          *Module, ir2vecMode, (this->level)[0], &output, this->dim, funcName));
     } else {
       emb = std::move(new IR2Vec::Embeddings(
-          *Module, ir2vecMode, (this->level)[0], nullptr, funcName));
+          *Module, ir2vecMode, (this->level)[0], nullptr, this->dim, funcName));
     }
 
     if (!emb) {
@@ -178,7 +178,7 @@ class IR2VecHandler {
     }
 
     if (type == OpType::Program) {
-      llvm::SmallVector<double, DIM> progVector = emb->getProgramVector();
+      IR2Vec::Vector progVector = emb->getProgramVector();
       return this->createProgramVectorList(progVector);
     } else if (type == OpType::Function) {
       llvm::SmallMapVector<const llvm::Function *, IR2Vec::Vector, 16>
@@ -293,9 +293,10 @@ PyObject *getFunctionVectors(PyObject *self, PyObject *args) {
 
 IR2VecHandlerObject *createIR2VECObject(const char *filename,
                                         const char *output_file,
-                                        const char *mode, const char *level) {
+                                        const char *mode, const char *level,
+                                        unsigned dim) {
   IR2VecHandler *ir2vecObj =
-      new IR2VecHandler(filename, output_file, mode, level);
+      new IR2VecHandler(filename, output_file, mode, level, dim);
   if (!ir2vecObj) {
     return nullptr;
   }
@@ -314,8 +315,9 @@ PyObject *initEmbedding(PyObject *self, PyObject *args) {
   const char *mode = "\0";
   const char *level = "\0";
   const char *output_file = "\0";
+  unsigned dim = 300;
 
-  if (!PyArg_ParseTuple(args, "sss|s", &filename, &mode, &level,
+  if (!PyArg_ParseTuple(args, "sss|Is", &filename, &mode, &level, &dim,
                         &output_file)) {
     // raise error here
     PyErr_SetString(PyExc_TypeError, "Invalid Arguments");
@@ -348,7 +350,7 @@ PyObject *initEmbedding(PyObject *self, PyObject *args) {
   }
 
   IR2VecHandlerObject *ir2vecObj =
-      createIR2VECObject(filename, output_file, mode, level);
+      createIR2VECObject(filename, output_file, mode, level, dim);
 
   if (!ir2vecObj) {
     PyErr_SetString(PyExc_TypeError, "Embedding Object not created");

diff --git a/Manylinux2014_Compliant_Source/pkg/tests/test_ir2vec.py b/Manylinux2014_Compliant_Source/pkg/tests/test_ir2vec.py
@@ -160,7 +160,7 @@ def test_fa_f():
         path = (TEST_SUITE_DIR / file).resolve()
         full_path = str(path).strip()
 
-        initObj = ir2vec.initEmbedding(full_path, "fa", "f")
+        initObj = ir2vec.initEmbedding(full_path, "fa", "f", 300)
         assert initObj is not None
 
         functionVectorMap = ir2vec.getFunctionVectors(initObj)

diff --git a/README.md b/README.md
@@ -113,13 +113,16 @@ To ensure the correctness, run `make check`
 instructions.
 
 ### Using Binary
-> ir2vec -\<mode\> -o \<output-file\> -level \<p|f\> -class \<class-number\> -funcName=\<function-name\> \<input-ll-file\>
+> ir2vec -\<mode\> -dim \<dimensions\> -o \<output-file\> -level \<p|f\> -class \<class-number\> -funcName=\<function-name\> \<input-ll-file\>
 
 #### Command-Line options
 
 - `mode` - can be one of `sym`/`fa`
     - `sym` denotes Symbolic representation
     - `fa` denotes Flow-Aware representation
+- `dim` - Dimensions of embeddings
+    - This is an optional argument. Defaults to `300`.
+    - Other supported dimensions are `75` and `100`
 -  `o` - file in which the embeddings are to be appended;     (Note : If  file doesn’t exist, new file would be created, else embeddings would be appended)
 - `level` - can be one of chars `p`/`f`.
     - `p` denotes `program level` encoding
@@ -141,16 +144,16 @@ Please use `--help` for further details.
 
 #### Flow-Aware Embeddings
 For all functions
-* `` ir2vec -fa -o <output_file> -level <p|f>  -class <class-number> <input_ll_file>``
+* `` ir2vec -fa -dim <dimension> -o <output_file> -level <p|f>  -class <class-number> <input_ll_file>``
 
 For a specific function
-* `` ir2vec -fa -o <output_file> -level f  -class <class-number> -funcName=\<function-name\><input_ll_file>``
+* `` ir2vec -fa -dim <dimension> -o <output_file> -level f  -class <class-number> -funcName=\<function-name\><input_ll_file>``
 
 #### Symbolic Embeddings
 For all functions
- * `` ir2vec -sym -o <output_file> -level <p|f> -class <class-number> <input_ll_file>``
+ * `` ir2vec -sym -dim <dimension> -o <output_file> -level <p|f> -class <class-number> <input_ll_file>``
 For a specific function
- * `` ir2vec -sym -o <output_file> -level f -class <class-number> -funcName=\<function-name\> <input_ll_file>``
+ * `` ir2vec -sym -dim <dimension> -o <output_file> -level f -class <class-number> -funcName=\<function-name\> <input_ll_file>``
 
 ## Using Libraries
 The libraries can be installed by passing the installation location to the `CMAKE_INSTALL_PREFIX` flag during `cmake` followed by `make install`.
@@ -178,7 +181,7 @@ The following example snippet shows how to query the exposed vector representati
 
 // Creating object to generate FlowAware representation
 auto ir2vec =
-      IR2Vec::Embeddings(<LLVM Module>, IR2Vec::IR2VecMode::FlowAware);
+      IR2Vec::Embeddings(<LLVM Module>, IR2Vec::IR2VecMode::FlowAware, <DIM>);
 
 // Getting Instruction vectors corresponding to the instructions in <LLVM Module>
 auto instVecMap = ir2vec.getInstVecMap();
@@ -218,6 +221,8 @@ for (auto val : pgmVec)
 * `file_path`: str - Path to the `.ll` or `.bc` file.
 * `encoding_type`: str - Choose `fa` (Flow-Aware) or `sym` (Symbolic).
 * `level`: str - Choose `p` for program-level or `f` for function-level.
+* `dim`: uint - Choose from `[300, 100, 75]`. Default value is `300`
+* `output_file`: str - If provided, embeddings are saved to this file. Default is an empty string.
 
 **Returns:**
 
@@ -228,7 +233,14 @@ for (auto val : pgmVec)
 ```python
 import ir2vec
 
+# Approach 1
 initObj = ir2vec.initEmbedding("/path/to/file.ll", "fa", "p")
+
+# Approach 2
+initObj = ir2vec.initEmbedding("/path/to/file.ll", "fa", "p", 100)
+
+# Approach 3
+initObj = ir2vec.initEmbedding("/path/to/file.ll", "fa", "p", 100, "output.txt")
 ```
 
 ### getProgramVector

diff --git a/seed_embeddings/OpenKE/analogy.py b/seed_embeddings/OpenKE/analogy.py
@@ -6,13 +6,14 @@
 import numpy as np
 from sklearn.metrics.pairwise import euclidean_distances
 
+
 class AnalogyScorer:
     def __init__(self, analogy_file="analogies.txt"):
         self.entity_dict = {}
         self.analogies = self._load_analogies(analogy_file)
 
     def _load_analogies(self, file_path):
-        with open(file_path, 'r') as f:
+        with open(file_path, "r") as f:
             return [tuple(line.strip().split()) for line in f if line.strip()]
 
     def find_vec(self, str1):
@@ -22,24 +23,24 @@ def gen_similarity_table(self, vec):
         keys = list(self.entity_dict.keys())
         entity_matrix = np.array(list(self.entity_dict.values()))
         vec = vec.reshape(1, -1)
-        
+
         # Calculate distances using euclidean_distances
         distances = euclidean_distances(vec, entity_matrix)[0]
-        
+
         return dict(zip(keys, distances))
-    
+
     def findTopk(self, dict1, k, values):
         sortedByVal = dict(sorted(dict1.items(), key=lambda x: x[1]))
         del sortedByVal[values[0].upper()]
         del sortedByVal[values[1].upper()]
         del sortedByVal[values[2].upper()]
         return {k: sortedByVal[k] for k in list(sortedByVal)[:k]}
 
-    def get_analogy_score(self, entity_dict):        
+    def get_analogy_score(self, entity_dict):
         self.entity_dict = entity_dict
         total_count = len(self.analogies)
         correct_count = 0
-        
+
         for values in self.analogies:
             vecA = self.find_vec(values[0])
             vecB = self.find_vec(values[1])
@@ -56,4 +57,4 @@ def get_analogy_score(self, entity_dict):
 
             if values[3].upper() in top_k_dict:
                 correct_count += 1
-        return correct_count
+        return correct_count
diff --git a/seed_embeddings/OpenKE/config/Trainer.py b/seed_embeddings/OpenKE/config/Trainer.py
@@ -34,7 +34,7 @@ def __init__(
         save_steps=None,
         checkpoint_dir=None,
         index_dir=None,
-        out_path=None,
+        analogy_file="analogies.txt",
     ):
 
         self.work_threads = 8
@@ -52,10 +52,10 @@ def __init__(
         self.save_steps = save_steps
         self.checkpoint_dir = checkpoint_dir
         # self.out_path = out_path
-        
+
         self.entity_names = self.load_entity_names(index_dir)
-        self.analogies = analogy.AnalogyScorer(analogy_file="analogies.txt")
-        
+        self.analogies = analogy.AnalogyScorer(analogy_file=analogy_file)
+
     def load_entity_names(self, index_dir):
         with open(os.path.join(index_dir, "entity2id.txt")) as fEntity:
             content = fEntity.read()
@@ -93,8 +93,8 @@ def getEntityDict(self, ent_embeddings):
         mapping entity names to their corresponding embeddings.
         """
         entity_dict = {}
-        
-        for i, entity_name in enumerate(self.entity_dict):
+
+        for i, entity_name in enumerate(self.entity_names):
             entity_dict[entity_name] = ent_embeddings[i].tolist()
 
         return entity_dict
@@ -139,7 +139,7 @@ def run(
                 weight_decay=self.weight_decay,
             )
         print("Finish initializing...")
-
+        best_metric_val = 0.0
         training_range = tqdm(range(self.train_times))
         for epoch in training_range:
             res = 0.0
@@ -148,6 +148,7 @@ def run(
                 res += loss
             training_range.set_description("Epoch %d | loss: %f" % (epoch, res))
             checkpoint = None
+            save_ckpt = False
             if ray and epoch % freq == 0:
                 metrics = {"loss": res}
                 # Link Prediction
@@ -170,27 +171,45 @@ def run(
                             "hit1": hit1,
                         }
                     )
+                    if best_metric_val <= hit1:
+                        best_metric_val = hit1
+                        save_ckpt = True
                     print("Link Prediction Scores Completed")
 
-                if is_analogy:
+                elif is_analogy:
                     # self.model => Negative Sampling object
                     # self.mode.model => Transe model
 
-                    ent_embeddings = self.model.model.ent_embeddings.weight.data.numpy()
+                    ent_embeddings = (
+                        self.model.model.ent_embeddings.weight.data.cpu().numpy()
+                    )
                     entity_dict = self.getEntityDict(ent_embeddings)
                     analogy_score = self.analogies.get_analogy_score(entity_dict)
                     metrics.update({"AnalogiesScore": analogy_score})
-                    print("Analogy Score Completed")
+                    print("Analogy Score completed")
+
+                    del entity_dict
+
+                    if best_metric_val <= analogy_score:
+                        best_metric_val = analogy_score
+                        save_ckpt = True
+
+                else:  # loss
+                    if best_metric_val >= res:
+                        best_metric_val = res
+                        save_ckpt = True
 
                 with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
                     # Save the checkpoint...
-                    self.model.save_checkpoint(
-                        os.path.join(
-                            temp_checkpoint_dir,
-                            "checkpoint" + "-" + str(epoch) + ".ckpt",
+                    checkpoint = None
+                    if save_ckpt:
+                        self.model.save_checkpoint(
+                            os.path.join(
+                                temp_checkpoint_dir,
+                                "checkpoint" + "-" + str(epoch) + ".ckpt",
+                            )
                         )
-                    )
-                    checkpoint = Checkpoint.from_directory(temp_checkpoint_dir)
+                        checkpoint = Checkpoint.from_directory(temp_checkpoint_dir)
 
                     train.report(metrics, checkpoint=checkpoint)