Skip to content

Commit

Permalink
Merge pull request #127 from nishant-sachdeva/seedemb-opt
Browse files Browse the repository at this point in the history
Extending support for vocabularies of different dimensions
  • Loading branch information
svkeerthy authored Oct 9, 2024
2 parents 6bd3ddc + 12da415 commit 30b8324
Show file tree
Hide file tree
Showing 23 changed files with 658 additions and 292 deletions.
2 changes: 1 addition & 1 deletion Manylinux2014_Compliant_Source/pkg/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ cd ..
cmake -DCMAKE_BUILD_TYPE=Release .. && make -j"$(nproc)" && make install

cd ..
cp build/vocabulary.h Manylinux2014_Compliant_Source/pkg/ir2vec/
cp build/include/Vocabulary*.h Manylinux2014_Compliant_Source/pkg/ir2vec/
cp src/include/utils.h Manylinux2014_Compliant_Source/pkg/ir2vec/
cp src/include/IR2Vec.h Manylinux2014_Compliant_Source/pkg/ir2vec/
cp build/src/version.h Manylinux2014_Compliant_Source/pkg/ir2vec/
Expand Down
26 changes: 14 additions & 12 deletions Manylinux2014_Compliant_Source/pkg/ir2vec/core.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/IR/CFG.h"
Expand Down Expand Up @@ -75,19 +74,21 @@ class IR2VecHandler {
std::string outputFile;
std::string mode;
std::string level;
unsigned dim;

public:
IR2VecHandler(std::string fileName, std::string outputFile, std::string mode,
std::string level)
: fileName(fileName), outputFile(outputFile), mode(mode), level(level) {}
std::string level, unsigned dim)
: fileName(fileName), outputFile(outputFile), mode(mode), level(level),
dim(dim) {}

std::string getFile() { return fileName; }
std::string getOutputFile() { return outputFile; }
std::string getMode() { return mode; }
std::string getLevel() { return level; }

// Function to get Program Vector List
PyObject *createProgramVectorList(llvm::SmallVector<double, DIM> llvmPgmVec) {
PyObject *createProgramVectorList(IR2Vec::Vector llvmPgmVec) {
// for PgmVector
PyObject *PgmList = PyList_New(0);
for (auto &Pgm_it : llvmPgmVec)
Expand Down Expand Up @@ -138,7 +139,6 @@ class IR2VecHandler {
PyObject *instructionVectorList = PyList_New(0);
for (auto &Inst_it : llvmInstVecMap) {
PyObject *instructionVector = PyList_New(0);
// copy this SmallVector into c++ Vector
for (auto &Vec_it : Inst_it.second) {
PyList_Append(instructionVector, PyFloat_FromDouble(Vec_it));
}
Expand Down Expand Up @@ -166,10 +166,10 @@ class IR2VecHandler {
ofstream output;
output.open(outFile, ios_base::app);
emb = std::move(new IR2Vec::Embeddings(
*Module, ir2vecMode, (this->level)[0], &output, funcName));
*Module, ir2vecMode, (this->level)[0], &output, this->dim, funcName));
} else {
emb = std::move(new IR2Vec::Embeddings(
*Module, ir2vecMode, (this->level)[0], nullptr, funcName));
*Module, ir2vecMode, (this->level)[0], nullptr, this->dim, funcName));
}

if (!emb) {
Expand All @@ -178,7 +178,7 @@ class IR2VecHandler {
}

if (type == OpType::Program) {
llvm::SmallVector<double, DIM> progVector = emb->getProgramVector();
IR2Vec::Vector progVector = emb->getProgramVector();
return this->createProgramVectorList(progVector);
} else if (type == OpType::Function) {
llvm::SmallMapVector<const llvm::Function *, IR2Vec::Vector, 16>
Expand Down Expand Up @@ -293,9 +293,10 @@ PyObject *getFunctionVectors(PyObject *self, PyObject *args) {

IR2VecHandlerObject *createIR2VECObject(const char *filename,
const char *output_file,
const char *mode, const char *level) {
const char *mode, const char *level,
unsigned dim) {
IR2VecHandler *ir2vecObj =
new IR2VecHandler(filename, output_file, mode, level);
new IR2VecHandler(filename, output_file, mode, level, dim);
if (!ir2vecObj) {
return nullptr;
}
Expand All @@ -314,8 +315,9 @@ PyObject *initEmbedding(PyObject *self, PyObject *args) {
const char *mode = "\0";
const char *level = "\0";
const char *output_file = "\0";
unsigned dim = 300;

if (!PyArg_ParseTuple(args, "sss|s", &filename, &mode, &level,
if (!PyArg_ParseTuple(args, "sss|Is", &filename, &mode, &level, &dim,
&output_file)) {
// raise error here
PyErr_SetString(PyExc_TypeError, "Invalid Arguments");
Expand Down Expand Up @@ -348,7 +350,7 @@ PyObject *initEmbedding(PyObject *self, PyObject *args) {
}

IR2VecHandlerObject *ir2vecObj =
createIR2VECObject(filename, output_file, mode, level);
createIR2VECObject(filename, output_file, mode, level, dim);

if (!ir2vecObj) {
PyErr_SetString(PyExc_TypeError, "Embedding Object not created");
Expand Down
2 changes: 1 addition & 1 deletion Manylinux2014_Compliant_Source/pkg/tests/test_ir2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ def test_fa_f():
path = (TEST_SUITE_DIR / file).resolve()
full_path = str(path).strip()

initObj = ir2vec.initEmbedding(full_path, "fa", "f")
initObj = ir2vec.initEmbedding(full_path, "fa", "f", 300)
assert initObj is not None

functionVectorMap = ir2vec.getFunctionVectors(initObj)
Expand Down
24 changes: 18 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -113,13 +113,16 @@ To ensure the correctness, run `make check`
instructions.

### Using Binary
> ir2vec -\<mode\> -o \<output-file\> -level \<p|f\> -class \<class-number\> -funcName=\<function-name\> \<input-ll-file\>
> ir2vec -\<mode\> -dim \<dimensions\> -o \<output-file\> -level \<p|f\> -class \<class-number\> -funcName=\<function-name\> \<input-ll-file\>
#### Command-Line options

- `mode` - can be one of `sym`/`fa`
- `sym` denotes Symbolic representation
- `fa` denotes Flow-Aware representation
- `dim` - Dimensions of embeddings
- This is an optional argument. Defaults to `300`.
- Other supported dimensions are `75` and `100`
- `o` - file in which the embeddings are to be appended; (Note : If file doesn’t exist, new file would be created, else embeddings would be appended)
- `level` - can be one of chars `p`/`f`.
- `p` denotes `program level` encoding
Expand All @@ -141,16 +144,16 @@ Please use `--help` for further details.
#### Flow-Aware Embeddings
For all functions
* `` ir2vec -fa -o <output_file> -level <p|f> -class <class-number> <input_ll_file>``
* `` ir2vec -fa -dim <dimension> -o <output_file> -level <p|f> -class <class-number> <input_ll_file>``

For a specific function
* `` ir2vec -fa -o <output_file> -level f -class <class-number> -funcName=\<function-name\><input_ll_file>``
* `` ir2vec -fa -dim <dimension> -o <output_file> -level f -class <class-number> -funcName=\<function-name\><input_ll_file>``

#### Symbolic Embeddings
For all functions
* `` ir2vec -sym -o <output_file> -level <p|f> -class <class-number> <input_ll_file>``
* `` ir2vec -sym -dim <dimension> -o <output_file> -level <p|f> -class <class-number> <input_ll_file>``
For a specific function
* `` ir2vec -sym -o <output_file> -level f -class <class-number> -funcName=\<function-name\> <input_ll_file>``
* `` ir2vec -sym -dim <dimension> -o <output_file> -level f -class <class-number> -funcName=\<function-name\> <input_ll_file>``

## Using Libraries
The libraries can be installed by passing the installation location to the `CMAKE_INSTALL_PREFIX` flag during `cmake` followed by `make install`.
Expand Down Expand Up @@ -178,7 +181,7 @@ The following example snippet shows how to query the exposed vector representati

// Creating object to generate FlowAware representation
auto ir2vec =
IR2Vec::Embeddings(<LLVM Module>, IR2Vec::IR2VecMode::FlowAware);
IR2Vec::Embeddings(<LLVM Module>, IR2Vec::IR2VecMode::FlowAware, <DIM>);

// Getting Instruction vectors corresponding to the instructions in <LLVM Module>
auto instVecMap = ir2vec.getInstVecMap();
Expand Down Expand Up @@ -218,6 +221,8 @@ for (auto val : pgmVec)
* `file_path`: str - Path to the `.ll` or `.bc` file.
* `encoding_type`: str - Choose `fa` (Flow-Aware) or `sym` (Symbolic).
* `level`: str - Choose `p` for program-level or `f` for function-level.
* `dim`: uint - Choose from `[300, 100, 75]`. Default value is `300`
* `output_file`: str - If provided, embeddings are saved to this file. Default is an empty string.
**Returns:**
Expand All @@ -228,7 +233,14 @@ for (auto val : pgmVec)
```python
import ir2vec
# Approach 1
initObj = ir2vec.initEmbedding("/path/to/file.ll", "fa", "p")
# Approach 2
initObj = ir2vec.initEmbedding("/path/to/file.ll", "fa", "p", 100)
# Approach 3
initObj = ir2vec.initEmbedding("/path/to/file.ll", "fa", "p", 100, "output.txt")
```

### getProgramVector
Expand Down
15 changes: 8 additions & 7 deletions seed_embeddings/OpenKE/analogy.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,14 @@
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances


class AnalogyScorer:
def __init__(self, analogy_file="analogies.txt"):
self.entity_dict = {}
self.analogies = self._load_analogies(analogy_file)

def _load_analogies(self, file_path):
with open(file_path, 'r') as f:
with open(file_path, "r") as f:
return [tuple(line.strip().split()) for line in f if line.strip()]

def find_vec(self, str1):
Expand All @@ -22,24 +23,24 @@ def gen_similarity_table(self, vec):
keys = list(self.entity_dict.keys())
entity_matrix = np.array(list(self.entity_dict.values()))
vec = vec.reshape(1, -1)

# Calculate distances using euclidean_distances
distances = euclidean_distances(vec, entity_matrix)[0]

return dict(zip(keys, distances))

def findTopk(self, dict1, k, values):
sortedByVal = dict(sorted(dict1.items(), key=lambda x: x[1]))
del sortedByVal[values[0].upper()]
del sortedByVal[values[1].upper()]
del sortedByVal[values[2].upper()]
return {k: sortedByVal[k] for k in list(sortedByVal)[:k]}

def get_analogy_score(self, entity_dict):
def get_analogy_score(self, entity_dict):
self.entity_dict = entity_dict
total_count = len(self.analogies)
correct_count = 0

for values in self.analogies:
vecA = self.find_vec(values[0])
vecB = self.find_vec(values[1])
Expand All @@ -56,4 +57,4 @@ def get_analogy_score(self, entity_dict):

if values[3].upper() in top_k_dict:
correct_count += 1
return correct_count
return correct_count
51 changes: 35 additions & 16 deletions seed_embeddings/OpenKE/config/Trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def __init__(
save_steps=None,
checkpoint_dir=None,
index_dir=None,
out_path=None,
analogy_file="analogies.txt",
):

self.work_threads = 8
Expand All @@ -52,10 +52,10 @@ def __init__(
self.save_steps = save_steps
self.checkpoint_dir = checkpoint_dir
# self.out_path = out_path

self.entity_names = self.load_entity_names(index_dir)
self.analogies = analogy.AnalogyScorer(analogy_file="analogies.txt")
self.analogies = analogy.AnalogyScorer(analogy_file=analogy_file)

def load_entity_names(self, index_dir):
with open(os.path.join(index_dir, "entity2id.txt")) as fEntity:
content = fEntity.read()
Expand Down Expand Up @@ -93,8 +93,8 @@ def getEntityDict(self, ent_embeddings):
mapping entity names to their corresponding embeddings.
"""
entity_dict = {}
for i, entity_name in enumerate(self.entity_dict):

for i, entity_name in enumerate(self.entity_names):
entity_dict[entity_name] = ent_embeddings[i].tolist()

return entity_dict
Expand Down Expand Up @@ -139,7 +139,7 @@ def run(
weight_decay=self.weight_decay,
)
print("Finish initializing...")

best_metric_val = 0.0
training_range = tqdm(range(self.train_times))
for epoch in training_range:
res = 0.0
Expand All @@ -148,6 +148,7 @@ def run(
res += loss
training_range.set_description("Epoch %d | loss: %f" % (epoch, res))
checkpoint = None
save_ckpt = False
if ray and epoch % freq == 0:
metrics = {"loss": res}
# Link Prediction
Expand All @@ -170,27 +171,45 @@ def run(
"hit1": hit1,
}
)
if best_metric_val <= hit1:
best_metric_val = hit1
save_ckpt = True
print("Link Prediction Scores Completed")

if is_analogy:
elif is_analogy:
# self.model => Negative Sampling object
# self.mode.model => Transe model

ent_embeddings = self.model.model.ent_embeddings.weight.data.numpy()
ent_embeddings = (
self.model.model.ent_embeddings.weight.data.cpu().numpy()
)
entity_dict = self.getEntityDict(ent_embeddings)
analogy_score = self.analogies.get_analogy_score(entity_dict)
metrics.update({"AnalogiesScore": analogy_score})
print("Analogy Score Completed")
print("Analogy Score completed")

del entity_dict

if best_metric_val <= analogy_score:
best_metric_val = analogy_score
save_ckpt = True

else: # loss
if best_metric_val >= res:
best_metric_val = res
save_ckpt = True

with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
# Save the checkpoint...
self.model.save_checkpoint(
os.path.join(
temp_checkpoint_dir,
"checkpoint" + "-" + str(epoch) + ".ckpt",
checkpoint = None
if save_ckpt:
self.model.save_checkpoint(
os.path.join(
temp_checkpoint_dir,
"checkpoint" + "-" + str(epoch) + ".ckpt",
)
)
)
checkpoint = Checkpoint.from_directory(temp_checkpoint_dir)
checkpoint = Checkpoint.from_directory(temp_checkpoint_dir)

train.report(metrics, checkpoint=checkpoint)

Expand Down
Loading

0 comments on commit 30b8324

Please sign in to comment.