Skip to content

Commit

Permalink
✨ bayes
Browse files Browse the repository at this point in the history
  • Loading branch information
userElaina committed Dec 21, 2024
1 parent ee78093 commit 00c3009
Show file tree
Hide file tree
Showing 17 changed files with 2,010 additions and 0 deletions.
2 changes: 2 additions & 0 deletions textattack/algorithms/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from algorithms.discrete_block_bayesian_opt import BlockBayesAttack
from algorithms.discrete_bayesian_opt import BayesOpt
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import torch
from torch.distributions.normal import Normal

def expected_improvement(mean, var, reference):
predictive_normal = Normal(mean.new_zeros(mean.size()), mean.new_ones(mean.size()))
std = torch.sqrt(var)
standardized = (mean - reference) / std

ucdf = predictive_normal.cdf(standardized)
updf = torch.exp(predictive_normal.log_prob(standardized))
ei = std * (updf + standardized * ucdf)
return ei
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from algorithms.bayesopt.acquisition.algorithm.kmeanspp import kmeans_pp
from algorithms.bayesopt.acquisition.algorithm.greedy_ascent import acquisition_maximization_with_indices
140 changes: 140 additions & 0 deletions textattack/algorithms/bayesopt/acquisition/algorithm/greedy_ascent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
import torch
import numpy as np
from algorithms.bayesopt.acquisition.acquisition_function.acquisition_functions import expected_improvement
from algorithms.bayesopt.dpp.dpp import dpp_sample
import copy
def find_remained_indices(candidates, history_X, N):
if len(candidates)==0:
return []
values, indices = torch.topk(((candidates.long().t() == history_X.long().unsqueeze(-1)).all(dim=1)).int(),1,1 )
rm_ids = set([int(ind) for ind in indices[values!=0]])
remained_indices = list(set(list(range(N))) - rm_ids)
return remained_indices

def find_removed_indices(candidates, history_X):
values, indices = torch.topk(((candidates.long().t() == history_X.long().unsqueeze(-1)).all(dim=1)).int(),1,1 )
rm_ids = set([int(ind) for ind in indices[values!=0]])
return list(rm_ids)

def greedy_ascent_with_indices(center_indices, opt_indices, stage, hb, surrogate_model, batch_size, reference=None, filter=True, return_ei=False, acq_with_opt_indices=True):

candidates = hb.neighbors(center_indices, stage+1, 0, indices=opt_indices)
N, L = candidates.shape

# filtering observed candidates
if filter:
remained_indices = find_remained_indices(candidates, hb.eval_X_reduced, N)
else:
remained_indices = list(set(list(range(N))))

testX_cate = candidates[remained_indices]

# calculate acquisition
if acq_with_opt_indices:
testX = testX_cate[:,opt_indices]
centerX = center_indices.view(1,-1)[:,opt_indices]
else:
testX = testX_cate
centerX = center_indices.view(1,-1)

if reference == None:
_, reference, best_ind = hb.best_of_hamming(hb.orig_X, stage+1)

if len(remained_indices)==0:
"something wrong, do larger space in greedy ascent with indices"
if return_ei:
center_ei = surrogate_model.acquisition(centerX, bias=reference)
return center_indices.view(1,-1), torch.Tensor([center_ei])
else:
return center_indices.view(1,-1)

ei = surrogate_model.acquisition(testX, bias=reference)
#center_ei = surrogate_model.acquisition(centerX, bias=reference)

topk_values, topk_indices = torch.topk(ei, min(len(ei),batch_size))
best_candidates_indices = torch.cat([testX_cate[idx].view(1,-1) for idx in topk_indices],dim=0)
if return_ei:
return best_candidates_indices, topk_values
else:
return best_candidates_indices
import time
def acquisition_maximization_with_indices(cur_seqs, opt_indices, batch_size, stage, hb, surrogate_model, reference=None, dpp_type='no', acq_with_opt_indices=True):
global_candidates_, global_eis_ = [], []

t0 = time.time()
for cur_seq in cur_seqs:
cur_indices = hb.reduce_seq(cur_seq).view(1,-1)
if acq_with_opt_indices:
cur_ei = surrogate_model.acquisition(cur_indices[:,opt_indices], bias=reference)
else:
cur_ei = surrogate_model.acquisition(cur_indices, bias=reference)
global_candidates_.append(cur_indices)
global_eis_.append(cur_ei)

num_next = int(np.ceil(100 / len(cur_seqs)))
filtering = True
new_candidates_ = []
new_eis_ = []
new_candidates, new_eis = greedy_ascent_with_indices(cur_indices, opt_indices, stage, hb, surrogate_model, batch_size=num_next, reference=reference, filter=filtering, return_ei=True, acq_with_opt_indices=acq_with_opt_indices)
new_candidates_.append(new_candidates)
new_eis_.extend(new_eis)
N = len(new_candidates_)
new_candidates_ = torch.cat(new_candidates_, dim=0)

candidates, indices = unique(new_candidates_, dim=0)
eis = [new_eis_[ind] for ind in indices]
assert len(candidates) == len(eis), f'something wrong {len(candidates)}, {len(eis)}'

global_candidates_.append(candidates)
global_eis_.extend(eis)
t1 = time.time()

global_candidates, indices = unique(torch.cat(global_candidates_, dim=0), dim=0)
global_eis = [global_eis_[ind] for ind in indices]
N, L = global_candidates.shape
remained_indices = find_remained_indices(global_candidates, hb.eval_X_reduced, N)
t2 = time.time()

global_candidates = global_candidates[remained_indices]
global_eis = [global_eis[ind] for ind in remained_indices]
assert len(global_candidates) == len(global_eis), f'something wrong {len(global_candidates)}, {len(global_eis)}'

global_eis = torch.Tensor(global_eis)

if len(global_candidates) == 0:
return None

t3 = time.time()
if dpp_type == 'no' or dpp_type == 'no_one':
topk_values, topk_indices = torch.topk(global_eis, min(len(global_eis),batch_size))
candidates = [hb.seq_by_indices(global_candidates[ind]) for ind in topk_indices]
elif dpp_type == 'dpp_posterior':
t4 = time.time()
topk_values, topk_indices = torch.topk(global_eis, min(len(global_eis),100))
global_candidates = global_candidates[topk_indices]

num = min(len(global_candidates), batch_size)
if acq_with_opt_indices:
Lmat = surrogate_model.get_covar(global_candidates[:,opt_indices].cuda()).cpu().detach().numpy()
else:
Lmat = surrogate_model.get_covar(global_candidates.cuda()).cpu().detach().numpy()
Lmat = Lmat / np.mean(np.abs(Lmat))
if Lmat.shape[0] == num:
best_indices = list(range(num))
else:
best_indices = dpp_sample(Lmat, k=num, T=0)
candidates = [hb.seq_by_indices(global_candidates[ind]) for ind in best_indices]
t5 = time.time()
if len(candidates):
return candidates
else:
"something wrong, do larger space"
return None

def unique(x, dim=None):
unique, inverse = torch.unique(
x, sorted=True, return_inverse=True, dim=dim)
perm = torch.arange(inverse.size(0), dtype=inverse.dtype,
device=inverse.device)
inverse, perm = inverse.flip([0]), perm.flip([0])
return unique, inverse.new_empty(unique.size(dim)).scatter_(0, inverse, perm)
140 changes: 140 additions & 0 deletions textattack/algorithms/bayesopt/acquisition/algorithm/kmeanspp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
import numpy as np
def euc_dist_sq(data1, data2):
'''
inputs:
data1 - numpy array of data points (n1, d)
data2 - numpy array of data points (n2, d)
'''
n1, d1 = data1.shape
n2, d2 = data2.shape
assert d1 == d2, f"the embedding dimension of data1, data2 are different {d1} != {d2}."
d = d1
c = np.reshape(data1,[n1,1,d]) - np.reshape(data2,[1,n2,d])
dist_sq = np.sum(np.square(c),axis=2)
return dist_sq

def euc_dist_sq_test():
data1 = np.random.randn(2,4)
data2 = np.random.randn(3,4)
print("data1 : ", data1)
print("data2 : ", data2)
print("euc_dist_sq : ", euc_dist_sq(data1,data2))

def hamming_dist_sq(data1, data2):
'''
inputs:
data1 - numpy array of data points (n1, d)
data2 - numpy array of data points (n2, d)
'''
n1, d1 = data1.shape
n2, d2 = data2.shape
assert d1 == d2, f"the embedding dimension of data1, data2 are different {d1} != {d2}."
d = d1
c = (np.reshape(data1,[n1,1,d]) != np.reshape(data2,[1,n2,d])) * 1.0
dist_sq = np.square(np.sum(c,axis=2))
return dist_sq

def hamming_dist_sq_test():
data1 = np.random.randint(5, size=[2,4])
data2 = np.random.randint(5, size=[3,4])
print("data1 : ", data1)
print("data2 : ", data2)
print("hamming_dist_sq : ", hamming_dist_sq(data1,data2))

def kmeans_pp(data, k, dist='euclidean', init_ind=None):
'''
initialized the centroids for K-means++
inputs:
data - numpy array of data points having shape (n, d)
k - number of clusters (k <= n)
dist - the name of metric
init_ind - int (if None, random init index)
'''
## initialize the centroids list and add
## a randomly selected data point to the list
centroids = []
selected_indices = []

if init_ind is None:
init_ind = np.random.randint(data.shape[0])
centroids.append(data[init_ind, :])
selected_indices.append(init_ind)

if dist == 'euclidean':
d_sq_func = euc_dist_sq
elif dist == 'hamming':
d_sq_func = hamming_dist_sq

## compute remaining centroids
for _ in range(k - 1):
all_indices = list(range(data.shape[0]))
unselected_indices = list(set(all_indices) - set(selected_indices))

d_sq_to_centroid = d_sq_func(data[unselected_indices], data[selected_indices])
min_d_sq_to_centroid = np.min(d_sq_to_centroid, axis=1)
if np.sum(min_d_sq_to_centroid)==0:
break
#prob = min_d_sq_to_centroid / np.sum(min_d_sq_to_centroid)
#next_centroid_ind = all_indices[np.random.choice(unselected_indices, p=prob)]
next_centroid_ind = unselected_indices[
np.argmax(min_d_sq_to_centroid)
]

selected_indices.append(next_centroid_ind)

centroids.append(data[next_centroid_ind, :])
return np.array(centroids), selected_indices

def kmeans_pp_test():
import matplotlib.pyplot as plt

## 1. euc
data = np.random.randn(100,2)
centroids, selected_indices = kmeans_pp(data, 5, dist='euclidean')
plt.scatter(data[:,0],data[:,1],label=0)
plt.scatter(data[selected_indices,0],data[selected_indices,1],label=1)
plt.savefig('kmeans_test1.png')
plt.close()
## 2. hamming
data = np.random.randint(20,size=[100,2])
centroids, selected_indices = kmeans_pp(data, 5, dist='hamming')
plt.scatter(data[:,0],data[:,1],label=0)
plt.scatter(data[selected_indices,0],data[selected_indices,1],label=1)
plt.savefig('kmeans_test2.png')

def kmeans_pp_test2():
dl = []
for i in range(10):
dl.append(np.random.randn(100,512) + i*10)
data = np.reshape(np.stack(dl), [1000,512])
print(data.shape)
losses = []
import time
tt = 0
for i in range(10):
print(i)
t0 = time.time()
centroids, selected_indices = kmeans_pp(data, 30, dist='euclidean')
t1 = time.time()
tt += t1 -t0
loss = np.sum(np.min(euc_dist_sq(data, centroids), axis=1))
losses.append(loss)
print("time : ", tt/10)

rnd_losses = []
for i in range(10):
indices = np.random.choice(data.shape[0], size=[30], replace=False)
centroids = data[indices,:]
loss = np.sum(np.min(euc_dist_sq(data, centroids), axis=1))
rnd_losses.append(loss)

print("kmeans", sum(losses)/ len(losses), losses)
print("random", sum(rnd_losses)/ len(rnd_losses), rnd_losses)



if __name__ == '__main__':
#euc_dist_sq_test()
#hamming_dist_sq_test()
kmeans_pp_test()
kmeans_pp_test2()
44 changes: 44 additions & 0 deletions textattack/algorithms/bayesopt/dpp/dpp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import numpy as np
from dppy.finite_dpps import FiniteDPP

def dpp_init(L,k):
n, m = L.shape
assert n == m, "L should be square numpy matrix"
assert n >= k, "candidate pool should be greater or equal than k"

S = [0]
cur_det = L[S][:,S]
while len(S) < k:
det_best = -1e9
S_best = None
for i in range(n):
if i in S:
continue
S_tmp = S + [i]
submat = L[S_tmp][:,S_tmp]
det = np.linalg.det(submat)
if det > det_best:
S_best = S_tmp
det_best = det
S = S_best
cur_det = det_best
return S, cur_det

def dpp_sample(L, k, T):
n, m = L.shape
assert n == m, "L should be square numpy matrix"
assert n >= k, "candidate pool should be greater or equal than k"

# greedy insertion
S, cur_det = dpp_init(L, k)
if T == 0:
return S
try:
DPP = FiniteDPP('likelihood', **{'L': L})
S = DPP.sample_mcmc_k_dpp(size=k, s_init=S, nb_iter=T)
return S
except:
L_ = L + 1e-8 * np.eye(n)
DPP = FiniteDPP('likelihood', **{'L': L_})
S = DPP.sample_mcmc_k_dpp(size=k, s_init=S, nb_iter=T)
return S
Loading

0 comments on commit 00c3009

Please sign in to comment.