-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSinglepredict.py
140 lines (118 loc) · 5.15 KB
/
Singlepredict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import esm
import time
import logging
import resource
import numpy as np
from functools import partial
from ProtFormerSiteSingle.data import ProteinDataset
from ProtFormerSiteSingle.models import SSpredictor
from ProtFormerSiteSingle.utils import prepare_protein_data, metrics, load_model
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from einops import rearrange
from minlora import add_lora, LoRAParametrization
import yaml
import argparse
#####################################
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
# create a file handler
handler = logging.FileHandler('predict.log')
handler.setLevel(logging.INFO)
# create a logging format
formatter = logging.Formatter(
'%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
# add the file handler to the logger
logger.addHandler(handler)
#####################################
def get_params():
parser = argparse.ArgumentParser('ProtFormer-Site')
parser.add_argument("--config", type=str, help='config file')
args, _ = parser.parse_known_args()
return args
def eval(val_loader, model, embedding_model, batch_converter, device, num_recycle):
embedding_model.eval()
model.eval()
start_time = time.time()
val_loss = 0.0
all_preds = []
all_targets = []
all_probs = []
with torch.no_grad():
for batch, item in enumerate(val_loader):
protein_names, protein_seqs, protein_labels = item
reprs = dict()
protein_labels, protein_name_seq = prepare_protein_data(protein_labels, protein_names, protein_seqs)
_, _, batch_tokens = batch_converter(protein_name_seq)
batch_tokens = batch_tokens.to(device)
protein_labels = protein_labels.to(device)
device_type = 'cuda' if device.type == 'cuda' else 'cpu'
with torch.autocast(device_type=device_type):
results = embedding_model(batch_tokens, repr_layers=[33], need_head_weights=True, return_contacts=False)
reprs["single_repr"] = results["representations"][33][:, 1:-1]
attentions = rearrange(results["attentions"][:, -1, :, 1:-1, 1:-1], 'b h i j -> b i j h')
reprs["pair_repr"] = attentions
output = model(reprs, mask=None, num_recycle=num_recycle)
predict = output["ss2"]
predict_probs = F.softmax(output["ss2"], dim=-1)
predict = predict_probs.argmax(dim=-1)
predict_probs = predict_probs[:, :, 1]
predict = rearrange(predict, 'b l -> (b l)')
protein_labels = rearrange(protein_labels, 'b l -> (b l)')
all_preds.extend(predict.cpu().numpy())
all_targets.extend(protein_labels.cpu().numpy())
all_probs.extend(predict_probs.cpu().numpy().flatten())
end_time = time.time()
logger.info(f"run Time: {end_time - start_time}")
acc, auc, rec, pre, f1, mcc, prc = metrics(all_targets, all_preds, all_probs)
logger.info(
f"Test Metrics: ACC - {acc}, AUC - {auc}, Recall - {rec}, Precision - {pre}, F1 - {f1}, MCC - {mcc}, PRC - {prc}")
return val_loss / len(val_loader), {
"ACC": acc,
"AUC": auc,
"Recall": rec,
"Precision": pre,
"F1": f1,
"MCC": mcc,
"PRC": prc
}
def main(args):
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
"""parser config"""
with open(args["config"], 'r') as file:
config = yaml.safe_load(file)
"""Initialize model"""
logger.info("Initializing model")
model = SSpredictor(dim=config["dim"], num_layers=config["num_layers"], n_hidden=config["n_hidden"],
pair_dim=config["pair_dim"], dropout=config["dropout"])
lora_config = {
torch.nn.Linear: {
"weight": partial(LoRAParametrization.from_linear, rank=3),
},
}
pretrain_model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
batch_converter = alphabet.get_batch_converter()
add_lora(pretrain_model, lora_config)
model = model.to(device)
pretrain_model = pretrain_model.to(device)
"""load model"""
load_model(model, pretrain_model, config['task'], save_dir=config['save_dir'])
test_data = ProteinDataset(config["test_dataset_path"], max_len=config['max_len'])
test_loader = DataLoader(test_data, batch_size=config['batch_size'], shuffle=False)
start_time = time.time()
test_loss, test_metrics = eval(test_loader, model, pretrain_model, batch_converter, device, config["num_recycle"])
end_time = time.time()
logger.info(f"run_time:{end_time-start_time}")
logger.info(f"Test Loss: {test_loss}")
logger.info(f"Test Metrics: {test_metrics}")
logger.info("Finished Predicting")
if __name__ == "__main__":
resource.setrlimit(resource.RLIMIT_NOFILE, (4096, 4096))
try:
params = vars(get_params())
main(params)
except Exception as exception:
logger.exception(exception)
raise