-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathSVM QA Model
91 lines (71 loc) · 3.3 KB
/
SVM QA Model
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import hstack
from nltk import pos_tag, word_tokenize
#Load and preprocess the training data
def load_training_data(file_path):
data = np.load(file_path, allow_pickle=True)
sentences = [item['question'] for item in data]
answers = [item['answer'] for item in data]
distractors = [str(item['distractor1']) + " " + str(item['distractor2']) + " " + str(item['distractor(unsure)']) for item in data]
labels = [item['label'] for item in data]
choice_list = [item['choice_list'] for item in data]
choice_order = [item['choice_order'] for item in data]
return sentences, answers, distractors, labels, choice_list, choice_order
#BoW features with TF-IDF
def generate_tfidf_features(sentences):
vectorizer = TfidfVectorizer(lowercase=True, stop_words="english")
tfidf_features = vectorizer.fit_transform(sentences)
return tfidf_features
#POS tagging
def generate_pos_features(sentences):
tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
pos_sequences = [" ".join(tag for _, tag in pos_tag(tokens)) for tokens in tokenized_sentences]
vectorizer = TfidfVectorizer()
pos_features = vectorizer.fit_transform(pos_sequences)
return pos_features
#Concatenate Answer, Question, and Distractor features
def combine_question_answer_distractor_features(question, answer, distractors):
combined = [q + " " + a + " " + d for q, a, d in zip(question, answer, distractors)]
combined_features = generate_tfidf_features(combined)
return combined_features
#Combine all features
def combine_all_features(tfidf_features, pos_features, combined_features):
return hstack([tfidf_features, pos_features, combined_features])
#Train SVM classifier
def train_svm(features, labels):
classifier = SVC(kernel='linear', probability=True)
classifier.fit(features, labels)
return classifier
#main execution section
#Load data
file_path = "sp-train.npy"
sentences, answers, distractors, labels, choice_list, choice_order = load_training_data(file_path)
#Encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)
#Generate features
#TF-IDF Features
tfidf_features = generate_tfidf_features(sentences)
#POS Features
pos_features = generate_pos_features(sentences)
#Combine question, answer, and distractors into a feature
combined_features = combine_question_answer_distractor_features(sentences, answers, distractors)
#Combine all features
final_features = combine_all_features(tfidf_features, pos_features, combined_features)
#Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(final_features, encoded_labels, test_size=0.2, random_state=36)
#Train SVM
svm_classifier = train_svm(X_train, y_train)
#make predictions
predictions = svm_classifier.predict(X_test)
# Count the number of correct predictions
correct_guesses = (predictions == y_test).sum()
total_guesses = len(y_test)
# Print the results: number of correct guesses and total guesses
print(f"Total guesses: {total_guesses}")
print(f"Correct guesses: {correct_guesses}")
print(f"Accuracy: {correct_guesses / total_guesses:.2f}")