forked from tobiasvanderwerff/HT-vs-MT
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclassifier_svm.py
86 lines (71 loc) · 2.67 KB
/
classifier_svm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#!/usr/bin/env python
import sys
from functools import partial
import spacy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.multiclass import OneVsRestClassifier
from sklearn import svm
from sklearn.calibration import CalibratedClassifierCV
from svm_utils import (
load_data,
print_division,
print_best_features,
plot_confusion_matrix,
parse_args_svm,
)
def main():
args = parse_args_svm()
# Nice for reproducibility when running lots of exps
print("Generated by:\npython {0}\n".format(" ".join(sys.argv)))
# Read in training data
X_train, Y_train = load_data(
args.root_dir, "train", args.use_google_data, args.use_normalized_data
)
X_test, Y_test = load_data(
args.root_dir, "dev", args.use_google_data, args.use_normalized_data
)
def _tokenizer(doc, spacy_tokenizer):
return [str(tkn) for tkn in spacy_tokenizer(doc)]
# Convert the texts to vectors.
if args.tfidf:
vec = TfidfVectorizer(min_df=args.min_df, ngram_range=(1, 2))
else:
# Simple BoW vectorizer.
tokenizer = partial(_tokenizer, spacy_tokenizer=spacy.load("en_core_web_sm"))
vec = CountVectorizer(
min_df=args.min_df, ngram_range=(1, 2), tokenizer=tokenizer
)
# Choose the algorithm.
if args.algorithm == "nb":
clf = Pipeline([("vec", vec), ("cls", MultinomialNB())])
elif args.algorithm == "svm":
clf = svm.LinearSVC(C=1)
# Use the CalibratedClassifier so we can use predict_proba later.
if args.probabilities:
clf = CalibratedClassifierCV(base_estimator=clf)
clf = Pipeline([("vec", vec), ("cls", clf)])
# Do we do 1v1 or 1 v rest?
if args.one_vs_rest:
clf = Pipeline([("cls", OneVsRestClassifier(clf))])
# Train & test on separate set
clf.fit(X_train, Y_train)
Y_pred = clf.predict(X_test)
print_division(clf.named_steps["cls"].classes_, Y_train)
print(classification_report(Y_test, Y_pred, digits=3))
# Feature analysis, only possible for SVM.
if args.features and args.algorithm == "svm" and not args.one_vs_rest:
print_best_features(vec, clf, X_train, args.only_word_features)
# Confusion matrix if we want.
if args.confusion:
Y_plot = Y_test
plot_confusion_matrix(
confusion_matrix(Y_plot, Y_pred),
[c for c in clf.named_steps["cls"].classes_],
args.confusion,
normalize=False,
)
if __name__ == "__main__":
main()