-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcode.py
137 lines (124 loc) · 3.77 KB
/
code.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
'''
Author: Mohit Mangal
Email: [email protected], [email protected]
Description: Following file classifies questions given in
test.csv into what, when, affirmation and unknown
classes.
It considers questions given in files what,when and
affirmation as training data corresponding to each
type.
output is written to testOut.csv file
'''
import nltk
import sklearn
import numpy as np
from sklearn.multiclass import OneVsRestClassifier
import csv
# Reading training dataset
distinctSymbols = []
whatPattern = []
whenPattern = []
affirmationPattern = []
fp = open("affirmation","r")
for row in fp:
row = row.lower()
row = row.split(',')[-1]
posTags = nltk.pos_tag(row.lower().strip().split())
affirmationPattern += [[posTags[0][0], '!!!!!@@@@@@', posTags[1][1]]]
if posTags[0][0] not in distinctSymbols:
distinctSymbols += [posTags[0][0]]
if posTags[1][1] not in distinctSymbols:
distinctSymbols += [posTags[1][1]]
if '!!!!!@@@@@@' not in distinctSymbols:
distinctSymbols += ['!!!!!@@@@@@']
fp.close()
fp = open("what","r")
for row in fp:
row = row.lower()
row = row.split(',')[-1]
posTags = nltk.pos_tag(row.lower().strip().split())
secondWord = posTags[1][0]
if 'what' in row:
secondWord = '!!!!!@@@@@@'
whatPattern += [[posTags[0][0], secondWord, posTags[1][1]]]
if posTags[0][0] not in distinctSymbols:
distinctSymbols += [posTags[0][0]]
if posTags[1][1] not in distinctSymbols:
distinctSymbols += [posTags[1][1]]
if secondWord not in distinctSymbols:
distinctSymbols += [secondWord]
fp.close()
fp = open("when","r")
for row in fp:
row = row.lower()
row = row.split(',')[-1]
posTags = nltk.pos_tag(row.lower().strip().split())
secondWord = posTags[1][0]
if 'when' in row:
secondWord = '!!!!!@@@@@@'
whenPattern += [[posTags[0][0], secondWord, posTags[1][1]]]
if posTags[0][0] not in distinctSymbols:
distinctSymbols += [posTags[0][0]]
if posTags[1][1] not in distinctSymbols:
distinctSymbols += [posTags[1][1]]
if secondWord not in distinctSymbols:
distinctSymbols += [secondWord]
fp.close()
# words to numbers
tagsDict = {}
i=0
for tag in distinctSymbols:
tagsDict[tag] = i
i+=1
classDict = {0:'what',1:'when',2:'affirmation'}
trainingX = []
trainingY = []
for patterns in whatPattern:
trainingX += [[tagsDict[pattern] for pattern in patterns]]
trainingY += [0]
for patterns in whenPattern:
trainingX += [[tagsDict[pattern] for pattern in patterns]]
trainingY += [1]
for patterns in affirmationPattern:
trainingX += [[tagsDict[pattern] for pattern in patterns]]
trainingY += [2]
trainingX = np.array(trainingX)
trainingY = np.array(trainingY)
#print whatPattern
#print whenPattern
#print affirmationPattern
#print tagsDict
#print trainingX,trainingY
# Training SVM
trainedModel = OneVsRestClassifier(sklearn.svm.SVC(random_state=0)).fit(trainingX,trainingY)
# Testing
fp = open("test.csv","r")
fp1 = open("testOut.csv","w")
writer = csv.writer(fp1,delimiter=",")
writer.writerow(['Question','Catagory'])
for row in fp:
originalRow = row[:]
row = row.lower().strip().split(",")[-1]
posTags = nltk.pos_tag(row.split())
catagory = ""
try:
secondWord = posTags[1][0]
try:
secondWordNumber = tagsDict[secondWord]
except:
secondWordNumber = tagsDict['!!!!!@@@@@@']
#print [tagsDict[posTags[0][0]],secondWordNumber,tagsDict[posTags[1][1]]]
testData = np.array([[tagsDict[posTags[0][0]],secondWordNumber,tagsDict[posTags[1][1]]]])
catagory = classDict[trainedModel.predict(testData)[0]]
if catagory=='affirmation':
if 'what' in row:
catagory = "what"
elif 'when' in row:
catagory = "when"
except Exception as e:
catagory = "Unknown"
#print [originalRow.strip(),catagory]
#input('next?')
writer.writerow([originalRow.strip(),catagory])
fp.close()
fp1.close()