forked from bAInaryglobe/bAInelm
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocessor1.py
371 lines (270 loc) · 12.1 KB
/
processor1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
from transformers import pipeline
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag, RegexpParser
# Set the path to the keywords file
keywords_file = 'keywords.txt'
# Set the path to the INPUT folder
input_folder = 'INPUT'
# Set the path to the CODING folder
coding_folder = 'CODING'
# Set the path to the pairs.txt file
pairs_file = 'pairs.txt'
# Function to check for the first non-empty file in the INPUT folder
def get_first_non_empty_file(input_folder):
for file in os.listdir(input_folder):
file_path = os.path.join(input_folder, file)
if os.path.isfile(file_path) and os.path.getsize(file_path) > 0:
return file_path
return None
# Function to break text into sentences
def break_text_into_sentences(text):
sentences = sent_tokenize(text)
return sentences
# Function to determine the type of sentence (question, exclamation, declaration, etc.)
def determine_sentence_type(sentence):
if sentence.endswith('?'):
return 'question'
elif sentence.endswith('!'):
return 'exclamation'
else:
return 'declaration'
# Function to get keywords from a sentence
def get_keywords(sentence):
# Define the stop words
stop_words = set(stopwords.words('english'))
# Tokenize the sentence into words
words = word_tokenize(sentence)
# Initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()
# Extract the keywords
keywords = []
for word in words:
# Remove punctuation and convert to lowercase
word = word.strip('.,!?()[]{}:;\'"').lower()
# Skip stop words and short words
if word in stop_words or len(word) < 3:
continue
# Lemmatize the word
word = lemmatizer.lemmatize(word)
# Add the word to the list of keywords
keywords.append(word)
# Append the keywords to the keywords.txt file
with open('keywords.txt', 'a') as f:
f.write('\n'.join(keywords))
return keywords
# Function to generate grammatically correct sentences as fallback reply
def generate_fallback_reply(keywords):
# Initialize the text generation pipeline
text_generator = pipeline('text-generation', model='gpt2')
# Generate text using the provided keywords as a prompt
text = text_generator(' '.join(keywords))[0]['generated_text']
# Set the fallback reply variable to the generated text
fallback_reply = text
return fallback_reply
# Function to check if a sentence is a coding question
def is_coding_question(sentence, keywords):
# Set the path to the file that contains the bias flag
bias_file = 'bias.txt'
# Read the bias flag from the file
with open(bias_file, 'r') as f:
bias_flag = f.read(1)
# Set the bias towards coding or non-coding questions based on the bias flag
if bias_flag == '1':
coding_bias = 0.3
else:
coding_bias = 0.7
# Define a list of coding-related keywords
coding_keywords = ['code', 'coding', 'program',
'programming', 'algorithm', 'script', 'function', 'method']
# Count the number of coding-related keywords in the sentence
coding_keyword_count = 0
for keyword in keywords:
if keyword.lower() in coding_keywords:
coding_keyword_count += 1
# Calculate the ratio of coding-related keywords to total keywords
coding_keyword_ratio = coding_keyword_count / len(keywords)
# Determine if the sentence is a coding question based on the calculated ratio and the coding bias
if coding_keyword_ratio >= coding_bias:
return True
else:
return False
# Function to save keywords to the first empty file in the CODING folder
def save_keywords_to_coding_folder(keywords, coding_folder):
for file in os.listdir(coding_folder):
file_path = os.path.join(coding_folder, file)
if os.path.isfile(file_path) and os.path.getsize(file_path) == 0:
with open(file_path, 'w') as f:
f.write('\n'.join(keywords))
break
# Function to compare phrases with lines in pairs.txt file and generate reply sentences
def generate_reply_sentences(phrases, pairs_file):
# Define the maximum number of matches to consider
max_matches = 5
# Read the lines from the pairs.txt file
with open(pairs_file, 'r') as f:
pairs_lines = f.readlines()
# Initialize a list to store the matches
matches = []
# Iterate over the phrases
for phrase in phrases:
# Initialize a list to store the scores for this phrase
scores = []
# Iterate over the lines in the pairs.txt file
for line in pairs_lines:
# Split the line into two parts using the "abcdefghij" separator
parts = line.split('abcdefghij')
# Check if the line has two parts
if len(parts) == 2:
# Calculate the score for this line by counting the number of times the phrase appears in the first part of the line
score = parts[0].count(phrase)
# Add the score and the second part of the line to the list of scores
scores.append((score, parts[1]))
# Sort the scores in descending order by score
scores.sort(key=lambda x: x[0], reverse=True)
# Add the top max_matches matches to the list of matches
matches.extend([score[1] for score in scores[:max_matches]])
# Generate reply sentences from matches
reply_sentences = []
for match in matches:
reply_sentence = match # Placeholder value
reply_sentences.append(reply_sentence)
return reply_sentences
# Function to make reply sentences more humane
def make_reply_sentences_more_humane(reply_sentences):
# Initialize the text generation pipeline
text_generator = pipeline('text-generation', model='gpt2')
# Join the reply sentences into a single text
text = ' '.join(reply_sentences)
# Generate additional text using the reply sentences as a prompt
additional_text = text_generator(text)[0]['generated_text']
# Split the additional text into sentences
additional_sentences = additional_text.split('. ')
# Combine the original reply sentences with the additional sentences
combined_sentences = reply_sentences + additional_sentences
return combined_sentences
# Function to check and correct individual reply sentences
def check_and_correct_sentences(sentences, pairs_file, keywords_file):
# Define the minimum and maximum sentence length
min_sentence_length = 3
max_sentence_length = 20
# Read the lines from the pairs.txt file
with open(pairs_file, 'r') as f:
pairs_lines = f.readlines()
# Read the lines from the keywords.txt file
with open(keywords_file, 'r') as f:
keywords_lines = f.readlines()
# Initialize a list to store the corrected sentences
corrected_sentences = []
# Iterate over the sentences
for sentence in sentences:
# Split the sentence into words
words = sentence.split()
# Check if the sentence is too short or too long
if len(words) < min_sentence_length or len(words) > max_sentence_length:
continue
# Check if the sentence is a duplicate
if sentence in corrected_sentences:
continue
# Check if the sentence is too similar to a sentence in the pairs.txt file
too_similar = False
for line in pairs_lines:
if line.strip() in sentence:
too_similar = True
break
if too_similar:
continue
# Check if the sentence contains too similar keywords
too_similar_keywords = False
for i in range(len(words) - 1):
if words[i] == words[i + 1]:
too_similar_keywords = True
break
if too_similar_keywords:
continue
# Replace keywords that are not present in the keywords.txt file
for i in range(len(words)):
keyword_found = False
for line in keywords_lines:
if words[i] in line.split(','):
keyword_found = True
break
if not keyword_found:
words[i] = '<REPLACEMENT>'
# Reconstruct the sentence from the corrected words
corrected_sentence = ' '.join(words)
# Add punctuation between sentences (alternating between commas, full stops, and semicolons)
if len(corrected_sentences) % 3 == 0:
corrected_sentence += '.'
elif len(corrected_sentences) % 3 == 1:
corrected_sentence += ','
else:
corrected_sentence += ';'
# Add the corrected sentence to the list of corrected sentences
corrected_sentences.append(corrected_sentence)
return corrected_sentences
def extract_phrases(sentence):
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
# Tokenize the sentence into words
words = word_tokenize(sentence)
# Perform POS tagging on the words
tagged_words = pos_tag(words)
# Define a regular expression pattern for phrase extraction
grammar = r"""
NP: {<DT>?<JJ>*<NN.*>+} # Noun phrase
"""
# Create a chunk parser with the defined grammar pattern
chunk_parser = RegexpParser(grammar)
# Parse the tagged words to extract noun and adjective phrases
tree = chunk_parser.parse(tagged_words)
# Initialize a list to store the extracted phrases
phrases = []
# Traverse the tree to extract the phrases
for subtree in tree.subtrees():
if subtree.label() == 'NP':
# Extract the words in the phrase
phrase_words = [lemmatizer.lemmatize(word.lower()) for word, tag in subtree.leaves()]
# Remove stop words and short words from the phrase
phrase_words = [word for word in phrase_words if word not in stop_words and len(word) >= 3]
# Combine the words to form the phrase
phrase = ' '.join(phrase_words)
# Add the phrase to the list of extracted phrases
phrases.append(phrase)
return phrases
while True:
# Get the first non-empty file in the INPUT folder
input_file = get_first_non_empty_file(input_folder)
if input_file:
# Read the content of the input file
with open(input_file, 'r') as f:
text = f.read()
# Break the text into sentences
sentences = break_text_into_sentences(text)
for sentence in sentences:
# Determine the type of sentence (question, exclamation, declaration, etc.)
sentence_type = determine_sentence_type(sentence)
# Get keywords from the sentence
keywords = get_keywords(sentence)
# Generate fallback reply using keywords
fallback_reply = generate_fallback_reply(keywords)
# Check if the sentence is a coding question
if is_coding_question(sentence, keywords):
# Save keywords to the first empty file in the CODING folder and stop processing this sentence
save_keywords_to_coding_folder(keywords, coding_folder)
continue
# Break sentence into phrases and get rid of non-keyword parts (conjunctions, prepositions, etc.)
phrases = extract_phrases(sentence)
# Compare phrases with lines in pairs.txt file and generate reply sentences
reply_sentences = generate_reply_sentences(phrases, pairs_file)
# Make reply sentences more humane by adding sentences before, in between and after original sentences
reply_sentences = make_reply_sentences_more_humane(reply_sentences)
# Check and correct individual reply sentences by adding conjunctions, mixing short and long sentences, adding paragraphs etc.
reply_sentences = check_and_correct_sentences(reply_sentences, pairs_file, keywords_file)
# Append resultant string to original input file
with open(input_file, 'a') as f:
f.write('\n'.join(reply_sentences))