-
-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathgen.py
326 lines (267 loc) · 13.3 KB
/
gen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Script to Generate Vocabulary from a Large Corpus or Dataset and Update a Model Qelm/JSON with a Tkinter GUI.
You can alter this in any way as it's very rudi and may not reflect current models as the base may change consistantly.
"""
import json
import os
import tkinter as tk
from tkinter import filedialog, messagebox, scrolledtext, ttk
from nltk.tokenize import word_tokenize
from collections import Counter
import numpy as np
import nltk
# Download necessary NLTK data
nltk.download('punkt', quiet=True)
def generate_vocabulary_from_corpus(corpus: str, vocab_size: int) -> dict:
"""
Generate a vocabulary and token-to-ID mapping from the given text corpus.
"""
if not corpus.strip():
raise ValueError("Input corpus is empty or contains no valid text.")
# Tokenize the corpus
tokens = word_tokenize(corpus.lower())
if not tokens:
raise ValueError("Tokenization produced no tokens. Ensure the input corpus contains valid text.")
# Count token frequencies
token_freq = Counter(tokens)
# Select the most common tokens based on vocab_size
most_common_tokens = [token for token, _ in token_freq.most_common(vocab_size)]
# Create token-to-ID mapping
token_to_id = {token: idx for idx, token in enumerate(most_common_tokens)}
return token_to_id
def update_model_with_vocabulary(json_file: str, token_to_id: dict):
"""
Update the model JSON file with a `token_to_id` mapping and ensure embeddings match the vocabulary.
"""
# Load the model JSON file
if not os.path.exists(json_file):
raise FileNotFoundError(f"Model JSON file not found: {json_file}")
with open(json_file, "r") as f:
model_dict = json.load(f)
# Update the model with token mappings
model_dict["token_to_id"] = token_to_id
vocab_size = len(token_to_id)
# Ensure embeddings match the new vocabulary size
if "embeddings" in model_dict:
embed_dim = len(model_dict["embeddings"][0])
model_dict["embeddings"] = (np.random.randn(vocab_size, embed_dim) * 0.01).tolist()
else:
raise ValueError("Embeddings are missing in the model JSON.")
# Save the updated model JSON file
with open(json_file, "w") as f:
json.dump(model_dict, f, indent=4)
print(f"Model updated successfully with vocabulary. Saved to: {json_file}")
def validate_input_text(text: str, token_to_id: dict):
"""
Validate that the input text contains tokens present in the vocabulary.
"""
tokens = word_tokenize(text.lower())
valid_tokens = [token for token in tokens if token in token_to_id]
if not valid_tokens:
missing_tokens = [token for token in tokens if token not in token_to_id]
raise ValueError(
f"Input text contains no valid tokens. Missing tokens: {missing_tokens}. "
f"Consider expanding the corpus to include these words."
)
return valid_tokens
class VocabularyGUI:
def __init__(self, master):
self.master = master
master.title("Vocabulary Generator and Model Updater")
# Initialize variables
self.json_file = tk.StringVar()
self.vocab_size = tk.IntVar(value=100000)
self.sample_input = tk.StringVar()
self.corpus_mode = tk.StringVar(value="manual")
self.dataset_path = tk.StringVar()
# Layout configuration
self.create_widgets()
def create_widgets(self):
# Mode Selection
mode_frame = tk.Frame(self.master)
mode_frame.pack(padx=10, pady=5, fill=tk.X)
tk.Label(mode_frame, text="Corpus Source:").pack(anchor=tk.W)
modes = [("Manual Text Corpus", "manual"), ("Use Dataset", "dataset")]
for text, mode in modes:
tk.Radiobutton(mode_frame, text=text, variable=self.corpus_mode, value=mode, command=self.toggle_corpus_source).pack(anchor=tk.W)
# Dataset Selection
self.dataset_frame = tk.Frame(self.master)
self.dataset_frame.pack(padx=10, pady=5, fill=tk.X)
tk.Label(self.dataset_frame, text="Dataset File/Directory:").pack(side=tk.LEFT)
self.dataset_entry = tk.Entry(self.dataset_frame, textvariable=self.dataset_path, width=50, state='disabled')
self.dataset_entry.pack(side=tk.LEFT, padx=5)
self.browse_dataset_button = tk.Button(self.dataset_frame, text="Browse", command=self.browse_dataset, state='disabled')
self.browse_dataset_button.pack(side=tk.LEFT)
# JSON File Selection
json_frame = tk.Frame(self.master)
json_frame.pack(padx=10, pady=5, fill=tk.X)
tk.Label(json_frame, text="Model JSON File:").pack(side=tk.LEFT)
self.json_entry = tk.Entry(json_frame, textvariable=self.json_file, width=50)
self.json_entry.pack(side=tk.LEFT, padx=5)
tk.Button(json_frame, text="Browse", command=self.browse_json).pack(side=tk.LEFT)
# Vocabulary Size
vocab_frame = tk.Frame(self.master)
vocab_frame.pack(padx=10, pady=5, fill=tk.X)
tk.Label(vocab_frame, text="Vocabulary Size:").pack(side=tk.LEFT)
self.vocab_entry = tk.Entry(vocab_frame, textvariable=self.vocab_size, width=10)
self.vocab_entry.pack(side=tk.LEFT, padx=5)
# Text Corpus
self.corpus_frame = tk.Frame(self.master)
self.corpus_frame.pack(padx=10, pady=5, fill=tk.BOTH, expand=True)
tk.Label(self.corpus_frame, text="Text Corpus:").pack(anchor=tk.W)
self.corpus_text = scrolledtext.ScrolledText(self.corpus_frame, height=10)
self.corpus_text.pack(fill=tk.BOTH, expand=True)
# Insert default corpus
default_corpus = """
Quantum computing is an area of computing focused on developing computer technology
based on the principles of quantum theory. Language models aim to generate meaningful
text based on input, bridging the gap between technology and human interaction.
Great literature includes works like those of Shakespeare, Dickens, and Austen.
Scientific advancements span physics, biology, and chemistry. Everyday conversations
include phrases like "hello," "how are you?" and "what's the weather like today?"
Advanced topics in artificial intelligence, machine learning, and deep learning
pave the way for innovation. Historical texts, scientific papers, and novels
contribute to a rich corpus for modeling.
Popular culture references, news headlines, and casual dialogue make up modern
conversations. The integration of complex fields like robotics and autonomous
systems revolutionizes industries. Let's ensure our corpus is broad enough to
capture all nuances of human language!
"""
self.corpus_text.insert(tk.END, default_corpus.strip())
# Sample Input
sample_frame = tk.Frame(self.master)
sample_frame.pack(padx=10, pady=5, fill=tk.X)
tk.Label(sample_frame, text="Sample Input:").pack(side=tk.LEFT)
self.sample_entry = tk.Entry(sample_frame, textvariable=self.sample_input, width=50)
self.sample_entry.pack(side=tk.LEFT, padx=5)
self.sample_entry.insert(0, "hello world, what's quantum computing?")
# Buttons
button_frame = tk.Frame(self.master)
button_frame.pack(padx=10, pady=10)
tk.Button(button_frame, text="Generate Vocabulary", command=self.generate_vocabulary).pack(side=tk.LEFT, padx=5)
tk.Button(button_frame, text="Validate Input", command=self.validate_input).pack(side=tk.LEFT, padx=5)
tk.Button(button_frame, text="Update Model JSON", command=self.update_model).pack(side=tk.LEFT, padx=5)
# Log Area
log_frame = tk.Frame(self.master)
log_frame.pack(padx=10, pady=5, fill=tk.BOTH, expand=True)
tk.Label(log_frame, text="Log:").pack(anchor=tk.W)
self.log_text = scrolledtext.ScrolledText(log_frame, height=10, state='disabled')
self.log_text.pack(fill=tk.BOTH, expand=True)
def toggle_corpus_source(self):
mode = self.corpus_mode.get()
if mode == "manual":
self.dataset_entry.config(state='disabled')
self.browse_dataset_button.config(state='disabled')
self.corpus_text.config(state='normal')
self.log("Switched to Manual Text Corpus mode.")
else:
self.dataset_entry.config(state='normal')
self.browse_dataset_button.config(state='normal')
self.corpus_text.config(state='disabled')
self.log("Switched to Dataset mode.")
def browse_json(self):
filepath = filedialog.askopenfilename(
title="Select Model JSON File",
filetypes=[("JSON Files", "*.json"), ("All Files", "*.*")]
)
if filepath:
self.json_file.set(filepath)
self.log(f"Selected JSON file: {filepath}")
def browse_dataset(self):
filepath = filedialog.askopenfilename(
title="Select Dataset File or Directory",
filetypes=[("Text Files", "*.txt"), ("All Files", "*.*")]
)
if not filepath:
# If no file is selected, try selecting a directory
filepath = filedialog.askdirectory(title="Select Dataset Directory")
if filepath:
self.dataset_path.set(filepath)
self.log(f"Selected dataset path: {filepath}")
def log(self, message):
self.log_text.config(state='normal')
self.log_text.insert(tk.END, message + "\n")
self.log_text.see(tk.END)
self.log_text.config(state='disabled')
def load_corpus_from_dataset(self, path):
"""
Load text from a single file or all text files in a directory.
"""
if os.path.isfile(path):
if not path.lower().endswith('.txt'):
raise ValueError("Selected file is not a text file (*.txt).")
with open(path, 'r', encoding='utf-8') as f:
corpus = f.read()
self.log(f"Loaded corpus from file: {path}")
return corpus
elif os.path.isdir(path):
corpus = ""
txt_files = [f for f in os.listdir(path) if f.lower().endswith('.txt')]
if not txt_files:
raise ValueError("No text files (*.txt) found in the selected directory.")
for file in txt_files:
file_path = os.path.join(path, file)
with open(file_path, 'r', encoding='utf-8') as f:
corpus += f.read() + " "
self.log(f"Loaded text from: {file_path}")
return corpus
else:
raise ValueError("Selected path is neither a file nor a directory.")
def generate_vocabulary(self):
self.log("Generating vocabulary...")
mode = self.corpus_mode.get()
vocab_size = self.vocab_size.get()
try:
if mode == "manual":
corpus = self.corpus_text.get("1.0", tk.END)
token_to_id = generate_vocabulary_from_corpus(corpus, vocab_size)
self.log(f"Generated {len(token_to_id)} tokens for the vocabulary from manual corpus.")
else:
dataset_path = self.dataset_path.get()
if not dataset_path:
raise ValueError("Dataset path is not specified.")
corpus = self.load_corpus_from_dataset(dataset_path)
token_to_id = generate_vocabulary_from_corpus(corpus, vocab_size)
self.log(f"Generated {len(token_to_id)} tokens for the vocabulary from dataset.")
messagebox.showinfo("Success", f"Vocabulary generated with {len(token_to_id)} tokens.")
self.generated_token_to_id = token_to_id # Store for later use
except ValueError as e:
self.log(f"Error while generating vocabulary: {e}")
messagebox.showerror("Error", f"Failed to generate vocabulary:\n{e}")
def validate_input(self):
self.log("Validating sample input...")
sample = self.sample_input.get()
try:
if not hasattr(self, 'generated_token_to_id'):
raise ValueError("Vocabulary not generated yet. Please generate vocabulary first.")
valid_tokens = validate_input_text(sample, self.generated_token_to_id)
self.log(f"Valid tokens: {valid_tokens}")
messagebox.showinfo("Validation Success", f"Valid tokens:\n{valid_tokens}")
except ValueError as e:
self.log(f"Error while validating input: {e}")
messagebox.showerror("Validation Error", f"Failed to validate input:\n{e}")
def update_model(self):
self.log("Updating model JSON with vocabulary...")
json_file = self.json_file.get()
try:
if not hasattr(self, 'generated_token_to_id'):
raise ValueError("Vocabulary not generated yet. Please generate vocabulary first.")
if not json_file:
raise ValueError("JSON file path is not specified.")
update_model_with_vocabulary(json_file, self.generated_token_to_id)
self.log("Vocabulary update complete.")
messagebox.showinfo("Success", "Model JSON updated successfully.")
except (FileNotFoundError, ValueError) as e:
self.log(f"Error while updating model JSON: {e}")
messagebox.showerror("Update Error", f"Failed to update model JSON:\n{e}")
# ============================
# Entry Point
# ============================
def main():
root = tk.Tk()
gui = VocabularyGUI(root)
root.mainloop()
if __name__ == "__main__":
main()