-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathChapter2.jl
643 lines (472 loc) · 19.8 KB
/
Chapter2.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
# 2.2 Tokenizing Text
using HTTP
using Base.Threads
function download_github_raw(url::String)
response = HTTP.get(url)
return String(response.body)
end
url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt"
raw_text = download_github_raw(url)
# for demonstation purposes, we will only show the first 100 characters of the text.
println("Total number of character:", length(raw_text))
println(raw_text[begin:99])
# Before transitioning to a prebuilt tokenizer
# import re
# text = "Hello, world. This, is a test."
# result = re.split(r'(\s)', text)
# print(result)
text = "Hello, world. This, is a test."
# result = split(text, r"\s") does not retain the whitespaces as with the split() in python.
# Capturing groups do not affect the behavior of split(). So, the whitespaces are not retained as with the split() in python.
# result = split(text, r"(\s)") <=> result = split(text, r"\s")
# Benchmark regex matching with retention of whitespaces
using BenchmarkTools
# Benchmark with the custom options
@btime result = [m.match for m in eachmatch(r"\S+|\s", text)] evals = 1000
println(result)
# ==============================
# result = re.split(r'([,.]|\s)', text)
# print(result)
result = split(text, r"([,.]|\s)")
println(result)
# result = [item for item in result if item.strip()]
# print(result)
# Note on the difference between list/array comprehension in Python and Julia
filter!(x -> x != "", result)
println(result)
# "Let’s modify it a bit further so that it can also handle other types of punctuation, such as question marks, quotation marks, and the double-dashes"
# text = "Hello, world. Is this-- a test?"
# result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
# result = [item.strip() for item in result if item.strip()]
# print(result)
text = """Hello, world. Is this-- a test?
new
line and another one. And
"""
# Define the regular expression pattern
pattern = r"([,.:;?_!\"()\']|--|\s)"
# Function to split text by matches while keeping the delimiters
import Base.split
function split(pattern, text, with_dlm=true)
result = String[]
last_index = 1
for m in eachmatch(pattern, text)
start = first(m.offsets) # Get the start index of the match
stop = last(m.offsets) # Get the end index of the match
# Get the substring between matches
part = last_index < start && length(m.match) > 1 ? text[last_index:start-1] : strip(text[last_index:start-1], '-')
# Only push non-empty parts
if part != ""
push!(result, part)
end
# leave out unnecessary empty strings
if with_dlm && m.match != ""
push!(result, m.match)
end
last_index = stop + 1
end
# Add the final substring if it's not empty
if last_index <= length(text)
final_part = text[last_index:end]
if final_part != ""
push!(result, final_part)
end
end
return result
end
# Define the regular expression pattern
pattern = r"([,.:;?_!\"()']|--|\s)"
# Get the result
result = split(pattern, text, false)
result = strip.(result)
# Remove empty strings from the result
result = filter(x -> !(x in ([" ", "\n"])), text)
println(result)
# Getting back to the original text
# preprocessed = re.split(r'([,.?_!"()\']|--|\s)', raw_text)
# preprocessed = [item.strip() for item in preprocessed if item.strip()]
# print(len(preprocessed))
preprocessed = split(r"([,.:;?_!\"()']|--|\s)", raw_text)
# 3 ways of achieving the same result
# with array comprehension (4694 in Python we get 4690)
preprocessed = [strip(item) for item in preprocessed if strip(item) != ""]
println(length(preprocessed))
println(preprocessed)
println(preprocessed[begin:30])
# 2.3 Converting tokens into IDs
# all_words = sorted(set(preprocessed))
# vocab_size = len(all_words)
# print(vocab_size)
all_words = sort(collect(Set(preprocessed)))
vocab_size = length(all_words)
println(vocab_size)
# Listing 2.2 Creating a vocabulary
# vocab = {token:integer for integer,token in enumerate(all_words)}
# for i, item in enumerate(vocab.items()):
# print(item)
# if i >= 50:
# break
vocab = Dict(word => idx for (idx, word) in enumerate(all_words))
for (i, item) in enumerate(vocab)
println(item)
if i >= 50
break
end
end
# class SimpleTokenizerV1:
# def __init__(self, vocab):
# self.str_to_int = vocab #A
# self.int_to_str = {i:s for s,i in vocab.items()} #B
# def encode(self, text): #C
# preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
# preprocessed = [
# item.strip() for item in preprocessed if item.strip()
# ]
# ids = [self.str_to_int[s] for s in preprocessed]
# return ids
# def decode(self, ids): #D
# text = " ".join([self.int_to_str[i] for i in ids])
# text = re.sub(r'\s+([,.?!"()\'])', r'\1', text) #E
# return text
# helper function
function prealloc_str_int(preprocessed, vocab)
n = length(preprocessed)
result = Vector{Int}(undef, n)
i = 1
for s in preprocessed
result[i] = vocab[s]
i += 1
end
return result
end
function prealloc_int_str(vocab, ids)
n = length(ids)
result = Vector{String}(undef, n)
i = 1
for s in ids
println(vocab[s])
result[i] = vocab[s]
i += 1
end
return result
end
struct SimpleTokenizerV1
str_to_int::Dict{String,Int}
int_to_str::Dict{Int,String}
function SimpleTokenizerV1(vocab::Dict{<:AbstractString,Int})
str_to_int = vocab
int_to_str = Dict(i => s for (s, i) in vocab)
new(str_to_int, int_to_str)
end
end
function encode(tokenizer::SimpleTokenizerV1, text::String)
preprocessed = split(r"([,.?_!\"()\']|--|\s)", text)
preprocessed = filter(x -> !(x in ([" ", "\n"])), preprocessed)
# join([tokenizer.int_to_str[i] for i in ids], " ")
ids = prealloc_str_int(preprocessed, tokenizer.str_to_int)
return ids
end
function decode(tokenizer::SimpleTokenizerV1, ids::Vector{Int})
text = join(prealloc_int_str(tokenizer.int_to_str, ids), " ")
text = replace(text, r"\s+([,.?!\"()\'])" => s"\1")
return text
end
# Example usage
vocab = Dict("hello" => 1, "world" => 2, "," => 3, "!" => 4)
tokenizer = SimpleTokenizerV1(vocab)
# Encoding
text = "hello, world!"
encoded = encode(tokenizer, text)
println(encoded) # Output: [1, 3, 2, 4]
# Decoding
decoded = decode(tokenizer, encoded)
println(decoded) # Output: "hello, world!"
# 2.4 Adding special context tokens
# all_tokens = sorted(list(set(preprocessed)))
# all_tokens.extend(["<|endoftext|>", "<|unk|>"])
# vocab = {token:integer for integer,token in enumerate(all_tokens)}
all_tokens = sort(collect(Set(preprocessed)))
push!(all_tokens, "<|endoftext|>", "<|unk|>")
vocab = Dict(token => integer for (integer, token) in enumerate(all_tokens))
# check whether the special tokens are in the vocab
# for i, item in enumerate(list(vocab.items())[-5:]):
# print(item)
# we don't need to use the for loop to print the last 5 items
all(key -> haskey(vocab, key), ["<|endoftext|>", "<|unk|>"])
# class SimpleTokenizerV2:
# def __init__(self, vocab):
# self.str_to_int = vocab
# self.int_to_str = { i:s for s,i in vocab.items()}
# def encode(self, text):
# preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
# preprocessed = [
# item.strip() for item in preprocessed if item.strip()
# ]
# preprocessed = [item if item in self.str_to_int #A
# else "<|unk|>" for item in preprocessed]
# ids = [self.str_to_int[s] for s in preprocessed]
# return ids
# def decode(self, ids):
# text = " ".join([self.int_to_str[i] for i in ids])
# text = re.sub(r'\s+([,.?!"()\'])', r'\1', text) #B
# return text
struct SimpleTokenizerV2
str_to_int::Dict{String,Int}
int_to_str::Dict{Int,String}
function SimpleTokenizerV2(vocab::Dict{<:AbstractString,Int})
str_to_int = vocab
int_to_str = Dict(i => s for (s, i) in vocab)
new(str_to_int, int_to_str)
end
end
function encode(tokenizer::SimpleTokenizerV2, text::String)
preprocessed = split(r"([,.?_!\"()\']|--|\s)", text)
preprocessed = filter(x -> !(x in ([" ", "\n"])), preprocessed)
println(preprocessed)
preprocessed = [haskey(tokenizer.str_to_int, item) ? item : "<|unk|>" for item in preprocessed]
ids = prealloc_str_int(preprocessed, tokenizer.str_to_int)
return ids
end
function decode(tokenizer::SimpleTokenizerV2, ids::Vector{Int})
text = join(prealloc_int_str(tokenizer.int_to_str, ids), " ")
text = replace(text, r"\s+([,.?!\"()\'])" => s"\1")
return text
end
# text1 = "Hello, do you like tea?"
# text2 = "In the sunlit terraces of the palace."
# text = " <|endoftext|> ".join((text1, text2))
# print(text)
text1 = "Hello, do you like, tea?"
text2 = "In the sunlit terraces of the palace."
text = join([text1, text2], " <|endoftext|> ")
tokenizer2 = SimpleTokenizerV2(vocab)
# Encoding
encoded = encode(tokenizer2, text)
println(encoded) # Output: [1, 3, 2, 4]
# Decoding
decoded = decode(tokenizer2, encoded)
println(decoded) # Output: "hello, world!"
# 2.5 Byte Pair Encoding (BPE)
# from importlib.metadata import version
# import tiktoken
# print("tiktoken version:", version("tiktoken"))
using BytePairEncoding
# text = (
# "Hello, do you like tea? <|endoftext|> In the sunlit terraces",
# "of someunknownPlace."
# )
# integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
# print(integers)
# tokenizer = tiktoken.get_encoding("gpt2")
# Thorough discussion on gpt-2 and tiktoken (which is the OpenAI's BPE encoding format) https://chatgpt.com/share/67248e53-89ac-8008-b643-4d0accce91e8
# tokenizer = BytePairEncoding.load_tiktoken("gpt2")
tokenizer = BytePairEncoding.load_tiktoken_encoder("gpt2")
integers = tokenizer.encode("Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknownPlace.")
println(integers)
# strings = tokenizer.decode(integers)
# print(strings)
strings = tokenizer.decode(integers)
println(strings)
# 2.6 Data sampling with a sliding window
# with open("the-verdict.txt", "r", encoding="utf-8") as f:
# raw_text = f.read()
# enc_text = tokenizer.encode(raw_text)
# print(len(enc_text))
enc_text = tokenizer.encode(raw_text)
print(length(enc_text))
# enc_sample = enc_text[50:] starting from 51 due to python's zero indexing
enc_sample = enc_text[51:end]
# context_size = 4
# x = enc_sample[:context_size]
# y = enc_sample[1:context_size+1]
# print(f"x: {x}")
# print(f"y: {y}")
context_size = 4
x = enc_sample[1:context_size]
y = enc_sample[2:context_size+1]
print("x: $x")
print("y: $y")
# for i in range(1, context_size+1):
# context = enc_sample[:i]
# desired = enc_sample[i]
# print(context, "---->", desired)
# Preallocate context as an empty array with a maximum possible length
context = Vector{eltype(enc_sample)}()
for i in 1:context_size
# Add the current element to the context
push!(context, enc_sample[i])
desired = enc_sample[i+1]
println(context, " ----> ", desired)
end
# for i in range(1, context_size+1):
# context = enc_sample[:i]
# desired = enc_sample[i]
# print(tokenizer.decode(context), "---->", tokenizer.decode([desired]))
for i in 1:context_size
# Add the current element to the context
push!(context, enc_sample[i])
desired = enc_sample[i+1]
println(tokenizer.decode(context), " ---->", tokenizer.decode([desired]))
end
# import torch
# from torch.utils.data import Dataset, DataLoader
# class GPTDatasetV1(Dataset):
# def __init__(self, txt, tokenizer, max_length, stride):
# self.input_ids = []
# self.target_ids = []
# token_ids = tokenizer.encode(txt) #1
# for i in range(0, len(token_ids) - max_length, stride): #2
# input_chunk = token_ids[i:i + max_length]
# target_chunk = token_ids[i + 1: i + max_length + 1]
# self.input_ids.append(torch.tensor(input_chunk))
# self.target_ids.append(torch.tensor(target_chunk))
# def __len__(self): #3
# return len(self.input_ids)
# def __getitem__(self, idx): #4
# return self.input_ids[idx], self.target_ids[idx]
using Flux
# The MLUtils package allow for creating a dataloader compatible with Flux.jl
using MLUtils
using BytePairEncoding
# using Transformers I can use the BytePairEncoding tokenizer from the BytePairEncodings.jl package and skip the Transformers.jl package
# Define custom dataset
struct GPTDatasetV1
input_ids::Vector{Vector{Int}}
target_ids::Vector{Vector{Int}}
end
function GPTDatasetV1(txt, tokenizer; max_length, stride)
input_ids = Vector{Vector{Int}}()
target_ids = Vector{Vector{Int}}()
# Tokenize the input text
token_ids = tokenizer.encode(txt)
# Generate input and target chunks
for i in 1:stride:(length(token_ids)-max_length)
input_chunk = token_ids[i:i+max_length-1]
target_chunk = token_ids[i+1:i+max_length]
push!(input_ids, input_chunk)
push!(target_ids, target_chunk)
end
return GPTDatasetV1(input_ids, target_ids)
end
# Define the length function
Base.length(dataset::GPTDatasetV1) = length(dataset.input_ids)
# Define the indexing function
function Base.getindex(dataset::GPTDatasetV1, idx)
return dataset.input_ids[idx], dataset.target_ids[idx]
end
# Example usage:
tokenizer = BytePairEncoding.load_tiktoken_encoder("gpt2")
dataset = GPTDatasetV1(raw_text, tokenizer, max_length=128, stride=64)
data_loader = DataLoader(dataset, batchsize=16, shuffle=true, partial=false, parallel=false)
# def create_dataloader_v1(txt, batch_size=4, max_length=256,
# stride=128, shuffle=True, drop_last=True,
# num_workers=0):
# tokenizer = tiktoken.get_encoding("gpt2") #1
# dataset = GPTDatasetV1(txt, tokenizer, max_length, stride) #2
# dataloader = DataLoader(
# dataset,
# batch_size=batch_size,
# shuffle=shuffle,
# drop_last=drop_last, #3
# num_workers=num_workers #4
# )
# return dataloader
function create_dataloader_v1(txt; batch_size=4, max_length=6,
stride=4, shuffle=true, partial=false, parallel=false)
tokenizer = BytePairEncoding.load_tiktoken_encoder("gpt2")
dataset = GPTDatasetV1(txt, tokenizer, max_length=max_length, stride=stride)
return DataLoader(dataset, batchsize=batch_size, shuffle=shuffle, partial=partial, parallel=parallel)
end
raw_text = """
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Morbi turpis diam, sollicitudin non pellentesque quis, lobortis tempor augue. Duis vestibulum id mi ut pellentesque. In hac habitasse platea dictumst. Praesent varius lectus neque, non mattis tortor vulputate ut. Fusce rhoncus elementum sodales. Quisque vel pellentesque leo. Quisque dapibus aliquam volutpat. Sed nec arcu sollicitudin, placerat mauris sed, pulvinar metus. Quisque volutpat est elit, vel tincidunt nibh congue quis. Vivamus eleifend luctus diam vel placerat. Praesent in velit mauris. Nulla non diam ac est mattis sagittis eget eu mauris.
Aliquam id mi vulputate, mattis erat in, sodales purus. Vivamus at purus eget ex commodo tristique ut ac magna. Cras efficitur dictum dui, eu efficitur ex aliquet eu. Donec vulputate odio eu sapien pharetra, vel mattis eros tristique. Nam vehicula mi non porta commodo. Pellentesque suscipit, sem nec ultricies accumsan, nulla enim eleifend lectus, eu malesuada diam neque et metus. Pellentesque auctor ultricies viverra. Quisque ut convallis est. Etiam mollis, odio eu molestie pellentesque, nulla libero feugiat ligula, sit amet lobortis est ante in justo. Aliquam erat volutpat. Donec sagittis gravida est. Sed rutrum sem a commodo convallis. Vestibulum non convallis nisi. Sed dignissim metus eget lacus viverra ultrices. Sed et dolor ut neque mollis posuere. Quisque ac molestie leo.
Phasellus porttitor hendrerit mauris a hendrerit. Ut eget rutrum ex. Nam fermentum semper ex, eu dignissim erat suscipit eu. Curabitur condimentum efficitur tincidunt. Nullam lacus ante, faucibus vitae feugiat vel, elementum sit amet dolor. Nulla facilisi. Nam laoreet, mauris sodales ornare pretium, enim est vehicula diam, at semper nisl tortor ut dolor. Proin ullamcorper ex at aliquam pulvinar. Aenean congue aliquam sagittis. Proin efficitur condimentum sapien at pretium. Etiam laoreet, lectus nec faucibus cursus, nisi nulla fermentum metus, sed aliquet ante lacus at turpis. Etiam nec arcu est.
"""
dataloaderV0 = create_dataloader_v1(raw_text)
# Iterating over data_loader
for (input_chunk, target_chunk) in dataloaderV0
println("Input: ========================", input_chunk)
println("Target: ========================", target_chunk)
end
# 2.7 Creating Token Embeddings
using Random
using Flux
# token ids and defining an embedding layer
# input_ids = torch.tensor([2, 3, 5, 1])
# vocab_size = 6
# output_dim = 3
# token ids and defining an embedding layer
input_ids = [2, 3, 5, 1]
vocab_size = 30 # replace with your vocabulary size
output_dim = 20 # replace with your desired output dimension
# Set random seed
# torch.manual_seed(123)
# embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
# Print the embedding weights
# print(embedding_layer.weight)
# Get the embedding for a specific token (0-based indexing)
# print(embedding_layer(torch.tensor([3])))
# Set random seed
Random.seed!(123)
embedding_layer = Flux.Embedding(vocab_size, output_dim)
# Print the embedding weights
println(embedding_layer.weight)
# Get the embedding for a specific token
embedding_layer.weight[1:end, 3]
# 2.8 Encoding word positions
using Flux
using Random
# Define the vocabulary size and output dimension
# vocab_size = 50257
# output_dim = 256
vocab_size = 50257
output_dim = 256
# Token embedding layer
# token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
# Token embedding layer
token_embedding_layer = Flux.Embedding(vocab_size, output_dim)
# Maximum sequence length
# max_length = 4
# Maximum sequence length
max_length = 4
# Example data (replace `raw_text` with actual input text)
# dataloader = create_dataloader_v1(
# raw_text, batch_size=8, max_length=max_length,
# stride=max_length, shuffle=False
# )
batch_size = 8
dataloader = create_dataloader_v1(raw_text, batch_size=batch_size, max_length=max_length, stride=max_length, shuffle=false)
# Simulate loading the data
# data_iter = iter(dataloader)
# inputs, targets = next(data_iter)
# print("Token IDs:\n", inputs)
# print("\nInputs shape:\\n", inputs.shape)
# Simulate loading the data
inputs, _ = first(dataloader)
println("Token IDs:\n", inputs)
println("Token IDs:\n", targets)
# Generate token embeddings
# token_embeddings = token_embedding_layer(inputs)
# print(token_embeddings.shape)
# torch.Size([8, 4, 256])
# Generate token embeddings
reduced_inputs = reduce(vcat, inputs')
token_embeddings = token_embedding_layer(reduced_inputs)
token_embeddings = permutedims(token_embeddings, (2, 3, 1))
println(size(token_embeddings)) # Output: (256, 8, 4)
# context_length = max_length
# pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
# pos_embeddings = pos_embedding_layer(torch.arange(context_length))
# print(pos_embeddings.shape)
# Positional embedding layer
pos_embedding_layer = Flux.Embedding(max_length, output_dim)
pos_embeddings = pos_embedding_layer(collect(1:max_length))
println(size(pos_embeddings))
pos_embeddings = reshape(pos_embeddings, 1, 4, 256) # Shape:
pos_embeddings = repeat(pos_embeddings, 1, 8, 1) # Resulting shape: (8, 4, 256)
# Add token and positional embeddings
# input_embeddings = token_embeddings + pos_embeddings
# print(input_embeddings.shape)
# torch.Size([8, 4, 256])
input_embeddings = token_embeddings .+ pos_embeddings # Resulting shape: (8, 4, 256)
println(size(input_embeddings)) # Output: (8, 4, 256)