forked from karpathy/ng-video-lecture
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_loader.py
25 lines (21 loc) · 1009 Bytes
/
data_loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import pandas as pd
# get vocabulary and special character like end of sentence. Vocabulary from amazon product reviews
def get_vocab():
chars = ''' !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[]^_`abcdefghijklmnopqrstuvwxyz{|}~\n'''
EOS = '^' # begin and end token
return chars, EOS
def clean_dataset(text):
chars, EOS = get_vocab()
# add next line character and EOS char to every review
return f"{EOS}{''.join(c for c in text if c in chars)}\n{EOS}"
def load_shakespeare_dataset():
# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data//input.txt
with open('tinyshakespeare_input.txt', 'r', encoding='utf-8') as f:
text = f.read()
return clean_dataset(text)
def load_amazon_dataset():
# https://www.kaggle.com/datasets/snap/amazon-fine-food-reviews/data
df = pd.read_csv('Reviews.csv')
reviews = df['Text'].tolist()
reviews_cleanup = [clean_dataset(review) for review in reviews]
return reviews_cleanup