forked from karpathy/ng-video-lecture
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathamazon_data_loader.py
32 lines (27 loc) · 913 Bytes
/
amazon_data_loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import pandas as pd
def clean_dataset(text):
EOS = '^' # begin and end token
allowed_chars = set(
"abcdefghijklmnopqrstuvwxyz"
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"0123456789"
" ,.!?()[]{};:'\"-+*/=@#$%^&_|<>`~"
)
# add next line character and EOS char to every review
return f"{EOS}{''.join(c for c in text if c in allowed_chars)}\n{EOS}"
def load_amazon_dataset():
# https://www.kaggle.com/datasets/snap/amazon-fine-food-reviews/data
df = pd.read_csv('Reviews.csv')
reviews = df['Text'].tolist()
reviews_cleanup = [clean_dataset(review) for review in reviews]
return reviews_cleanup
"""
# char frequency stats
char_freq = {char: 0 for char in chars}
for review in reviews_cleanup:
for char in review:
if char in char_freq:
char_freq[char] += 1
for char, freq in char_freq.items():
print(f"'{char}': {freq}")
"""