-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess.py
131 lines (100 loc) · 6.8 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import pandas as pd
import re
import spacy
import nltk
import pickle
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
# Don't use word_tokenize
def extract_words(ingredients):
word_pattern = r'\b\w+\b'
# list of strings
words_list = re.findall(word_pattern, ingredients.lower())
# convert it to a string
words_str = ' '.join(words_list)
return words_str
# Super slow, but super accurate!
def lemmatization(ingredients):
# Download necessary spaCy resource
# spacy.cli.download("en_core_web_sm")
# Load English NLP model
nlp = spacy.load("en_core_web_sm")
# Process the text
doc = nlp(ingredients)
# Lemmatize each token and remove duplicates!
lemmatized_words_list = list(set([token.lemma_ for token in doc]))
lemmatized_words_str = ' '.join(lemmatized_words_list)
return lemmatized_words_str
# def lemmatization(ingredients):
# # Download necessary NLTK resource
# # nltk.download('wordnet')
# lemmatizer = WordNetLemmatizer()
# words_list = word_tokenize(ingredients)
# # Lemmatize each token and remove duplicates!
# lemmatized_words_list = list(set([lemmatizer.lemmatize(word) for word in words_list]))
# lemmatized_words_str = ' '.join(lemmatized_words_list)
# return lemmatized_words_str
def remove_stop_words(ingredients):
# Download necessary NLTK resources
# nltk.download('punkt')
# nltk.download('stopwords')
# Load English stopwords
stop_words = set(stopwords.words('english'))
# Tokenize the string into words, this is similar to what we did in extract_words
words_list = word_tokenize(ingredients)
filtered_words_list = [word for word in words_list if word not in stop_words]
filtered_words_str = ' '.join(filtered_words_list)
return filtered_words_str
def remove_numbers(ingredients):
integer_float_pattern = r'\b\d*\.?\d+\b'
unicode_fraction_pattern = r'\b\d*[\u00BC-\u00BE\u2150-\u215E]\b'
combined_pattern = f'{integer_float_pattern}|{unicode_fraction_pattern}'
new_ingredients = re.sub(combined_pattern, '', ingredients)
# remove extra space
new_ingredients = ' '.join(new_ingredients.split())
return new_ingredients
def remove_cooking_metrics(ingredients):
cooking_metrics = ['ml', 'milliliter', 'millilitre', 'cc', 'l', 'liter', 'litre', 'dl', 'deciliter', 'decilitre',
'teaspoon', 't', 'tsp', 'tablespoon', 'tbl', 'tbs', 'tbsp', 'fluid ounce', 'fl oz', 'gill', 'cup', 'c', 'pint', 'p', 'pt', 'fl pt', 'quart', 'q', 'qt', 'fl qt', 'gallon', 'g', 'gal',
'mg', 'milligram', 'milligramme', 'g', 'gram', 'gramme', 'kg', 'kilogram', 'kilogramme', 'pound', 'lb', 'ounce', 'oz',
'mm', 'millimeter', 'millimetre', 'cm', 'centimeter', 'centimetre', 'm', 'meter', 'metre', 'inch', 'in', 'yard',
'milli', 'centi', 'deci', 'hecto', 'kilo']
cooking_metrics_pattern = r'\b(' + '|'.join(re.escape(metric) for metric in cooking_metrics) + r')\b'
new_ingredients = re.sub(cooking_metrics_pattern, '', ingredients)
new_ingredients = ' '.join(new_ingredients.split())
return new_ingredients
def data_cleaning_1(ingredients):
return remove_cooking_metrics(remove_numbers(remove_stop_words(lemmatization(extract_words(ingredients)))))
def find_common_items(df):
string_list = []
for ingredients in df['Cleaned_Ingredients']:
string_list.append(ingredients)
# combine list of strings into a single string
total_string = ' '.join(string_list)
words_list = re.findall(r'\b\w+\b', total_string)
return Counter(words_list).most_common(50)
# NLTK lemmatization
# [('salt', 9002), ('oil', 7387), ('fresh', 6356), ('chopped', 6340), ('large', 5674), ('pepper', 5374), ('olive', 5007), ('ground', 4925), ('sugar', 4816), ('kosher', 4775), ('butter', 4371), ('garlic', 4195), ('cut', 3880), ('black', 3842), ('clove', 3831), ('finely', 3822), ('juice', 3820), ('sliced', 3745), ('plus', 3650), ('unsalted', 3546), ('freshly', 3452), ('egg', 3442), ('leaf', 3336), ('onion', 3217), ('lemon', 3201), ('grated', 3165), ('white', 3121), ('red', 3092), ('peeled', 2917), ('flour', 2855), ('divided', 2851), ('whole', 2570), ('thinly', 2497), ('piece', 2491), ('extra', 2477), ('vegetable', 2419), ('cream', 2410), ('stick', 2390), ('water', 2359), ('purpose', 2356), ('virgin', 2327), ('vinegar', 2315), ('medium', 2263), ('small', 2233), ('dried', 1923), ('milk', 1906), ('wine', 1877), ('powder', 1876), ('green', 1855), ('halved', 1821)]
# spaCy lemmatization
# [('salt', 9074), ('oil', 7389), ('chop', 6410), ('fresh', 6357), ('large', 5698), ('pepper', 5375), ('olive', 5007), ('sugar', 4816), ('kosher', 4775), ('slice', 4599), ('butter', 4375), ('garlic', 4195), ('ground', 4056), ('cut', 3882), ('black', 3842), ('juice', 3837), ('clove', 3831), ('finely', 3822), ('plus', 3650), ('freshly', 3452), ('egg', 3442), ('unsalted', 3378), ('peel', 3359), ('onion', 3217), ('lemon', 3201), ('dry', 3140), ('white', 3121), ('red', 3092), ('flour', 2856), ('divide', 2832), ('leave', 2826), ('whole', 2570), ('thinly', 2497), ('piece', 2491), ('extra', 2476), ('vegetable', 2419), ('cream', 2413), ('stick', 2391), ('water', 2359), ('purpose', 2356), ('virgin', 2327), ('vinegar', 2315), ('medium', 2263), ('small', 2249), ('seed', 2204), ('powder', 2039), ('grate', 2037), ('milk', 1906), ('wine', 1877), ('halve', 1861)]
def remove_common_items(ingredients):
common_items = ['salt', 'oil', 'chop', 'fresh', 'large', 'sugar', 'kosher', 'slice', 'ground', 'cut', 'black', 'juice', 'clove', 'finely', 'plus', 'freshly', 'unsalted', 'peel', 'dry', 'white', 'red', 'divide', 'leave', 'whole', 'thinly', 'piece', 'extra', 'vegetable', 'stick', 'water', 'purpose', 'virgin', 'medium', 'small', 'seed', 'powder', 'grate', 'halve']
common_items_pattern = r'\b(' + '|'.join(re.escape(metric) for metric in common_items) + r')\b'
new_ingredients = re.sub(common_items_pattern, '', ingredients)
new_ingredients = ' '.join(new_ingredients.split())
return new_ingredients
def data_cleaning_2(ingredients):
return remove_common_items(ingredients)
if __name__ == "__main__":
data = pd.read_csv('archive/data.csv')
data.fillna("", inplace=True)
data.drop(columns=[data.columns[0], 'Cleaned_Ingredients'], inplace=True)
data['Rating'] = 50
data['Cleaned_Ingredients'] = data['Ingredients'].apply(lambda ingredients: data_cleaning_1(ingredients))
# print(find_common_items(data))
data['Cleaned_Ingredients'] = data['Cleaned_Ingredients'].apply(lambda ingredients: data_cleaning_2(ingredients))
pickle.dump(data, open('data.pkl', 'wb'))
pickle.dump(data, open('archive/data.pkl', 'wb'))
pickle.dump(data, open('Evaluation/data.pkl', 'wb'))