-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrecommendationsystem.py
65 lines (48 loc) · 2.3 KB
/
recommendationsystem.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sqlalchemy import create_engine
from eunjeon import Mecab
import pandas as pd
import numpy as np
import re
engine = create_engine('mysql+pymysql://user:password@host:port/databasename')
stopwords = ['의', '가', '이', '은', '들', '는', '좀', '잘', '걍', '과', '도', '를', '으로', '자', '에', '와', '한', '하다', "내", "네", "니다"]
def __Preprocessing(title: list) -> list:
# regular expression
regex_data = []
for t in title:
regex_data.append(re.sub('[^ㄱ-ㅣ가-힣A-Z]', '', t))
# processing stopword and token length < 1
mecab = Mecab()
processed_data = []
for t in regex_data:
sent = mecab.nouns(t)
sent = [token for token in sent if token not in stopwords]
sent = [token for token in sent if len(sent) > 1]
sent = " ".join(sent)
processed_data.append(sent)
return processed_data
def Recomendation(post_id: int) -> list:
# find post_id and title more than a week
query = 'select post.post_id, post.title from post where post.posted_datetime > (now() - interval 1 week);'
# make dataframe for post_id and title
dataframe = pd.DataFrame(engine.execute(query).fetchall(), columns=['postId', 'title'])
# find start index in dataframe
start_idx = dataframe["postId"][0]
# title preprocessing
title = np.array(dataframe['title'].tolist())
processing_data = __Preprocessing(title)
# TF-IDF
tfidf = TfidfVectorizer()
tfidf.fit(processing_data)
tfidf_dataframe = pd.DataFrame(tfidf.transform(processing_data).toarray(), columns=sorted(tfidf.vocabulary_), index=dataframe['postId'])
# calculate cosine similarity
cosine_sim = cosine_similarity(tfidf_dataframe, tfidf_dataframe)
# find similarity title and sorted by higher
sim_scores = [(i + post_id, c) for i, c in enumerate(cosine_sim[post_id - start_idx], start=0) if
c > 0 and i != (post_id - start_idx)]
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
# make dataframe for sorted similarity title
df = pd.DataFrame(sim_scores, columns=['postId', 'cosine similarity'])
# return similarity title list
return np.array(df['postId'].tolist())