-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMovie.py
78 lines (58 loc) · 3.29 KB
/
Movie.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import sigmoid_kernel
from sklearn.metrics import average_precision_score
# Load movie credits and details datasets
movie_credits = pd.read_csv(r"C:\Users\HP\Documents\Data Science Project\Movie recommendation system\tmdb_5000_credits.csv")
movie_details = pd.read_csv(r"C:\Users\HP\Documents\Data Science Project\Movie recommendation system\tmdb_5000_movies.csv")
# Rename the 'movie_id' column in the credits dataset
credits_renamed = movie_credits.rename(index=str, columns={"movie_id": "id"})
# Merge movie details and credits datasets on the 'id' column
merged_data = movie_details.merge(credits_renamed, on='id')
# Remove unnecessary columns from the merged dataset
cleaned_data = merged_data.drop(columns=['homepage', 'title_x', 'title_y', 'status','production_countries'])
# Fill missing values in the 'overview' column with an empty string
cleaned_data['overview'].fillna('', inplace=True)
# Create a TF-IDF vectorizer for text processing
tfidf_vectorizer = TfidfVectorizer(
min_df=3,
max_features=None,
strip_accents='unicode',
analyzer='word',
token_pattern=r'\w{1,}',
ngram_range=(1, 3),
stop_words='english'
)
# Fit the TF-IDF vectorizer on the 'overview' text and transform it into a TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(cleaned_data['overview'])
# Compute the sigmoid kernel for the TF-IDF matrix
sigmoid_scores = sigmoid_kernel(tfidf_matrix, tfidf_matrix)
# Create a mapping of movie titles to their indices in the cleaned dataset
movie_indices = pd.Series(cleaned_data.index, index=cleaned_data['original_title']).drop_duplicates()
# Define a function to provide movie recommendations
def get_movie_recommendations(title, sigmoid_scores=sigmoid_scores):
# Get the index corresponding to the original_title
idx = movie_indices[title]
# Get the pairwise similarity scores
similarity_scores = list(enumerate(sigmoid_scores[idx]))
# Sort the movies by similarity scores
sorted_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
# Get scores of the 10 most similar movies
top_scores = sorted_scores[1:11]
# Get movie indices
recommended_movie_indices = [i[0] for i in top_scores]
# Get top 10 most similar movie titles
return cleaned_data['original_title'].iloc[recommended_movie_indices]
# Take user input for movie name and provide recommendations
user_movie_name = input("Enter movie name here: ")
print(get_movie_recommendations(user_movie_name))
ground_truth_indices = np.array([...]) # Replace with actual ground truth movie indices for the user
recommended_indices_1 = np.array([...]) # Replace with actual predicted recommended movie indices (recommendation 1)
recommended_indices_2 = np.array([...]) # Replace with actual predicted recommended movie indices (recommendation 2)
# Creating binary relevance arrays
true_relevance = np.isin(recommended_indices_1, ground_truth_indices).astype(int)
predicted_relevance = np.ones_like(true_relevance) # Assuming all recommended items are relevant
# Calculate Average Precision (AP) for the recommendation
average_precision = average_precision_score(true_relevance, predicted_relevance)
print("Average Precision:", average_precision)