Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/age gap analysis #23

Open
wants to merge 22 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
e4601e6
test
Lokhia May 29, 2022
b571425
Add .idea directory to gitignore
Lokhia May 29, 2022
c38d073
Remove toto test file
Lokhia May 29, 2022
f6a0062
Update dependencies
Lokhia May 29, 2022
cf863ed
First histogram plotted on streamlit
Lokhia May 31, 2022
fa119b6
Visualization of age gap according to the gender of the oldest
Lokhia May 31, 2022
bab4e41
Condensed histogram of age gap and file cleaning
Lokhia Jun 1, 2022
092bf4b
Histogram make-up (DFG colors, template)
Lokhia Jun 2, 2022
8ccc1ce
Plotly template changed
Lokhia Jun 3, 2022
a130a2e
Integration of Juliana's notebook into python file
Lokhia Jun 6, 2022
997ef5e
function to get images from TMDB for person id
jupeg Jun 6, 2022
c25cf06
compute scores for relationships from movie plot
jupeg Jun 6, 2022
1e8acab
first draft for visualisation/correction of couples detection
jupeg Jun 6, 2022
567a8ed
print couple details and correct characters order
jupeg Jun 10, 2022
44402af
also consider wikipedia Cast section for Q&A
jupeg Jun 10, 2022
409cef7
plot characters relationships with pyvis
jupeg Jun 10, 2022
e6bb03c
add actors and actresses age at release
jupeg Jun 10, 2022
a04f1c9
plot age gaps for couples in streamlit
jupeg Jun 12, 2022
b57ff19
combine relationship visualisation in the same streamlit
jupeg Jun 14, 2022
e71528a
some minors modif in Jupyter
Lokhia Jan 18, 2023
95fb5dc
Merge branch 'feature/age_gap_analysis' of https://github.com/datafor…
Lokhia Jan 18, 2023
afcee48
Pyproject update to python 11
Lokhia Feb 8, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -129,4 +129,7 @@ dmypy.json
.pyre/
.vscode
.DS_Store
*.csv
*.csv

# IDE
.idea/**
60 changes: 60 additions & 0 deletions bechdelai/data/tmdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from os import environ

import pandas as pd
import numpy as np
from dotenv import load_dotenv

from bechdelai.data.scrap import get_json_from_url
Expand Down Expand Up @@ -37,6 +38,7 @@ class APIKeyNotSetInEnv(Exception):
MOVIE_API_URL = f"{API_URL}/movie/{{movie_id}}?api_key={API_KEY}"
CAST_API_URL = f"{API_URL}/movie/{{movie_id}}/credits?api_key={API_KEY}"
PERSON_API_URL = f"{API_URL}/person/{{person_id}}?api_key={API_KEY}"
PERSON_IMG_API_URL = f"{API_URL}/person/{{person_id}}/images?api_key={API_KEY}"
SEARCH_IMDB_URL = (
f"{API_URL}//find/tt{{imdb_id}}?api_key={API_KEY}&external_source=imdb_id"
)
Expand Down Expand Up @@ -135,6 +137,18 @@ def get_person_details_from_id(person_id) -> dict:

return get_json_from_url(url)

def get_person_image_from_id(person_id) -> dict:
"""Get TMDB API images for person by id

Parameters
----------
person_id : str or int
Person id to get details from
"""
url = PERSON_IMG_API_URL.format(person_id=str(person_id))

return get_json_from_url(url)

def format_results_for_suggestion(search_res: dict) -> list:
"""Format search movie results for `show_movie_suggestions()`

Expand Down Expand Up @@ -224,3 +238,49 @@ def get_movies_from_ids(movie_ids: list) -> tuple:
cast_df = pd.concat(cast_df)

return movies_df, crew_df, cast_df

def get_best_tmdb_id(title,release_year):
"""
Get most probable TMDB id for movie title released in release year.
The release_date in TMDB may be different from the release_year given, but we look for the closest date.

Parameters
----------
title : str
movie title
release_year : int
year the movie was release


Returns
-------
int
TMDB id

"""
movie_candidates = search_movie_from_query(title)
if movie_candidates['total_results']==0:
# Movie not found in TMDB with query
return None

if release_year==None:
return res[0]['id']
else:
# find most probable id -> same (or closest) release year
movie_id = ''
release_year_error = np.Inf # should be min
# look at the 5 first matches to choose the one that was release closer to release_year
for res in movie_candidates["results"][:5]:
if ('release_date' not in res.keys()):
continue
try:
res_release_year = int(res['release_date'][:4])
except ValueError:
continue
if res_release_year==release_year:
movie_id = res['id']
break
elif abs(res_release_year-release_year)<release_year_error:
movie_id = res['id']
release_year_error = abs(res_release_year-release_year)
return movie_id
4 changes: 2 additions & 2 deletions bechdelai/data/wikipedia.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
from bs4 import BeautifulSoup
import re
import outputformat as ouf
import wikipediaapi
from bechdelai.data.scrap import get_json_from_url
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Utilise plutôt ici from .scrap import get_json_from_url en import relatif, ça permet d'éviter des bugs

# import wikipediaapi
# from bechdelai.data.scrap import get_json_from_url

def get_sections(query, lang="en"):
"""Return all sections and subsections in the page and their corresponding indexes
Expand Down
Empty file added notebooks/age_gap/__init__.py
Empty file.
87 changes: 87 additions & 0 deletions notebooks/age_gap/age_gap_automation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import sys
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Qu'est-ce que fait ce script ? Il permet d'aller sauvegarder des données ?
Est-ce qu'il faut le mettre dans la librairie ?

sys.path.append("../..")
import bechdelai.data.wikipedia as wiki
import bechdelai.data.tmdb as tmdb
import process_couples as pc
import outputformat as ouf
import pandas as pd
from datetime import datetime
import requests
import io
import spacy
from spacy import displacy
from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy.matcher import PhraseMatcher
from pathlib import Path


class Movie:
def __init__(self, title, release_year=None):
self.title = title
self.release_year = release_year
self.plot = self.get_plot()
self.cast_wiki = self.get_cast_wiki()
self.cast = self.get_cast_tmdb()

def __repr__(self):
return self.__str__()

def __str__(self):
return "Film : {}".format(self.title)

def get_plot(self):
for query_suffix in [' ('+str(self.release_year)+' film)',' (film)','']:
try:
return wiki.get_section_text(self.title+query_suffix, ['Plot'])['Plot'] # to improve
except ValueError:
continue
return None

def get_cast_wiki(self):
return pc.get_cast_from_wikipedia(self.title,self.release_year)

def get_cast_tmdb(self):
movie_id = tmdb.get_best_tmdb_id(self.title,self.release_year)

# get casting data
data = tmdb.get_movie_cast_from_id(movie_id)
tmdb_cast = pd.DataFrame(data["cast"])
wiki_cast = self.cast_wiki
cast_df = pc.correct_cast_with_wikipedia(tmdb_cast,wiki_cast)

# only use simple quotation marks'
cast_df.replace(regex=r'\"',value="'",inplace=True)

#remove any accents
cast_df['name'] = cast_df['name'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
cast_df['character'] = cast_df['character'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')

# get release date
release_date = tmdb.get_movie_details_from_id(movie_id)['release_date']
release_date = datetime.strptime(release_date, '%Y-%m-%d')
# complete with actors/actress ages
cast_df['age_at_release'] = pc.compute_cast_age(cast_df,release_date)

return cast_df

def main():
verbs = ['kisses', 'sleeps with', 'goes on a date with', 'has sex with', 'marries', 'is in love with','is in couple with',
'is the father of', 'is the mother of','is a friend of', 'is in the family of', 'is the enemy of']
hp4 = Movie("Harry Potter and the Goblet of Fire",2005)
ans = pc.compute_relationships_in_movie(hp4, verbs)
ans.to_csv('hp4.csv')

call_me = Movie("Call Me by Your Name",2017)
ans = pc.compute_relationships_in_movie(call_me.cast,call_me.plot, verbs)
ans.to_csv('call_me.csv')

lebowski = Movie("The Big Lebowski",1998)
ans = pc.compute_relationships_in_movie(lebowski.cast,lebowski.plot, verbs)
ans.to_csv('lebowski.csv')

print(ans)


if __name__ == "__main__":
main()
99 changes: 99 additions & 0 deletions notebooks/age_gap/age_gap_visualisation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import sys
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On peut soit garder pour le moment toute la démo streamlit dans les notebooks, soit le sortir dans un autre repo (peut être dans un second temps)

Si des scripts sont importants -> les mettre dans la librairie

sys.path.append("../..")
from age_gap_automation import Movie
import process_couples as pc
import bechdelai.data.wikipedia as wiki
import bechdelai.data.tmdb as tmdb

import streamlit as st
import pandas as pd
import plotly.express as px

MOVIE_FILES = {"Harry Potter and the Goblet of Fire":"hp4.csv",
"Call me by your name":"call_me.csv",
"The Big Lebowski":"lebowski.csv",
"Love Actually":"love_actually.csv"}
MOVIE_YEARS = {"Harry Potter and the Goblet of Fire":2005,
"Call me by your name":2017,
"The Big Lebowski":1998,
"Love Actually":2003}

VERBS = ['kisses', 'sleeps with', 'goes on a date with', 'has sex with', 'marries', 'is in love with','is in couple with', 'is the father of', 'is the mother of']
LOVE_VERBS = ['kisses', 'sleeps with', 'goes on a date with', 'has sex with', 'marries', 'is in love with','is in couple with']

@st.cache
def load_data_from_file(file):
return pd.read_csv(file)
def load_data(movie):
return pc.compute_relationships_in_movie(movie.cast,movie.plot, VERBS)


def main():
st.set_page_config(layout="wide")
title = st.selectbox("Choose a movie:",list(MOVIE_FILES.keys()))
st.title(title)
st.subheader('Romantic relationships')


movie = Movie(title,MOVIE_YEARS[title])
cast = movie.cast

try:
scores = load_data_from_file(MOVIE_FILES[title])
except FileNotFoundError:
with st.spinner('Wait for it...'):
scores = load_data(movie)

scores.sort_values('score',ascending=False,inplace=True)
scores.drop_duplicates(['star1','star2'],keep='first',inplace=True) # TO DO: avoid duplicates when star1 and star2 are inversed

count=0
for i,row in scores.iterrows():

if row.question not in LOVE_VERBS:
continue

if (count==10) | (row.score<0.7):
break

star_younger = {'name':row.star1,
'character':row.character1,
'age':cast[cast.name==row.star1]['age_at_release'].iloc[0],
'gender':cast[cast.name==row.star1]['gender'].iloc[0],
'image' : tmdb.get_person_image_from_id(row.star_id1)["profiles"][0]["file_path"] }
star_older = {'name':row.star2,
'character':row.character2,
'age':cast[cast.name==row.star2]['age_at_release'].iloc[0],
'gender':cast[cast.name==row.star2]['gender'].iloc[0],
'image' : tmdb.get_person_image_from_id(row.star_id2)["profiles"][0]["file_path"] }

if star_younger['age'] > star_older['age']:
star_aux = star_younger
star_younger = star_older
star_older = star_aux



st.subheader('{} and {}'.format(star_younger['character'], star_older['character']))
st.write('They were played by {} and {} respectively. '.format(star_younger['name'], star_older['name']))
st.write('Age gap: ' ,row.age_gap)

col1, col2, col3,col4,col5 = st.columns([1.5,5,1.5,2,10])
col1.image('https://image.tmdb.org/t/p/original'+star_younger['image'],width=100)


values = col2.slider(
'',
10, 50,
(star_younger['age'], star_older['age']),
disabled=True, key = "slider_"+str(i))

col3.image('https://image.tmdb.org/t/p/original'+star_older['image'],width=100)

# relationship_true = col4.radio('Is this relationship true?', ['Yes', 'No'],key = "radio_"+str(i))
count+=1



if __name__ == "__main__":
main()
Loading