-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexpdata1.py
185 lines (157 loc) · 8.94 KB
/
expdata1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
from numpy import NaN
import numpy as np
import pandas as pd
pd.options.plotting.backend = "plotly"
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
import seaborn as sns
# RespondentID,
# Have you seen any of the 6 films in the Star Wars franchise?,
# Do you consider yourself to be a fan of the Star Wars film franchise?,
# Which of the following Star Wars films have you seen? Please select all that apply.,,,,,,
# Please rank the Star Wars films in order of preference with 1 being your favorite film in the franchise and 6 being your least favorite film.,,,,,,
# "Please state whether you view the following characters favorably, unfavorably, or are unfamiliar with him/her.",,,,,,,,,,,,,,
# Which character shot first?,
# Are you familiar with the Expanded Universe?,
# Do you consider yourself to be a fan of the Expanded Universe?,
# Do you consider yourself to be a fan of the Star Trek franchise?,
# Gender,
# Age,
# Household Income
# ,Education,
# Location (Census Region)
# ,Response,Response,Star Wars: Episode I The Phantom Menace,Star Wars: Episode II Attack of the Clones,Star Wars: Episode III Revenge of the Sith,Star Wars: Episode IV A New Hope,Star Wars: Episode V The Empire Strikes Back,Star Wars: Episode VI Return of the Jedi,Star Wars: Episode I The Phantom Menace,Star Wars: Episode II Attack of the Clones,Star Wars: Episode III Revenge of the Sith,Star Wars: Episode IV A New Hope,Star Wars: Episode V The Empire Strikes Back,Star Wars: Episode VI Return of the Jedi,Han Solo,Luke Skywalker,Princess Leia Organa,Anakin Skywalker,Obi Wan Kenobi,Emperor Palpatine,Darth Vader,Lando Calrissian,Boba Fett,C-3P0,R2 D2,Jar Jar Binks,Padme Amidala,Yoda,Response,Response,Response,Response,Response,Response,Response,Response,Response
# 3292879998,Yes,Yes,Star Wars: Episode I The Phantom Menace,Star Wars: Episode II Attack of the Clones,Star Wars: Episode III Revenge of the Sith,Star Wars: Episode IV A New Hope,Star Wars: Episode V The Empire Strikes Back,Star Wars: Episode VI Return of the Jedi,3,2,1,4,5,6,Very favorably,Very favorably,Very favorably,Very favorably,Very favorably,Very favorably,Very favorably,Unfamiliar (N/A),Unfamiliar (N/A),Very favorably,Very favorably,Very favorably,Very favorably,Very favorably,I don't understand this question,Yes,No,No,Male,18-29,,High school degree,South Atlantic
#Who shot first?
#Han
#Greedo
#I don't understand this question
def main():
swDF = pd.read_csv("starwars.csv")
new_columns = ['ID', 'HasSeenAny', 'Fan',\
'SeenEp1', 'SeenEp2', 'SeenEp3', 'SeenEp4', 'SeenEp5', 'SeenEp6', \
'RankEp1', 'RankEp2', 'RankEp3', 'RankEp4', 'RankEp5', 'RankEp6',\
'RateHan', 'RateLuke','RateLeia', 'RateAnakin', 'RateObi', 'RateEmperor', 'RateVader',\
'RateLando', 'RateBoba', 'RateC3PO', 'RateR2D2', 'RateJarJar', 'RatePadme', 'RateYoda', \
'ShotFirst', 'KnowsExpUni', 'FanExpUni','FanStarTrek', \
'Gender', 'Age', 'Income', 'Education', 'Region']
swDF = swDF.rename(columns=dict(zip(swDF.columns, new_columns)))
swDF = swDF.drop(0) #drop response questions/options row
#shorten the titles of the movies
swDF = swDF.replace("Star Wars: Episode I The Phantom Menace", value="Ep1")
swDF = swDF.replace("Star Wars: Episode II Attack of the Clones", value="Ep2")
swDF = swDF.replace("Star Wars: Episode III Revenge of the Sith", "Ep3")
swDF = swDF.replace("Star Wars: Episode IV A New Hope", "Ep4")
swDF = swDF.replace("Star Wars: Episode V The Empire Strikes Back", "Ep5")
swDF = swDF.replace("Star Wars: Episode VI Return of the Jedi","Ep6")
#output csv with shortened titles
swDF.to_csv("titles-shortened.csv")
#count each episodes watch counts over all survey respondents who answered
seenEp1 = swDF["SeenEp1"].notnull().sum()
seenEp2 = swDF["SeenEp2"].notnull().sum()
seenEp3 = swDF["SeenEp3"].notnull().sum()
seenEp4 = swDF["SeenEp4"].notnull().sum()
seenEp5 = swDF["SeenEp5"].notnull().sum()
seenEp6 = swDF["SeenEp6"].notnull().sum()
watchTotals = [seenEp1,seenEp2,seenEp3,seenEp4,seenEp5,seenEp6]
wT = {'Ep1': [seenEp1], 'Ep2': [seenEp2], 'Ep3': [seenEp3], \
'Ep4': [seenEp4],'Ep5': [seenEp5], 'Ep6': [seenEp6] }
wT = pd.DataFrame(data = wT)
print(wT.head())
# #for matplotlib.pyplot
# watchTotalsBarChart = plt.figure()
# watchTotalsBarChart = watchTotalsBarChart.add_axes([0,0,1,1])
# x = ['Ep1', 'Ep2', 'Ep3', 'Ep4', 'Ep5', 'Ep6']
# y = watchTotals
# plt.xlabel("Episodes")
# plt.ylabel("Watch Count")
# watchTotalsBarChart.bar(x,y)
# #plt.show()
#seaborn plots
sns.barplot(data=wT)
plt.title("Watch Count by Episode, all responses")
plt.xlabel("Episode Number")
plt.ylabel("WatchCount")
plt.savefig("WatchCount")
plt.close()
#bar chart of Watch Count by Episode Number
#Begin making dataframe of income and movie ranking responses
swDFincome = swDF[pd.notnull(swDF["Income"])]
swDF_incomeFav = swDFincome.dropna(subset=[\
'RankEp1', 'RankEp2', 'RankEp3',\
'RankEp4', 'RankEp5',\
'RankEp6'], how="any")
#"$100,000 - $149,999"
swDF_incomeFav.groupby(swDF_incomeFav['Income'])
swDF_incomeFav = swDF_incomeFav[['RankEp1','RankEp2','RankEp3','RankEp4','RankEp5','RankEp6', 'Income']]
#sort by lower to higher income
swDF_incomeFav = swDF_incomeFav.sort_values(["Income"])
#
# wrong dataset, mistyped 99,000, should be 99,999
# manually change income category order...
# swDF_incomeFav = pd.Categorical(swDF_incomeFav,\
# categories=["$0 - $24,999","$25,000 - $49,999",\
# "50,000 - $99,000","$100,000 - $149,999", "$150,000+"],\
# ordered=True)
# swDF_incomeFav["sort"]=pd.Series(swDF_incomeFav.Income, dtype=sort)
# swDF_incomeFav.sort_values(by=["sort", "Episode", "Rating"],inplace=True)
#at this point this dataframe is only income and movie rank response
swDF_incomeFav[['RankEp1','RankEp2','RankEp3','RankEp4','RankEp5','RankEp6']] \
= swDF_incomeFav[['RankEp1','RankEp2','RankEp3','RankEp4','RankEp5','RankEp6']]\
.apply(pd.to_numeric, errors='coerce')
#print(swDF_incomeFav.dtypes) #display datatypes in dataframe
#Wrote this after sorting the ones in the below sections
#so that mean and size df are in correct order
#difference is that this df is in a different format (see column names)
incomeSort = ["$0 - $24,999","$25,000 - $49,999","$50,000 - $99,999","$100,000 - $149,999", "$150,000+"]
swDF_incomeFav["Income"] = pd.Categorical(swDF_incomeFav["Income"], incomeSort)
swDF_incomeFav = swDF_incomeFav.sort_values("Income")
print(swDF_incomeFav.head())
#Now sorted by income category in correct order
ratings_income_group_mean = \
swDF_incomeFav.groupby('Income').mean()
ratings_income_group_size = \
swDF_incomeFav.groupby('Income').size()
print(ratings_income_group_mean)
print(ratings_income_group_size)
#ratings_income_group_size.to_csv("size")
#group024k=ratings_income_group_mean.get_group("$0 - $24,999")
#Next section: make bar chart of movie ratings by income level for each episode
#First convert the dataframe of ratings and income to long format
#(Instead of grouping by income, list income, Episode Number, and Ranking side-by-side)
ratings = pd.melt(swDF_incomeFav, id_vars=['Income'], value_vars=['RankEp1',\
'RankEp2','RankEp3','RankEp4','RankEp5','RankEp6'], \
var_name="Episode", value_name="Rating")
#the income categories are not in order, so manually change the category order with pd.Categorical
#manually change income category order...
incomeSort = ["$0 - $24,999","$25,000 - $49,999","$50,000 - $99,999","$100,000 - $149,999", "$150,000+"]
episodeSort = ['RankEp1','RankEp2','RankEp3','RankEp4','RankEp5','RankEp6']
ratings["Income"] = pd.Categorical(ratings["Income"], incomeSort)
ratings["Episode"] = pd.Categorical(ratings["Episode"], episodeSort)
ratings = ratings.sort_values(["Income","Episode"])
#First sort by income, then within each income category, sort by Episode Number.
#
# failed attempt to sort, accidentally mistyped 99,000 instead of 99,999
# incomeSort = {"$0 - $24,999":0,"$25,000 - $49,999":1,"$50,000 - $99,000":2,"$100,000 - $149,999":3, "$150,000+":4}
# sort = pd.CategoricalDtype(categories=["$0 - $24,999","$25,000 - $49,999",\
# "50,000 - $99,000","$100,000 - $149,999", "$150,000+"],\
# ordered=True)
# ratings["sort"]=pd.Series(ratings.Income, dtype=sort)
# ratings.sort_values(by=["sort", "Episode", "Rating"],inplace=True)
#ci (Confidence Interval on bar chart), var_name (raname "variable" column to "Episode")
sns.barplot(x="Income", y="Rating", hue="Episode", data = ratings, ci=None)
plt.xlabel("Income Category")
plt.yticks(np.arange(0, 6, .25))
#set font size, for xlabels, which are strings
plt.xticks(fontsize=6)
plt.yticks(fontsize=12)
plt.ylim(2,5)
plt.ylabel("Ratings (1=Best)")
#Ratings: 1 is best, 6 is worst. Lower scores, more favorites.
plt.legend(bbox_to_anchor=(.9, .8))
plt.title("Average Rating of Episodes by Income")
plt.savefig("RatingAndIncome")
plt.close()
#TODO determine if differences between income categories e.g. 150k+ liking Episode 4 the most is statistically significant
main()