-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathch2.py
47 lines (35 loc) · 1.61 KB
/
ch2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import json
from pandas import DataFrame, Series
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
path = 'dataset/ch2.txt'
records = [json.loads(line) for line in open(path)]
frame = DataFrame(records)
# print(frame)
# print(frame['tz'][:10]) # print 10 time zones
# tz_counts = frame['tz'].value_counts() # use value_counts() to count the frequency of a value
# print(tz_counts[:10])
# display the graph of top 10 timezones
# clean_tz = frame['tz'].fillna('Missing') # fill missing values
# clean_tz[clean_tz == ''] = 'Unknown' # fill empty strings with Unknown
# tz_counts = clean_tz.value_counts()
# # print(tz_counts[:10])
# tz_counts[:10].plot(kind='barh', rot=0) # make a horizonal bar plot using the plot method
# # print(frame['a'][1])
# plt.show() # display the top 10 time zones
results = Series([x.split()[0] for x in frame.a.dropna()])
# print(results[:5])
# print(results.value_counts()[:8])
cframe = frame[frame.a.notnull()]
operating_system = np.where(cframe['a'].str.contains('Windows'),'Windows','Not Windows') # check if the string contains 'Windows'
# print(operating_system[:5]) # return the first 5 records whether windows or not
by_tz_os = cframe.groupby(['tz', operating_system]) # group by timezone and os
agg_counts = by_tz_os.size().unstack().fillna(0) # use unstack to reshape the results into a table; use size to coumpute the counts
# print(agg_counts[:10])
# use to sort in ascending order
indexer = agg_counts.sum(1).argsort()
# print(indexer[:10])
# slice the last 10 rows
count_subset = agg_counts.take(indexer)[-10:0] # use take() to select rows in an order
print(count_subset)