-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathno2_select_data.py
131 lines (101 loc) · 4.35 KB
/
no2_select_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# import libraries
from google.cloud import bigquery
# create a client object
client = bigquery.Client()
### --------------------------------------------------------------------------
### SELECT FROM
### --------------------------------------------------------------------------
# Construct a reference to the "openaq" dataset
dataset_ref = client.dataset("openaq", project="bigquery-public-data")
# https://openaq.org/#/?_k=esd1sb
# API request - fetch the dataset
dataset = client.get_dataset(dataset_ref)
# construct a reference to the "global_air_quality" table
table_ref = dataset_ref.table("global_air_quality")
# API request - fetch the table
table = client.get_table(table_ref)
# Print information on all the columns in the "global_air_quality" table in the "openaq" dataset
table.schema
# Print information on all the columns in the "global_air_quality" table in the "openaq" dataset
full = client.list_rows(table, max_results = 5).to_dataframe()
full.info()
# query
query = """
SELECT city
FROM `bigquery-public-data.openaq.global_air_quality`
WHERE country = 'US'
"""
# Set up the query
query_job = client.query(query)
# API request - run the query, and return a pandas DataFrame
us_cities = query_job.to_dataframe()
# What five cities have the most measurements?
us_cities['city'].value_counts().head()
us_cities.city.value_counts().head()
### --------------------------------------------------------------------------
# multiple selection
query = """
SELECT city, country
FROM `bigquery-public-data.openaq.global_air_quality`
WHERE country = 'US'
"""
### --------------------------------------------------------------------------
# select all
query = """
SELECT *
FROM `bigquery-public-data.openaq.global_air_quality`
WHERE country = 'France'
"""
### --------------------------------------------------------------------------
# select distinct values
first_query = """
SELECT DISTINCT country
FROM `bigquery-public-data.openaq.global_air_quality`
WHERE unit = 'ppm'
"""
### --------------------------------------------------------------------------
# working with big datasets
### --------------------------------------------------------------------------
query = """
SELECT score, title
FROM `bigquery-public-data.hacker_news.full`
WHERE type = 'job'
"""
### --------------------------------------------------------------------------
### OPTION 1
### --------------------------------------------------------------------------
"""
Option 1: To begin,you can estimate the size of any query before running it. Here is an
example using the (very large!) Hacker News dataset. To see how much data a query will scan,
we create a
QueryJobConfig object and set the dry_run parameter to True .
"""
# Create a QueryJobConfig object to estimate size of query without r unning it
dry_run_config = bigquery.QueryJobConfig(dry_run = True)
# API request - dry run query to estimate costs
dry_run_query_job = client.query(query, job_config = dry_run_config)
print("This query will process {} bytes.".format(dry_run_query_job.total_bytes_processed))
### --------------------------------------------------------------------------
### OPTION 2
### --------------------------------------------------------------------------
"""
Option 2: You can also specify a parameter when running the query to limit how much
data you are willing to scan. Here's an example with a low limit.
"""
# Only run the query if it's less than 100 MB
ONE_HUNDRED_MB = 100*1000*1000
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=ONE_HUNDRED_MB)
# Set up the query (will only run if it's less than 100 MB)
safe_query_job = client.query(query, job_config=safe_config)
# API request - try to run the query, and return a pandas DataFrame
safe_query_job.to_dataframe()
### --------------------------------------------------------------------------
# Only run the query if it's less than 1 Go
ONE_GB = 1000*1000*1000
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=ONE_GB)
# Set up the query (will only run if it's less than 100 MB)
safe_query_job = client.query(query, job_config=safe_config)
# API request - try to run the query, and return a pandas DataFrame
job_post_scores = safe_query_job.to_dataframe()
# Print average score for job posts
job_post_scores.score.mean()