Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refresh data for us_eia_opendata #1093

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions scripts/us_eia/opendata/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ Each dataset available as a Zip-file of JSONL content. See [here](https://www.ei
To download the latest versions of ALL datasets available, run the following command. Files will be downloaded and extracted to a tmp_raw_data folder.

```bash
python3 download_bulk.py
python3 process.py
```

### Data Exploration
Expand All @@ -22,9 +22,9 @@ This dataset is available for public use, license is available at https://www.ei

### Import procedure

- Download data
- Download data and will process the data
```bash
python3 download_bulk.py
python3 process.py --dataset=TOTAL
```

- Run the [processor](process/README.md)
- Run the [processor](process/README.md)
61 changes: 0 additions & 61 deletions scripts/us_eia/opendata/download_bulk.py

This file was deleted.

33 changes: 17 additions & 16 deletions scripts/us_eia/opendata/generate_jsonl_for_bq.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,9 @@
IN_DATA_PATH = 'tmp_raw_data'
OUT_DATA_PATH = 'tmp_bq_import'
DATASETS = [
'AEO.2014', 'AEO.2015', 'AEO.2016', 'AEO.2017', 'AEO.2018', 'AEO.2019',
'AEO.2020', 'AEO.2021', 'COAL', 'EBA', 'ELEC', 'EMISS', 'IEO.2017',
'IEO.2019', 'INTL', 'NG', 'NUC_STATUS', 'PET', 'PET_IMPORTS', 'SEDS',
'STEO', 'TOTAL'
'AEO.2020', 'AEO.2021', 'AEO.2022', 'AEO.2023', 'AEO.IEO2', 'COAL', 'EBA',
'ELEC', 'EMISS', 'IEO', 'INTL', 'NG', 'NUC_STATUS', 'PET', 'PET_IMPORTS',
'SEDS', 'STEO', 'TOTAL'
]


Expand Down Expand Up @@ -77,17 +76,18 @@ def process_dataset(dataset, in_file_path, out_file_path):
with open(out_file_path + '.series.jsonl', 'w+') as series_fp:
with open(out_file_path + '.categories.jsonl', 'w+') as category_fp:
for line in data_fp:
data = json.loads(line)
series_id = data.get('series_id', None)
if series_id:
jsonl = extract_series_to_jsonl(line, dataset)
series_fp.write(json.dumps(jsonl))
series_fp.write('\n')
category_id = data.get('category_id', None)
if category_id:
jsonl = extract_category_to_jsonl(line, dataset)
category_fp.write(json.dumps(jsonl))
category_fp.write('\n')
if line.startswith('{'):
data = json.loads(line)
series_id = data.get('series_id', None)
if series_id:
jsonl = extract_series_to_jsonl(line, dataset)
series_fp.write(json.dumps(jsonl))
series_fp.write('\n')
category_id = data.get('category_id', None)
if category_id:
jsonl = extract_category_to_jsonl(line, dataset)
category_fp.write(json.dumps(jsonl))
category_fp.write('\n')


def process_single(subdir, file):
Expand All @@ -103,7 +103,8 @@ def process_all():
for file in sorted(files):
if not file.endswith('.txt'):
continue
print(f'Processing {subdir}/{file}')
print(f'Processing1 {subdir}/{file}')

process_single(subdir, file)


Expand Down
130 changes: 130 additions & 0 deletions scripts/us_eia/opendata/manifest.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
{
"import_specifications": [
{
"import_name": "EIA_Electricity",
"curator_emails": [
"[email protected]"
],
"provenance_url": "https://www.eia.gov/opendata/v1/qb.php?category=0",
"provenance_description": "Electricity dataset has country, state-level and plant-level information on electricity generation, consumption, sales etc by energy source and “sectors” (like residential, commercial, etc.).",
"scripts": [
"process.py --dataset=ELEC"
],
"import_inputs": [
{
"template_mcf": "tmp_raw_data/ELEC/ELEC.tmcf",
"cleaned_csv": "tmp_raw_data/ELEC/ELEC.csv"
}
],
"cron_schedule": "0 8 1 2 *"
},
{
"import_name": "EIA_NaturalGas",
"curator_emails": [
"[email protected]"
],
"provenance_url": "https://www.eia.gov/opendata/v1/qb.php?category=0",
"provenance_description": "Natural gas dataset has country and state-level data.",
"scripts": [
"process.py --dataset=NG"
],
"import_inputs": [
{
"template_mcf": "tmp_raw_data/NG/NG.tmcf",
"cleaned_csv": "tmp_raw_data/NG/NG.csv"
}
],
"cron_schedule": "05 10 * * *"
},
{
"import_name": "EIA_NuclearOutages",
"curator_emails": [
"[email protected]"
],
"provenance_url": "https://www.eia.gov/opendata/v1/qb.php?category=0",
"provenance_description": "Nuclear outage dataset has nuclear-plant and national data about Nuclear energy generation capacity and planned outages.",
"scripts": [
"process.py --dataset=NUC_STATUS"
],
"import_inputs": [
{
"template_mcf": "tmp_raw_data/NUC_STATUS/NUC_STATUS.tmcf",
"cleaned_csv": "tmp_raw_data/NUC_STATUS/NUC_STATUS.csv"
}
],
"cron_schedule": "01 9 * * *"
},
{
"import_name": "EIA_Petroleum",
"curator_emails": [
"[email protected]"
],
"provenance_url": "https://www.eia.gov/opendata/v1/qb.php?category=0",
"provenance_description": "EIA Petroleum dataset has country and state-level data.",
"scripts": [
"process.py --dataset=PET"
],
"import_inputs": [
{
"template_mcf": "tmp_raw_data/PET/PET.tmcf",
"cleaned_csv": "tmp_raw_data/PET/PET.csv"
}
],
"cron_schedule": "5 9 2 2 *"
},
{
"import_name": "EIA_International",
"curator_emails": [
"[email protected]"
],
"provenance_url": "https://www.eia.gov/opendata/v1/qb.php?category=0",
"provenance_description": "EIA International Energy dataset has country, continent and world-level data.",
"scripts": [
"process.py --dataset=INTL"
],
"import_inputs": [
{
"template_mcf": "tmp_raw_data/INTL/INTL.tmcf",
"cleaned_csv": "tmp_raw_data/INTL/INTL.csv"
}
],
"cron_schedule": "1 7 * 1,4,7,10 *"
},
{
"import_name": "EIA_SEDS",
"curator_emails": [
"[email protected]"
],
"provenance_url": "https://www.eia.gov/opendata/v1/qb.php?category=0",
"provenance_description": "EIA SEDS International Energy dataset has US country-level and state-level data.",
"scripts": [
"process.py --dataset=SEDS"
],
"import_inputs": [
{
"template_mcf": "tmp_raw_data/SEDS/SEDS.tmcf",
"cleaned_csv": "tmp_raw_data/SEDS/SEDS.csv"
}
],
"cron_schedule": "0 0 1 1 *"
},
{
"import_name": "EIA_TotalEnergy",
"curator_emails": [
"[email protected]"
],
"provenance_url": "https://www.eia.gov/opendata/v1/qb.php?category=0",
"provenance_description": "Total Energy dataset has US country-level data.",
"scripts": [
"process.py --dataset=TOTAL"
],
"import_inputs": [
{
"template_mcf": "tmp_raw_data/TOTAL/TOTAL.tmcf",
"cleaned_csv": "tmp_raw_data/TOTAL/TOTAL.csv"
}
],
"cron_schedule": "0 0 1 * *"
}
]
}
96 changes: 96 additions & 0 deletions scripts/us_eia/opendata/process.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Utility to download all EIA data from https://api.eia.gov/bulk/manifest.txt
Files are stored in raw_data.

Run this script in this folder:
python3 download_bulk.py
"""

import io
import os
import sys
import zipfile

import requests

from absl import flags
from absl import app

from process import coal, common, elec, intl, ng, nuclear, pet, seds, total

MANIFEST_URL = "https://api.eia.gov/bulk/manifest.txt"

FLAGS = flags.FLAGS
flags.DEFINE_string('data_dir', 'tmp_raw_data', 'Data dir to download into')
flags.DEFINE_string('dataset', '',
'Datasets to download. Everything, if empty.')

# Value: (name, extract_fn, schema_fn)
_DATASETS = {
'COAL': ('Coal', coal.extract_place_statvar, coal.generate_statvar_schema),
'ELEC': ('Electricity', elec.extract_place_statvar,
elec.generate_statvar_schema),
'INTL': ('Energy Overview (INTL)', intl.extract_place_statvar, None),
'PET': ('Petroleum', pet.extract_place_statvar, None),
'NG': ('Natural Gas', ng.extract_place_statvar, None),
'NUC_STATUS': ('Nuclear Outages', nuclear.extract_place_statvar,
nuclear.generate_statvar_schema),
'SEDS': ('Consumption, Production, Prices and Expenditure (SEDS)',
seds.extract_place_statvar, None),
'TOTAL': ('Energy Overview (TOTAL)', total.extract_place_statvar, None)
}


def download_file(url: str, save_path: str):
print(f'Downloading {url} to {save_path}')
r = requests.get(url, stream=True)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall(save_path)


def download_manifest():
return requests.get(MANIFEST_URL).json()


def main(_):
assert FLAGS.data_dir
manifest_json = download_manifest()
datasets = manifest_json.get('dataset', {})
for dataset_name in datasets:
if FLAGS.dataset and dataset_name not in FLAGS.dataset:
continue
print(dataset_name)
dataset = datasets[dataset_name]
print("dataset", dataset)
download_file(dataset['accessURL'], f'{FLAGS.data_dir}/{dataset_name}')
print(f'{FLAGS.data_dir}/{dataset_name}')
print(FLAGS.data_dir, FLAGS.dataset)
file_prefix = os.path.join(f'{FLAGS.data_dir}/{dataset_name}',
FLAGS.dataset)
print("file_prefix", file_prefix)
common.process(dataset=FLAGS.dataset,
dataset_name=_DATASETS[FLAGS.dataset],
in_json=file_prefix + '.txt',
out_csv=file_prefix + '.csv',
out_sv_mcf=file_prefix + '.mcf',
out_svg_mcf=file_prefix + '.svg.mcf',
out_tmcf=file_prefix + '.tmcf',
extract_place_statvar_fn=_DATASETS[FLAGS.dataset][1],
generate_statvar_schema_fn=_DATASETS[FLAGS.dataset][2])


if __name__ == '__main__':
app.run(main)
4 changes: 2 additions & 2 deletions scripts/us_eia/opendata/process/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,13 +58,13 @@ takes a raw stat-var and generates a fully defined stat-var for it.

Download and unzip the data files based on the
[manifest](https://api.eia.gov/bulk/manifest.txt) by running the
[`download_bulk.py`](https://github.com/datacommonsorg/data/blob/master/scripts/us_eia/opendata/download_bulk.py)
[`python3 process.py --dataset=TOTAL`](https://github.com/datacommonsorg/data/blob/master/scripts/us_eia/opendata/process.py)
script.

To generate CSV, TMCF and stat-var MCF for a supported dataset:

```bash
python3 main.py --data_dir=tmp_raw_data/ELEC --dataset=ELEC
python3 process.py --dataset=ELEC
```

Replace `ELEC` with any of the other dataset codes listed above.
Expand Down
1 change: 1 addition & 0 deletions scripts/us_eia/opendata/process/coal.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ def extract_place_statvar(series_id, counters):
measure = m.group(1)
if measure.startswith("SHIP"):
# TODO: model destination / source port as well
print(series_id)
return (None, None, None)
if measure == "PROD_DIST_STOCKS":
assert m.group(2) == "TOT"
Expand Down
Loading
Loading