datacommonsorg · Harsha-chandaluri · Oct 16, 2024 · Oct 17, 2024 · Oct 21, 2024 · Oct 21, 2024
diff --git a/scripts/us_eia/opendata/README.md b/scripts/us_eia/opendata/README.md
@@ -6,12 +6,6 @@
 
 Each dataset available as a Zip-file of JSONL content. See [here](https://www.eia.gov/opendata/bulkfiles.php) for more details.
 
-To download the latest versions of ALL datasets available, run the following command. Files will be downloaded and extracted to a tmp_raw_data folder.
-
-```bash
-python3 download_bulk.py
-```
-
 ### Data Exploration
 
 To ease analysis of the datasets, see [`generate_jsonl_for_bq.py`](generate_jsonl_for_bq.py) for instructions to convert and import the data into BigQuery.
@@ -20,11 +14,44 @@ To ease analysis of the datasets, see [`generate_jsonl_for_bq.py`](generate_json
 
 This dataset is available for public use, license is available at https://www.eia.gov/about/copyrights_reuse.php
 
-### Import procedure
 
-- Download data 
-    ```bash
-    python3 download_bulk.py
-    ```
+- Run the [processor](process/README.md)
+
+### Downloading and Processing Data
+
+
+    If you want to perform "only download", run the below command:
 
-- Run the [processor](process/README.md)
+        python3 process.py --dataset=INTL --mode=download
+        python3 process.py --dataset=ELEC --mode=download
+        python3 process.py --dataset=PET --mode=download
+        python3 process.py --dataset=NG --mode=download
+        python3 process.py --dataset=SEDS --mode=download
+        python3 process.py --dataset=NUC_STATUS --mode=download
+        python3 process.py --dataset=TOTAL --mode=download
+
+
+
+   If you want to perform "only process", run the below command:
+
+   Running this command generates input_fles and csv, mcf, tmcf, svg.mcf files.
+
+        python3 process.py --dataset=INTL --mode=process
+        python3 process.py --dataset=ELEC --mode=process
+        python3 process.py --dataset=PET --mode=process
+        python3 process.py --dataset=NG --mode=process
+        python3 process.py --dataset=SEDS --mode=process
+        python3 process.py --dataset=NUC_STATUS --mode=process
+        python3 process.py --dataset=TOTAL --mode=process
+
+    To Download and process the data together, run the below command:
+    ```bash
+    python3 process.py --dataset=TOTAL
+    python3 process.py --dataset=INTL
+    python3 process.py --dataset=ELEC
+    python3 process.py --dataset=NG
+    python3 process.py --dataset=PET
+    python3 process.py --dataset=SEDS
+    python3 process.py --dataset=NUC_STATUS
+
+    ```
diff --git a/scripts/us_eia/opendata/download_bulk.py b/scripts/us_eia/opendata/download_bulk.py
diff --git a/scripts/us_eia/opendata/generate_jsonl_for_bq.py b/scripts/us_eia/opendata/generate_jsonl_for_bq.py
@@ -40,10 +40,9 @@
 IN_DATA_PATH = 'tmp_raw_data'
 OUT_DATA_PATH = 'tmp_bq_import'
 DATASETS = [
-    'AEO.2014', 'AEO.2015', 'AEO.2016', 'AEO.2017', 'AEO.2018', 'AEO.2019',
-    'AEO.2020', 'AEO.2021', 'COAL', 'EBA', 'ELEC', 'EMISS', 'IEO.2017',
-    'IEO.2019', 'INTL', 'NG', 'NUC_STATUS', 'PET', 'PET_IMPORTS', 'SEDS',
-    'STEO', 'TOTAL'
+    'AEO.2020', 'AEO.2021', 'AEO.2022', 'AEO.2023', 'AEO.IEO2', 'COAL', 'EBA',
+    'ELEC', 'EMISS', 'IEO', 'INTL', 'NG', 'NUC_STATUS', 'PET', 'PET_IMPORTS',
+    'SEDS', 'STEO', 'TOTAL'
 ]
 
 
@@ -77,17 +76,18 @@ def process_dataset(dataset, in_file_path, out_file_path):
         with open(out_file_path + '.series.jsonl', 'w+') as series_fp:
             with open(out_file_path + '.categories.jsonl', 'w+') as category_fp:
                 for line in data_fp:
-                    data = json.loads(line)
-                    series_id = data.get('series_id', None)
-                    if series_id:
-                        jsonl = extract_series_to_jsonl(line, dataset)
-                        series_fp.write(json.dumps(jsonl))
-                        series_fp.write('\n')
-                    category_id = data.get('category_id', None)
-                    if category_id:
-                        jsonl = extract_category_to_jsonl(line, dataset)
-                        category_fp.write(json.dumps(jsonl))
-                        category_fp.write('\n')
+                    if line.startswith('{'):
+                        data = json.loads(line)
+                        series_id = data.get('series_id', None)
+                        if series_id:
+                            jsonl = extract_series_to_jsonl(line, dataset)
+                            series_fp.write(json.dumps(jsonl))
+                            series_fp.write('\n')
+                        category_id = data.get('category_id', None)
+                        if category_id:
+                            jsonl = extract_category_to_jsonl(line, dataset)
+                            category_fp.write(json.dumps(jsonl))
+                            category_fp.write('\n')
 
 
 def process_single(subdir, file):
@@ -103,7 +103,8 @@ def process_all():
         for file in sorted(files):
             if not file.endswith('.txt'):
                 continue
-            print(f'Processing {subdir}/{file}')
+            print(f'Processing1 {subdir}/{file}')
+
             process_single(subdir, file)
 
 

diff --git a/scripts/us_eia/opendata/manifest.json b/scripts/us_eia/opendata/manifest.json
@@ -0,0 +1,130 @@
+{
+  "import_specifications": [
+    {
+      "import_name": "EIA_Electricity",
+      "curator_emails": [
+        "[email protected]"
+      ],
+      "provenance_url": "https://www.eia.gov/opendata/v1/qb.php?category=0",
+      "provenance_description": "Electricity dataset has country, state-level and plant-level information on electricity generation, consumption, sales etc by energy source and “sectors” (like residential, commercial, etc.).",
+      "scripts": [
+        "process.py --dataset=ELEC"
+      ],
+      "import_inputs": [
+        {
+          "template_mcf": "tmp_raw_data/ELEC/ELEC.tmcf",
+          "cleaned_csv": "tmp_raw_data/ELEC/ELEC.csv"
+        }
+      ],
+      "cron_schedule": "0 1  1 * *"
+    },
+    {
+      "import_name": "EIA_NaturalGas",
+      "curator_emails": [
+        "[email protected]"
+      ],
+      "provenance_url": "https://www.eia.gov/opendata/v1/qb.php?category=0",
+      "provenance_description": "Natural gas dataset has country and state-level data.",
+      "scripts": [
+        "process.py --dataset=NG"
+      ],
+      "import_inputs": [
+        {
+          "template_mcf": "tmp_raw_data/NG/NG.tmcf",
+          "cleaned_csv": "tmp_raw_data/NG/NG.csv"
+        }
+      ],
+      "cron_schedule": "0 2  1 * *"
+    },
+    {
+      "import_name": "EIA_NuclearOutages",
+      "curator_emails": [
+        "[email protected]"
+      ],
+      "provenance_url": "https://www.eia.gov/opendata/v1/qb.php?category=0",
+      "provenance_description": "Nuclear outage dataset has nuclear-plant and national data about Nuclear energy generation capacity and planned outages.",
+      "scripts": [
+        "process.py --dataset=NUC_STATUS"
+      ],
+      "import_inputs": [
+        {
+          "template_mcf": "tmp_raw_data/NUC_STATUS/NUC_STATUS.tmcf",
+          "cleaned_csv": "tmp_raw_data/NUC_STATUS/NUC_STATUS.csv"
+        }
+      ],
+      "cron_schedule": "0 3  1 * *"
+    },
+    {
+      "import_name": "EIA_Petroleum",
+      "curator_emails": [
+        "[email protected]"
+      ],
+      "provenance_url": "https://www.eia.gov/opendata/v1/qb.php?category=0",
+      "provenance_description": "EIA Petroleum dataset has country and state-level data.",
+      "scripts": [
+        "process.py --dataset=PET"
+      ],
+      "import_inputs": [
+        {
+          "template_mcf": "tmp_raw_data/PET/PET.tmcf",
+          "cleaned_csv": "tmp_raw_data/PET/PET.csv"
+        }
+      ],
+      "cron_schedule": "0 4  1 * *"
+    },
+    {
+      "import_name": "EIA_International",
+      "curator_emails": [
+        "[email protected]"
+      ],
+      "provenance_url": "https://www.eia.gov/opendata/v1/qb.php?category=0",
+      "provenance_description": "EIA International Energy dataset has country, continent and world-level data.",
+      "scripts": [
+        "process.py --dataset=INTL"
+      ],
+      "import_inputs": [
+        {
+          "template_mcf": "tmp_raw_data/INTL/INTL.tmcf",
+          "cleaned_csv": "tmp_raw_data/INTL/INTL.csv"
+        }
+      ],
+      "cron_schedule": "0 5  1 * *"
+    },
+    {
+      "import_name": "EIA_SEDS",
+      "curator_emails": [
+        "[email protected]"
+      ],
+      "provenance_url": "https://www.eia.gov/opendata/v1/qb.php?category=0",
+      "provenance_description": "EIA SEDS International Energy dataset has US country-level and state-level data.",
+      "scripts": [
+        "process.py --dataset=SEDS"
+      ],
+      "import_inputs": [
+        {
+          "template_mcf": "tmp_raw_data/SEDS/SEDS.tmcf",
+          "cleaned_csv": "tmp_raw_data/SEDS/SEDS.csv"
+        }
+      ],
+      "cron_schedule": "0 6  1 * *"
+    },
+    {
+      "import_name": "EIA_TotalEnergy",
+      "curator_emails": [
+        "[email protected]"
+      ],
+      "provenance_url": "https://www.eia.gov/opendata/v1/qb.php?category=0",
+      "provenance_description": "Total Energy dataset has US country-level data.",
+      "scripts": [
+        "process.py --dataset=TOTAL"
+      ],
+      "import_inputs": [
+        {
+          "template_mcf": "tmp_raw_data/TOTAL/TOTAL.tmcf",
+          "cleaned_csv": "tmp_raw_data/TOTAL/TOTAL.csv"
+        }
+      ],
+      "cron_schedule": "20 6  1 * *"
+    }
+  ]
+}