Skip to content

Commit

Permalink
Fix: Resolved issue in GSOD_BY_YEAR where files with 402 errors (#783)
Browse files Browse the repository at this point in the history
* Fix: Resolved issue in GSOD_BY_YEAR where files with 402 errors were outputting to the source file and breaking the process.

* fix: Modify sequence of pipelines in DAG.
  • Loading branch information
nlarge-google authored Aug 29, 2024
1 parent 2f81461 commit 08b1529
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -641,6 +641,8 @@ def run_gsod_by_year(
if ((file_ptr % 100) == 0) or (file_ptr == file_cnt):
logging.info(f"Appended {file_ptr} files of total {file_cnt} files")
file_ptr += 1
# Remove bad file data
os.system(f"sed -ni '/^\"/p' {source_file}")
if number_of_header_rows > 0:
remove_header_rows(source_file, number_of_header_rows=number_of_header_rows)
else:
Expand Down
19 changes: 12 additions & 7 deletions datasets/noaa/pipelines/noaa/noaa_dag.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@
image="{{ var.json.noaa.container_registry.run_csv_transform_kub }}",
env_vars={
"PIPELINE_NAME": "GHCND by year",
"SOURCE_URL": '{\n "ghcnd_by_year": "ftp://ftp.ncei.noaa.gov/pub/data/ghcn/daily/by_year/.csv.gz"\n}',
"SOURCE_URL": '{\n "ghcnd_by_year": "http://www.ncei.noaa.gov/pub/data/ghcn/daily/by_year/.csv.gz"\n}',
"SOURCE_FILE": "files/data_ghcnd_by_year.csv",
"TARGET_FILE": "files/data_output_ghcnd_by_year.csv",
"CHUNKSIZE": "750000",
Expand Down Expand Up @@ -829,8 +829,6 @@

(
create_cluster
>> [ghcnd_by_year, lightning_strikes_by_year]
>> ghcnd_hurricanes
>> [
ghcnd_inventory,
spc_hail,
Expand All @@ -840,12 +838,19 @@
ghcnd_stations,
gsod_stations,
ghcnd_countries,
noaa_goes16_mcmip,
noaa_goes16_cmip,
noaa_goes16_glm,
noaa_goes16_radiance,
noaa_goes17_mcmip,
noaa_goes17_cmip,
noaa_goes17_glm,
noaa_goes17_radiance,
]
>> ghcnd_hurricanes
>> ghcnd_by_year
>> lightning_strikes_by_year
>> noaa_ghcn_m
>> [noaa_goes16_mcmip, noaa_goes16_cmip, noaa_goes16_glm]
>> noaa_goes16_radiance
>> [noaa_goes17_mcmip, noaa_goes17_cmip, noaa_goes17_glm]
>> noaa_goes17_radiance
>> storms_database_by_year
>> noaa_gsod_by_year
>> delete_cluster
Expand Down
2 changes: 1 addition & 1 deletion datasets/noaa/pipelines/noaa/pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2993,4 +2993,4 @@ dag:
name: noaa

graph_paths:
- "create_cluster >> [ghcnd_by_year, lightning_strikes_by_year ] >> ghcnd_hurricanes >> [ ghcnd_inventory, spc_hail, spc_wind, spc_tornado, ghcnd_states, ghcnd_stations, gsod_stations, ghcnd_countries] >> noaa_ghcn_m >> [noaa_goes16_mcmip, noaa_goes16_cmip, noaa_goes16_glm] >> noaa_goes16_radiance >> [noaa_goes17_mcmip, noaa_goes17_cmip, noaa_goes17_glm] >> noaa_goes17_radiance >> storms_database_by_year >> noaa_gsod_by_year >> delete_cluster"
- "create_cluster >> [ ghcnd_inventory, spc_hail, spc_wind, spc_tornado, ghcnd_states, ghcnd_stations, gsod_stations, ghcnd_countries, noaa_goes16_mcmip, noaa_goes16_cmip, noaa_goes16_glm, noaa_goes16_radiance, noaa_goes17_mcmip, noaa_goes17_cmip, noaa_goes17_glm, noaa_goes17_radiance] >> ghcnd_hurricanes >> ghcnd_by_year >> lightning_strikes_by_year >> noaa_ghcn_m >> storms_database_by_year >> noaa_gsod_by_year >> delete_cluster"

0 comments on commit 08b1529

Please sign in to comment.