diff --git a/datasets/noaa/pipelines/_images/run_csv_transform_kub/csv_transform.py b/datasets/noaa/pipelines/_images/run_csv_transform_kub/csv_transform.py index f616a9434..c71725fdc 100644 --- a/datasets/noaa/pipelines/_images/run_csv_transform_kub/csv_transform.py +++ b/datasets/noaa/pipelines/_images/run_csv_transform_kub/csv_transform.py @@ -641,6 +641,8 @@ def run_gsod_by_year( if ((file_ptr % 100) == 0) or (file_ptr == file_cnt): logging.info(f"Appended {file_ptr} files of total {file_cnt} files") file_ptr += 1 + # Remove bad file data + os.system(f"sed -ni '/^\"/p' {source_file}") if number_of_header_rows > 0: remove_header_rows(source_file, number_of_header_rows=number_of_header_rows) else: diff --git a/datasets/noaa/pipelines/noaa/noaa_dag.py b/datasets/noaa/pipelines/noaa/noaa_dag.py index c2652ec48..33b8f283d 100644 --- a/datasets/noaa/pipelines/noaa/noaa_dag.py +++ b/datasets/noaa/pipelines/noaa/noaa_dag.py @@ -61,7 +61,7 @@ image="{{ var.json.noaa.container_registry.run_csv_transform_kub }}", env_vars={ "PIPELINE_NAME": "GHCND by year", - "SOURCE_URL": '{\n "ghcnd_by_year": "ftp://ftp.ncei.noaa.gov/pub/data/ghcn/daily/by_year/.csv.gz"\n}', + "SOURCE_URL": '{\n "ghcnd_by_year": "http://www.ncei.noaa.gov/pub/data/ghcn/daily/by_year/.csv.gz"\n}', "SOURCE_FILE": "files/data_ghcnd_by_year.csv", "TARGET_FILE": "files/data_output_ghcnd_by_year.csv", "CHUNKSIZE": "750000", @@ -829,8 +829,6 @@ ( create_cluster - >> [ghcnd_by_year, lightning_strikes_by_year] - >> ghcnd_hurricanes >> [ ghcnd_inventory, spc_hail, @@ -840,12 +838,19 @@ ghcnd_stations, gsod_stations, ghcnd_countries, + noaa_goes16_mcmip, + noaa_goes16_cmip, + noaa_goes16_glm, + noaa_goes16_radiance, + noaa_goes17_mcmip, + noaa_goes17_cmip, + noaa_goes17_glm, + noaa_goes17_radiance, ] + >> ghcnd_hurricanes + >> ghcnd_by_year + >> lightning_strikes_by_year >> noaa_ghcn_m - >> [noaa_goes16_mcmip, noaa_goes16_cmip, noaa_goes16_glm] - >> noaa_goes16_radiance - >> [noaa_goes17_mcmip, noaa_goes17_cmip, noaa_goes17_glm] - >> noaa_goes17_radiance >> storms_database_by_year >> noaa_gsod_by_year >> delete_cluster diff --git a/datasets/noaa/pipelines/noaa/pipeline.yaml b/datasets/noaa/pipelines/noaa/pipeline.yaml index c2d0b86b5..c14a1dc29 100644 --- a/datasets/noaa/pipelines/noaa/pipeline.yaml +++ b/datasets/noaa/pipelines/noaa/pipeline.yaml @@ -2993,4 +2993,4 @@ dag: name: noaa graph_paths: - - "create_cluster >> [ghcnd_by_year, lightning_strikes_by_year ] >> ghcnd_hurricanes >> [ ghcnd_inventory, spc_hail, spc_wind, spc_tornado, ghcnd_states, ghcnd_stations, gsod_stations, ghcnd_countries] >> noaa_ghcn_m >> [noaa_goes16_mcmip, noaa_goes16_cmip, noaa_goes16_glm] >> noaa_goes16_radiance >> [noaa_goes17_mcmip, noaa_goes17_cmip, noaa_goes17_glm] >> noaa_goes17_radiance >> storms_database_by_year >> noaa_gsod_by_year >> delete_cluster" + - "create_cluster >> [ ghcnd_inventory, spc_hail, spc_wind, spc_tornado, ghcnd_states, ghcnd_stations, gsod_stations, ghcnd_countries, noaa_goes16_mcmip, noaa_goes16_cmip, noaa_goes16_glm, noaa_goes16_radiance, noaa_goes17_mcmip, noaa_goes17_cmip, noaa_goes17_glm, noaa_goes17_radiance] >> ghcnd_hurricanes >> ghcnd_by_year >> lightning_strikes_by_year >> noaa_ghcn_m >> storms_database_by_year >> noaa_gsod_by_year >> delete_cluster"