From f318d4066ca713511925e6812713593478719c69 Mon Sep 17 00:00:00 2001 From: Jan Griesfeller Date: Tue, 19 Dec 2023 09:51:57 +0100 Subject: [PATCH] start reading tar file as input --- .../AeronetSdaTimeseriesReader.py | 24 +++++++++++++++++++ tests/test_AERONETSDATimeSeriesReader.py | 21 +++++++++++++++- 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/src/pyaro_readers/aeronetsdareader/AeronetSdaTimeseriesReader.py b/src/pyaro_readers/aeronetsdareader/AeronetSdaTimeseriesReader.py index abad4b4..5eb84db 100644 --- a/src/pyaro_readers/aeronetsdareader/AeronetSdaTimeseriesReader.py +++ b/src/pyaro_readers/aeronetsdareader/AeronetSdaTimeseriesReader.py @@ -10,6 +10,7 @@ ) import numpy as np import requests +import tarfile from pyaro.timeseries import ( AutoFilterReaderEngine, Data, @@ -101,6 +102,29 @@ def __init__( # read only 1st file here break except BadZipFile: + # try reading as tar.gz file + # Aeronet's tar files differ from the zip files by providing one file per station instead of one file + # with all stations + # the general format of the data is the same though. + # so we just keep the header lines of the 1st station, and add all data lines of all stations + # That way we get to the same file format as the zip file + r = requests.get(self._filename) + with tarfile.open(fileobj=BytesIO(r.raw.read()), mode="r") as tf: + lines = [] + for _midx, member in enumerate(tf.getmembers()): + f = tf.extractfile(member) + if _midx == 0: + lines.extend( + [line.decode("utf-8") for line in f.readlines()] + ) + else: + # skip the header lines + for _hidx in range(HEADER_LINE_NO): + dummy = f.readline() + + lines.extend([line.decode("utf-8") for line in f.readlines()]) + except tarfile.TarError: + # read as text file response = urlopen(self._filename) lines = [line.decode("utf-8") for line in response.readlines()] diff --git a/tests/test_AERONETSDATimeSeriesReader.py b/tests/test_AERONETSDATimeSeriesReader.py index 14f8e84..9a67acb 100644 --- a/tests/test_AERONETSDATimeSeriesReader.py +++ b/tests/test_AERONETSDATimeSeriesReader.py @@ -7,6 +7,9 @@ from pyaro.timeseries.Wrappers import VariableNameChangingReader TEST_URL = "https://pyaerocom.met.no/pyaro-suppl/testdata/aeronetsda_testdata.csv" +TEST_TAR_URL = ( + "https://pyaerocom.met.no/pyaro-suppl/testdata/SDA_Level20_Daily_V3_testdata.tar.gz" +) TEST_ZIP_URL = ( "https://pyaerocom.met.no/pyaro-suppl/testdata/aeronetsda_testdata.csv.zip" ) @@ -29,6 +32,22 @@ def external_resource_available(self, url): except: return False + def test_dl_data_tared(self): + if not self.external_resource_available(TEST_TAR_URL): + self.skipTest(f"external resource not available: {TEST_TAR_URL}") + engine = pyaro.list_timeseries_engines()["aeronetsdareader"] + with engine.open( + TEST_TAR_URL, + filters=[], + fill_country_flag=False, + tqdm_desc="test_sda_dl_data_tared", + ) as ts: + count = 0 + for var in ts.variables(): + count += len(ts.data(var)) + self.assertEqual(count, 79944) + self.assertEqual(len(ts.stations()), 4) + def test_dl_data_unzipped(self): if not self.external_resource_available(TEST_URL): self.skipTest(f"external resource not available: {TEST_URL}") @@ -121,7 +140,7 @@ def test_variables_filter(self): engine = pyaro.list_timeseries_engines()["aeronetsdareader"] new_var_name = "od550gt1aer" vfilter = pyaro.timeseries.filters.get( - "variables", reader_to_new={"Coarse_Mode_AOD_500nm[tau_c]": new_var_name} + "variables", reader_to_new={"AODGT1_550nm": new_var_name} ) with engine.open( self.file, filters=[vfilter], tqdm_desc="test_sda_variables_filter"