From 46c15ebd50eb943d7e32e4470e150a2a42d5e77a Mon Sep 17 00:00:00 2001
From: Jan Jurgen Griesfeller <jan.griesfeller@met.no>
Date: Wed, 6 Dec 2023 15:49:56 +0100
Subject: [PATCH] add tests including downloading

---
 .../AeronetSunTimeseriesReader.py             | 192 ++++++++++--------
 tests/test_AERONETTimeSeriesReader.py         |  27 ++-
 2 files changed, 135 insertions(+), 84 deletions(-)

diff --git a/src/pyaro_readers/aeronetsunreader/AeronetSunTimeseriesReader.py b/src/pyaro_readers/aeronetsunreader/AeronetSunTimeseriesReader.py
index 3ef9c8e..ce5429b 100644
--- a/src/pyaro_readers/aeronetsunreader/AeronetSunTimeseriesReader.py
+++ b/src/pyaro_readers/aeronetsunreader/AeronetSunTimeseriesReader.py
@@ -9,9 +9,12 @@
     Engine,
 )
 
-# import requests, zipfile, io
+import requests, zipfile, io
 import geocoder
 
+from urllib.parse import urlparse
+
+
 # from tqdm import tqdm
 
 # default URL
@@ -72,87 +75,109 @@ def __init__(
         self._header = []
         _laststatstr = ""
 
-        with open(self._filename, newline="") as csvfile:
-            for _hidx in range(HEADER_LINE_NO - 1):
-                self._header.append(csvfile.readline())
-            # get fields from header line although csv can do that as well since we might want to adjust these names
-            self._fields = csvfile.readline().strip().split(",")
-
-            crd = csv.DictReader(csvfile, fieldnames=self._fields, **csvreader_kwargs)
-            for _ridx, row in enumerate(crd):
-                if row[SITE_NAME] != _laststatstr:
-                    print(f"reading station {row[SITE_NAME]}...")
-                    _laststatstr = row[SITE_NAME]
-                    # new station
-                    station = row[SITE_NAME]
-                    lon = float(row[LON_NAME])
-                    lat = float(row[LAT_NAME])
-                    alt = float(row["Site_Elevation(m)"])
-                    if fill_country_flag:
-                        try:
-                            country = geocoder.osm([lat, lon], method="reverse").json[
-                                "country_code"
-                            ]
-                            country = country.upper()
-                        except:
-                            country = "NN"
-                    else:
-                        country = "NN"
-                    # print(country)
-                    # units of Aeronet data are always 1
-                    units = "1"
-                    if not station in self._stations:
-                        self._stations[station] = Station(
-                            {
-                                "station": station,
-                                "longitude": lon,
-                                "latitude": lat,
-                                "altitude": alt,
-                                "country": country,
-                                "url": "",
-                                "long_name": station,
-                            }
-                        )
-                    # every line contains all variables, sometimes filled with NaNs though
-                    if _ridx == 0:
-                        for variable in DATA_VARS:
-                            if variable in self._data:
-                                da = self._data[variable]
-                                if da.units != units:
-                                    raise Exception(
-                                        f"unit change from '{da.units}' to 'units'"
-                                    )
-                            else:
-                                da = NpStructuredData(variable, units)
-                                self._data[variable] = da
-
-                day, month, year = row[DATE_NAME].split(":")
-                datestring = "-".join([year, month, day])
-                datestring = "T".join([datestring, row[TIME_NAME]])
-                start = np.datetime64(datestring)
-                end = start
-
-                ts_dummy_data = {}
-                for variable in DATA_VARS:
+        # check if file is a URL
+        if self.is_valid_url(self._filename):
+            from urllib.request import urlopen
+            from io import BytesIO
+            from zipfile import ZipFile
+
+            # try to open as zipfile
+            try:
+                r = requests.get(self._filename)
+                zip_ref = ZipFile(BytesIO(r.content))
+                for file in zip_ref.namelist():
+                    with zip_ref.open(file) as response:
+                        lines = [line.decode("utf-8") for line in response.readlines()]
+                    # read only 1st file here
+                    break
+            except:
+                response = urlopen(self._filename)
+                lines = [line.decode("utf-8") for line in response.readlines()]
+
+        else:
+            with open(self._filename, newline="") as csvfile:
+                lines = csvfile.readlines()
+
+        for _hidx in range(HEADER_LINE_NO - 1):
+            self._header.append(lines.pop(0))
+        # get fields from header line although csv can do that as well since we might want to adjust these names
+        self._fields = lines.pop(0).strip().split(",")
+
+        crd = csv.DictReader(lines, fieldnames=self._fields, **csvreader_kwargs)
+        for _ridx, row in enumerate(crd):
+            if row[SITE_NAME] != _laststatstr:
+                print(f"reading station {row[SITE_NAME]}...")
+                _laststatstr = row[SITE_NAME]
+                # new station
+                station = row[SITE_NAME]
+                lon = float(row[LON_NAME])
+                lat = float(row[LAT_NAME])
+                alt = float(row["Site_Elevation(m)"])
+                if fill_country_flag:
                     try:
-                        value = float(row[variable])
-                        if value == NAN_VAL:
-                            value = np.nan
-                        # store value in ts_dummy_data, so we don't need to perform the nan check
-                        # for each component of calculated values again
-                        ts_dummy_data[variable] = value
-                    except KeyError:
-                        # computed variable
-                        if variable == AOD550_NAME:
-                            value = self.compute_od_from_angstromexp(
-                                0.55,
-                                ts_dummy_data[AOD440_NAME],
-                                0.44,
-                                ts_dummy_data[ANG4487_NAME],
-                            )
-                    self._data[variable].append(
-                        value, station, lat, lon, alt, start, end, Flag.VALID, np.nan
+                        country = geocoder.osm([lat, lon], method="reverse").json[
+                            "country_code"
+                        ]
+                        country = country.upper()
+                    except:
+                        country = "NN"
+                else:
+                    country = "NN"
+
+                # units of Aeronet data are always 1
+                units = "1"
+                if not station in self._stations:
+                    self._stations[station] = Station(
+                        {
+                            "station": station,
+                            "longitude": lon,
+                            "latitude": lat,
+                            "altitude": alt,
+                            "country": country,
+                            "url": "",
+                            "long_name": station,
+                        }
                     )
+                # every line contains all variables, sometimes filled with NaNs though
+                if _ridx == 0:
+                    for variable in DATA_VARS:
+                        if variable in self._data:
+                            da = self._data[variable]
+                            if da.units != units:
+                                raise Exception(
+                                    f"unit change from '{da.units}' to 'units'"
+                                )
+                        else:
+                            da = NpStructuredData(variable, units)
+                            self._data[variable] = da
+
+            day, month, year = row[DATE_NAME].split(":")
+            datestring = "-".join([year, month, day])
+            datestring = "T".join([datestring, row[TIME_NAME]])
+            start = np.datetime64(datestring)
+            end = start
+
+            ts_dummy_data = {}
+            for variable in DATA_VARS:
+                try:
+                    value = float(row[variable])
+                    if value == NAN_VAL:
+                        value = np.nan
+                    # store value in ts_dummy_data, so we don't need to perform the nan check
+                    # for each component of calculated values again
+                    ts_dummy_data[variable] = value
+                except KeyError:
+                    # computed variable
+                    if variable == AOD550_NAME:
+                        value = self.compute_od_from_angstromexp(
+                            0.55,
+                            ts_dummy_data[AOD440_NAME],
+                            0.44,
+                            ts_dummy_data[ANG4487_NAME],
+                        )
+                self._data[variable].append(
+                    value, station, lat, lon, alt, start, end, Flag.VALID, np.nan
+                )
 
     def _unfiltered_data(self, varname) -> Data:
         return self._data[varname]
@@ -207,6 +232,13 @@ def calc_angstroem_coeff(
         """
         return -np.log(od1 / od2) / np.log(wl1 / wl2)
 
+    def is_valid_url(self, url):
+        try:
+            result = urlparse(url)
+            return all([result.scheme, result.netloc])
+        except ValueError:
+            return False
+
 
 class AeronetSunTimeseriesEngine(Engine):
     def open(self, filename, *args, **kwargs) -> AeronetSunTimeseriesReader:
diff --git a/tests/test_AERONETTimeSeriesReader.py b/tests/test_AERONETTimeSeriesReader.py
index 4c025fb..2b4f242 100644
--- a/tests/test_AERONETTimeSeriesReader.py
+++ b/tests/test_AERONETTimeSeriesReader.py
@@ -1,11 +1,15 @@
 import unittest
 import os
 
-import numpy as np
 import pyaro
 import pyaro.timeseries
 from pyaro.timeseries.Wrappers import VariableNameChangingReader
 
+TEST_URL = "https://pyaerocom.met.no/pyaro-suppl/testdata/aeronetsun_testdata.csv"
+TEST_ZIP_URL = (
+    "https://pyaerocom.met.no/pyaro-suppl/testdata/aeronetsun_testdata.csv.zip"
+)
+
 
 class TestAERONETTimeSeriesReader(unittest.TestCase):
     file = os.path.join(
@@ -14,6 +18,24 @@ class TestAERONETTimeSeriesReader(unittest.TestCase):
         "aeronetsun_testdata.csv",
     )
 
+    def test_dl_data_unzipped(self):
+        engine = pyaro.list_timeseries_engines()["aeronetsunreader"]
+        with engine.open(TEST_URL, filters=[], fill_country_flag=False) as ts:
+            count = 0
+            for var in ts.variables():
+                count += len(ts.data(var))
+            self.assertEqual(count, 49965)
+            self.assertEqual(len(ts.stations()), 4)
+
+    def test_dl_data_zipped(self):
+        engine = pyaro.list_timeseries_engines()["aeronetsunreader"]
+        with engine.open(TEST_ZIP_URL, filters=[], fill_country_flag=False) as ts:
+            count = 0
+            for var in ts.variables():
+                count += len(ts.data(var))
+            self.assertEqual(count, 49965)
+            self.assertEqual(len(ts.stations()), 4)
+
     def test_init(self):
         engine = pyaro.list_timeseries_engines()["aeronetsunreader"]
         self.assertEqual(engine.url(), "https://github.com/metno/pyaro-readers")
@@ -55,9 +77,6 @@ def test_variables_filter(self):
         with engine.open(self.file, filters=[vfilter]) as ts:
             self.assertEqual(ts.data(new_var_name).variable, new_var_name)
 
-    def test_downloaded_file(self):
-        pass
-
 
 if __name__ == "__main__":
     unittest.main()