spaceml-org · annajungbluth · Apr 25, 2024 · Apr 25, 2024 · Apr 25, 2024 · Apr 25, 2024
diff --git a/.project-root b/.project-root
diff --git a/config/example-hydra-config/data.yaml b/config/example-hydra-config/data.yaml
@@ -0,0 +1,54 @@
+A_data:
+  A_path: null
+  A_train_dataset:
+    _target_: iti.data.geo_datasets.GeoDataset # TODO: make specific msg dataset?
+    data_dir: null
+    editors: null # TODO: hard code in dataset?
+    splits_dict:
+      train:
+        years: [2020]
+        months: [10]
+        days: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
+    load_coords: False
+    load_cloudmask: False
+  A_val_dataset:
+    _target_: iti.data.geo_datasets.GeoDataset # TODO: make specific msg dataset?
+    data_dir: null
+    editors: null # TODO: hard code in dataset?
+    splits_dict:
+      train:
+        years: [2020]
+        months: [10]
+        days: [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
+    load_coords: False
+    load_cloudmask: False 
+  A_plot_settings: null 
+
+B_data:
+  B_path: null
+  B_train_dataset:
+    _target_: iti.data.geo_datasets.GeoDataset # TODO: make specific goes dataset?
+    data_dir: null
+    editors: null # TODO: hard code in dataset?
+    splits_dict:
+      train:
+        years: [2020]
+        months: [10]
+        days: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
+    load_coords: False
+    load_cloudmask: False
+  B_val_dataset:
+    _target_: iti.data.geo_datasets.GeoDataset # TODO: make specific goes dataset?
+    data_dir: null
+    editors: null # TODO: hard code in dataset?
+    splits_dict:
+      train:
+        years: [2020]
+        months: [10]
+        days: [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
+    load_coords: False
+    load_cloudmask: False  
+  B_plot_settings: null
+
+num_workers: 4
+iterations_per_epoch: 1000
diff --git a/config/example-hydra-config/model.yaml b/config/example-hydra-config/model.yaml
@@ -0,0 +1,9 @@
+model:
+  __target__: null
+  input_dim_a: 11
+  input_dim_b: 16
+  upsampling: 0
+  discriminator_mode: CHANNELS
+  lambda_diversity: 0
+  norm: 'none'
+  use_batch_statistic: False
diff --git a/config/example-hydra-config/train.yaml b/config/example-hydra-config/train.yaml
@@ -0,0 +1 @@
+base_dir: /home/freischem/outputs/miniset/
diff --git a/config/example-hydra-config/wandb.yaml b/config/example-hydra-config/wandb.yaml
@@ -0,0 +1,6 @@
+experiment_name: null
+tags: null
+wandb_entity: null
+wandb_project: null
+wandb_name: null
+wandb_id: null
diff --git a/config/msg_to_goes.yaml b/config/msg_to_goes.yaml
@@ -0,0 +1,27 @@
+base_dir: /home/anna.jungbluth/outputs/msg-to-goes/
+data:
+  A_path: /mnt/disks/eo-data/msg/
+  # converted_A_path:
+  B_path: /mnt/disks/eo-data/goes/
+  # converted_B_path: 
+  num_workers: 4
+  iterations_per_epoch: 10
+  patch_size: (256, 256)
+  skip_constant_channels: False # If True, patches with constant channels will be skipped. Note, this massively slows down training.
+model:
+  input_dim_a: 7 # 11
+  input_dim_b: 9 # 16
+  upsampling: 0
+  discriminator_mode: CHANNELS
+  lambda_diversity: 0
+  norm: 'none' # 'in_rs_aff'
+  use_batch_statistic: False
+logging:
+  wandb_entity: itieo
+  wandb_project: msg-to-goes
+  wandb_name: msg-to-goes-infrared-channels
+training:
+  epochs: 100
+normalization: # TODO: Change to avoid absolute paths
+  A_norm_dir: /home/anna.jungbluth/InstrumentToInstrument/dataset/msg_2020_hourly_subset.csv
+  B_norm_dir: /home/anna.jungbluth/InstrumentToInstrument/dataset/goes_2020_hourly_subset.csv
diff --git a/dataset/goes_2020_hourly.csv b/dataset/goes_2020_hourly.csv
diff --git a/dataset/goes_2020_hourly_subset.csv b/dataset/goes_2020_hourly_subset.csv
diff --git a/dataset/msg_2020_hourly.csv b/dataset/msg_2020_hourly.csv
diff --git a/dataset/msg_2020_hourly_subset.csv b/dataset/msg_2020_hourly_subset.csv
diff --git a/itipy/callback.py b/itipy/callback.py
@@ -102,7 +102,7 @@ def __init__(self, data, model, plot_settings_A=None, plot_settings_B=None, plot
 
         plot_settings = [*plot_settings_A, *plot_settings_B, *plot_settings_A]
 
-        super().__init__(data, model, path, plot_id, plot_settings, **kwargs)
+        super().__init__(data, model, plot_id, plot_settings, **kwargs)
 
     def predict(self, x):
         x_ab, x_aba = self.model.forwardABA(x)
@@ -138,7 +138,7 @@ def __init__(self, data, model, plot_settings_A=None, plot_settings_B=None, plot
 
         plot_settings = [*plot_settings_B, *plot_settings_A, *plot_settings_B]
 
-        super().__init__(data, model, path, plot_id, plot_settings, **kwargs)
+        super().__init__(data, model, plot_id, plot_settings, **kwargs)
 
     def predict(self, x):
         x_ba, x_bab = self.model.forwardBAB(x)
@@ -169,7 +169,7 @@ def __init__(self, data, model, plot_settings_A=None, plot_settings_B=None, plot
 
         plot_settings = [*plot_settings_A, *plot_settings_B]
 
-        super().__init__(data, model, path, plot_id, plot_settings, **kwargs)
+        super().__init__(data, model, plot_id, plot_settings, **kwargs)
 
     def predict(self, input_data):
         x_ab = self.model.forwardAB(input_data)

diff --git a/itipy/data/geo_datasets.py b/itipy/data/geo_datasets.py
@@ -0,0 +1,149 @@
+from __future__ import annotations
+import collections
+import collections.abc
+
+#hyper needs the four following aliases to be done manually.
+collections.Iterable = collections.abc.Iterable
+collections.Mapping = collections.abc.Mapping
+collections.MutableSet = collections.abc.MutableSet
+collections.MutableMapping = collections.abc.MutableMapping
+
+import logging
+import torch
+import numpy as np
+import xarray as xr
+from typing import List, Union, Dict
+from loguru import logger
+
+from itipy.data.editor import Editor
+from itipy.data.geo_editor import CenterWeightedCropDatasetEditor
+from itipy.data.dataset import BaseDataset
+from itipy.data.geo_utils import get_split, get_list_filenames, _check_any_constant_channels, _check_all_constant_channels
+
+class GeoDataset(BaseDataset):
+    def __init__(
+        self,
+        data_dir: List[str],
+        editors: List[Editor],
+        splits_dict: Dict,
+        ext: str="nc",
+        limit: int=None,
+        load_coords: bool=True,
+        load_cloudmask: bool=True, 
+        patch_size: tuple[int, int] = (256, 256),
+        skip_constant_channels: bool = False, # Could be used for filtering out night time observations
+        **kwargs
+    ):
+        """
+        Initialize the GeoDataset class.
+
+        Args:
+            data_dir (List[str]): A list of directories containing the data files.
+            editors (List[Editor]): A list of editors for data preprocessing.
+            splits_dict (Dict, optional): A dictionary specifying the splits for the dataset. Defaults to None.
+            ext (str, optional): The file extension of the data files. Defaults to "nc".
+            limit (int, optional): The maximum number of files to load. Defaults to None.
+            load_coords (bool, optional): Whether to load the coordinates. Defaults to True.
+            load_cloudmask (bool, optional): Whether to load the cloud mask. Defaults to True.
+            patch_size (tuple[int, int], optional): The size of the patches to crop. Defaults to (256, 256).
+            skip_constant_channels (bool, optional): Whether to skip a patch is any channel is constant. Defaults to False.
+            **kwargs: Additional keyword arguments.
+
+        """
+        self.data_dir = data_dir
+        self.editors = editors
+        self.splits_dict = splits_dict
+        self.ext = ext
+        self.limit = limit
+        self.load_coords = load_coords
+        self.load_cloudmask = load_cloudmask
+        self.patch_size = patch_size
+        self.skip_constant_channels = skip_constant_channels
+
+        self.files = self.get_files()
+
+        self.crop = CenterWeightedCropDatasetEditor(patch_shape=self.patch_size)
+
+        super().__init__(
+            data=self.files,
+            editors=self.editors,
+            ext=self.ext,
+            limit=self.limit,
+            **kwargs
+        )
+
+    def get_files(self):
+        # Get filenames from data_dir
+        files = get_list_filenames(data_path=self.data_dir, ext=self.ext)
+        # split files based on split criteria
+        files = get_split(files=files, split_dict=self.splits_dict)
+        return files
+
+    def __len__(self):
+        return len(self.files)
+
+    def getIndex(self, data_dict, idx):
+        # Attempt applying editors
+        try:
+            return self.convertData(data_dict)
+        except Exception as ex:
+            logging.error('Unable to convert %s: %s' % (self.files[idx], ex))
+            raise ex
+
+    def __getitem__(self, idx):
+        data_dict = {}
+
+        max_attempts = 20
+        attempts = 1
+
+        while attempts <= max_attempts:
+            if attempts == max_attempts:
+                raise Exception("Could not load data after %d attempts." % max_attempts)
+            # Load dataset
+            ds: xr.Dataset = xr.load_dataset(self.files[idx], engine="netcdf4")
+            # Crop data before computing
+            ds = self.crop(ds)
+            # Extract data
+            data = ds.Rad.compute().to_numpy()
+            # Check if all channels are constant -> Always performed
+            all_constant = _check_all_constant_channels(data)
+            # Check if any channel is constant -> Only relevant if skip_constant_channels is True
+            any_constant = _check_any_constant_channels(data)
+            if all_constant or (self.skip_constant_channels and any_constant):
+                # Retry loading data
+                logger.info("Found constant channels in %s. Attempting with other files." % self.files[idx])
+                idx = np.random.randint(0, len(self.files))
+                attempts += 1
+            else:
+                break
+
+        data_dict["data"] = data
+        del data # Delete data to reduce memory usage
+        # Extract wavelengths
+        wavelengths = ds.band_wavelength.compute().to_numpy()
+        data_dict["wavelengths"] = wavelengths
+        del wavelengths # Delete data to reduce memory usage
+
+        # Extract coordinates
+        if self.load_coords:
+            latitude = ds.latitude.compute().to_numpy()
+            longitude = ds.longitude.compute().to_numpy()
+            coords = np.stack([latitude, longitude], axis=0)
+            data_dict["coords"] = coords
+            del latitude, longitude # Delete data to reduce memory usage
+            del coords # Delete data to reduce memory usage
+
+        # Extract cloud mask
+        if self.load_cloudmask:
+            cloud_mask = ds.cloud_mask.compute().to_numpy()
+            data_dict["cloud_mask"] = cloud_mask
+            del cloud_mask # Delete data to reduce memory usage
+
+        # Delete dataset to reduce memory usage
+        del ds
+
+        # Apply editors
+        data, _ = self.getIndex(data_dict, idx)
+        return data
+
+