Skip to content

Commit

Permalink
Merge pull request #13 from OCHA-DAP/exploration-notebooks
Browse files Browse the repository at this point in the history
Add exploration notebooks
  • Loading branch information
hannahker authored Nov 15, 2024
2 parents 8c2c576 + ea20a2c commit e621481
Show file tree
Hide file tree
Showing 2 changed files with 365 additions and 0 deletions.
102 changes: 102 additions & 0 deletions exploration/test_grid_alignment.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
---
jupyter:
jupytext:
text_representation:
extension: .md
format_name: markdown
format_version: '1.3'
jupytext_version: 1.16.3
kernelspec:
display_name: venv
language: python
name: python3
---

## Grid alignment investigation

This notebook investigates the source of some grid misalignment identified during preparation of the source raster data. We're looking at a sample SEAS5 COG and demonstrating how the order of clipping and resampling operations can impact the output raster stats.

Update: After further investigation, see [this comment](https://github.com/OCHA-DAP/ds-raster-stats/pull/13#issuecomment-2463353673) for a potential solution.

```python
from src.utils.raster_utils import upsample_raster
from src.utils.cog_utils import get_cog_url

import rioxarray as rxr
import geopandas as gpd
from rasterstats import zonal_stats
import numpy as np
import pandas as pd

MODE = "prod"
date = "2024-01-01"
```

#### 1. Read in source raster and geopandas dataframe

```python
# Or replace with other locally saved shapefile
gdf = gpd.read_file("data/eth_adm2_simplified.shp")
minx, miny, maxx, maxy = gdf.total_bounds

cog_name = f"seas5/monthly/processed/precip_em_i{date}_lt0.tif"
cog_url = get_cog_url(MODE, cog_name)
da_seas5 = rxr.open_rasterio(cog_url, chunks="auto")
```

#### Option A: 1) clip raster, then 2) upsample to 0.05 degree resolution

```python
da_1 = da_seas5.sel(x=slice(minx, maxx), y=slice(maxy, miny)).persist()
da_1 = upsample_raster(da_1)

print(da_1.rio.transform())
print(da_1.rio.resolution())
```

#### Option B: 1) Upsample to 0.05 degrees, then 2) clip raster

```python
da_2 = upsample_raster(da_seas5)
da_2 = da_2.sel(x=slice(minx, maxx), y=slice(maxy, miny)).persist()

print(da_2.rio.transform())
print(da_2.rio.resolution())
```

#### Now compare output zonal stats from both `da_1` and `da_2`

```python
stats = ["mean", "median", "min", "max", "count"]

da_1_results = zonal_stats(
vectors=gdf[["geometry"]],
raster=da_1.values[0],
affine=da_1.rio.transform(),
nodata=np.nan,
all_touched=False,
stats=stats,
)

da_2_results = zonal_stats(
vectors=gdf[["geometry"]],
raster=da_2.values[0],
affine=da_2.rio.transform(),
nodata=np.nan,
all_touched=False,
stats=stats,
)

df_1 = pd.DataFrame(da_1_results)
df_2 = pd.DataFrame(da_2_results)
```

We can see different results between both dataframes...

```python
df_1
```

```python
df_2
```
263 changes: 263 additions & 0 deletions exploration/validate_outputs.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,263 @@
---
jupyter:
jupytext:
text_representation:
extension: .md
format_name: markdown
format_version: '1.3'
jupytext_version: 1.16.3
kernelspec:
display_name: venv
language: python
name: python3
---

## Validate output stats

In this notebook we're taking a look at our output raster stats and comparing against the outputs from other raster stats calculation functions. We would expect our outputs to be the same as the `zonal_stats` outputs, and see some moderate differences with the `exactextract` outputs.

```python
%load_ext jupyter_black
%load_ext autoreload
%autoreload 2
```

```python
from exactextract import exact_extract
from rasterstats import zonal_stats
from dotenv import load_dotenv
from sqlalchemy import create_engine, text
import os
import pandas as pd
import geopandas as gpd
import rioxarray as rxr
import numpy as np

# os.chdir("..")
from src.utils.cloud_utils import get_cog_url
from src.utils.raster_utils import upsample_raster, prep_raster

load_dotenv()
```

```python
AZURE_DB_PW_PROD = os.getenv("AZURE_DB_PW_PROD")
MODE = "prod"

adm_level = 2
iso3 = "AFG"
date = "2024-01-01"
stats = ["mean", "median", "min", "max"]
engine = create_engine(
f"postgresql+psycopg2://chdadmin:{AZURE_DB_PW_PROD}@chd-rasterstats-prod.postgres.database.azure.com/postgres"
)
gdf = gpd.read_file(f"data/{iso3.lower()}/{iso3.lower()}_adm{adm_level}.shp")
gdf["geometry"] = gdf["geometry"].simplify(tolerance=0.001, preserve_topology=True)
```

## 1. ERA5


Read in the ERA5 COG

```python
# Read in the ERA5 COG
cog_name = f"era5/monthly/processed/precip_reanalysis_v{date}.tif"
cog_url = get_cog_url(MODE, cog_name)
da_era5 = rxr.open_rasterio(cog_url, chunks="auto")
da_era5_upsampled = prep_raster(da_era5, gdf)

# da_era5.rio.to_raster("da_era5.tif")
# da_era5_upsampled.rio.to_raster("da_era5_upsampled.tif")

coords_transform = da_era5_upsampled.rio.set_spatial_dims(
x_dim="x", y_dim="y"
).rio.transform()
```

#### 1.1: Rasterstats in database

```python
with engine.connect() as con:
query = text(
f"SELECT * FROM public.era5 WHERE iso3='{iso3}' AND adm_level='{adm_level}' AND valid_date='{date}'"
)
df_db = pd.read_sql_query(query, con)
df_db.rename(columns={"pcode": f"ADM{adm_level}_PCODE"}, inplace=True)
df_db.columns = [
f"{col}_db" if col != f"ADM{adm_level}_PCODE" else col for col in df_db.columns
]
```

#### 1.2: Calculate comparisons using `exactextract` and `rasterstats` packages

```python
df_ee = exact_extract(
da_era5,
gdf,
stats,
output="pandas",
include_cols=f"ADM{adm_level}_PCODE",
).sort_values(f"ADM{adm_level}_PCODE")

df_ee.columns = [
f"{col}_ee" if col != f"ADM{adm_level}_PCODE" else col for col in df_ee.columns
]

results_zs = zonal_stats(
vectors=gdf[["geometry"]],
raster=da_era5_upsampled.values[0],
affine=coords_transform,
nodata=np.nan,
all_touched=False,
stats=stats,
)
df_zs = (
pd.DataFrame.from_dict(results_zs)
.join(gdf)[stats + [f"ADM{adm_level}_PCODE"]]
.sort_values(f"ADM{adm_level}_PCODE")
)
df_zs.columns = [
f"{col}_zs" if col != f"ADM{adm_level}_PCODE" else col for col in df_zs.columns
]

assert len(df_zs) == len(df_ee) == len(df_db)
```

#### 1.3 Compare using percent differences

```python
df_comparison = df_db.merge(df_ee, on=f"ADM{adm_level}_PCODE").merge(
df_zs, on=f"ADM{adm_level}_PCODE"
)

for stat in stats:
df_comparison[f"{stat}_diff_db_ee"] = abs(
(df_comparison[f"{stat}_ee"] - df_comparison[f"{stat}_db"])
/ ((df_comparison[f"{stat}_ee"] + df_comparison[f"{stat}_db"]) / 2)
* 100
).round(4)

df_comparison[f"{stat}_diff_db_zs"] = abs(
(df_comparison[f"{stat}_zs"] - df_comparison[f"{stat}_db"])
/ ((df_comparison[f"{stat}_zs"] + df_comparison[f"{stat}_db"]) / 2)
* 100
).round(4)
```

#### 1.4 Look for any values that are > 5% absolute difference

```python
threshold = 5

diff_cols = [col for col in df_comparison.columns if "_diff_" in col]
significant_diffs = df_comparison[diff_cols].abs().gt(threshold).any(axis=1)
significant_rows = df_comparison[significant_diffs].copy()
max_diffs = significant_rows[diff_cols].abs().max(axis=1)
significant_rows = significant_rows.assign(max_difference=max_diffs)
significant_rows = significant_rows.sort_values("max_difference", ascending=False)

significant_rows
```

# SEAS5

#### Read in the SEAS5 COG

```python
cog_name = f"seas5/monthly/processed/precip_em_i{date}_lt0.tif"
cog_url = get_cog_url(MODE, cog_name)
da_seas5 = rxr.open_rasterio(cog_url, chunks="auto")
da_seas5_upsampled = prep_raster(da_seas5, gdf)

# da_seas5_upsampled.rio.to_raster("da_seas5_upsampled.tif")

coords_transform = da_seas5_upsampled.rio.set_spatial_dims(
x_dim="x", y_dim="y"
).rio.transform()
```

#### 2.1 Rasterstats in database

```python
with engine.connect() as con:
query = text(
f"SELECT * FROM public.seas5 WHERE iso3='{iso3}' AND adm_level='{adm_level}' AND valid_date='{date}' and leadtime=0"
)
df_db = pd.read_sql_query(query, con)
df_db.rename(columns={"pcode": f"ADM{adm_level}_PCODE"}, inplace=True)
df_db.columns = [
f"{col}_db" if col != f"ADM{adm_level}_PCODE" else col for col in df_db.columns
]
```

#### 2.2 Calculate comparisons using `exactextract` and `rasterstats` packages

```python
df_ee = exact_extract(
da_seas5,
gdf,
stats,
output="pandas",
include_cols=f"ADM{adm_level}_PCODE",
).sort_values(f"ADM{adm_level}_PCODE")

df_ee.columns = [
f"{col}_ee" if col != f"ADM{adm_level}_PCODE" else col for col in df_ee.columns
]

results_zs = zonal_stats(
vectors=gdf[["geometry"]],
raster=da_seas5_upsampled.values[0],
affine=coords_transform,
nodata=np.nan,
all_touched=False,
stats=stats,
)
df_zs = (
pd.DataFrame.from_dict(results_zs)
.join(gdf)[stats + [f"ADM{adm_level}_PCODE"]]
.sort_values(f"ADM{adm_level}_PCODE")
)
df_zs.columns = [
f"{col}_zs" if col != f"ADM{adm_level}_PCODE" else col for col in df_zs.columns
]

assert len(df_zs) == len(df_ee) == len(df_db)
```

#### 2.3 Compare using percent differences

```python
df_comparison = df_db.merge(df_ee, on=f"ADM{adm_level}_PCODE").merge(
df_zs, on=f"ADM{adm_level}_PCODE"
)

for stat in stats:
df_comparison[f"{stat}_diff_db_ee"] = abs(
(df_comparison[f"{stat}_ee"] - df_comparison[f"{stat}_db"])
/ ((df_comparison[f"{stat}_ee"] + df_comparison[f"{stat}_db"]) / 2)
* 100
).round(4)

df_comparison[f"{stat}_diff_db_zs"] = abs(
(df_comparison[f"{stat}_zs"] - df_comparison[f"{stat}_db"])
/ ((df_comparison[f"{stat}_zs"] + df_comparison[f"{stat}_db"]) / 2)
* 100
).round(4)
```

#### 2.4 Look for any values that are > 5% absolute difference

```python
threshold = 5

diff_cols = [col for col in df_comparison.columns if "_diff_" in col]
significant_diffs = df_comparison[diff_cols].abs().gt(threshold).any(axis=1)
significant_rows = df_comparison[significant_diffs].copy()
max_diffs = significant_rows[diff_cols].abs().max(axis=1)
significant_rows = significant_rows.assign(max_difference=max_diffs)
significant_rows = significant_rows.sort_values("max_difference", ascending=False)

significant_rows
```

0 comments on commit e621481

Please sign in to comment.