-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #13 from OCHA-DAP/exploration-notebooks
Add exploration notebooks
- Loading branch information
Showing
2 changed files
with
365 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
--- | ||
jupyter: | ||
jupytext: | ||
text_representation: | ||
extension: .md | ||
format_name: markdown | ||
format_version: '1.3' | ||
jupytext_version: 1.16.3 | ||
kernelspec: | ||
display_name: venv | ||
language: python | ||
name: python3 | ||
--- | ||
|
||
## Grid alignment investigation | ||
|
||
This notebook investigates the source of some grid misalignment identified during preparation of the source raster data. We're looking at a sample SEAS5 COG and demonstrating how the order of clipping and resampling operations can impact the output raster stats. | ||
|
||
Update: After further investigation, see [this comment](https://github.com/OCHA-DAP/ds-raster-stats/pull/13#issuecomment-2463353673) for a potential solution. | ||
|
||
```python | ||
from src.utils.raster_utils import upsample_raster | ||
from src.utils.cog_utils import get_cog_url | ||
|
||
import rioxarray as rxr | ||
import geopandas as gpd | ||
from rasterstats import zonal_stats | ||
import numpy as np | ||
import pandas as pd | ||
|
||
MODE = "prod" | ||
date = "2024-01-01" | ||
``` | ||
|
||
#### 1. Read in source raster and geopandas dataframe | ||
|
||
```python | ||
# Or replace with other locally saved shapefile | ||
gdf = gpd.read_file("data/eth_adm2_simplified.shp") | ||
minx, miny, maxx, maxy = gdf.total_bounds | ||
|
||
cog_name = f"seas5/monthly/processed/precip_em_i{date}_lt0.tif" | ||
cog_url = get_cog_url(MODE, cog_name) | ||
da_seas5 = rxr.open_rasterio(cog_url, chunks="auto") | ||
``` | ||
|
||
#### Option A: 1) clip raster, then 2) upsample to 0.05 degree resolution | ||
|
||
```python | ||
da_1 = da_seas5.sel(x=slice(minx, maxx), y=slice(maxy, miny)).persist() | ||
da_1 = upsample_raster(da_1) | ||
|
||
print(da_1.rio.transform()) | ||
print(da_1.rio.resolution()) | ||
``` | ||
|
||
#### Option B: 1) Upsample to 0.05 degrees, then 2) clip raster | ||
|
||
```python | ||
da_2 = upsample_raster(da_seas5) | ||
da_2 = da_2.sel(x=slice(minx, maxx), y=slice(maxy, miny)).persist() | ||
|
||
print(da_2.rio.transform()) | ||
print(da_2.rio.resolution()) | ||
``` | ||
|
||
#### Now compare output zonal stats from both `da_1` and `da_2` | ||
|
||
```python | ||
stats = ["mean", "median", "min", "max", "count"] | ||
|
||
da_1_results = zonal_stats( | ||
vectors=gdf[["geometry"]], | ||
raster=da_1.values[0], | ||
affine=da_1.rio.transform(), | ||
nodata=np.nan, | ||
all_touched=False, | ||
stats=stats, | ||
) | ||
|
||
da_2_results = zonal_stats( | ||
vectors=gdf[["geometry"]], | ||
raster=da_2.values[0], | ||
affine=da_2.rio.transform(), | ||
nodata=np.nan, | ||
all_touched=False, | ||
stats=stats, | ||
) | ||
|
||
df_1 = pd.DataFrame(da_1_results) | ||
df_2 = pd.DataFrame(da_2_results) | ||
``` | ||
|
||
We can see different results between both dataframes... | ||
|
||
```python | ||
df_1 | ||
``` | ||
|
||
```python | ||
df_2 | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,263 @@ | ||
--- | ||
jupyter: | ||
jupytext: | ||
text_representation: | ||
extension: .md | ||
format_name: markdown | ||
format_version: '1.3' | ||
jupytext_version: 1.16.3 | ||
kernelspec: | ||
display_name: venv | ||
language: python | ||
name: python3 | ||
--- | ||
|
||
## Validate output stats | ||
|
||
In this notebook we're taking a look at our output raster stats and comparing against the outputs from other raster stats calculation functions. We would expect our outputs to be the same as the `zonal_stats` outputs, and see some moderate differences with the `exactextract` outputs. | ||
|
||
```python | ||
%load_ext jupyter_black | ||
%load_ext autoreload | ||
%autoreload 2 | ||
``` | ||
|
||
```python | ||
from exactextract import exact_extract | ||
from rasterstats import zonal_stats | ||
from dotenv import load_dotenv | ||
from sqlalchemy import create_engine, text | ||
import os | ||
import pandas as pd | ||
import geopandas as gpd | ||
import rioxarray as rxr | ||
import numpy as np | ||
|
||
# os.chdir("..") | ||
from src.utils.cloud_utils import get_cog_url | ||
from src.utils.raster_utils import upsample_raster, prep_raster | ||
|
||
load_dotenv() | ||
``` | ||
|
||
```python | ||
AZURE_DB_PW_PROD = os.getenv("AZURE_DB_PW_PROD") | ||
MODE = "prod" | ||
|
||
adm_level = 2 | ||
iso3 = "AFG" | ||
date = "2024-01-01" | ||
stats = ["mean", "median", "min", "max"] | ||
engine = create_engine( | ||
f"postgresql+psycopg2://chdadmin:{AZURE_DB_PW_PROD}@chd-rasterstats-prod.postgres.database.azure.com/postgres" | ||
) | ||
gdf = gpd.read_file(f"data/{iso3.lower()}/{iso3.lower()}_adm{adm_level}.shp") | ||
gdf["geometry"] = gdf["geometry"].simplify(tolerance=0.001, preserve_topology=True) | ||
``` | ||
|
||
## 1. ERA5 | ||
|
||
|
||
Read in the ERA5 COG | ||
|
||
```python | ||
# Read in the ERA5 COG | ||
cog_name = f"era5/monthly/processed/precip_reanalysis_v{date}.tif" | ||
cog_url = get_cog_url(MODE, cog_name) | ||
da_era5 = rxr.open_rasterio(cog_url, chunks="auto") | ||
da_era5_upsampled = prep_raster(da_era5, gdf) | ||
|
||
# da_era5.rio.to_raster("da_era5.tif") | ||
# da_era5_upsampled.rio.to_raster("da_era5_upsampled.tif") | ||
|
||
coords_transform = da_era5_upsampled.rio.set_spatial_dims( | ||
x_dim="x", y_dim="y" | ||
).rio.transform() | ||
``` | ||
|
||
#### 1.1: Rasterstats in database | ||
|
||
```python | ||
with engine.connect() as con: | ||
query = text( | ||
f"SELECT * FROM public.era5 WHERE iso3='{iso3}' AND adm_level='{adm_level}' AND valid_date='{date}'" | ||
) | ||
df_db = pd.read_sql_query(query, con) | ||
df_db.rename(columns={"pcode": f"ADM{adm_level}_PCODE"}, inplace=True) | ||
df_db.columns = [ | ||
f"{col}_db" if col != f"ADM{adm_level}_PCODE" else col for col in df_db.columns | ||
] | ||
``` | ||
|
||
#### 1.2: Calculate comparisons using `exactextract` and `rasterstats` packages | ||
|
||
```python | ||
df_ee = exact_extract( | ||
da_era5, | ||
gdf, | ||
stats, | ||
output="pandas", | ||
include_cols=f"ADM{adm_level}_PCODE", | ||
).sort_values(f"ADM{adm_level}_PCODE") | ||
|
||
df_ee.columns = [ | ||
f"{col}_ee" if col != f"ADM{adm_level}_PCODE" else col for col in df_ee.columns | ||
] | ||
|
||
results_zs = zonal_stats( | ||
vectors=gdf[["geometry"]], | ||
raster=da_era5_upsampled.values[0], | ||
affine=coords_transform, | ||
nodata=np.nan, | ||
all_touched=False, | ||
stats=stats, | ||
) | ||
df_zs = ( | ||
pd.DataFrame.from_dict(results_zs) | ||
.join(gdf)[stats + [f"ADM{adm_level}_PCODE"]] | ||
.sort_values(f"ADM{adm_level}_PCODE") | ||
) | ||
df_zs.columns = [ | ||
f"{col}_zs" if col != f"ADM{adm_level}_PCODE" else col for col in df_zs.columns | ||
] | ||
|
||
assert len(df_zs) == len(df_ee) == len(df_db) | ||
``` | ||
|
||
#### 1.3 Compare using percent differences | ||
|
||
```python | ||
df_comparison = df_db.merge(df_ee, on=f"ADM{adm_level}_PCODE").merge( | ||
df_zs, on=f"ADM{adm_level}_PCODE" | ||
) | ||
|
||
for stat in stats: | ||
df_comparison[f"{stat}_diff_db_ee"] = abs( | ||
(df_comparison[f"{stat}_ee"] - df_comparison[f"{stat}_db"]) | ||
/ ((df_comparison[f"{stat}_ee"] + df_comparison[f"{stat}_db"]) / 2) | ||
* 100 | ||
).round(4) | ||
|
||
df_comparison[f"{stat}_diff_db_zs"] = abs( | ||
(df_comparison[f"{stat}_zs"] - df_comparison[f"{stat}_db"]) | ||
/ ((df_comparison[f"{stat}_zs"] + df_comparison[f"{stat}_db"]) / 2) | ||
* 100 | ||
).round(4) | ||
``` | ||
|
||
#### 1.4 Look for any values that are > 5% absolute difference | ||
|
||
```python | ||
threshold = 5 | ||
|
||
diff_cols = [col for col in df_comparison.columns if "_diff_" in col] | ||
significant_diffs = df_comparison[diff_cols].abs().gt(threshold).any(axis=1) | ||
significant_rows = df_comparison[significant_diffs].copy() | ||
max_diffs = significant_rows[diff_cols].abs().max(axis=1) | ||
significant_rows = significant_rows.assign(max_difference=max_diffs) | ||
significant_rows = significant_rows.sort_values("max_difference", ascending=False) | ||
|
||
significant_rows | ||
``` | ||
|
||
# SEAS5 | ||
|
||
#### Read in the SEAS5 COG | ||
|
||
```python | ||
cog_name = f"seas5/monthly/processed/precip_em_i{date}_lt0.tif" | ||
cog_url = get_cog_url(MODE, cog_name) | ||
da_seas5 = rxr.open_rasterio(cog_url, chunks="auto") | ||
da_seas5_upsampled = prep_raster(da_seas5, gdf) | ||
|
||
# da_seas5_upsampled.rio.to_raster("da_seas5_upsampled.tif") | ||
|
||
coords_transform = da_seas5_upsampled.rio.set_spatial_dims( | ||
x_dim="x", y_dim="y" | ||
).rio.transform() | ||
``` | ||
|
||
#### 2.1 Rasterstats in database | ||
|
||
```python | ||
with engine.connect() as con: | ||
query = text( | ||
f"SELECT * FROM public.seas5 WHERE iso3='{iso3}' AND adm_level='{adm_level}' AND valid_date='{date}' and leadtime=0" | ||
) | ||
df_db = pd.read_sql_query(query, con) | ||
df_db.rename(columns={"pcode": f"ADM{adm_level}_PCODE"}, inplace=True) | ||
df_db.columns = [ | ||
f"{col}_db" if col != f"ADM{adm_level}_PCODE" else col for col in df_db.columns | ||
] | ||
``` | ||
|
||
#### 2.2 Calculate comparisons using `exactextract` and `rasterstats` packages | ||
|
||
```python | ||
df_ee = exact_extract( | ||
da_seas5, | ||
gdf, | ||
stats, | ||
output="pandas", | ||
include_cols=f"ADM{adm_level}_PCODE", | ||
).sort_values(f"ADM{adm_level}_PCODE") | ||
|
||
df_ee.columns = [ | ||
f"{col}_ee" if col != f"ADM{adm_level}_PCODE" else col for col in df_ee.columns | ||
] | ||
|
||
results_zs = zonal_stats( | ||
vectors=gdf[["geometry"]], | ||
raster=da_seas5_upsampled.values[0], | ||
affine=coords_transform, | ||
nodata=np.nan, | ||
all_touched=False, | ||
stats=stats, | ||
) | ||
df_zs = ( | ||
pd.DataFrame.from_dict(results_zs) | ||
.join(gdf)[stats + [f"ADM{adm_level}_PCODE"]] | ||
.sort_values(f"ADM{adm_level}_PCODE") | ||
) | ||
df_zs.columns = [ | ||
f"{col}_zs" if col != f"ADM{adm_level}_PCODE" else col for col in df_zs.columns | ||
] | ||
|
||
assert len(df_zs) == len(df_ee) == len(df_db) | ||
``` | ||
|
||
#### 2.3 Compare using percent differences | ||
|
||
```python | ||
df_comparison = df_db.merge(df_ee, on=f"ADM{adm_level}_PCODE").merge( | ||
df_zs, on=f"ADM{adm_level}_PCODE" | ||
) | ||
|
||
for stat in stats: | ||
df_comparison[f"{stat}_diff_db_ee"] = abs( | ||
(df_comparison[f"{stat}_ee"] - df_comparison[f"{stat}_db"]) | ||
/ ((df_comparison[f"{stat}_ee"] + df_comparison[f"{stat}_db"]) / 2) | ||
* 100 | ||
).round(4) | ||
|
||
df_comparison[f"{stat}_diff_db_zs"] = abs( | ||
(df_comparison[f"{stat}_zs"] - df_comparison[f"{stat}_db"]) | ||
/ ((df_comparison[f"{stat}_zs"] + df_comparison[f"{stat}_db"]) / 2) | ||
* 100 | ||
).round(4) | ||
``` | ||
|
||
#### 2.4 Look for any values that are > 5% absolute difference | ||
|
||
```python | ||
threshold = 5 | ||
|
||
diff_cols = [col for col in df_comparison.columns if "_diff_" in col] | ||
significant_diffs = df_comparison[diff_cols].abs().gt(threshold).any(axis=1) | ||
significant_rows = df_comparison[significant_diffs].copy() | ||
max_diffs = significant_rows[diff_cols].abs().max(axis=1) | ||
significant_rows = significant_rows.assign(max_difference=max_diffs) | ||
significant_rows = significant_rows.sort_values("max_difference", ascending=False) | ||
|
||
significant_rows | ||
``` |