From 09827030e909e01bca5e84924cf48932db4a1bca Mon Sep 17 00:00:00 2001 From: robinlovelace Date: Sat, 5 Oct 2024 20:00:17 +0100 Subject: [PATCH] Update qmd -> .ipynb -> .py files, close #266 --- code/chapters/01-spatial-data.py | 391 +++++++---- code/chapters/02-attribute-operations.py | 178 ++--- code/chapters/03-spatial-operations.py | 474 ++++++------- code/chapters/04-geometry-operations.py | 541 +++++++-------- code/chapters/05-raster-vector.py | 278 ++++---- code/chapters/06-reproj.py | 282 ++++---- code/chapters/07-read-write.py | 445 ++++++------ code/chapters/08-mapping.py | 445 +++++++++--- .../chapters/book_options.py | 0 .../chapters/book_options_pdf.py | 0 map_to_png.py => code/chapters/map_to_png.py | 0 code/chapters/references.py | 7 + ipynb/01-spatial-data.ipynb | 477 +++++++++---- ipynb/02-attribute-operations.ipynb | 203 ++---- ipynb/03-spatial-operations.ipynb | 504 ++++++-------- ipynb/04-geometry-operations.ipynb | 635 +++++++----------- ipynb/05-raster-vector.ipynb | 304 ++++----- ipynb/06-reproj.ipynb | 301 ++++----- ipynb/07-read-write.ipynb | 493 +++++++------- ipynb/08-mapping.ipynb | 578 ++++++++++++---- ipynb/README.ipynb | 235 ------- ipynb/index.ipynb | 63 +- ipynb/preface.ipynb | 211 +++++- ipynb/references.ipynb | 23 + 24 files changed, 3650 insertions(+), 3418 deletions(-) rename book_options.py => code/chapters/book_options.py (100%) rename book_options_pdf.py => code/chapters/book_options_pdf.py (100%) rename map_to_png.py => code/chapters/map_to_png.py (100%) create mode 100644 code/chapters/references.py delete mode 100644 ipynb/README.ipynb create mode 100644 ipynb/references.ipynb diff --git a/code/chapters/01-spatial-data.py b/code/chapters/01-spatial-data.py index e27a3379..92063c47 100644 --- a/code/chapters/01-spatial-data.py +++ b/code/chapters/01-spatial-data.py @@ -1,39 +1,66 @@ #!/usr/bin/env python # coding: utf-8 -# # Geographic data in Python {#sec-spatial-class} +# --- +# jupyter: python3 +# --- # +# # Geographic data in Python {#sec-spatial-class} + +# In[ ]: + + +#| echo: false +#| include: false +#| error: true +import map_to_png + + +# In[ ]: + + +#| echo: false +import book_options + + +# In[ ]: + + +#| echo: false +import book_options_pdf + + # ## Introduction # -# This chapter outlines two fundamental geographic data models --- vector and raster --- and introduces the main Python packages for working with them. +# This chapter outlines two fundamental geographic data models (vector and raster) and introduces Python packages for working with them. # Before demonstrating their implementation in Python, we will introduce the theory behind each data model and the disciplines in which they predominate. # -# The vector data model (@sec-vector-data) represents the world using points, lines, and polygons. +# The vector data model (@sec-vector-data) represents geographic entities with points, lines, and polygons. # These have discrete, well-defined borders, meaning that vector datasets usually have a high level of precision (but not necessarily accuracy). # The raster data model (@sec-raster-data), on the other hand, divides the surface up into cells of constant size. -# Raster datasets are the basis of background images used in web-mapping and have been a vital source of geographic data since the origins of aerial photography and satellite-based remote sensing devices. +# Raster datasets are the basis of background images used in online maps and have been a vital source of geographic data since the origins of aerial photography and satellite-based remote sensing devices. # Rasters aggregate spatially specific features to a given resolution, meaning that they are consistent over space and scalable, with many worldwide raster datasets available. # # Which to use? # The answer likely depends on your domain of application, and the datasets you have access to: # -# - Vector datasets and methods dominate the social sciences because human settlements and processes (e.g., transport infrastructure) tend to have discrete borders +# - Vector datasets and methods dominate the social sciences because human settlements and processes (e.g., transport infrastructure) tend to have discrete borders # - Raster datasets and methods dominate many environmental sciences because of the reliance on remote sensing data # # Python has strong support for both data models. # We will focus on **shapely** and **geopandas** for working with geograpic vector data, and **rasterio** for working with rasters. # -# **shapely** is a "low-level" package for working with individual vector geometry objects. -# **geopandas** is a "high-level" package for working with geometry columns (`GeoSeries` objects), which internally contain **shapely** geometries, and vector layers (`GeoDataFrame` objects). +# **shapely** is a 'low-level' package for working with individual vector geometry objects. +# **geopandas** is a 'high-level' package for working with geometry columns (`GeoSeries` objects), which internally contain **shapely** geometries, and with vector layers (`GeoDataFrame` objects). # The **geopandas** ecosystem provides a comprehensive approach for working with vector layers in Python, with many packages building on it. # # There are several partially overlapping packages for working with raster data, each with its own advantages and disadvantages. -# In this book, we focus on the most prominent one: **rasterio**, which represents "simple" raster datasets with a combination of a **numpy** array, and a metadata object (`dict`) providing geographic metadata such as the coordinate system. +# In this book, we focus on the most prominent one: **rasterio**, which represents 'simple' raster datasets with a combination of a **numpy** array, and a metadata object (`dict`) providing geographic metadata such as the coordinate system. # **xarray** is a notable alternative to **rasterio** not covered in this book which uses native `xarray.Dataset` and `xarray.DataArray` classes to effectively represent complex raster datasets such as NetCDF files with multiple bands and metadata. # -# There is much overlap in some fields and raster and vector datasets can be used together: ecologists and demographers, for example, commonly use both vector and raster data. +# There is much overlap in some fields, and raster and vector datasets can be used together: ecologists and demographers, for example, commonly use both vector and raster data. # Furthermore, it is possible to convert between the two forms (see @sec-raster-vector). -# Whether your work involves more use of vector or raster datasets, it is worth understanding the underlying data models before using them, as discussed in subsequent chapters. +# Whether your work involves use of vector or raster datasets, it is worth understanding the underlying data models before using them, as discussed in subsequent chapters. # # ## Vector data {#sec-vector-data} # @@ -44,47 +71,42 @@ # In this system, London, for example, can be represented by the coordinates `(-0.1,51.5)`. # This means that its location is -0.1 degrees east and 51.5 degrees north of the origin. # The origin, in this case, is at 0 degrees longitude (a prime meridian located at Greenwich) and 0 degrees latitude (the Equator) in a geographic ('lon/lat') CRS (@fig-vector-london, left panel). -# The same point could also be approximated in a projected CRS with 'Easting/Northing' values of `(530000, 180000)` in the British National Grid, meaning that London is located 530 $km$ East and 180 $km$ North of the origin of the CRS (@fig-vector-london, right panel). +# The same point could also be approximated in a projected CRS with 'Easting/Northing' values of `(530000,180000)` in the British National Grid, meaning that London is located 530 $km$ East and 180 $km$ North of the origin of the CRS (@fig-vector-london, right panel). # The location of National Grid's origin, in the sea beyond South West Peninsular, ensures that most locations in the UK have positive Easting and Northing values. # -# ::: {#fig-vector-london} +# ::: {#fig-vector-london layout-ncol=2} # -# ::: {.columns} -# :::: {.column width="50%"} # ![](images/vector_lonlat.png) -# :::: -# :::: {.column width="50%"} +# # ![](images/vector_projected.png) -# :::: -# ::: # # Illustration of vector (point) data in which location of London (the red X) is represented with reference to an origin (the blue circle). # The left plot represents a geographic CRS with an origin at 0° longitude and latitude. # The right plot represents a projected CRS with an origin located in the sea west of the South West Peninsula. # ::: # -# There is more to CRSs, as described in @sec-coordinate-reference-systems-intro and @sec-reproj-geo-data but, for the purposes of this section, it is sufficient to know that coordinates consist of two numbers representing the distance from an origin, usually in $x$ then $y$ dimensions. +# There is more to CRSs, as described in @sec-coordinate-reference-systems-intro and @sec-reproj-geo-data but, for the purposes of this section, it is sufficient to know that coordinates consist of two numbers representing the distance from an origin, usually in $x$ and $y$ dimensions. # # **geopandas** [@geopandas] provides classes for geographic vector data and a consistent command-line interface for reproducible geographic data analysis in Python. -# It also provides an interface to three mature libraries for geocomputation which, in combination, represent a strong foundation on which many geographic applications (including QGIS and R's spatial ecosystem): +# It also provides an interface to three mature libraries for geocomputation, a strong foundation on which many geographic applications are built: # # - GDAL, for reading, writing, and manipulating a wide range of geographic data formats, covered in @sec-read-write # - PROJ, a powerful library for coordinate system transformations, which underlies the content covered in @sec-reproj-geo-data # - GEOS, a planar geometry engine for operations such as calculating buffers and centroids on data with a projected CRS, covered in @sec-geometric-operations # -# Tight integration with these geographic libraries makes reproducible geocomputation possible: an advantage of using a higher level language such as Python to access these libraries is that you do not need to know the intricacies of the low level components, enabling focus on the methods rather than the implementation. +# Tight integration with these geographic libraries makes reproducible geocomputation possible: an advantage of using a higher-level language such as Python to access these libraries is that you do not need to know the intricacies of the low-level components, enabling focus on the methods rather than the implementation. # # ### Vector data classes # -# The main classes for working with geographic vector data in Python are hierarchical, meaning the highest level 'vector layer' class is composed of simpler 'geometry column' and individual 'geometry' components. +# The main classes for working with geographic vector data in Python are hierarchical, meaning that the 'vector layer' class is composed of simpler 'geometry column' and individual 'geometry' components. # This section introduces them in order, starting with the highest level class. -# For many applications, the high level vector layer class, which are essentially a data frame with geometry columns, are all that's needed. -# However, it's important to understand the structure of vector geographic objects and their component pieces for more advanced applications. +# For many applications, the vector layer class, a data frame with geometry columns, is all that's needed. +# However, it's important to understand the structure of vector geographic objects and their components for some applications and for a deep understanding. # The three main vector geographic data classes in Python are: # # - `GeoDataFrame`, a class representing vector layers, with a geometry column (class `GeoSeries`) as one of the columns # - `GeoSeries`, a class that is used to represent the geometry column in `GeoDataFrame` objects -# - `shapely` geometry objects which represent individual geometries, such as a point or a polygon +# - `shapely` geometry objects, which represent individual geometries, such as a point or a polygon in `GeoSeries` objects # # The first two classes (`GeoDataFrame` and `GeoSeries`) are defined in **geopandas**. # The third class is defined in the **shapely** package, which deals with individual geometries, and is a main dependency of the **geopandas** package. @@ -103,7 +125,7 @@ import geopandas as gpd -# We also limit the maximum number of printed rows to four, to save space, using the `'display.max_rows'` option of **pandas**. +# We also limit the maximum number of printed rows to six, to save space, using the `'display.max_rows'` option of **pandas**. # In[ ]: @@ -112,7 +134,7 @@ # Projects often start by importing an existing vector layer saved as a GeoPackage (`.gpkg`) file, an ESRI Shapefile (`.shp`), or other geographic file format. -# The function `read_file()` imports a GeoPackage file named `world.gpkg` located in the `data` directory of Python's working directory into a `GeoDataFrame` named `gdf`. +# The function `gpd.read_file` imports a GeoPackage file named `world.gpkg` located in the `data` directory of Python's working directory into a `GeoDataFrame` named `gdf`. # In[ ]: @@ -208,6 +230,8 @@ # Interactive maps of `GeoDataFrame` objects can be created with the `.explore` method, as illustrated in @fig-gdf-explore which was created with the following command: +# +# ::: {.content-visible when-format="html"} # In[ ]: @@ -217,7 +241,31 @@ gdf.explore() +# ::: +# ::: {.content-visible when-format="pdf"} + +# In[ ]: + + +#| eval: false +gdf.explore() + + +# In[ ]: + + +#| echo: false +#| output: false +#| error: true +map_to_png.map_to_png(gdf.explore(), 'fig-gdf-explore') + + +# ![Basic interactive map with `.explore`](images/fig-gdf-explore.png){#fig-gdf-explore} +# ::: +# # A subset of the data can be also plotted in a similar fashion. +# +# ::: {.content-visible when-format="html"} # In[ ]: @@ -227,17 +275,28 @@ gdf[gdf['name_long'] == 'Egypt'].explore() +# ::: +# ::: {.content-visible when-format="pdf"} + +# In[ ]: + + +#| eval: false +gdf[gdf['name_long'] == 'Egypt'].explore() + + # In[ ]: #| echo: false -# (Alternative) -# import hvplot.pandas -# gdf.hvplot(title='Hello world', geo=True, hover_cols=['name_long'], legend=False).opts(bgcolor='lightgray', active_tools=['wheel_zoom']) -#This way, we can also add background tiles: -# gdf.hvplot(tiles='OSM', alpha=0.5, geo=True, title='Hello world', hover_cols=['name_long'], legend=False).opts(active_tools=['wheel_zoom']) +#| output: false +#| error: true +map_to_png.map_to_png(gdf[gdf['name_long'] == 'Egypt'].explore(), 'fig-gdf-explore2') +# ![Interactive map of a `GeoDataFrame` subset](images/fig-gdf-explore2.png){#fig-gdf-explore2} +# ::: +# # ### Geometry columns {#sec-geometry-columns} # # The geometry column of class `GeoSeries` is an essential column in a `GeoDataFrame`. @@ -260,9 +319,9 @@ gdf.geometry.crs -# Many geometry operations, such as calculating the centroid, buffer, or bounding box of each feature involve just the geometry. +# Many geometry operations, such as calculating the centroid, buffer, or bounding box of each feature, involve just the geometry. # Applying this type of operation on a `GeoDataFrame` is therefore basically a shortcut to applying it on the `GeoSeries` object in the geometry column. -# For example, the two following commands return exactly the same result, a `GeoSeries` with country bounding box polygons (using the [`.envelope`](https://geopandas.org/en/stable/docs/reference/api/geopandas.GeoSeries.envelope.html) method). +# For example, the two following commands return exactly the same result, a `GeoSeries` containing bounding box polygons (using the `.envelope` method). # In[ ]: @@ -276,8 +335,8 @@ gdf.geometry.envelope -# Note that `.envelope`, and other similar operators in **geopandas** such as `.centroid` (@sec-centroids), `.buffer` (@sec-buffers) or `.convex_hull`, return only the geometry (i.e., a `GeoSeries`), not a `GeoDataFrame` with the original attribute data. -# In case we want the latter, we can create a copy of the `GeoDataFrame` and then "overwrite" its geometry (or, we can overwrite the geometries directly in case we do not need the original ones, as in `gdf.geometry=gdf.envelope`). +# Note that `.envelope`, and other similar operators in **geopandas** such as `.centroid` (@sec-centroids), `.buffer` (@sec-buffers), or `.convex_hull`, return only the geometry (i.e., a `GeoSeries`), not a `GeoDataFrame` with the original attribute data. +# In case we want the latter, we can create a copy of the `GeoDataFrame` and then 'overwrite' its geometry (or, we can overwrite the geometries directly in case we do not need the original ones, as in `gdf.geometry=gdf.envelope`). # In[ ]: @@ -288,8 +347,9 @@ # Another useful property of the geometry column is the geometry type, as shown in the following code. -# Note that the types of geometries contained in a geometry column (and, thus, a vector layer) are not necessarily the same for every row. -# Accordingly, the `.type` property returns a `Series` (of type `string`), rather than a single value (the same can be done with the shortcut `gdf.geom_type`). +# Note that the types of geometries contained in a geometry column (and, thus, a vector layer) are not necessarily the same for every row. +# It is possible to have multiple geometry types in a single `GeoSeries`. +# Accordingly, the `.type` property returns a `Series` (with values of type `str`, i.e., strings), rather than a single value (the same can be done with the shortcut `gdf.geom_type`). # In[ ]: @@ -297,7 +357,8 @@ gdf.geometry.type -# To summarize the occurrence of different geometry types in a geometry column, we can use the **pandas** method called `value_counts`. +# To summarize the occurrence of different geometry types in a geometry column, we can use the **pandas** `.value_counts` method. +# In this case, we see that the `gdf` layer contains only `'MultiPolygon'` geometries. # In[ ]: @@ -305,10 +366,7 @@ gdf.geometry.type.value_counts() -# It is possible to have multiple geometry types in a single `GeoSeries`. -# However, in this case, we see that the `gdf` layer contains only `'MultiPolygon'` geometries. -# -# A `GeoDataFrame` can also have multiple `GeoSeries`. +# A `GeoDataFrame` can also have multiple `GeoSeries` columns, as demonstrated in the following code section. # In[ ]: @@ -319,9 +377,11 @@ gdf -# Only one geometry column at a time is "active", in the sense that it is being accessed in operations involving the geometries (such as `.centroid`, `.crs`, etc.). -# To switch the active geometry column from one `GeoSeries` column to another, we use `set_geometry`. +# Only one geometry column at a time is 'active', in the sense that it is being accessed in operations involving the geometries (such as `.centroid`, `.crs`, etc.). +# To switch the active geometry column from one `GeoSeries` column to another, we use `.set_geometry`. # @fig-switch-to-centroids and @fig-switch-to-polygons shows interactive maps of the `gdf` layer with the `'bbox'` and `'polygon'` geometry columns activated, respectively. +# +# ::: {.content-visible when-format="html"} # In[ ]: @@ -332,6 +392,32 @@ gdf.explore() +# ::: +# ::: {.content-visible when-format="pdf"} + +# In[ ]: + + +#| eval: false +gdf = gdf.set_geometry('bbox') +gdf.explore() + + +# In[ ]: + + +#| echo: false +#| output: false +#| error: true +gdf = gdf.set_geometry('bbox') +map_to_png.map_to_png(gdf.explore(), 'fig-switch-to-centroids') + + +# ![Switching to the `'bbox'` geometry column in the `world` layer, and plotting it](images/fig-switch-to-centroids.png){#fig-switch-to-centroids} +# ::: +# +# ::: {.content-visible when-format="html"} + # In[ ]: @@ -341,21 +427,45 @@ gdf.explore() +# ::: +# ::: {.content-visible when-format="pdf"} + +# In[ ]: + + +#| eval: false +gdf = gdf.set_geometry('polygon') +gdf.explore() + + +# In[ ]: + + +#| echo: false +#| output: false +#| error: true +gdf = gdf.set_geometry('polygon') +map_to_png.map_to_png(gdf.explore(), 'fig-switch-to-polygons') + + +# ![Switching to the `'polygons'` geometry column in the `world` layer, and plotting it](images/fig-switch-to-polygons.png){#fig-switch-to-polygons} +# ::: +# # ### The Simple Features standard {#sec-simple-features} # # Geometries are the basic building blocks of vector layers. # Although the Simple Features standard defines about 20 types of geometries, we will focus on the seven most commonly used types: `POINT`, `LINESTRING`, `POLYGON`, `MULTIPOINT`, `MULTILINESTRING`, `MULTIPOLYGON` and `GEOMETRYCOLLECTION`. -# A useful list of possible geometry types can be found in R's **sf** package [documentation](https://r-spatial.github.io/sf/articles/sf1.html#simple-feature-geometry-types). +# A useful list of possible geometry types can be found in R's **sf** package documentation[^sf_docs]. +# +# [^sf_docs]: [https://r-spatial.github.io/sf/articles/sf1.html#simple-feature-geometry-types](https://r-spatial.github.io/sf/articles/sf1.html#simple-feature-geometry-types) # # Simple feature geometries can be represented by well-known binary (WKB) and well-known text (WKT) encodings. # WKB representations are usually hexadecimal strings easily readable for computers, and this is why GIS software and spatial databases use WKB to transfer and store geometry objects. # WKT, on the other hand, is a human-readable text markup description of Simple Features. -# -# # Both formats are exchangeable, and if we present one, we will naturally choose the WKT representation. # # The foundation of each geometry type is the point. -# A point is simply a coordinate in 2D, 3D, or 4D space such as shown in @fig-point and @fig-point2. +# A point is simply a coordinate in two-dimensional, three-dimensional, or four-dimensional space such as shown in @fig-point. # # ``` text # POINT (5 2) @@ -375,7 +485,7 @@ # ``` # # So far we have created geometries with only one geometric entity per feature. -# However, the Simple Features standard allows multiple geometries to exist within a single feature, using "multi" versions of each geometry type, as illustrated in @fig-multipoint, @fig-multilinestring, and @fig-multipolygon1. +# However, the Simple Features standard allows multiple geometries to exist within a single feature, using 'multi' versions of each geometry type, as illustrated in @fig-multipoint, @fig-multilinestring, and @fig-multipolygon1. # # ``` text # MULTIPOINT (5 2, 1 3, 3 4, 3 2) @@ -383,7 +493,7 @@ # MULTIPOLYGON (((1 5, 2 2, 4 1, 4 4, 1 5), (0 2, 1 2, 1 3, 0 3, 0 2))) # ``` # -# Finally, a geometry collection can contain any combination of geometries including (multi)points and linestrings (@fig-geometrycollection). +# Finally, a geometry collection can contain any combination of geometries of the other six types, such as the combination of a multipoint and linestring shown below (@fig-geometrycollection). # # ``` text # GEOMETRYCOLLECTION (MULTIPOINT (5 2, 1 3, 3 4, 3 2), @@ -392,8 +502,8 @@ # # ### Geometries {#sec-geometries} # -# Each element in the geometry column is a geometry object, of class `shapely` [@shapely]. -# For example, here is one specific geometry selected by implicit index (Canada, i.e., the 4^th^ element in `gdf`'s geometry column'). +# Each element in the geometry column (`GeoSeries`) is a geometry object of class `shapely` [@shapely]. +# For example, here is one specific geometry selected by implicit index (Canada, the 4^th^ element in `gdf`'s geometry column). # In[ ]: @@ -415,25 +525,24 @@ # In the first example (a `'Point'`) we show two types of inputs to create a geometry: a list of coordinates or a `string` in the WKT format. # In the examples for the remaining geometries we use the former approach. # -# Creating a `'Point'` geometry from a list of coordinates uses the `shapely.Point` function (@fig-point). +# Creating a `'Point'` geometry from a list of coordinates uses the `shapely.Point` function in the following expression (@fig-point). # In[ ]: #| label: fig-point -#| fig-cap: A `Point` geometry (created from a `list`) +#| fig-cap: A `Point` geometry (created either from a `list` or WKT) point = shapely.Point([5, 2]) point -# Alternatively, we can use the `shapely.from_wkt` to transform a WKT string to a `shapely` geometry object. -# Here is an example of creating the same `'Point'` geometry from WKT (@fig-point2). +# Alternatively, we can use `shapely.from_wkt` to transform a WKT string to a `shapely` geometry object. +# Here is an example of creating the same `'Point'` geometry from WKT (@fig-point). # In[ ]: -#| label: fig-point2 -#| fig-cap: A `Point` geometry (created from a WKT string) +#| output: false point = shapely.from_wkt('POINT (5 2)') point @@ -459,7 +568,7 @@ #| fig-cap: A `Polygon` geometry polygon = shapely.Polygon( [(1,5), (2,2), (4,1), (4,4), (1,5)], ## Exterior - [[(2,4), (3,4), (3,3), (2,3), (2,4)]] ## Holes + [[(2,4), (3,4), (3,3), (2,3), (2,4)]] ## Hole(s) ) polygon @@ -498,21 +607,20 @@ #| fig-cap: A `MultiPolygon` geometry multipolygon = shapely.MultiPolygon([ [[(1,5), (2,2), (4,1), (4,4), (1,5)], []], ## Polygon 1 - [[(0,2), (1,2), (1,3), (0,3), (0,2)], []] ## Polygon 2, etc. + [[(0,2), (1,2), (1,3), (0,3), (0,2)], []] ## Polygon 2, etc. ]) multipolygon -# Since the required input has four hierarchical levels, it may be more clear to create the single-part `'Polygon'` geometries in advance, using the respective function (`shapely.Polygon`), and then pass them to `shapely.MultiPolygon` (@fig-multipolygon2). (The same technique can be used with the other `shapely.Multi*` functions.) +# Since the required input has four hierarchical levels, it may be more clear to create the single-part `'Polygon'` geometries in advance, using the respective function (`shapely.Polygon`), and then pass them to `shapely.MultiPolygon` (@fig-multipolygon1). (The same technique can be used with the other `shapely.Multi*` functions.) # In[ ]: -#| label: fig-multipolygon2 -#| fig-cap: A `MultiPolygon` geometry +#| output: false multipolygon = shapely.MultiPolygon([ shapely.Polygon([(1,5), (2,2), (4,1), (4,4), (1,5)]), ## Polygon 1 - shapely.Polygon([(0,2), (1,2), (1,3), (0,3), (0,2)]) ## Polygon 2, etc. + shapely.Polygon([(0,2), (1,2), (1,3), (0,3), (0,2)]) ## Polygon 2, etc. ]) multipolygon @@ -528,8 +636,9 @@ geometrycollection -# `shapely` geometries act as atomic units of vector data, meaning that there is no concept of geometry *sets*: each operation accepts individual geometry object(s) as input, and retunrs an individual geometry as output. (The `GeoSeries` and `GeoDataFrame` objects, defined in **geopandas**, are used to deal with sets of `shapely` geometries, collectively) -# For example, the following expression calculates the difference between the buffered `multipolygon` (using distance of `0.2`) and itself (@fig-mpol-buffer-difference): +# `shapely` geometries act as atomic units of vector data, meaning that there is no concept of geometry *sets*: each operation accepts individual geometry object(s) as input, and returns an individual geometry as output. +# (The `GeoSeries` and `GeoDataFrame` objects, defined in **geopandas**, are used to deal with sets of `shapely` geometries, collectively.) +# For example, the following expression calculates the difference (see @sec-clipping) between the buffered (see @sec-buffers) `multipolygon` (using distance of `0.2`) and itself (@fig-mpol-buffer-difference): # In[ ]: @@ -539,7 +648,7 @@ multipolygon.buffer(0.2).difference(multipolygon) -# As demonstrated above, a `shapely` geometry object is automatically evaluated to a small image of the geometry (when using an interface capable of displaying it, such as a Jupyter Notebook). +# As demonstrated in the last few figures, a `shapely` geometry object is automatically evaluated to a small image of the geometry (when using an interface capable of displaying it, such as Jupyter Notebook). # To print the WKT string instead, we can use the `print` function: # In[ ]: @@ -558,9 +667,11 @@ list(polygon.exterior.coords) +# Also see @sec-type-transformations, where `.coords`, `.geoms`, and `.exterior` are used to transform a given `shapely` geometry to a different type (e.g., `'Polygon'` to `'MultiPoint'`). +# # ### Vector layer from scratch {#sec-vector-layer-from-scratch} # -# In the previous sections, we started with a vector layer (`GeoDataFrame`), from an existing GeoPackage file, and "decomposed" it to extract the geometry column (`GeoSeries`, @sec-geometry-columns) and separate geometries (`shapely`, see @sec-geometries). +# In the previous sections, we started with a vector layer (`GeoDataFrame`), from an existing GeoPackage file, and 'decomposed' it to extract the geometry column (`GeoSeries`, @sec-geometry-columns) and separate geometries (`shapely`, see @sec-geometries). # In this section, we will demonstrate the opposite process, constructing a `GeoDataFrame` from `shapely` geometries, combined into a `GeoSeries`. # This will help you better understand the structure of a `GeoDataFrame`, and may come in handy when you need to programmatically construct simple vector layers, such as a line between two given points. # @@ -570,13 +681,13 @@ # ![Creating a `GeoDataFrame` from scratch](images/gdf-flow.svg){#fig-gdf-flow} # # The final result, a vector layer (`GeoDataFrame`) is therefore a hierarchical structure (@fig-gdf-structure), containing the geometry column (`GeoSeries`), which in turn contains geometries (`shapely`). -# Each of the "internal" components can be accessed, or "extracted", which is sometimes necessary, as we will see later on. +# Each of the 'internal' components can be accessed, or 'extracted', which is sometimes necessary, as we will see later on. # -# ![Structure of a `GeoDataFrame`](images/gdf-structure.svg){#fig-gdf-structure} +# ![Structure of a `GeoDataFrame`](images/gdf-structure.svg){width=40% fig-align="center" #fig-gdf-structure} # # Non-geographic attributes may represent the name of the feature, and other attributes such as measured values, groups, etc. # To illustrate attributes, we will represent a temperature of 25°C in London on June 21st, 2023. -# This example contains a geometry (the coordinates), and three attributes with three different classes (place name, temperature and date). +# This example contains a geometry (the coordinates), and three attributes with three different classes (place name, temperature, and date). # Objects of class `GeoDataFrame` represent such data by combining the attributes (`Series`) with the simple feature geometry column (`GeoSeries`). # First, we create a point geometry, which we know how to do from @sec-geometries (@fig-point-lnd). @@ -603,7 +714,7 @@ # The geometry column is a `GeoSeries`, named `geometry`. # The other attributes (if any) may be defined using `list` or `Series` objects. # Here, for simplicity, we use the `list` option for defining the three attributes `name`, `temperature`, and `date`. -# Again, note that the `list` can be of length \>1, in case we are creating a layer with more than one feature. +# Again, note that the `list` can be of length \>1, in case we are creating a layer with more than one feature (i.e., multiple rows). # In[ ]: @@ -649,8 +760,10 @@ towns_layer -# Now, we are able to create an interactive map of the `towns_layer` object(@fig-layer-from-scratch-explore). +# Now, we are able to create an interactive map of the `towns_layer` object (@fig-layer-from-scratch-explore). # To make the points easier to see, we are customizing a fill color and size (we elaborate on `.explore` options in @sec-interactive-maps). +# +# ::: {.content-visible when-format="html"} # In[ ]: @@ -660,8 +773,30 @@ towns_layer.explore(color='red', marker_kwds={'radius': 10}) -# Spatial object can be also created from a `pandas.DataFrame` object that contains columns with coordinates. -# For that, we need to first create a `GeoSeries` object from the coordinates, and then combine it with `DataFrame` to a `GeoDataFrame` object. +# ::: +# ::: {.content-visible when-format="pdf"} + +# In[ ]: + + +#| eval: false +towns_layer.explore(color='red', marker_kwds={'radius': 10}) + + +# In[ ]: + + +#| echo: false +#| output: false +#| error: true +map_to_png.map_to_png(towns_layer.explore(color='red', marker_kwds={'radius': 10}), 'fig-layer-from-scratch-explore') + + +# ![`towns_layer`, created from scratch, visualized using `.explore`](images/fig-layer-from-scratch-explore.png){#fig-layer-from-scratch-explore} +# ::: +# +# A spatial (point) layer can be also created from a `DataFrame` object (package **pandas**) that contains columns with coordinates. +# To demonstrate, we hereby first create a `GeoSeries` object from the coordinates, and then combine it with the `DataFrame` to form a `GeoDataFrame`. # In[ ]: @@ -678,12 +813,12 @@ # The output gives the same result as previous `towns_layer`. -# This approach is particularly useful when we need to read data from a CSV file, e.g., using `pandas.read_csv`, and want to turn the resulting `DataFrame` into a `GeoDataFrame` (see another example in @sec-spatial-joining). +# This approach is particularly useful when we need to read data from a CSV file, e.g., using `pd.read_csv`, and want to turn the resulting `DataFrame` into a `GeoDataFrame` (see another example in @sec-spatial-joining). # # ### Derived numeric properties {#sec-area-length} # -# Vector layers are characterized by two essential derived numeric properties: Length (`.length`)---applicable to lines and Area (`.area`)---applicable to polygons. -# Area and length can be calculated for any data structures discussed above, either a `shapely` geometry, in which case the returned value is a number or for `GeoSeries` or `DataFrame`, in which case the returned value is a numeric `Series`. +# Vector layers are characterized by two essential derived numeric properties: *length* (`.length`)---applicable to lines, and *area* (`.area`)---applicable to polygons. +# Area and length can be calculated for any data structures discussed above, either a `shapely` geometry, in which case the returned value is a number, or for `GeoSeries` or `DataFrame`, in which case the returned value is a numeric `Series`. # In[ ]: @@ -708,7 +843,7 @@ # # To obtain meaningful length and area measurements for data in a geographic CRS, the geometries first need to be transformed to a projected CRS (see @sec-reprojecting-vector-geometries) applicable to the area of interest. # For example, the area of Slovenia can be calculated in the UTM zone 33N CRS (`crs=32633`). -# The result is in $m^2$, the units of the CRS of this dataset. +# The result is in $m^2$, the units of the UTM zone 33N CRS. # In[ ]: @@ -719,20 +854,22 @@ # ## Raster data {#sec-raster-data} # # The spatial raster data model represents the world with the continuous grid of cells (often also called pixels; @fig-raster-intro-plot1 (A)). -# This data model often refers to so-called regular grids, in which each cell has the same, constant size---and we will focus on the regular grids in this book only. +# This data model often refers to so-called regular grids, in which each cell has the same, constant size---and we will focus only on regular grids in this book. # However, several other types of grids exist, including rotated, sheared, rectilinear, and curvilinear grids (see Chapter 1 of @pebesma_spatial_2022 or Chapter 2 of @tennekes_elegant_2022). # # The raster data model usually consists of a raster header (or metadata) and a matrix (with rows and columns) representing equally spaced cells (often also called pixels; @fig-raster-intro-plot1 (A)). -# The raster header defines the coordinate reference system, the extent and the origin. -# The origin (or starting point) is frequently the coordinate of the lower-left corner of the matrix. -# The metadata defines the extent via the origin, the number of columns, the number of rows, and the cell size resolution. -# The matrix representation avoids storing explicitly the coordinates for the four corner points (in fact it only stores one coordinate, namely the origin) of each cell, as would be the case for rectangular vector polygons. +# The raster header defines the coordinate reference system, the origin and the resolution. +# The origin (or starting point) is typically the coordinate of the lower-left corner of the matrix. +# The metadata defines the origin, and the cell size, i.e., resolution. +# Combined with the column and row count, the extent can also be derived. +# The matrix representation avoids storing explicitly the coordinates for the four corner points (in fact it only stores one coordinate, namely the origin) of each cell, as would be the case for rectangular vector polygons. # This and map algebra (@sec-map-algebra) makes raster processing much more efficient and faster than vector data processing. -# However, in contrast to vector data, the cell of one raster layer can only hold a single value. The value might be numeric or categorical (@fig-raster-intro-plot1 (C)). +# However, in contrast to vector data, the cell of one raster layer can only hold a single value. +# The cell values are numeric, representing either a continuous or a categorical variable (@fig-raster-intro-plot1 (C)). # # ![Raster data types: (A) cell IDs, (B) cell values, (C) a colored raster map](images/raster-intro-plot1.png){#fig-raster-intro-plot1} # -# Raster maps usually represent continuous phenomena such as elevation, temperature, population density or spectral data. +# Raster maps usually represent continuous phenomena such as elevation, temperature, population density, or spectral data. # Discrete features such as soil or land-cover classes can also be represented in the raster data model. # Both uses of raster datasets are illustrated in @fig-raster-intro-plot2, which shows how the borders of discrete features may become blurred in raster datasets. # Depending on the nature of the application, vector representations of discrete features may be more suitable. @@ -745,12 +882,12 @@ # The two most notable approaches for working with rasters in Python are provided by **rasterio** and **rioxarray** packages. # As we will see shortly, they differ in scope and underlying data models. # Specifically, **rasterio** represents rasters as **numpy** arrays associated with a separate object holding the spatial metadata. -# The **rioxarray** package, a warpper of **rasterio**, however, represents rasters with **xarray** "extended" arrays, which are an extension of **numpy** array designed to hold axis labels and attributes in the same object, together with the array of raster values. +# The **rioxarray** package, a wrapper of **rasterio**, however, represents rasters with **xarray** 'extended' arrays, which are an extension of **numpy** array designed to hold axis labels and attributes in the same object, together with the array of raster values. # Similar approaches are provided by less well-known **xarray-spatial** and **geowombat** packages. -# Comparatively, **rasterio** is more well-established, but it is more low-level (which has both advantabes and distadvantages). +# Comparatively, **rasterio** is more well-established, but it is more low-level (which has both advantages and distadvantages). # # All of the above-mentioned packages, however, are not exhaustive in the same way **geopandas** is. -# For example, when working with **rasterio**, on the one hand, more packages may be needed to accomplish common tasks such as zonal statistics (package **rasterstats**) or calculating topographic indices (package **richdem**). +# For example, when working with **rasterio**, more packages may be needed to accomplish common tasks such as zonal statistics (package **rasterstats**) or calculating topographic indices (package **richdem**). # # # In the following two sections, we introduce **rasterio**, which is the raster-related package we are going to work with through the rest of the book. @@ -759,11 +896,7 @@ # # To work with the **rasterio** package, we first need to import it. # Additionally, as the raster data is stored within **numpy** arrays, we import the **numpy** package and make all its functions accessible for effective data manipulation. -# -# # Finally, we import the **rasterio.plot** sub-module for its `rasterio.plot.show` function that allows for quick visualization of rasters. -# -# # In[ ]: @@ -776,18 +909,16 @@ # Rasters are typically imported from existing files. # When working with **rasterio**, importing a raster is actually a two-step process: # -# - First, we open a raster file "connection" using `rasterio.open` +# - First, we open a raster file 'connection' using `rasterio.open` # - Second, we read raster values from the connection using the `.read` method # -# This separation is analogous to basic Python functions for reading from files, such as `open` and `.readline` to read from a text file. +# This type of separation is analogous to basic Python functions for reading from files, such as `open` and `.readline` to read from a text file. # The rationale is that we do not always want to read all information from the file into memory, which is particularly important as rasters size can be larger than RAM size. -# -# # Accordingly, the second step (`.read`) is selective, meaning that the user can fine-tune the subset of values (bands, rows/columns, resolution, etc.) that are actually being read. # For example, we may want to read just one raster band rather than reading all bands. # -# In the first step, we pass a file path to the `rasterio.open` function to create a `DatasetReader` file connection. -# For this example, we use a single-band raster representing elevation in Zion National Park. +# In the first step, we pass a file path to the `rasterio.open` function to create a `DatasetReader` file connection, hereby named `src`. +# For this example, we use a single-band raster representing elevation in Zion National Park, stored in `srtm.tif`. # In[ ]: @@ -803,11 +934,12 @@ #| label: fig-rasterio-plot #| fig-cap: Basic plot of a raster, the data are coming from a **rasterio** file connection +#| out-width: 60% rasterio.plot.show(src); # The `DatasetReader` contains the raster metadata, that is, all of the information other than the raster values. -# Let us examine it with the `meta` property. +# Let's examine it with the `.meta` property. # In[ ]: @@ -815,13 +947,11 @@ src.meta -# -# # Namely, it allows us to see the following properties, which we will elaborate on below, and in later chapters: # # - `driver`---The raster file format (see @sec-data-output-raster) # - `dtype`---Data type (see @tbl-numpy-data-types) -# - `nodata`---The value being used as "No Data" flag (see @sec-data-output-raster) +# - `nodata`---The value being used as 'No Data' flag (see @sec-data-output-raster) # - Dimensions: # - `width`---Number of columns # - `height`---Number of rows @@ -831,7 +961,7 @@ # # The last item (i.e., `transform`) deserves more attention. # To position a raster in geographical space, in addition to the CRS, we must specify the raster *origin* ($x_{min}$, $y_{max}$) and resolution ($delta_{x}$, $delta_{y}$). -# In the transformation matrix notation, these data items are stored as follows: +# In the transformation matrix notation, assuming a regular grid, these data items are stored as follows: # # ```{text} # Affine(delta_x, 0.0, x_min, @@ -839,8 +969,9 @@ # ``` # # Note that, by convention, raster y-axis origin is set to the maximum value ($y_{max}$) rather than the minimum, and, accordingly, the y-axis resolution ($delta_{y}$) is negative. +# In other words, since the origin is in the *top*-left corner, advancing along the y-axis is done through negative steps (downwards). # -# Finally, the `.read` method of the `DatasetReader` is used to read the actual raster values. +# In the second step, the `.read` method of the `DatasetReader` is used to read the actual raster values. # Importantly, we can read: # # - All layers (as in `.read()`) @@ -849,12 +980,12 @@ # # Note that the layer indices start from `1`, contrary to the Python convention of the first index being `0`. # -# The resulting object is a **numpy** array [@numpy], with either two or three dimensions: +# The object returned by `.read` is a **numpy** array [@numpy], with either two or three dimensions: # # - *Three* dimensions, when reading more than one layer (e.g., `.read()` or `.read([1,2])`). In such case, the dimensions pattern is `(layers, rows, columns)` # - *Two* dimensions, when reading one specific layer (e.g., `.read(1)`). In such case, the dimensions pattern is `(rows, columns)` # -# Let's read the first (and only) layer from the `srtm.tif` raster, using the file connection object `src` using the `.read(1)` method. +# Let's read the first (and only) layer from the `srtm.tif` raster, using the file connection object `src` and the `.read` method. # In[ ]: @@ -862,7 +993,7 @@ src.read(1) -# The result is a two-dimensional **numpy** array in which each value represents the elevation of the corresponding pixel. +# The result is a two-dimensional **numpy** array where each value represents the elevation of the corresponding pixel. # # The relation between a **rasterio** file connection and the derived properties is summarized in @fig-rasterio-structure. # The file connection (created with `rasterio.open`) gives access to the two components of raster data: the metadata (via the `.meta` property) and the values (via the `.read` method). @@ -873,20 +1004,18 @@ # # In this section, we are going to demonstrate the creation of rasters from scratch. # We will construct two small rasters, `elev` and `grain`, which we will use in examples later in the book. -# Unlike creating a vector layer (see @sec-vector-layer-from-scratch), creating a raster from scratch is rarely needed in practice because aligning a raster with the proper spatial extent is challenging to do programmatically ("georeferencing" tools in GIS software are a better fit for the job). +# Unlike creating a vector layer (see @sec-vector-layer-from-scratch), creating a raster from scratch is rarely needed in practice because aligning a raster with the proper spatial extent is challenging to do programmatically ('georeferencing' tools in GIS software are a better fit for the job). # Nevertheless, the examples will be helpful to become more familiar with the **rasterio** data structures. # # Conceptually, a raster is an array combined with georeferencing information, whereas the latter comprises: # -# - A transformation matrix, linking pixel indices with coordinates in a particular coordinate system +# - A transformation matrix, containing the origin and resolution, thus linking pixel indices with coordinates in a particular coordinate system # - A CRS definition, specifying the association of that coordinate system with the surface of the earth (optional) # # Therefore, to create a raster, we first need to have an array with the values, and then supplement it with the georeferencing information. # Let's create the arrays `elev` and `grain`. # The `elev` array is a $6 \times 6$ array with sequential values from `1` to `36`. -# It can be created as follows using the `np.arange` function and `.reshape` method. -# -# +# It can be created as follows using the `np.arange` function and `.reshape` method from **numpy**. # In[ ]: @@ -895,8 +1024,8 @@ elev -# The `grain` array represents a categorical raster with values `0`, `1`, `2`, corresponding to categories "clay", "silt", "sand", respectively. -# We will create it from a specific arrangement of pixel values using the **numpy** `array` and `reshape` functions. +# The `grain` array represents a categorical raster with values `0`, `1`, `2`, corresponding to categories 'clay', 'silt', 'sand', respectively. +# We will create it from a specific arrangement of pixel values, using **numpy**'s `np.array` and `.reshape`. # In[ ]: @@ -914,8 +1043,6 @@ # Note that in both cases, we are using the `uint8` (unsigned integer in 8 bits, i.e., `0-255`) data type, which is sufficient to represent all possible values of the given rasters (see @tbl-numpy-data-types). -# -# # This is the recommended approach for a minimal memory footprint. # # What is missing now is the georeferencing information (see @sec-using-rasterio). @@ -924,7 +1051,8 @@ # - The origin ($x_{min}$, $y_{max}$) is at `-1.5,1.5` # - The raster resolution ($delta_{x}$, $delta_{y}$) is `0.5,-0.5` # -# We can add this information using [`rasterio.transform.from_origin`](rasterio.transform.from_origin), and specifying `west`, `north`, `xsize`, and `ysize` parameters. +# We can add this information using `rasterio.transform.from_origin`, and specifying `west`, `north`, `xsize`, and `ysize` parameters. +# The resulting transformation matrix object is hereby named `new_transform`. # In[ ]: @@ -963,10 +1091,10 @@ # At this point, we have two rasters, each composed of an array and related transformation matrix. # We can work with the raster using **rasterio** by: # -# - Passing the transformation matrix wherever actual raster pixel coordinates are important (such as in function `show` above) -# - Keeping in mind that any other layer we use in the analysis is in the same CRS of those coordinates +# - Passing the transformation matrix wherever actual raster pixel coordinates are important (such as in function `rasterio.plot.show` above) +# - Keeping in mind that any other layer we use in the analysis is in the same CRS # -# Finally, to export the raster for permanent storage, along with the CRS definition, we need to go through the following steps: +# Finally, to export the raster for permanent storage, along with the spatial metadata, we need to go through the following steps: # # 1. Create a raster file connection (where we set the transform and the CRS, among other settings) # 2. Write the array with raster values into the connection @@ -974,9 +1102,7 @@ # # Don't worry if the code below is unclear; the concepts related to writing raster data to file will be explained in @sec-data-output-raster. # For now, for completeness, and also to use these rasters in subsequent chapters without having to re-create them from scratch, we just provide the code for exporting the `elev` and `grain` rasters into the `output` directory. -# In the case of `elev`, we do it as follows with the `open`, `write`, and `close` methods of the **rasterio** package. -# -# +# In the case of `elev`, we do it as follows with the `rasterio.open`, `.write`, and `.close` functions and methods of the **rasterio** package. # In[ ]: @@ -998,7 +1124,7 @@ # Note that the CRS we (arbitrarily) set for the `elev` raster is WGS84, defined using `crs=4326` according to the EPSG code. # -# Exporting the `grain` raster is done in the same way, with the only difference being the array we write into the connection. +# Exporting the `grain` raster is done in the same way, with the only differences being the file name and the array we write into the connection. # In[ ]: @@ -1055,7 +1181,7 @@ # This can be seen in @fig-geocentric-vs-local, where the local datum is fitted to the area of Philippines, but is misaligned with most of the rest of the planet's surface. # Both datums in @fig-geocentric-vs-local are put on top of a geoid---a model of global mean sea level. # -# ![Geocentric and local geodetic datums shown on top of a geoid (in false color and the vertical exaggeration by 10,000 scale factor). Image of the geoid is adapted from the work of [@essd-11-647-2019].](https://r.geocompx.org/figures/02_datum_fig.png){#fig-geocentric-vs-local} +# ![Geocentric and local geodetic datums shown on top of a geoid (in false color and the vertical exaggeration by 10,000 scale factor). Image of the geoid is adapted from the work of [@essd-11-647-2019].](images/geocompr_02_datum_fig.png){#fig-geocentric-vs-local} # # ### Projected coordinate reference systems {#sec-projected-coordinate-reference-systems} # @@ -1098,7 +1224,7 @@ pyproj.CRS.from_epsg(4326) ## Printout of WGS84 CRS (EPSG:4326) -# A quick summary of different projections, their types, properties, and suitability can be found in "Map Projections" (1993) and at . +# A quick summary of different projections, their types, properties, and suitability can be found at . # We will expand on CRSs and explain how to project from one CRS to another in @sec-reproj-geo-data. # But, for now, it is sufficient to know: # @@ -1115,11 +1241,7 @@ zion.crs -# We can also illustrate the difference between a geographic and a projected CRS by plotting the `zion` data in both CRSs (@fig-zion-crs). Note that we are using the [`.grid`](https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.grid.html) method of **matplotlib** to draw grid lines on top of the plot. -# -# -# -# +# We can also illustrate the difference between a geographic and a projected CRS by plotting the `zion` data in both CRSs (@fig-zion-crs). Note that we are using the `.grid` method of **matplotlib** to draw grid lines on top of the plot. # In[ ]: @@ -1164,6 +1286,5 @@ # It is up to the user to determine which units the result is given in, and treat the result accordingly. # For example, if the area output was in $m^2$ and we need the result in $km^2$, then we need to divide the result by $1000^2$. # -# ## Exercises +# # -# ## References diff --git a/code/chapters/02-attribute-operations.py b/code/chapters/02-attribute-operations.py index 4482da49..6d9bb61f 100644 --- a/code/chapters/02-attribute-operations.py +++ b/code/chapters/02-attribute-operations.py @@ -1,23 +1,26 @@ #!/usr/bin/env python # coding: utf-8 +# --- +# jupyter: python3 +# --- +# # # Attribute data operations {#sec-attr} # # ## Prerequisites {.unnumbered} -# -# -# # In[ ]: #| echo: false -import matplotlib.pyplot as plt -import pandas as pd -pd.options.display.max_rows = 6 -pd.options.display.max_columns = 6 -pd.options.display.max_colwidth = 35 -plt.rcParams['figure.figsize'] = (5, 5) +import book_options + + +# In[ ]: + + +#| echo: false +import book_options_pdf # This chapter requires importing the following packages: @@ -38,6 +41,7 @@ #| echo: false +#| include: false import os from urllib.request import urlretrieve @@ -65,15 +69,15 @@ # # Attribute data is non-spatial information associated with geographic (geometry) data. # A bus stop provides a simple example: its position would typically be represented by latitude and longitude coordinates (geometry data), in addition to its name. -# The Elephant & Castle / New Kent Road bus stop in London, for example has coordinates of `-0.098` degrees longitude and `51.495` degrees latitude which can be represented as `POINT (-0.098 51.495)` using the Simple Feature representation described in @sec-spatial-class. +# A bus stop in London, for example, has coordinates of `-0.098` degrees longitude and `51.495` degrees latitude which can be represented as `POINT (-0.098 51.495)` using the Simple Feature representation described in @sec-spatial-class. # Attributes, such as the name of the bus stop, are the topic of this chapter. # # Another example of an attribute is the elevation value for a specific grid cell in raster data. # Unlike the vector data model, the raster data model stores the coordinate of the grid cell indirectly, meaning the distinction between attribute and spatial information is less clear. -# Think of a pixel in the 3rd row and the 4th column of a raster matrix: its spatial location is defined by its index in the matrix. +# Think of a pixel in the 3^rd^ row and the 4^th^ column of a raster matrix: its spatial location is defined by its index in the matrix. # In this case, we need to move four cells in the x direction (typically east/right on maps) and three cells in the y direction (typically south/down) from the origin. # The raster's resolution is also important as it defines the distance for each x- and y-step. -# The resolution and the origin are stored in the raster's header, which is a vital component of raster datasets which specifies how pixels relate to geographic coordinates (see also @sec-spatial-operations). +# The resolution and the origin are stored in the raster's metadata (header), which is a vital component of raster datasets which specifies how pixels relate to geographic coordinates (see also @sec-spatial-operations). # # This chapter teaches how to manipulate geographic objects based on attributes such as the names of bus stops in a vector dataset and elevations of pixels in a raster dataset. # For vector data, this means techniques such as subsetting and aggregation (see @sec-vector-attribute-subsetting and @sec-vector-attribute-aggregation). @@ -82,17 +86,17 @@ # This is good news: skills developed in this chapter are cross-transferable. # @sec-spatial-operations extends the methods presented here to the spatial world. # -# After a deep dive into various types of vector attribute operations in the next section, raster attribute data operations are covered in @sec-raster-subsetting, which demonstrates extracting cell values from one or more layer (raster subsetting). +# After a deep dive into various types of vector attribute operations in the next section, raster attribute data operations are covered in @sec-raster-subsetting, which demonstrates extracting cell values from one or more layers (raster subsetting). # @sec-summarizing-raster-objects provides an overview of 'global' raster operations which can be used to summarize entire raster datasets. # # ## Vector attribute manipulation {#sec-vector-attribute-manipulation} # -# As mentioned in @sec-vector-layers, vector layers (`GeoDataFrame`, from package **geopandas**) are basically extended tables (`DataFrame` from package **pandas**), the difference being that a vector layer has a geometry column. +# As mentioned in @sec-vector-layers, vector layers (`GeoDataFrame`, from package **geopandas**) are basically extended tables (`DataFrame` from package **pandas**), the only differences being the geometry column and class. # Therefore, all ordinary table-related operations from package **pandas** are supported for **geopandas** vector layers as well, as shown below. # # ### Vector attribute subsetting {#sec-vector-attribute-subsetting} # -# **pandas** supports several subsetting interfaces, though the most [recommended](https://stackoverflow.com/questions/38886080/python-pandas-series-why-use-loc) ones are `.loc`, which uses **pandas** indices, and `.iloc`, which uses (implicit) **numpy**-style numeric indices. +# **pandas** supports several subsetting interfaces, though the most recommended ones are `.loc`, which uses **pandas** indices, and `.iloc`, which uses (implicit) **numpy**-style numeric indices. # # In both cases, the method is followed by square brackets, and two indices, separated by a comma. # Each index can be: @@ -100,33 +104,19 @@ # - A specific value, as in `1` # - A `list`, as in `[0,2,4]` # - A slice, as in `0:3` -# - `:`---indicating "all" indices, as in `[:]` +# - `:`---indicating 'all' indices, as in `[:]` # -# An exception to this rule is selecting columns using a list, which we do using shorter notation, as in `df[['a','b']]`, instead of `df.loc[:, ['a','b']]`, to select columns `'a'` and `'b'` from `df`. +# An exception to this guideline is selecting columns using a list, which we do using shorter notation, as in `df[['a','b']]`, instead of `df.loc[:, ['a','b']]`, to select columns `'a'` and `'b'` from `df`. # # Here are few examples of subsetting the `GeoDataFrame` of world countries (@fig-gdf-plot). # First, we are subsetting rows by position. -# This can be done using the three following approaches, which all return the same result. -# -# -# -# -# In the expression #1, we are using the expressive notation `[0:3,:]`, meaning "rows 1,2,3, all columns". Keep in mind that indices in Python start from 0, and slices are inclusive of the start and exclusive of the end.; therefore, `0:3` means indices `0`, `1`, `2`, i.e., first three rows in this example. In expression #2, we omit the columns index, as well as the starting index, that is, `[:3]`, doing the same with less code. In expression #3, we are using the `.head` method to select the first N rows. +# In the first example, we are using `[0:3,:]`, meaning 'rows 1,2,3, all columns'. Keep in mind that indices in Python start from 0, and slices are inclusive of the start and exclusive of the end; therefore, `0:3` means indices `0`, `1`, `2`, i.e., first three rows in this example. +# # In[ ]: -#| eval: false -world.iloc[0:3, :] # approach #1 -world.iloc[:3] # approach #2 -world.head(3) # approach #3 - - -# In[ ]: - - -#| echo: false -world.head(3) +world.iloc[0:3, :] # Subsetting columns by position requires specifying that we want to keep all of the rows (`:`) and then the indexes of the columns we want to keep. @@ -145,7 +135,7 @@ world.iloc[0:3, 0:3] -# Subsetting columns by name is not done with the `.iloc` method, but requires specifying the column names directly in a double square bracket `[[` notation. +# Subsetting columns by name is not done with the `.iloc` method, but instead requires specifying the column names in `.loc`, or directly in a double square bracket `[[` notation. # In[ ]: @@ -153,7 +143,7 @@ world[['name_long', 'geometry']] -# To select many successive columns, we can use the `:` notation, as in `world.loc[:, 'name_long':'pop']`, which selects all columns from `name_long` to `pop` (inclusive). +# To select many successive columns, we can use the `:` (slice) notation, as in `world.loc[:, 'name_long':'pop']`, which selects all columns from `name_long` to `pop` (inclusive). # In[ ]: @@ -161,9 +151,6 @@ world.loc[:, 'name_long':'pop'] -# -# -# # Removing rows or columns is done using the `.drop` method. # We can remove specific rows by specifying their ids, e.g., dropping rows 2, 3, and 5 in the following example. @@ -181,7 +168,7 @@ world.drop(['name_long', 'continent'], axis=1) -# We can also rename columns using the [`.rename`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rename.html) method, in which we pass a dictionary of the form `old_name:new_name` to the `columns` argument. +# We can also rename columns using the `.rename` method, in which we pass a dictionary with items of the form `old_name:new_name` to the `columns` argument. # In[ ]: @@ -192,7 +179,7 @@ # The standard **numpy** comparison operators (@tbl-comparison-operators) can be used in boolean subsetting with **pandas**/**geopandas**. # # | `Symbol` | `Name` | -# |---------------|---------------------------------| +# |:---------------:|:---------------------------------:| # | `==` | Equal to | # | `!=` | Not equal to | # | `>`, `<` | Greater/Less than | @@ -201,7 +188,7 @@ # # : Comparison operators that return boolean values (`True`/`False`). {#tbl-comparison-operators} # -# The following example demonstrates logical vectors for subsetting by creating a new `GeoDataFrame` object called `small_countries` that contains only those countries and other teritories from the `world` object whose surface area is smaller than 10,000 $km^2$. +# The following example demonstrates logical vectors for subsetting by creating a new `GeoDataFrame` object called `small_countries` that contains only those countries and other territories from the `world` object whose surface area is smaller than 10,000 $km^2$. # The first step is to create a logical vector (a `Series` object) that is `True` for countries with an area smaller than 10,000 $km^2$ and `False` otherwise. # Then, we use this vector to subset the `world` dataset, which returns a new `GeoDataFrame` object containing only the small countries. @@ -245,7 +232,9 @@ # Logical operators `&`, `|`, and `~` (@tbl-comparison-operators) can be used to combine multiple conditions. # For example, here are all countries in North America or South America. -# Keep in mind that the parentheses around each condition (here, and in analogous cases using other operators) are crucial; otherwise, due to Python's [precedence rules](https://docs.python.org/3/reference/expressions.html#operator-precedence), the `|` operator is executed before `==` and we get an error. +# Keep in mind that the parentheses around each condition (here, and in analogous cases using other operators) are crucial; otherwise, due to Python's precedence rules[^python_precedence_rules], the `|` operator is executed before `==` and we get an error. +# +# [^python_precedence_rules]: [https://docs.python.org/3/reference/expressions.html#operator-precedence](https://docs.python.org/3/reference/expressions.html#operator-precedence) # In[ ]: @@ -257,7 +246,7 @@ .loc[:, ['name_long', 'continent']] -# However, specifically, expressions combining multiple comparisons with `==` combined with `|` can be replaced with the [`.isin`](https://pandas.pydata.org/docs/reference/api/pandas.Series.isin.html) method and a `list` of values to compare with. +# However, specifically, expressions combining multiple comparisons with `==` combined with `|` can be replaced with the `.isin` method and a `list` of values to compare with. # The advantage of `.isin` is more concise and easy to manage code, especially when the number of comparisons is large. # For example, the following expression gives the same result as above. @@ -276,12 +265,12 @@ # The aim is to find the `sum()` of country populations for each continent, resulting in a smaller table or vector layer (of continents). # Since aggregation is a form of data reduction, it can be a useful early step when working with large datasets. # -# Attribute-based aggregation can be achieved using a combination of `.groupby` and `.sum`, where the former groups the data by the grouping variable(s) and the latter calculates the sum of the remaining columns. +# Attribute-based aggregation can be achieved using a combination of `.groupby` and `.sum` (package **pandas**), where the former groups the data by the grouping variable(s) and the latter calculates the sum of the specified column(s). The `.reset_index` methods moves the grouping variable into an ordinary column, rather than an index (the default), which is something we typically want to do. # In[ ]: -world_agg1 = world[['continent', 'pop']].groupby('continent').sum() +world_agg1 = world.groupby('continent')[['pop']].sum().reset_index() world_agg1 @@ -290,9 +279,7 @@ # If we want to include the geometry in the aggregation result, we can use the `.dissolve` method. # That way, in addition to the summed population, we also get the associated geometry per continent, i.e., the union of all countries. # Note that we use the `by` parameter to choose which column(s) are used for grouping, and the `aggfunc` parameter to choose the aggregation function for non-geometry columns. -# Note that the `.reset_index` method is used (here, and elsewhere in the book) to turn **pandas** and **geopandas** [*indices*](https://pandas.pydata.org/docs/reference/api/pandas.Index.html), which are automatically created for grouping variables in grouping operations such as `.dissolve`, "back" into ordinary columns, which are more appropriate in the scope of this book. -# -# +# Again, note that the `.reset_index` method is used (here, and elsewhere in the book) to turn **pandas** and **geopandas** row *indices*, which are automatically created for grouping variables in grouping operations such as `.dissolve`, 'back' into ordinary columns, which are more appropriate in the scope of this book. # In[ ]: @@ -303,9 +290,7 @@ world_agg2 -# In this case, the resulting `world_agg2` object is a `GeoDataFrame` containing 8 features representing the continents of the world (and the open ocean) that we can plot (@fig-spatial-aggregation). The `plt.subplots` function is hereby used to control plot dimensions (to make the plot wider and narrower) (see @sec-static-styling). -# -# +# In this case, the resulting `world_agg2` object is a `GeoDataFrame` containing 8 features representing the continents of the world that we can plot (@fig-spatial-aggregation). The `plt.subplots` function is hereby used to control plot dimensions (to make the plot wider and narrower) (see @sec-static-styling). # In[ ]: @@ -323,8 +308,6 @@ # It is done by passing a dictionary to the `aggfunc` parameter, where the keys are the column names and the values are the aggregation functions. # The result is a `GeoDataFrame` object with 8 rows (one per continent) and 4 columns (one for the continent name and one for each of the three aggregated attributes). # The `rename` method is used to rename the `'name_long'` column into `'n'`, as it now expresses the count of names (i.e., the number of countries) rather than their names. -# -# # In[ ]: @@ -335,11 +318,11 @@ 'name_long': 'count', 'pop': 'sum', 'area_km2': 'sum' - }).rename(columns={'name_long': 'n'}) + }).rename(columns={'name_long': 'n'}).reset_index() world_agg3 -# Figure @fig-spatial-aggregation-different-functions visualizes the three aggregated attributes of our resulting layer `world_agg3`. +# @fig-spatial-aggregation-different-functions visualizes the three aggregated attributes of our resulting layer `world_agg3`. # In[ ]: @@ -365,9 +348,9 @@ # There are several other table-related operations that are possible, such as creating new columns or sorting the values. # In the following code example, given the `world_agg3` continent summary (@fig-spatial-aggregation-different-functions), we: # -# - drop the geometry columns, +# - drop the geometry column, # - calculate population density of each continent, -# - arrange continents by the number countries they contain, and +# - arrange continents by the number of countries each contains, and # - keep only the 3 most populous continents. # In[ ]: @@ -383,8 +366,8 @@ # ### Vector attribute joining {#sec-vector-attribute-joining} # # Combining data from different sources is a common task in data preparation. -# Joins do this by combining tables based on a shared "key" variable. -# **pandas** has a function named [`pd.merge`](https://pandas.pydata.org/docs/reference/api/pandas.merge.html) for joining `(Geo)DataFrames` based on common column(s) that follows conventions used in the database language SQL [@grolemund_r_2016]. +# Joins do this by combining tables based on a shared 'key' variable. +# **pandas** has a function named `pd.merge` for joining `(Geo)DataFrames` based on common column(s) that follows conventions used in the database language SQL [@grolemund_r_2016]. # The `pd.merge` result can be either a `DataFrame` or a `GeoDataFrame` object, depending on the inputs. # # A common type of attribute join on spatial data is to join `DataFrames` to `GeoDataFrames`. @@ -412,7 +395,7 @@ # The result is a `GeoDataFrame` object identical to the original `world` object, but with two new variables (`coffee_production_2016` and `coffee_production_2017`) on coffee production. # This can be plotted as a map, as illustrated (for `coffee_production_2017`) in @fig-join-coffee-production. -# Note that, here and in many other examples in later chapters, we are using a technique to plot two layers (all of the world countries outline, and coffee production with symbology) at once, which will be "formally" introduced towards the end of the book in @sec-plot-static-layers. +# Note that, here and in many other examples in later chapters, we are using a technique to plot two layers (all of the world countries outline, and coffee production with symbology) at once, which will be 'formally' introduced towards the end of the book in @sec-plot-static-layers. # # @@ -425,15 +408,13 @@ coffee_map = world_coffee.plot(ax=base, column='coffee_production_2017'); -# To work, attribute-based joins need a "key variable" in both datasets (`on` parameter of `pd.merge`). +# To work, attribute-based joins need a 'key variable' in both datasets (`on` parameter of `pd.merge`). # In the above example, both `world_coffee` and `world` DataFrames contained a column called `name_long`. # # ::: callout-note # By default, `pd.merge` uses all columns with matching names. However, it is recommended to explicitly specify the names of the columns to be used for matching, like we did in the last example. # ::: # -# -# # In case where column names are not the same, you can use `left_on` and `right_on` to specify the respective columns. # # Note that the result `world_coffee` has the same number of rows as the original dataset `world`. @@ -451,23 +432,6 @@ pd.merge(world, coffee_data, on='name_long', how='inner') -# -# -# -# -# -# -# -# # ### Creating attributes and removing spatial information {#sec-creating-attributes-and-removing-spatial-information} # # Often, we would like to create a new column based on already existing columns. @@ -496,11 +460,9 @@ # The resulting `GeoDataFrame` object has a new column called `con_reg` representing the continent and region of each country, e.g., `'South America:Americas'` for Argentina and other South America countries. -# The opposite operation, splitting one column into multiple columns based on a separator string, is done using the [`.str.split`](https://pandas.pydata.org/docs/reference/api/pandas.Series.str.split.html) method. -# As a result we go back to the previous state of two separate `continent` and `region_un` columns (only that their position is now last, since they are newly created). +# The opposite operation, splitting one column into multiple columns based on a separator string, is done using the `.str.split` method. +# As a result, we go back to the previous state of two separate `continent` and `region_un` columns (only that their position is now last, since they are newly created). # The `str.split` method returns a column of `list`s by default; to place the strings into separate `str` columns we use the `expand=True` argument. -# -# # In[ ]: @@ -519,7 +481,7 @@ world2.rename(columns={'name_long': 'name'}) -# To change all column names at once, we assign a `list` of the "new" column names into the `.columns` property. +# To change all column names at once, we assign a `list` of the 'new' column names into the `.columns` property. # The `list` must be of the same length as the number of columns (i.e., `world.shape[1]`). # This is illustrated below, which outputs the same `world2` object, but with very short names. @@ -563,10 +525,7 @@ # # Raster cell values can be considered the counterpart of vector attribute values. # In this section, we cover operations that deal with raster values in a similar way, namely as a series of numbers. -# This type of operations include subsetting raster values (@sec-raster-subsetting) and calculating global summaries of raster values (@sec-summarizing-raster-objects). -# -# -# +# This type of operations includes subsetting raster values (@sec-raster-subsetting) and calculating global summaries of raster values (@sec-summarizing-raster-objects). # # ### Raster subsetting {#sec-raster-subsetting} # @@ -586,7 +545,7 @@ # In[ ]: -elev[1, 2] ## Value at row 2, column 3 +elev[1, 2] # Cell values can be modified by overwriting existing values in conjunction with a subsetting operation, e.g., `elev[1,2]=0` to set cell at row 2, column 3 of `elev` to `0`. @@ -617,21 +576,18 @@ # ::: callout-note -# You can see that the above array is three-dimensional according to the number of brackets `[`, or check explicitly using `.shape` or [`.ndim`](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.ndim.html). +# You can see that the above array is three-dimensional according to the number of brackets `[`, or check explicitly using `.shape` or `.ndim`. # ::: # # In three-dimensional arrays, we access cell values using three indices, keeping in mind that dimensions order is `(layers,rows, columns)` -# For example, to get the same value shown above, at row 2, column 3 (at band 1), we use `elev[0,1,2]` returns instead of `elev[1,2]`. +# For example, to get the same value shown above, at row 2, column 3 (at band 1), we use `elev[0,1,2]` instead of `elev[1,2]`. # In[ ]: -elev3d[0, 1, 2] ## Value at band 1, row 2, column 3 +elev3d[0, 1, 2] -# -# -# # ### Summarizing raster objects {#sec-summarizing-raster-objects} # # Global summaries of raster values can be calculated by applying **numpy** summary functions on the array with raster values, e.g., `np.mean`. @@ -642,10 +598,8 @@ np.mean(elev) -# Note that "No Data"-safe functions--such as `np.nanmean`---should be used in case the raster contains "No Data" values which need to be ignored. -# Before we can demonstrate that, we must convert the array from `int` to `float`, as `int` arrays cannot contain `np.nan` (due to [computer memory limitations](https://en.wikipedia.org/wiki/NaN#Integer_NaN)). -# -# +# Note that 'No Data'-safe functions--such as `np.nanmean`---should be used in case the raster contains 'No Data' values which need to be ignored. +# Before we can demonstrate that, we must convert the array from `int` to `float`, as `int` arrays cannot contain `np.nan` (due to computer memory limitations). # In[ ]: @@ -656,9 +610,7 @@ # Now we can insert an `np.nan` value into the array, for example to a cell located in the first row and third column. -# (Trying to do so in the original `elev` array raises an error, because an `int` array cannot accomodate `np.nan`, as mentioned above; try it to see for yourself.) -# -# +# (Doing so in the original `elev` array raises an error, because an `int` array cannot accommodate `np.nan`, as mentioned above; try it to see for yourself.) # In[ ]: @@ -667,7 +619,7 @@ elev1 -# With the `np.nan` value inplace, the summary value becomes unknown (`np.nan`). +# With the `np.nan` value inplace, the `np.mean` summary value becomes unknown (`np.nan`). # In[ ]: @@ -675,7 +627,7 @@ np.mean(elev1) -# To get a summary of all non-missing values, we need to use the specialized **numpy** functions that ignore "No Data" values: +# To get a summary of all non-missing values, we need to use one of the specialized **numpy** functions that ignore 'No Data' values, such as `np.nanmean`: # In[ ]: @@ -684,20 +636,17 @@ # Raster value statistics can be visualized in a variety of ways. -# One approach is to "flatten" the raster values into a one-dimensional array (`flatten`), then use a graphical function such as [`plt.hist`](https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.hist.html) or [`plt.boxplot`](https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.boxplot.html) (from **matplotlib.pyplot**). +# One approach is to 'flatten' the raster values into a one-dimensional array (using `.flatten`), then use a graphical function such as `plt.hist` or `plt.boxplot` (from **matplotlib.pyplot**). # For example, the following code section shows the distribution of values in `elev` using a histogram (@fig-raster-hist). # In[ ]: #| label: fig-raster-hist -#| fig-cap: Distribution of cell values in continuous raster (`elev.tif`) +#| fig-cap: Distribution of cell values in a continuous raster (`elev.tif`) plt.hist(elev.flatten()); -# -# -# # To summarize the distribution of values in a categorical raster, we can calculate the frequencies of unique values, and draw them using a barplot. # Let's demonstrate using the `grain.tif` small categorical raster. @@ -708,7 +657,7 @@ grain -# To calculate the frequency of unique values in an array, we use the [`np.unique`](https://numpy.org/doc/stable/reference/generated/numpy.unique.html) with the `return_counts=True` option. +# To calculate the frequency of unique values in an array, we use the `np.unique` with the `return_counts=True` option. # The result is a `tuple` with two corresponding arrays: the unique values, and their counts. # In[ ]: @@ -718,7 +667,7 @@ freq -# These two arrays can be passed to the [`plt.bar`](https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.bar.html) function to draw a barplot, as shown in @fig-raster-bar. +# These two arrays can be passed to the `plt.bar` function to draw a barplot, as shown in @fig-raster-bar. # In[ ]: @@ -728,6 +677,5 @@ plt.bar(*freq); -# ## Exercises +# # -# ## References diff --git a/code/chapters/03-spatial-operations.py b/code/chapters/03-spatial-operations.py index ec83a62c..71915f3f 100644 --- a/code/chapters/03-spatial-operations.py +++ b/code/chapters/03-spatial-operations.py @@ -1,6 +1,10 @@ #!/usr/bin/env python # coding: utf-8 +# --- +# jupyter: python3 +# --- +# # # Spatial data operations {#sec-spatial-operations} # # ## Prerequisites {.unnumbered} @@ -9,12 +13,14 @@ #| echo: false -import matplotlib.pyplot as plt -import pandas as pd -pd.set_option('display.max_rows', 4) -pd.set_option('display.max_columns', 6) -pd.options.display.max_colwidth = 35 -plt.rcParams['figure.figsize'] = (5, 5) +import book_options + + +# In[ ]: + + +#| echo: false +import book_options_pdf # This chapter requires importing the following packages: @@ -24,6 +30,8 @@ import os import numpy as np +import matplotlib.pyplot as plt +import pandas as pd import scipy.ndimage import scipy.stats import shapely @@ -34,7 +42,7 @@ import rasterio.features -# It also relies on the following data files: +# It also relies on the following data files: # In[ ]: @@ -68,14 +76,20 @@ # ## Introduction # -# -# -# -# Spatial operations, including spatial joins between vector datasets and local and focal operations on raster datasets, are a vital part of geocomputation. This chapter shows how spatial objects can be modified in a multitude of ways based on their location and shape. Many spatial operations have a non-spatial (attribute) equivalent, so concepts such as subsetting and joining datasets demonstrated in the previous chapter are applicable here. This is especially true for vector operations: @sec-vector-attribute-manipulation on vector attribute manipulation provides the basis for understanding its spatial counterpart, namely spatial subsetting (covered in @sec-spatial-subsetting-vector). Spatial joining (@sec-spatial-joining) and aggregation (@sec-vector-spatial-aggregation) also have non-spatial counterparts, covered in the previous chapter. +# Spatial operations, including spatial joins between vector datasets and local and focal operations on raster datasets, are a vital part of geocomputation. +# This chapter shows how spatial objects can be modified in a multitude of ways based on their location and shape. Many spatial operations have a non-spatial (attribute) equivalent, so concepts such as subsetting and joining datasets demonstrated in the previous chapter are applicable here. +# This is especially true for vector operations: @sec-vector-attribute-manipulation on vector attribute manipulation provides the basis for understanding its spatial counterpart, namely spatial subsetting (covered in @sec-spatial-subsetting-vector). +# Spatial joining (@sec-spatial-joining) and aggregation (@sec-vector-spatial-aggregation) also have non-spatial counterparts, covered in the previous chapter. # -# Spatial operations differ from non-spatial operations in a number of ways, however. Spatial joins, for example, can be done in a number of ways---including matching entities that intersect with or are within a certain distance of the target dataset---while the attribution joins discussed in @sec-vector-attribute-joining in the previous chapter can only be done in one way. Different types of spatial relationship between objects, including intersects and disjoint, are described in @sec-topological-relations. Another unique aspect of spatial objects is distance: all spatial objects are related through space, and distance calculations can be used to explore the strength of this relationship, as described in the context of vector data in @sec-distance-relations. +# Spatial operations differ from non-spatial operations in a number of ways, however. +# Spatial joins, for example, can be done in a number of ways---including matching entities that intersect with or are within a certain distance of the target dataset---while the attribution joins discussed in @sec-vector-attribute-joining in the previous chapter can only be done in one way. +# Different types of spatial relationships between objects, including intersects and disjoints, are described in @sec-topological-relations. +# Another unique aspect of spatial objects is distance: all spatial objects are related through space, and distance calculations can be used to explore the strength of this relationship, as described in the context of vector data in @sec-distance-relations. # -# Spatial operations on raster objects include subsetting---covered in @sec-spatial-subsetting-raster---and merging several raster 'tiles' into a single object, as demonstrated in @sec-merging-rasters. Map algebra covers a range of operations that modify raster cell values, with or without reference to surrounding cell values. The concept of map algebra, vital for many applications, is introduced in @sec-map-algebra; local, focal and zonal map algebra operations are covered in sections @sec-raster-local-operations, @sec-focal-operations, and @sec-zonal-operations, respectively. Global map algebra operations, which generate summary statistics representing an entire raster dataset, and distance calculations on rasters, are discussed in Section @sec-global-operations-and-distances. In the final section (@sec-merging-rasters) the process of merging two raster datasets is discussed and demonstrated with reference to a reproducible example. +# Spatial operations on raster objects include subsetting---covered in @sec-spatial-subsetting-raster---and merging several raster 'tiles' into a single object, as demonstrated in @sec-merging-rasters. +# Map algebra covers a range of operations that modify raster cell values, with or without reference to surrounding cell values. +# The concept of map algebra, vital for many applications, is introduced in @sec-map-algebra; local, focal, and zonal map algebra operations are covered in sections @sec-raster-local-operations, @sec-focal-operations, and @sec-zonal-operations, respectively. +# Global map algebra operations, which generate summary statistics representing an entire raster dataset, and distance calculations on rasters, are discussed in Section @sec-global-operations-and-distances. # # ::: callout-note # It is important to note that spatial operations that use two spatial objects rely on both objects having the same coordinate reference system, a topic that was introduced in @sec-coordinate-reference-systems-intro and which will be covered in more depth in @sec-reproj-geo-data. @@ -83,13 +97,8 @@ # # ## Spatial operations on vector data {#sec-spatial-vec} # -# -# -# # This section provides an overview of spatial operations on vector geographic data represented as Simple Features using the **shapely** and **geopandas** # packages. -# -# # @sec-spatial-ras then presents spatial operations on raster datasets, using the **rasterio** and **scipy** packages. # # ### Spatial subsetting {#sec-spatial-subsetting-vector} @@ -99,7 +108,7 @@ # # Spatial subsetting is the process of taking a spatial object and returning a new object containing only features that relate in space to another object. # Analogous to attribute subsetting (covered in @sec-vector-attribute-subsetting), subsets of `GeoDataFrame`s can be created with square bracket (`[`) operator using the syntax `x[y]`, where `x` is an `GeoDataFrame` from which a subset of rows/features will be returned, and `y` is a boolean `Series`. -# The difference is, that, in spatial subsetting `y` is created based on another geometry and using one of the binary geometry relation methods, such as `.intersects` (see @sec-topological-relations), rather based on comparison based on ordinary columns. +# The difference is, that, in spatial subsetting `y` is created based on another geometry and using one of the binary geometry relation methods, such as `.intersects` (see @sec-topological-relations), rather than based on comparison based on ordinary columns. # # To demonstrate spatial subsetting, we will use the `nz` and `nz_height` layers, which contain geographic data on the 16 main regions and 101 highest points in New Zealand, respectively (@fig-spatial-subset (a)), in a projected coordinate system. # The following expression creates a new object, `canterbury`, representing only one region --- Canterbury. @@ -111,8 +120,8 @@ canterbury -# Then, we use the `.intersects` method evaluate, for each of the `nz_height` points, whether they intersect with Canterbury. -# The result `canterbury_height` is a boolean `Series` with the "answers". +# Then, we use the `.intersects` method to evaluate, for each of the `nz_height` points, whether they intersect with Canterbury. +# The result `canterbury_height` is a boolean `Series` with the 'answers'. # In[ ]: @@ -154,7 +163,7 @@ # Like in attribute subsetting (@sec-vector-attribute-subsetting), we are using a boolean series (`sel`), of the same length as the number of rows in the filtered table (`nz_height`), created based on a condition applied on itself. # The difference is that the condition is not a comparison of attribute values, but an evaluation of a spatial relation. -# Namely, we evaluate whether each geometry of `nz_height` intersects with `canterbury` geometry, using the `.intersects` method. +# Namely, we evaluate whether each geometry of `nz_height` intersects with the `canterbury` geometry, using the `.intersects` method. # # Various topological relations can be used for spatial subsetting which determine the type of spatial relationship that features in the target object must have with the subsetting object to be selected. # These include touches, crosses, or within, as we will see shortly in @sec-topological-relations. @@ -189,41 +198,26 @@ canterbury_height2.plot(ax=base, color='None', edgecolor='red'); -# In case we need to subset according to several geometries at once, e.g., find out which points intersect with both Canterbury and Southland, we can dissolve the filtering subset, using `.unary_union`, before applying the `.intersects` (or any other) operator. +# In case we need to subset according to several geometries at once, e.g., find out which points intersect with both Canterbury and Southland, we can dissolve the filtering subset, using `.union_all`, before applying the `.intersects` (or any other) operator. # For example, here is how we can subset the `nz_height` points which intersect with Canterbury or Southland. -# (Note that we are also using the `.isin` method, as demonstrated in the end of @sec-vector-attribute-subsetting.) -# -# +# (Note that we are also using the `.isin` method, as demonstrated at the end of @sec-vector-attribute-subsetting.) # In[ ]: canterbury_southland = nz[nz['Name'].isin(['Canterbury', 'Southland'])] -sel = nz_height.intersects(canterbury_southland.unary_union) +sel = nz_height.intersects(canterbury_southland.union_all()) canterbury_southland_height = nz_height[sel] canterbury_southland_height -# -# -# -# - -# In[ ]: - - -#| eval: false -#| echo: false -nz_height.overlay(canterbury_southland) - - # @fig-spatial-subset2 shows the results of the spatial subsetting of `nz_height` points by intersection with Canterbury and Southland. # In[ ]: #| label: fig-spatial-subset2 -#| fig-cap: Spatial subsetting of points by intersection with more that one polygon +#| fig-cap: Spatial subsetting of points by intersection with more than one polygon #| fig-subcap: #| - Original points (red) #| - Spatial subset based on intersection (red), geometry used for subsetting (Canterbury and Southland) (grey) @@ -237,31 +231,31 @@ canterbury_southland_height.plot(ax=base, color='None', edgecolor='red'); -# The next section further explores different types of spatial relation, also known as binary predicates (of which `.intersects` and `.disjoint` are two examples), that can be used to identify whether or not two features are spatially related. +# The next section further explores different types of spatial relations, also known as binary predicates (of which `.intersects` and `.disjoint` are two examples), that can be used to identify whether or not two features are spatially related. # # ### Topological relations {#sec-topological-relations} # # Topological relations describe the spatial relationships between objects. -# "Binary topological relationships", to give them their full name, are logical statements (in that the answer can only be `True` or `False`) about the spatial relationships between two objects defined by ordered sets of points (typically forming points, lines and polygons) in two or more dimensions [@egenhofer_mathematical_1990]. +# 'Binary topological relationships', to give them their full name, are logical statements (in that the answer can only be `True` or `False`) about the spatial relationships between two objects defined by ordered sets of points (typically forming points, lines, and polygons) in two or more dimensions [@egenhofer_mathematical_1990]. # That may sound rather abstract and, indeed, the definition and classification of topological relations is based on mathematical foundations first published in book form in 1966 [@spanier_algebraic_1995], with the field of algebraic topology continuing into the 21st century [@dieck_algebraic_2008]. # # Despite their mathematical origins, topological relations can be understood intuitively with reference to visualizations of commonly used functions that test for common types of spatial relationships. # @fig-spatial-relations shows a variety of geometry pairs and their associated relations. # The third and fourth pairs in @fig-spatial-relations (from left to right and then down) demonstrate that, for some relations, order is important: while the relations equals, intersects, crosses, touches and overlaps are symmetrical, meaning that if `x.relation(y)` is true, `y.relation(x)` will also be true, relations in which the order of the geometries are important such as contains and within are not. -# -# # # ::: callout-note -# Notice that each geometry pair has a ["DE-9IM"](https://en.wikipedia.org/wiki/DE-9IM) string such as `FF2F11212`. +# Notice that each geometry pair has a 'DE-9IM'[^de-9im] string such as `FF2F11212`. # DE-9IM strings describe the dimensionality (0=points, 1=lines, 2=polygons) of the pairwise intersections of the interior, boundary, and exterior, of two geometries (i.e., nine values of 0/1/2 encoded into a string). # This is an advanced topic beyond the scope of this book, which can be useful to understand the difference between relation types, or define custom types of relations. -# See the [DE-9IM strings](https://r.geocompx.org/spatial-operations#de-9im-strings) section in Geocomputation with R [@lovelace_geocomputation_2019]. -# Also note that the **shapely** package contains the `.relate` and `.relate_pattern` [methods](https://shapely.readthedocs.io/en/stable/manual.html#de-9im-relationships), for derive and test for DE-9IM patterns, respectively. +# See the DE-9IM strings section in Geocomputation with R [@lovelace_geocomputation_2019]. +# Also note that the **shapely** package contains the `.relate` and `.relate_pattern` methods, to derive and to test for DE-9IM patterns, respectively. # ::: # -# ![Topological relations between vector geometries, inspired by Figures 1 and 2 in Egenhofer and Herring (1990). The relations for which the `x.relation(y)` is true are printed for each geometry pair, with `x` represented in pink and `y` represented in blue. The nature of the spatial relationship for each pair is described by the Dimensionally Extended 9-Intersection Model string.](https://r.geocompx.org/04-spatial-operations_files/figure-html/relations-1.png){#fig-spatial-relations} +# [^de-9im]: [https://en.wikipedia.org/wiki/DE-9IM](https://en.wikipedia.org/wiki/DE-9IM) +# +# ![Topological relations between vector geometries, inspired by Figures 1 and 2 in [@egenhofer_mathematical_1990]. The relations for which the `x.relation(y)` is true are printed for each geometry pair, with `x` represented in pink and `y` represented in blue. The nature of the spatial relationship for each pair is described by the Dimensionally Extended 9-Intersection Model string.](images/relations-1.png){#fig-spatial-relations} # -# In **shapely**, methods testing for different types of topological relations are known as ["relationships"](https://shapely.readthedocs.io/en/stable/manual.html#relationships). +# In **shapely**, methods testing for different types of topological relations are known as 'relationships'. # **geopandas** provides their wrappers (with the same method name) which can be applied on multiple geometries at once (such as `.intersects` and `.disjoint` applied on all points in `nz_height`, see @sec-spatial-subsetting-vector). # To see how topological relations work in practice, let's create a simple reproducible example, building on the relations illustrated in @fig-spatial-relations and consolidating knowledge of how vector geometries are represented from a previous chapter (@sec-geometry-columns and @sec-geometries). @@ -282,15 +276,13 @@ # The sample dataset which we created is composed of three is `GeoSeries`: named `points`, `line`, and `poly`, which are visualized in @fig-spatial-relations-geoms. -# The last expression is a `for` loop used to add text labels (`1`, `2`, and `3`) to identify the points; we are going to explain the concepts of text annotations with **geopandas** `.plot` in @sec-plot-static-labels. -# -# +# The last expression is a `for` loop used to add text labels (`0`, `1`, and `2`) to identify the points; we are going to explain the concepts of text annotations with **geopandas** `.plot` in @sec-plot-static-labels. # In[ ]: #| label: fig-spatial-relations-geoms -#| fig-cap: Points, line and polygon objects arranged to illustrate topological relations +#| fig-cap: Points (`points`), line (`line`), and polygon (`poly`) objects used to illustrate topological relations base = poly.plot(color='lightgrey', edgecolor='red') line.plot(ax=base, color='black', linewidth=7) points.plot(ax=base, color='none', edgecolor='black') @@ -302,7 +294,7 @@ # A simple query is: which of the points in `points` intersect in some way with polygon `poly`? -# The question can be answered by visual inspection (points 1 and 3 are touching and are within the polygon, respectively). +# The question can be answered by visual inspection (points `0` and `2` are touching and are within the polygon, respectively). # Alternatively, we can get the solution with the `.intersects` method, which reports whether or not each geometry in a `GeoSeries` (`points`) intersects with a single `shapely` geometry (`poly.iloc[0]`). # In[ ]: @@ -312,17 +304,17 @@ # The result shown above is a boolean `Series`. -# Its contents should match our intuition: positive (`True`) results are returned for the first and third point, and a negative result (`False`) for the second. +# Its contents should match our intuition: positive (`True`) results are returned for the points `0` and `2`, and a negative result (`False`) for point `1`. # Each value in this `Series` represents a feature in the first input (`points`). # -# All earlier examples in this chapter demonstrate the "many-to-one" mode of `.intersects` and analogous methods, where the relation is evaluated between each of several geometries in a `GeoSeries`/`GeoDataFrame`, and an individual `shapely` geometry. +# All earlier examples in this chapter demonstrate the 'many-to-one' mode of `.intersects` and analogous methods, where the relation is evaluated between each of several geometries in a `GeoSeries`/`GeoDataFrame`, and an individual `shapely` geometry. # A second mode of those methods (not demonstrated here) is when both inputs are `GeoSeries`/`GeoDataFrame` objects. -# In such case, a "pairwise" evaluation takes place between geometries aligned by index (`align=True`, the default) or by position (`align=False`). +# In such case, a 'pairwise' evaluation takes place between geometries aligned by index (`align=True`, the default) or by position (`align=False`). # For example, the expression `nz.intersects(nz)` returns a `Series` of 16 `True` values, indicating (unsurprisingly) that each geometry in `nz` intersects with itself. # -# A third mode is when we are interested in a "many-to-many" evaluation, i.e., obtaining a matrix of all pairwise combinations of geometries from two `GeoSeries` objects. +# A third mode is when we are interested in a 'many-to-many' evaluation, i.e., obtaining a matrix of all pairwise combinations of geometries from two `GeoSeries` objects. # At the time of writing, there is no built-in method to do this in **geopandas**. -# However, the [`.apply`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.apply.html) method can be used to repeat a "many-to-one" evaluation over all geometries in the second layer, resulting in a matrix of *pairwise* results. +# However, the `.apply` method (package **pandas**) can be used to repeat a 'many-to-one' evaluation over all geometries in the second layer, resulting in a matrix of *pairwise* results. # We will create another `GeoSeries` with two polygons, named `poly2`, to demonstrate this. # In[ ]: @@ -350,11 +342,9 @@ ) -# Now we can use to get the intersection relations matrix. -# -# -# The result is a `DataFrame`, where each row represents a `points` geometry and each column represents a `poly` geometry. -# We can see that the first point intersects with both polygons, while the second and third points intersect with one of the polygons each. +# Now we can use `.apply` to get the intersection relations matrix. +# The result is a `DataFrame`, where each row represents a `points` geometry and each column represents a `poly2` geometry. +# We can see that the point `0` intersects with both polygons, while points `1` and `2` intersect with one of the polygons each. # In[ ]: @@ -363,7 +353,7 @@ # ::: callout-note -# The [`.apply`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.apply.html) method (package **pandas**) is used to apply a function along one of the axes of a `DataFrame` (or `GeoDataFrame`). +# The `.apply` method (package **pandas**) is used to apply a function along one of the axes of a `DataFrame` (or `GeoDataFrame`). # That is, we can apply a function on all rows (`axis=1`) or all columns (`axis=0`, the default). # When the function being applied returns a single value, the output of `.apply` is a `Series` (e.g., `.apply(len)` returns the lengths of all columns, because `len` returns a single value). # When the function returns a `Series`, then `.apply` returns a `DataFrame` (such as in the above example.) @@ -371,9 +361,7 @@ # # ::: callout-note # Since the above result, like any pairwise matrix, (1) is composed of values of the same type, and (2) has no contrasting role for rows and columns, is may be more convenient to use a plain **numpy** array to work with it. -# In such case, we can use the [`.to_numpy`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_numpy.html) method to go from `DataFrame` to `ndarray`. -# -# +# In such case, we can use the `.to_numpy` method to go from `DataFrame` to `ndarray`. # In[ ]: @@ -383,7 +371,7 @@ # ::: # -# The `.intersects` method returns `True` even in cases where the features just touch: intersects is a 'catch-all' topological operation which identifies many types of spatial relation, as illustrated in @fig-spatial-relations. +# The `.intersects` method returns `True` even in cases where the features just touch: intersects is a 'catch-all' topological operation which identifies many types of spatial relations, as illustrated in @fig-spatial-relations. # More restrictive questions include which points lie within the polygon, and which features are on or contain a shared boundary with it? # The first question can be answered with `.within`, and the second with `.touches`. @@ -399,7 +387,7 @@ points.touches(poly.iloc[0]) -# Note that although the first point touches the boundary polygon, it is not within it; the third point is within the polygon but does not touch any part of its border. +# Note that although the point `0` touches the boundary polygon, it is not within it; point `2` is within the polygon but does not touch any part of its border. # The opposite of `.intersects` is `.disjoint`, which returns only objects that do not spatially relate in any way to the selecting object. # In[ ]: @@ -408,7 +396,7 @@ points.disjoint(poly.iloc[0]) -# Another useful type of relation is "within distance", where we detect features that intersect with the target buffered by particular distance. +# Another useful type of relation is 'within distance', where we detect features that intersect with the target buffered by particular distance. # Buffer distance determines how close target objects need to be before they are selected. # This can be done by literally buffering (@sec-geometries) the target geometry, and evaluating intersection (`.intersects`). # Another way is to calculate the distances using the `.distance` method, and then evaluate whether they are within a threshold distance. @@ -419,11 +407,9 @@ points.distance(poly.iloc[0]) < 0.2 -# -# -# Note that although the second point is more than `0.2` units of distance from the nearest vertex of `poly`, it is still selected when the distance is set to `0.2`. -# This is because distance is measured to the nearest edge, in this case the part of the polygon that lies directly above point 2 in Figure @fig-spatial-relations. -# We can verify the actual distance between the second point and the polygon is `0.13`, as follows. +# Note that although point `1` is more than `0.2` units of distance from the nearest vertex of `poly`, it is still selected when the distance is set to `0.2`. +# This is because distance is measured to the nearest edge, in this case, the part of the polygon that lies directly above point 2 in Figure @fig-spatial-relations. +# We can verify that the actual distance between point `1` and the polygon is `0.13`, as follows. # In[ ]: @@ -431,7 +417,7 @@ points.iloc[1].distance(poly.iloc[0]) -# This is also a good opportunity to repeat that all distance-related calculations in **gopandas** (and **shapely**) assume planar geometry, and only take into account the coordinate values. It is up to the user to make sure that all input layers are in the same projected CRS, so that this type of calculations make sense (see @sec-geometry-operations-on-projected-and-unprojected-data and @sec-when-to-reproject). +# This is also a good opportunity to repeat that all distance-related calculations in **geopandas** (and **shapely**) assume planar geometry, and only take into account the coordinate values. It is up to the user to make sure that all input layers are in the same projected CRS, so that this type of calculations make sense (see @sec-geometry-operations-on-projected-and-unprojected-data and @sec-when-to-reproject). # # ### Spatial joining {#sec-spatial-joining} # @@ -442,8 +428,6 @@ # The following example illustrates the process: imagine you have ten points randomly distributed across the Earth's surface and you ask, for the points that are on land, which countries are they in? # Implementing this idea in a reproducible example will build your geographic data handling skills and show how spatial joins work. # The starting point is to create points that are randomly scattered over the planar surface that represents Earth's geographic coordinates, in decimal degrees (@fig-spatial-join (a)). -# -# # In[ ]: @@ -457,13 +441,13 @@ random_points -# The scenario illustrated in @fig-spatial-join shows that the `random_points` object (top left) lacks attribute data, while the world (top right) has attributes, including country names shown for a sample of countries in the legend. +# The scenario illustrated in @fig-spatial-join shows that the `random_points` object (top left) lacks attribute data, while the world (top right) has attributes, including country names that are shown for a sample of countries in the legend. # Before creating the joined dataset, we use spatial subsetting to create `world_random`, which contains only countries that contain random points, to verify the number of country names returned in the joined dataset should be four (see the top right panel of @fig-spatial-join (b)). # In[ ]: -world_random = world[world.intersects(random_points.unary_union)] +world_random = world[world.intersects(random_points.union_all())] world_random @@ -505,7 +489,7 @@ # # Sometimes two geographic datasets do not touch but still have a strong geographic relationship. # The datasets `cycle_hire` and `cycle_hire_osm` provide a good example. -# Plotting them reeveals that they are often closely related but they do not seem to touch, as shown in @fig-cycle-hire. +# Plotting them reveals that they are often closely related but they do not seem to touch, as shown in @fig-cycle-hire. # In[ ]: @@ -539,11 +523,7 @@ # This is when a non-overlapping join is needed. # Spatial join (`gpd.sjoin`) along with buffered geometries (see @sec-buffers) can be used to do that, as demonstrated below using a threshold distance of 20 $m$. -# -# # Note that we transform the data to a projected CRS (`27700`) to use real buffer distances, in meters (see @sec-geometry-operations-on-projected-and-unprojected-data). -# -# # In[ ]: @@ -551,7 +531,11 @@ crs = 27700 cycle_hire_buffers = cycle_hire.copy().to_crs(crs) cycle_hire_buffers.geometry = cycle_hire_buffers.buffer(20) -cycle_hire_buffers = gpd.sjoin(cycle_hire_buffers, cycle_hire_osm.to_crs(crs)) +cycle_hire_buffers = gpd.sjoin( + cycle_hire_buffers, + cycle_hire_osm.to_crs(crs), + how='left' +) cycle_hire_buffers @@ -559,10 +543,6 @@ # This is because some cycle hire stations in `cycle_hire_buffers` have multiple matches in `cycle_hire_osm`. # To aggregate the values for the overlapping points and return the mean, we can use the aggregation methods shown in @sec-vector-attribute-aggregation, resulting in an object with the same number of rows as the target. # We also go back from buffers to points using `.centroid` method. -# -# -# -# # In[ ]: @@ -583,7 +563,7 @@ #| fig-cap: Non-overlapping join #| fig-subcap: #| - Input (`cycle_hire_osm`) -#| - Join result (`z`) +#| - Join result (`cycle_hire_buffers`) #| layout-ncol: 2 # Input fig, ax = plt.subplots(1, 1, figsize=(6, 3)) @@ -629,7 +609,7 @@ # In[ ]: -nz_height2 = nz_height2.groupby('Name')[['elevation']].mean() +nz_height2 = nz_height2.groupby('Name')[['elevation']].mean().reset_index() nz_height2 @@ -642,10 +622,10 @@ nz2 -# We now have create the `nz_height4` layer, which gives the average `nz_height` elevation value per polygon. +# We now have create the `nz2` layer, which gives the average `nz_height` elevation value per polygon. # The result is shown in @fig-nz-avg-nz-height. # Note that the `missing_kwds` part determines the style of geometries where the symbology attribute (`elevation`) is missing, because there were no `nz_height` points overlapping with them. -# The default is to omit them, which is usually not what we want, but with `{'color':'none','edgecolor':'black'}`, those polygons are shown with black outline and no fill. +# The default is to omit them, which is usually not what we want, but with `{'color':'grey','edgecolor':'black'}`, those polygons are shown with black outline and grey fill. # In[ ]: @@ -656,30 +636,23 @@ column='elevation', legend=True, cmap='Blues', edgecolor='black', - missing_kwds={'color': 'none', 'edgecolor': 'black'} + missing_kwds={'color': 'grey', 'edgecolor': 'black'} ); # ### Joining incongruent layers {#sec-joining-incongruent-layers} # -# -# -# # Spatial congruence is an important concept related to spatial aggregation. # An aggregating object (which we will refer to as `y`) is congruent with the target object (`x`) if the two objects have shared borders. # Often this is the case for administrative boundary data, whereby larger units---such as Middle Layer Super Output Areas (MSOAs) in the UK, or districts in many other European countries---are composed of many smaller units. # # Incongruent aggregating objects, by contrast, do not share common borders with the target [@qiu_development_2012]. # This is problematic for spatial aggregation (and other spatial operations) illustrated in @fig-nz-and-grid: aggregating the centroid of each sub-zone will not return accurate results. -# Areal interpolation overcomes this issue by transferring values from one set of areal units to another, using a range of algorithms including simple area weighted approaches and more sophisticated approaches such as 'pycnophylactic' methods [@tobler_smooth_1979]. +# Areal interpolation overcomes this issue by transferring values from one set of areal units to another, using a range of algorithms including simple area-weighted approaches and more sophisticated approaches such as 'pycnophylactic' methods [@tobler_smooth_1979]. # -# To demonstrate joining incongruent layers, we will create a "synthetic" layer comprising a [regular grid](https://gis.stackexchange.com/questions/322589/rasterizing-polygon-grid-in-python-geopandas-rasterio) of rectangles of size $100\times100$ $km$, covering the extent of the `nz` layer. +# To demonstrate joining incongruent layers, we will create a 'synthetic' layer comprising a regular grid of rectangles of size $100\times100$ $km$, covering the extent of the `nz` layer. # This recipe can be used to create a regular grid covering any given layer (other than `nz`), at the specified resolution (`res`). -# Most of the functions have been explained in previous chapters; we leave it as an exerise for the reader to explore how the code works. -# -# -# -# +# Most of the functions have been explained in previous chapters; we leave it as an exercise for the reader to explore how the code works. # In[ ]: @@ -723,13 +696,13 @@ column='Population', edgecolor='black', legend=True, - cmap='viridis_r' + cmap='Reds' ); -# Our goal, now, is to "transfer" the `'Population'` attribute (@fig-nz-and-grid) to the rectangular grid polygons, which is an example of a join between incongruent layers. +# Our goal, now, is to 'transfer' the `'Population'` attribute (@fig-nz-and-grid) to the rectangular grid polygons, which is an example of a join between incongruent layers. # To do that, we basically need to calculate--for each `grid` cell---the weighted sum of the population in `nz` polygons coinciding with that cell. -# The weights in the weighted sum calculation are the ratios between the area of the coinciding "part" out of the entire `nz` polygon. +# The weights in the weighted sum calculation are the ratios between the area of the coinciding 'part' out of the entire `nz` polygon. # That is, we (inevitably) assume that the population in each `nz` polygon is equally distributed across space, therefore a partial `nz` polygon contains the respective partial population size. # # We start by calculating the entire area of each `nz` polygon, as follows, using the `.area` method (@sec-area-length). @@ -741,8 +714,8 @@ nz -# Next, we use the [`.overlay`](https://geopandas.org/en/stable/docs/reference/api/geopandas.GeoDataFrame.overlay.html) method to calculate the pairwise intersections between `nz` and `grid`. -# As a result, we now have a layer where each `nz` polygon is "split" according to the `grid` polygons, hereby named `nz_grid`. +# Next, we use the `.overlay` method to calculate the pairwise intersections between `nz` and `grid`. +# As a result, we now have a layer where each `nz` polygon is split according to the `grid` polygons, hereby named `nz_grid`. # In[ ]: @@ -772,7 +745,7 @@ nz_grid -# The resulting layer `nz_grid`, which the `area_sub` attribute, is shown in @fig-nz-and-grid2. +# The resulting layer `nz_grid`, with the `area_sub` attribute, is shown in @fig-nz-and-grid2. # In[ ]: @@ -780,12 +753,18 @@ #| label: fig-nz-and-grid2 #| fig-cap: The areas of pairwise intersections in the `nz_grid` layer base = grid.plot(color='none', edgecolor='grey') -nz_grid.plot(ax=base, column='area_sub', edgecolor='black', legend=True, cmap='viridis_r'); +nz_grid.plot( + ax=base, + column='area_sub', + edgecolor='black', + legend=True, + cmap='Reds' +); -# Note that each of the "intersections" still holds the `Population` attribute of its "origin" feature of `nz`, i.e., each portion of the `nz` area is associated with the original complete population count for that area. +# Note that each of the intersections still holds the `Population` attribute of its 'origin' feature of `nz`, i.e., each portion of the `nz` area is associated with the original complete population count for that area. # The real population size of each `nz_grid` feature, however, is smaller, or equal, depending on the geographic area proportion that it occupies out of the original `nz` feature. -# To make the "correction", we first calculate the ratio (`area_prop`) and then multiply it by the population. +# To make the correction, we first calculate the ratio (`area_prop`) and then multiply it by the population. # The new (lowercase) attribute `population` now has the correct estimate of population sizes in `nz_grid`: # In[ ]: @@ -797,7 +776,7 @@ # What is left to be done is to sum (see @sec-vector-attribute-aggregation) the population in all parts forming the same grid cell and join (see @sec-vector-attribute-joining) them back to the `grid` layer. -# Note that many of the grid cells have "No Data" for population, because they have no intersection with `nz` at all (@fig-nz-and-grid). +# Note that many of the grid cells have 'No Data' for population, because they have no intersection with `nz` at all (@fig-nz-and-grid). # In[ ]: @@ -814,7 +793,12 @@ #| label: fig-nz-and-grid3 #| fig-cap: 'The `nz` layer and a regular grid of rectangles: final result' -base = grid.plot(column='population', edgecolor='black', legend=True, cmap='viridis_r'); +base = grid.plot( + column='population', + edgecolor='black', + legend=True, + cmap='Reds' +); nz.plot(ax=base, color='none', edgecolor='grey', legend=True); @@ -834,24 +818,19 @@ # The procedure in this section is known as an area-weighted interpolation of a spatially *extensive* (e.g., population) variable. # In extensive interpolation, we assume that the variable of interest represents counts (such as, here, inhabitants) uniformly distributed across space. -# In such case, each "part" of a given polygon captures the respective proportion of counts (such as, half of a region with $N$ inhabitants contains $N/2$ ihnabitants). +# In such case, each part of a given polygon captures the respective proportion of counts (such as, half of a region with $N$ inhabitants contains $N/2$ inhabitants). # Accordingly, summing the parts gives the total count of the total area. # # An area-weighted interpolation of a spatially *intensive* variable (e.g., population density) is almost identical, except that we would have to calculate the weighted `.mean` rather than `.sum`, to preserve the average rather than the sum. # In intensive interpolation, we assume that the variable of interest represents counts per unit area, i.e., density. -# Since density is (assumed to be) uniform, any "part" of a given polygon has exactly the same density as that of the whole polygon. +# Since density is (assumed to be) uniform, any part of a given polygon has exactly the same density as that of the whole polygon. # Density values are therefore computed as weighted averages, rather than sums, of the parts. -# Also see the "Area-weighted interpolation" [section](https://r-spatial.org/book/05-Attributes.html#sec-area-weighted) in @pebesma_spatial_2023. -# -# +# Also, see the 'Area-weighted interpolation' section in @pebesma_spatial_2023. # # ### Distance relations {#sec-distance-relations} # -# -# -# # While topological relations are binary---a feature either intersects with another or does not---distance relations are continuous. -# The distance between two objects is calculated with the [`.distance`](https://geopandas.org/en/stable/docs/reference/api/geopandas.GeoSeries.distance.html) method. +# The distance between two objects is calculated with the `.distance` method. # The method is applied on a `GeoSeries` (or a `GeoDataFrame`), with the argument being an individual `shapely` geometry. # The result is a `Series` of pairwise distances. # @@ -859,25 +838,21 @@ # **geopandas** uses similar syntax and mode of operation for many of its methods and functions, including: # # * Numeric calculations, such as `.distance` (this section), returning numeric values -# * Topological evaluations methods, such as `.intersects` or `.disjoint` (@sec-topological-relations), returning boolean values +# * Topological evaluation methods, such as `.intersects` or `.disjoint` (@sec-topological-relations), returning boolean values # * Geometry generating-methods, such as `.intersection` (@sec-clipping), returning geometries # -# In all cases, the input is a `GeoSeries` and (or a `GeoDataFrame`) and a `shapely` geometry, and the output is a `Series` or `GeoSeries` of results, contrasting each geometry from the `GeoSeries` with the `shapely` geometry. The examples in this book demonstrate this, so called "many-to-one", mode of the functions. +# In all cases, the input is a `GeoSeries` and (or a `GeoDataFrame`) and a `shapely` geometry, and the output is a `Series` or `GeoSeries` of results, contrasting each geometry from the `GeoSeries` with the `shapely` geometry. +# The examples in this book demonstrate this, so-called 'many-to-one', mode of the functions. # # All of the above-mentioned methods also have a pairwise mode, perhaps less useful and not used in the book, where we evaluate relations between pairs of geometries in two `GeoSeries`, aligned either by index or by position. # ::: # -# -# -# -# To illustrate the `.distance` method, let's take the three highest point in New Zealand with `.sort_values` and `.iloc`. +# To illustrate the `.distance` method, let's take the three highest points in New Zealand with `.sort_values` and `.iloc`. # In[ ]: -nz_highest = nz_height \ - .sort_values(by='elevation', ascending=False) \ - .iloc[:3, :] +nz_highest = nz_height.sort_values(by='elevation', ascending=False).iloc[:3, :] nz_highest @@ -897,7 +872,7 @@ nz_highest.distance(canterbury_centroid) -# To obtain a distance matrix, i.e., a pairwise set of distances between all combinations of features in objects `x` and `y`, we need to use the method (analogous to the way we created the `.intersects` boolean matrix in @sec-topological-relations). +# To obtain a distance matrix, i.e., a pairwise set of distances between all combinations of features in objects `x` and `y`, we need to use the `.apply` method (analogous to the way we created the `.intersects` boolean matrix in @sec-topological-relations). # To illustrate this, let's now take two regions in `nz`, Otago and Canterbury, represented by the object `co`. # In[ ]: @@ -919,37 +894,39 @@ # Note that the distance between the second and third features in `nz_height` and the second feature in `co` is zero. -# This demonstrates the fact that distances between points and polygons refer to the distance to any part of the polygon: the second and third points in `nz_height` are in Otago, which can be verified by plotting them (two almost completly overlappling points in @fig-nz-height-and-otago). +# This demonstrates the fact that distances between points and polygons refer to the distance to any part of the polygon: the second and third points in `nz_height` are in Otago, which can be verified by plotting them (two almost completely overlappling points in @fig-nz-height-and-otago). # In[ ]: #| label: fig-nz-height-and-otago #| fig-cap: The first three `nz_height` points, and the Otago and Canterbury regions from `nz` -base = co.plot(color='lightgrey', edgecolor='black') -nz_height.iloc[:3, :].plot(ax=base, color='none', edgecolor='black'); +fig, ax = plt.subplots() +co.plot(color='lightgrey', edgecolor='black', ax=ax) +co.apply( + lambda x: ax.annotate( + text=x['Name'], + xy=x.geometry.centroid.coords[0], + ha='center' + ), + axis=1 +) +nz_height.iloc[:3, :].plot(color='none', edgecolor='black', ax=ax); # ## Spatial operations on raster data {#sec-spatial-ras} # -# -# -# # This section builds on @sec-manipulating-raster-objects, which highlights various basic methods for manipulating raster datasets, to demonstrate more advanced and explicitly spatial raster operations, and uses the `elev.tif` and `grain.tif` rasters manually created in @sec-raster-from-scratch. # # ### Spatial subsetting {#sec-spatial-subsetting-raster} # # The previous chapter (and especially @sec-manipulating-raster-objects) demonstrated how to retrieve values associated with specific row and column combinations from a raster. -# -# # Raster values can also be extracted by location (coordinates) and other spatial objects. -# To use coordinates for subsetting, we can use the [`.sample`](https://rasterio.readthedocs.io/en/stable/api/rasterio.io.html#rasterio.io.DatasetReader.sample) method of a `rasterio` file connection object, combined with a list of coordinate tuples. -# The methods is demonstrated below to find the value of the cell that covers a point located at coordinates of `(0.1,0.1)` in `elev`. +# To use coordinates for subsetting, we can use the `.sample` method of a `rasterio` file connection object, combined with a list of coordinate tuples. +# The method is demonstrated below to find the value of the cell that covers a point located at coordinates of `(0.1,0.1)` in `elev`. # The returned object is a *generator*. # The rationale for returning a generator, rather than a `list`, is memory efficiency. -# The number of sampled points may be huge, in which case we would want to "generate" the values one at a time rather than all at once. -# -# +# The number of sampled points may be huge, in which case we would want to generate the values one at a time rather than all at once. # In[ ]: @@ -993,25 +970,20 @@ #| fig-cap: The `elev.tif` raster, and two points where we extract its values fig, ax = plt.subplots() rasterio.plot.show(src_elev, ax=ax) -gpd.GeoSeries([shapely.Point(0.1, 0.1)]).plot(color='black', ax=ax) -gpd.GeoSeries([shapely.Point(1.1, 1.1)]).plot(color='black', ax=ax); +gpd.GeoSeries([shapely.Point(0.1, 0.1)]) \ + .plot(color='black', edgecolor='white', markersize=50, ax=ax) +gpd.GeoSeries([shapely.Point(1.1, 1.1)]) \ + .plot(color='black', edgecolor='white', markersize=50, ax=ax); -# -# -# # ::: callout-note # We elaborate on the plotting technique used to display the points and the raster in @sec-plot-static-layers. # We will also introduce a more user-friendly and general method to extract raster values to points, using the **rasterstats** package, in @sec-extraction-to-points. # ::: # # Another common use case of spatial subsetting is using a boolean mask, based on another raster with the same extent and resolution, or the original one, as illustrated in @fig-raster-subset. -# To do that, we "erase" the values in the array of one raster, according to another corresponding "mask" raster. -# For example, let us read (@sec-using-rasterio) the `elev.tif` raster values into an array named `elev` (@fig-raster-subset (a)), -# -# -# -# +# To do that, we erase the values in the array of one raster, according to another corresponding mask raster. +# For example, let's read (@sec-using-rasterio) the `elev.tif` raster values into an array named `elev` (@fig-raster-subset (a)). # In[ ]: @@ -1034,7 +1006,7 @@ # In other words, we want to mask `elev` with `mask`. # The result will be stored in a copy named `masked_elev` (@fig-raster-subset (c)). # In the case of `elev.tif`, to be able to store `np.nan` in the array of values, we also need to convert it to `float` (see @sec-summarizing-raster-objects). -# Afterwards, masking is a matter of assigning `np.nan` into a subset defined by the mask, using the ["boolean array indexing"](https://numpy.org/doc/stable/user/basics.indexing.html#boolean-array-indexing) syntax of **numpy**. +# Afterwards, masking is a matter of assigning `np.nan` into a subset defined by the mask, using the 'boolean array indexing' syntax of **numpy**. # In[ ]: @@ -1062,7 +1034,7 @@ rasterio.plot.show(masked_elev); -# The "mask" can be create from the array itself, using condition(s). +# The mask can be created from the array itself, using condition(s). # That way, we can replace some values (e.g., values assumed to be wrong) with `np.nan`, such as in the following example. # In[ ]: @@ -1078,13 +1050,13 @@ # # ### Map algebra {#sec-map-algebra} # -# The term 'map algebra' was coined in the late 1970s to describe a "set of conventions, capabilities, and techniques" for the analysis of geographic raster and (although less prominently) vector data [@tomlin_map_1994]. +# The term 'map algebra' was coined in the late 1970s to describe a 'set of conventions, capabilities, and techniques' for the analysis of geographic raster and (although less prominently) vector data [@tomlin_map_1994]. # In this context, we define map algebra more narrowly, as operations that modify or summarize raster cell values, with reference to surrounding cells, zones, or statistical functions that apply to every cell. # -# Map algebra operations tend to be fast, because raster datasets only implicitly store coordinates, hence the old adage "raster is faster but vector is corrector". +# Map algebra operations tend to be fast, because raster datasets only implicitly store coordinates, hence the old adage 'raster is faster but vector is corrector'. # The location of cells in raster datasets can be calculated by using its matrix position and the resolution and origin of the dataset (stored in the raster metadata, @sec-using-rasterio). # For the processing, however, the geographic position of a cell is barely relevant as long as we make sure that the cell position is still the same after the processing. -# Additionally, if two or more raster datasets share the same extent, projection and resolution, one could treat them as matrices for the processing. +# Additionally, if two or more raster datasets share the same extent, projection, and resolution, one could treat them as matrices for the processing. # # Map algebra (or cartographic modeling with raster data) divides raster operations into four subclasses [@tomlin_geographic_1990], with each working on one or several grids simultaneously: # @@ -1093,14 +1065,14 @@ # - Zonal operations are similar to focal operations, but the surrounding pixel grid on which new values are computed can have irregular sizes and shapes (@sec-zonal-operations) # - Global or per-raster operations; that means the output cell derives its value potentially from one or several entire rasters (@sec-global-operations-and-distances) # -# This typology classifies map algebra operations by the number of cells used for each pixel processing step and the type of the output. -# For the sake of completeness, we should mention that raster operations can also be classified by discipline such as terrain, hydrological analysis, or image classification. +# This typology classifies map algebra operations by the number of cells used for each pixel processing step and the type of output. +# For the sake of completeness, we should mention that raster operations can also be classified by disciplines such as terrain, hydrological analysis, or image classification. # The following sections explain how each type of map algebra operations can be used, with reference to worked examples. # # ### Local operations {#sec-raster-local-operations} # # Local operations comprise all cell-by-cell operations in one or several layers. -# Raster algebra is a classical use case of local operations---this includes adding or subtracting values from a raster, squaring and multiplying rasters. +# Raster algebra is a classical use case of local operations---this includes adding or subtracting values from a raster, squaring,, and multiplying rasters. # Raster algebra also allows logical operations such as finding all raster cells that are greater than a specific value (e.g., `5` in our example below). # Local operations are applied using the **numpy** array operations syntax, as demonstrated below. # @@ -1113,7 +1085,7 @@ # Now, any element-wise array operation can be applied using **numpy** arithmetic or conditional operators and functions, comprising local raster operations in spatial analysis terminology. -# For example `elev + elev` adds the values of `elev` to itself, resulting in a raster with double values. +# For example, `elev + elev` adds the values of `elev` to itself, resulting in a raster with double values. # In[ ]: @@ -1121,11 +1093,9 @@ elev + elev -# Note that some functions and operators automatically change the data type to accommodate the resulting values, while other operators do not, potentially resulting in overflow (i.e., incorrect values for results beyond the data type range, such as trying to accomodate values above `255` in an `int8` array). +# Note that some functions and operators automatically change the data type to accommodate the resulting values, while other operators do not, potentially resulting in overflow (i.e., incorrect values for results beyond the data type range, such as trying to accommodate values above `255` in an `int8` array). # For example, `elev**2` (`elev` squared) results in overflow. -# Since the `**` operator does not automatically change the data type, leaving it as `int8`, the resulting array has incorrect values for `16**2`, `17**2`, etc., which are above `255` and therefore cannot be accomodated. -# -# +# Since the `**` operator does not automatically change the data type, leaving it as `int8`, the resulting array has incorrect values for `16**2`, `17**2`, etc., which are above `255` and therefore cannot be accommodated. # In[ ]: @@ -1134,9 +1104,7 @@ # To avoid this situation, we can, for instance, transform `elev` to the standard `int64` data type, using `.astype` before applying the `**` operator. -# That way all, results up to `36**2` (`1296`) can be easily accomodated, since the `int64` data type supports values up to `9223372036854775807` (@tbl-numpy-data-types). -# -# +# That way, all results, up to `36**2` (`1296`), can be easily accommodated, since the `int64` data type supports values up to `9223372036854775807` (@tbl-numpy-data-types). # In[ ]: @@ -1146,10 +1114,6 @@ # Now we get correct results. # -# ::: callout-note -# **numpy** has the special data types `np.int_` and `np.float_`, which refer to "default" `int` and `float` data types. These are platform dependent, but typically resolve to `np.int64` and `np.float64`. Furthermore, the standard Python types `int` and `float` refer to those two **numpy** types, respectively. Therefore, for example, either of the three objects `np.int64`, `np.int_` and `int` can be passed to `.astype` in the above example, with identical result. Whereas we've used the shortest one, `int`. -# ::: -# # @fig-raster-local-operations demonstrates the result of the last two examples (`elev+elev` and `elev.astype(int)**2`), and two other ones (`np.log(elev)` and `elev>5`). # In[ ]: @@ -1169,8 +1133,8 @@ rasterio.plot.show(elev > 5, cmap='Oranges'); -# Another good example of local operations is the classification of intervals of numeric values into groups such as grouping a digital elevation model into low (class `1`), middle (class `2`) and high elevations (class `3`). -# Here, we assign the raster values in the ranges `0`--`12`, `12`--`24` and `24`--`36` are reclassified to take values `1`, `2` and `3`, respectively. +# Another good example of local operations is the classification of intervals of numeric values into groups such as grouping a digital elevation model into low (class `1`), middle (class `2`) and high (class `3`) elevations. +# Here, the raster values in the ranges `0`--`12`, `12`--`24`, and `24`--`36` are reclassified to take values `1`, `2`, and `3`, respectively. # In[ ]: @@ -1196,10 +1160,12 @@ rasterio.plot.show(recl, cmap='Oranges'); -# The calculation of the [Normalized Difference Vegetation Index (NDVI)](https://en.wikipedia.org/wiki/Normalized_difference_vegetation_index) is a well-known local (pixel-by-pixel) raster operation. +# The calculation of the Normalized Difference Vegetation Index (NDVI)[^ndvi] is a well-known local (pixel-by-pixel) raster operation. # It returns a raster with values between `-1` and `1`; positive values indicate the presence of living plants (mostly \> `0.2`). -# NDVI is calculated from red and near-infrared (NIR) bands of remotely sensed imagery, typically from satellite systems such as Landsat or Sentinel 2. -# Vegetation absorbs light heavily in the visible light spectrum, and especially in the red channel, while reflecting NIR light, which is emulated in the NVDI formula (@eq-ndvi). +# NDVI is calculated from red and near-infrared (NIR) bands of remotely sensed imagery, typically from satellite systems such as Landsat or Sentinel-2. +# Vegetation absorbs light heavily in the visible light spectrum, and especially in the red channel, while reflecting NIR light, which is emulated in the NVDI formula (@eq-ndvi), +# +# [^ndvi]: [https://en.wikipedia.org/wiki/Normalized_difference_vegetation_index](https://en.wikipedia.org/wiki/Normalized_difference_vegetation_index) # # $$ # NDVI=\frac{NIR-Red} {NIR+Red} @@ -1208,32 +1174,22 @@ # , where $NIR$ is the near-infrared band and $Red$ is the red band. # # Let's calculate NDVI for the multispectral Landsat satellite file (`landsat.tif`) of the Zion National Park. +# The file `landsat.tif` contains surface reflectance values (range `0`-`1`) in the blue, green, red, and near-infrared (NIR) bands. # We start by reading the file and extracting the NIR and red bands, which are the fourth and third bands, respectively. # Next, we apply the formula to calculate the NDVI values. # In[ ]: +#| warning: false landsat = src_landsat.read() nir = landsat[3] red = landsat[2] ndvi = (nir-red)/(nir+red) -# We also convert values \>`1` to "No Data". -# -# - -# In[ ]: - - -ndvi[ndvi>1] = np.nan - - # When plotting an RGB image using the `rasterio.plot.show` function, the function assumes that values are in the range `[0,1]` for floats, or `[0,255]` for integers (otherwise clipped) and the order of bands is RGB. -# To "prepare" the multi-band raster for `rasterio.plot.show`, we therefore reverse the order of the first three bands (to go from B-G-R-NIR to R-G-B), using the `[:3]` slice to select the first three bands and then the `[::-1]` slice to reverse the bands order, and divide by the raster maximum to set the maximum value to `1`. -# -# +# To prepare the multi-band raster for `rasterio.plot.show`, we, therefore, reverse the order of the first three bands (to go from B-G-R-NIR to R-G-B), using the `[:3]` slice to select the first three bands and then the `[::-1]` slice to reverse the bands order, and divide by the raster maximum to set the maximum value to `1`. # In[ ]: @@ -1246,8 +1202,8 @@ # The default is to start from the beginning, go to the end, and use steps of `1`. # Otherwise, `start` is inclusive and `end` is exclusive, whereas negative `step` values imply going backwards starting from the end. # Also, always keep in mind that Python indices start from `0`. -# When subsetting two- or three-dimensional objects, indices for each dimension are separated by commas, where either index can be set to `:` meaning "all values". -# The last dimensions can also be omitted implying `:`, e.g., to subset the first three bands from a three-dimensional array `a` we can use either `a[:3,:,:]` or `a[:3]` +# When subsetting two- or three-dimensional objects, indices for each dimension are separated by commas, where either index can be set to `:` meaning 'all values'. +# The last dimensions can also be omitted implying `:`, e.g., to subset the first three bands from a three-dimensional array `a` we can use either `a[:3,:,:]` or `a[:3]`. # # In the above example: # @@ -1272,23 +1228,23 @@ # ### Focal operations {#sec-focal-operations} # -# While local functions operate on one cell at time (though possibly from multiple layers), focal operations take into account a central (focal) cell and its neighbors. -# The neighborhood (also named kernel, filter or moving window) under consideration is typically of $3 \times 3$ cells (that is, the central cell and its eight surrounding neighbors), but can take on any other (not necessarily rectangular) shape as defined by the user. +# While local functions operate on one cell at a time (though possibly from multiple layers), focal operations take into account a central (focal) cell and its neighbors. +# The neighborhood (also named kernel, filter, or moving window) under consideration is typically of $3 \times 3$ cells (that is, the central cell and its eight surrounding neighbors), but can take on any other (not necessarily rectangular) shape as defined by the user. # A focal operation applies an aggregation function to all cells within the specified neighborhood, uses the corresponding output as the new value for the central cell, and moves on to the next central cell (@fig-focal-filter). # Other names for this operation are spatial filtering and convolution [@burrough_principles_2015]. # -# ![Input raster (left) and resulting output raster (right) due to a focal operation---finding the minimum value in $3 \times 3$ moving windows.](https://r.geocompx.org/figures/04_focal_example.png){#fig-focal-filter} +# ![Input raster (left) and resulting output raster (right) due to a focal operation---finding the minimum value in $3 \times 3$ moving windows.](images/04_focal_example.png){#fig-focal-filter} # -# In Python, the [**scipy.ndimage**](https://docs.scipy.org/doc/scipy/tutorial/ndimage.html) [@scipy] package has a comprehensive collection of [functions](https://docs.scipy.org/doc/scipy/reference/ndimage.html#filters) to perform filtering of **numpy** arrays, such as: +# In Python, the **scipy.ndimage** [@scipy] package has a comprehensive collection of functions to perform filtering of **numpy** arrays, such as: # -# - [`scipy.ndimage.minimum_filter`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.ndimage.minimum_filter.html) -# - [`scipy.ndimage.maximum_filter`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.ndimage.maximum_filter.html) -# - [`scipy.ndimage.uniform_filter`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.ndimage.uniform_filter.html) (i.e., mean filter) -# - [`scipy.ndimage.median_filter`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.ndimage.median_filter.html) etc. +# - `scipy.ndimage.minimum_filter`, +# - `scipy.ndimage.maximum_filter`, +# - `scipy.ndimage.uniform_filter` (i.e., mean filter), +# - `scipy.ndimage.median_filter`, etc. # -# In this group of functions, we define the shape of the moving window with either one of `size`---a single number (e.g., `3`), or tuple (e.g., `(3,3)`), implying a filter of those dimensions or `footprint`---a boolean array, representing both the window shape and the identity of elements being included +# In this group of functions, we define the shape of the moving window with either one of `size`---a single number (e.g., `3`), or tuple (e.g., `(3,3)`), implying a filter of those dimensions, or `footprint`---a boolean array, representing both the window shape and the identity of elements being included. # -# In addition to specific built-in filters, `convolve`---applies the sum function after multiplying by a custom `weights` array and `generic_filter`---makes it possible to pass any custom function, where the user can specify any type of custom window-based calculation. +# In addition to specific built-in filters, `convolve`---applies the sum function after multiplying by a custom `weights` array, and `generic_filter`---makes it possible to pass any custom function, where the user can specify any type of custom window-based calculation. # # For example, here we apply the minimum filter with window size of `3` on `elev`. # As a result, we now have a new array `elev_min`, where each value is the minimum in the corresponding $3 \times 3$ neighborhood in `elev`. @@ -1301,12 +1257,12 @@ # Special care should be given to the edge pixels -- how should they be calculated? -# The **scipy.ndimage** filtering functions give several options through the `mode` parameter (see the documentation of any filtering function, such as [scipy.ndimage.median_filter](https://docs.scipy.org/doc/scipy/reference/generated/scipy.ndimage.median_filter.html), for the definition of each mode): `reflect` (the default), `constant`, `nearest`, `mirror`, `wrap`. +# The **scipy.ndimage** filtering functions give several options through the `mode` parameter (see the documentation of any filtering function, such as `scipy.ndimage.median_filter`, for the definition of each mode): `reflect` (the default), `constant`, `nearest`, `mirror`, `wrap`. # Sometimes artificially extending raster edges is considered unsuitable. -# In other words, we may wish the resulting raster to contain pixel values with "complete" windows only, for example to have a uniform sample size or because values in all directions matter (such as in topographic calculations). +# In other words, we may wish the resulting raster to contain pixel values with 'complete' windows only, for example, to have a uniform sample size or because values in all directions matter (such as in topographic calculations). # There is no specific option *not* to extend edges in **scipy.ndimage**. # However, to get the same effect, the edges of the filtered array can be assigned with `np.nan`, in a number of rows and columns according to filter size. -# For example, when using a filter of `size=3`, the outermost "layer" of pixels may be assigned with `np.nan`, reflecting the fact that these pixels have incomplete $3 \times 3$ neighborhoods: +# For example, when using a filter of `size=3`, the outermost 'layer' of pixels may be assigned with `np.nan`, reflecting the fact that these pixels have incomplete $3 \times 3$ neighborhoods (@fig-focal-filter): # In[ ]: @@ -1322,12 +1278,9 @@ # # Focal functions or filters play a dominant role in image processing. # For example, low-pass or smoothing filters use the mean function to remove extremes. -# By contrast, high-pass filters accentuate features. -# The line detection Laplace and Sobel filters might serve as an example here. -# -# +# By contrast, high-pass filters, often created with custom neighborhood weights, accentuate features. # -# In the case of categorical data, we can replace the mean with the mode, which is the most common value. +# In the case of categorical data, we can replace the mean with the mode, i.e., the most common value. # To demonstrate applying a mode filter, let's read the small sample categorical raster `grain.tif`. # In[ ]: @@ -1337,9 +1290,7 @@ grain -# There is no built-in filter function for a mode filter in **scipy.ndimage**, but we can use the [`scipy.ndimage.generic_filter`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.ndimage.generic_filter.html) function along with a custom filtering function, internally utilizing [`scipy.stats.mode`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mode.html). -# -# +# There is no built-in filter function for a mode filter in **scipy.ndimage**, but we can use the `scipy.ndimage.generic_filter` function along with a custom filtering function, internally utilizing `scipy.stats.mode`. # In[ ]: @@ -1361,9 +1312,7 @@ # # Terrain processing is another important application of focal operations. # Such functions are provided by multiple Python packages, including the general purpose **xarray** package, and more specialized packages such as **richdem** and **pysheds**. -# -# -# Useful terrain [metrics](https://richdem.readthedocs.io/en/latest/python_api.html?highlight=TerrainAttribute#richdem.TerrainAttribute) include: +# Useful terrain metrics include: # # - Slope, measured in units of percent, degreees, or radians [@horn_1981] # - Aspect, meaning each cell's downward slope direction [@horn_1981] @@ -1372,18 +1321,29 @@ # For example, each of these, and other, terrain metrics can be computed with the **richdem** package. # # ::: callout-note -# Terrain metrics are essentially focal filters with customized functions. Using `scipy.ndimage.generic_filter`, along with such custom functions, is an option for those who would like to calculate terrain metric through coding by hand and/or limiting their code dependencies. For example, the [How Aspect works](https://pro.arcgis.com/en/pro-app/latest/tool-reference/spatial-analyst/how-aspect-works.htm) and [How Slope works](https://pro.arcgis.com/en/pro-app/latest/tool-reference/spatial-analyst/how-slope-works.htm) pages from the ArcGIS Pro documentation provide exlanations and formulas of the required funtions for aspect and slope metrics (@fig-raster-slope), respectively, which can be translated to **numpy**-based functions to be used in `scipy.ndimage.generic_filter` to calculate those metrics. +# Terrain metrics are essentially focal filters with customized functions. +# Using `scipy.ndimage.generic_filter`, along with such custom functions, is an option for those who would like to calculate terrain metric through coding by hand and/or limiting their code dependencies. +# For example, the *How Aspect works*[^how_aspect_works] and *How Slope works*[^how_slope_works] pages from the ArcGIS Pro documentation provide explanations and formulas of the required functions for aspect and slope metrics (@fig-raster-slope), respectively, which can be translated to **numpy**-based functions to be used in `scipy.ndimage.generic_filter` to calculate those metrics. # ::: # -# Another extremely fast, memory-efficient, and concise, alternative, is to the use the GDAL program called [`gdaldem`](https://gdal.org/programs/gdaldem.html). +# [^how_aspect_works]: [https://pro.arcgis.com/en/pro-app/latest/tool-reference/spatial-analyst/how-aspect-works.htm](https://pro.arcgis.com/en/pro-app/latest/tool-reference/spatial-analyst/how-aspect-works.htm) +# +# [^how_slope_works]: [https://pro.arcgis.com/en/pro-app/latest/tool-reference/spatial-analyst/how-slope-works.htm](https://pro.arcgis.com/en/pro-app/latest/tool-reference/spatial-analyst/how-slope-works.htm) +# +# Another extremely fast, memory-efficient, and concise, alternative, is to the use the GDAL program called `gdaldem`. # `gdaldem` can be used to calculate slope, aspect, and other terrain metrics through a single command, accepting an input file path and exporting the result to a new file. # This is our first example in the book where we demonstrate a situation where it may be worthwhile to leave the Python environment, and utilize a GDAL program directly, rather than through their wrappers (such as **rasterio** and other Python packages), whether to access a computational algorithm not easily accessible in a Python package, or for GDAL's memory-efficiency and speed benefits. # # ::: callout-note -# GDAL contains a collection of over 40 [programs](https://gdal.org/programs/index.html), mostly aimed at raster processing. These include programs for fundamental operations, such as [`gdal_translate`](https://gdal.org/programs/gdal_translate.html#gdal-translate) (convert between raster file formats), [`gdalwarp`](https://gdal.org/programs/gdalwarp.html#gdalwarp) (raster reprojection), [`gdal_rasterize`](https://gdal.org/programs/gdal_rasterize.html#gdal-rasterize) (rasterize vector features), and [`gdal_merge.py`](https://gdal.org/programs/gdal_merge.html#gdal-merge) (raster mosaic), as well as numerous miscellaneous programs. In this book, we use **rasterio** for the above-mentioned operations, although the GDAL programs are a good alternative for those who are more comfortable with the command line. However, we do use two GDAL programs for tasks that are lacking in **rasterio** and not well-implemented in other Python packages: `gdaldem` (this section), and `gdal_contour` (@sec-raster-to-contours). +# GDAL contains a collection of over 40 programs, mostly aimed at raster processing. These include programs for fundamental operations, such as: +# +# * `gdal_translate`---convert between raster file formats +# * `gdalwarp`---raster reprojection +# * `gdal_rasterize`---rasterize vector features +# * `gdal_merge.py`---raster mosaic +# +# In this book, we use **rasterio** for the above-mentioned operations, although the GDAL programs are a good alternative for those who are more comfortable with the command line. However, we do use two GDAL programs for tasks that are lacking in **rasterio** and not well-implemented in other Python packages: `gdaldem` (this section), and `gdal_contour` (@sec-raster-to-contours). # ::: -# -# # # GDAL, along with all of its programs should be available in your Python environment, since GDAL is a dependency of **rasterio**. # The following example, which should be run from the command line, takes the `srtm_32612.tif` raster (which we are going to create in @sec-reprojecting-raster-geometries, therefore it is in the `'output'` directory), calculates slope (in decimal degrees, between `0` and `90`), and exports the result to a new file `srtm_32612_slope.tif`. @@ -1397,7 +1357,7 @@ # Here we ran the `gdaldem` command through `os.system`, in order to remain in the Python environment, even though we are calling an external program. -# You can also run the standalone command in the command line interface you are using, such as the Anaconda Prompt: +# Alternatively, you can run the standalone command in the command line interface you are using, such as the Anaconda Prompt: # # ```{sh} # gdaldem slope output/srtm_32612.tif output/srtm_32612_slope.tif @@ -1414,13 +1374,7 @@ # @fig-raster-slope shows the results, using our more familiar plotting methods from **rasterio**. -# The code section is relatively long due to the workaround to create a color key (see @sec-plot-symbology) and removing "No Data" flag values from the arrays so that the color key does not include them. Also note that we are using one of **matplotlib**'s the [cyclic color scales](https://matplotlib.org/stable/users/explain/colors/colormaps.html#cyclic) (`'twilight'`) when plotting aspect (@fig-raster-slope (c)). -# -# -# -# -# -# +# The code section is relatively long due to the workaround to create a color key (see @sec-plot-symbology) and removing 'No Data' flag values from the arrays so that the color key does not include them. Also note that we are using one of **matplotlib**'s the cyclic color scales (`'twilight'`) when plotting aspect (@fig-raster-slope (c)). # In[ ]: @@ -1460,14 +1414,14 @@ # Just like focal operations, zonal operations apply an aggregation function to multiple raster cells. # However, a second raster, usually with categorical values, defines the zonal filters (or 'zones') in the case of zonal operations, as opposed to a predefined neighborhood window in the case of focal operation presented in the previous section. # Consequently, raster cells defining the zonal filter do not necessarily have to be neighbors. -# Our `grain.tif` raster is a good example, as illustrated in @fig-rasterio-plot-elev: different grain sizes are spread irregularly throughout the raster. +# Our `grain.tif` raster is a good example, as illustrated in @fig-rasterio-plot-grain: different grain sizes are spread irregularly throughout the raster. # Finally, the result of a zonal operation is a summary table grouped by zone, which is why this operation is also known as zonal statistics in the GIS world. # This is in contrast to focal operations (@sec-focal-operations) which return a raster object. # # To demonstrate, let's get back to the `grain.tif` and `elev.tif` rasters. # To calculate zonal statistics, we use the arrays with raster values, which we already imported earlier. # Our intention is to calculate the average (or any other summary function, for that matter) of *elevation* in each zone defined by *grain* values. -# To do that, first we first obtain the unique values defining the zones using [`np.unique`](https://numpy.org/doc/stable/reference/generated/numpy.unique.html). +# To do that, first we first obtain the unique values defining the zones using `np.unique`. # In[ ]: @@ -1475,9 +1429,7 @@ np.unique(grain) -# Now, we can use [dictionary comprehension](https://docs.python.org/3/tutorial/datastructures.html#dictionaries) to "split" the `elev` array into separate one-dimensional arrays with values per `grain` group, with keys being the unique `grain` values. -# -# +# Now, we can use dictionary comprehension (see note below) to split the `elev` array into separate one-dimensional arrays with values per `grain` group, with keys being the unique `grain` values. # In[ ]: @@ -1487,7 +1439,7 @@ # ::: callout-note -# [List comprehension](https://docs.python.org/3/tutorial/datastructures.html#list-comprehensions) and dictionary comprehension are concise ways to create a `list` or a `dict`, respectively, from an iterable object. +# *List comprehension* and *dictionary comprehension* are concise ways to create a `list` or a `dict`, respectively, from an iterable object. # Both are, conceptually, a concise syntax to replace `for` loops where we iterate over an object and return a same-length object with the results. # Here are minimal examples of list and dictionary comprehension, respectively, to demonstrate the idea: # @@ -1519,19 +1471,15 @@ # In the first case, one can calculate the distance from each cell to specific target cells or vector geometries. # For example, one might want to compute the distance to the nearest coast (see @sec-distance-to-nearest-geometry). # We might also want to consider topography, that means, we are not only interested in the pure distance but would like also to avoid the crossing of mountain ranges when going to the coast. -# To do so, we can weight the distance with elevation so that each additional altitudinal meter "prolongs" the Euclidean distance (this is beyond the scope of the book). -# -# +# To do so, we can weight the distance with elevation so that each additional altitudinal meter 'prolongs' the Euclidean distance (this is beyond the scope of the book). # Visibility and viewshed computations also belong to the family of global operations (also beyond the scope of the book). -# -# # # ### Map algebra counterparts in vector processing # # Many map algebra operations have a counterpart in vector processing [@liu_essential_2009]. -# Computing a distance raster (global operation) while only considering a maximum distance (logical focal operation) is the equivalent to a vector buffer operation (@sec-buffers). +# Computing a distance raster (global operation) while only considering a maximum distance (logical focal operation) is the equivalent of a vector buffer operation (@sec-buffers). # Reclassifying raster data (either local or zonal function depending on the input) is equivalent to dissolving vector data (@sec-geometry-unions). -# Overlaying two rasters (local operation), where one contains "No Data" values representing a mask, is similar to vector clipping (Section @sec-clipping). +# Overlaying two rasters (local operation), where one contains 'No Data' values representing a mask, is similar to vector clipping (Section @sec-clipping). # Quite similar to spatial clipping is intersecting two layers (@sec-spatial-subsetting-vector, @sec-joining-incongruent-layers). # The difference is that these two layers (vector or raster) simply share an overlapping area. # However, be careful with the wording. @@ -1543,13 +1491,12 @@ # # Suppose we would like to compute the NDVI (see @sec-raster-local-operations), and additionally want to compute terrain attributes from elevation data for observations within a study area. # Such computations rely on remotely sensed information. -# The corresponding source imagery is often divided into scenes covering a specific spatial extent (i.e., "tiles"), and frequently, a study area covers more than one scene. -# Then, we would need to merge (also known as "mosaic") the scenes covered by our study area. -# In case when all scenes are "aligned" (i.e., share the same origin and resolution), this can be thought of as simply gluing them into one big raster; otherwise, all scenes should be resampled (see @sec-raster-resampling) to the grid defined by the first scene. +# The corresponding source imagery is often divided into scenes covering a specific spatial extent (i.e., tiles), and frequently, a study area covers more than one scene. +# Then, we would need to merge (also known as mosaic) the scenes covering our study area. +# In case when all scenes are aligned (i.e., share the same origin and resolution), this can be thought of as simply gluing them into one big raster; otherwise, all scenes need to be resampled (see @sec-raster-resampling) to the same grid (e.g., the one defined by the first scene). # -# For example, let us merge digital elevation data from two SRTM elevation tiles, for Austria (`'aut.tif'`) and Switzerland (`'ch.tif'`). -# Merging can be done using function `rasterio.merge.merge`, which accepts a `list` of raster file connections, and returns the new `ndarray` and a "transform", representing the resulting mosaic. -# +# For example, let's merge digital elevation data from two SRTM elevation tiles, for Austria (`'aut.tif'`) and Switzerland (`'ch.tif'`). +# Merging can be done using function `rasterio.merge.merge`, which accepts a `list` of raster file connections, and returns the new `ndarray` and the corresponding transform object, representing the resulting mosaic. # In[ ]: @@ -1586,7 +1533,7 @@ rasterio.plot.show(out_image, transform=out_transform); -# By default in `rasterio.merge.merge` (`method='first'`), areas of overlap retain the value of the *first* raster. +# By default in `rasterio.merge.merge`, areas of overlap retain the value of the *first* raster (`method='first'`). # Other possible methods are: # # - `'last'`---Value of the last raster @@ -1596,13 +1543,12 @@ # When dealing with non-overlapping tiles, such as `aut.tif` and `ch.tif` (above), the `method` argument has no practical effect. # However, it becomes relevant when we want to combine spectral imagery from scenes that were taken on different dates. # The above four options for `method` do not cover the commonly required scenario when we would like to compute the *mean* value---for example to calculate a seasonal average NDVI image from a set of partially overlapping satellite images (such as Landsat). -# An alternative worflow to `rasterio.merge.merge`, for calculating a mosaic as well as "averaging" any overlaps, is to go through two steps: +# An alternative workflow to `rasterio.merge.merge`, for calculating a mosaic as well as averaging any overlaps, is to go through two steps: # -# - Resampling all scenes into a common "global" grid (@sec-raster-resampling), thereby producing a series of "matching" rasters (with the area surrounding each scene set as "No Data") -# - Averaging the rasters through raster algebra (@sec-raster-local-operations), using `np.mean(m,axis=0)` or `np.nanmean(m,axis=0)` (depending whether we prefer to ignore "No Data" or not), where `m` is the multi-band array, which would return a single-band array of averages +# - Resampling all scenes into a common 'global' grid (@sec-raster-resampling), thereby producing a series of matching rasters (with the area surrounding each scene set as 'No Data') +# - Averaging the rasters through raster algebra (@sec-raster-local-operations), using `np.mean(m,axis=0)` or `np.nanmean(m,axis=0)` (depending whether we prefer to ignore 'No Data' or not), where `m` is the multi-band array, which would return a single-band array of averages # -# ## Exercises +# # -# ## References diff --git a/code/chapters/04-geometry-operations.py b/code/chapters/04-geometry-operations.py index 12743eca..d0803111 100644 --- a/code/chapters/04-geometry-operations.py +++ b/code/chapters/04-geometry-operations.py @@ -1,6 +1,10 @@ #!/usr/bin/env python # coding: utf-8 +# --- +# jupyter: python3 +# --- +# # # Geometry operations {#sec-geometric-operations} # # ## Prerequisites {.unnumbered} @@ -9,14 +13,20 @@ #| echo: false -import matplotlib.pyplot as plt -import pandas as pd -pd.options.display.max_rows = 6 -pd.options.display.max_columns = 6 -pd.options.display.max_colwidth = 35 -plt.rcParams['figure.figsize'] = (5, 5) +import book_options + + +# ::: {.content-visible when-format="pdf"} + +# In[ ]: +#| echo: false +import book_options_pdf + + +# ::: +# # This chapter requires importing the following packages: # In[ ]: @@ -24,6 +34,8 @@ import sys import numpy as np +import matplotlib.pyplot as plt +import pandas as pd import shapely import geopandas as gpd import topojson as tp @@ -58,7 +70,7 @@ # # @sec-geo-ras covers geometric transformations on raster objects. # This involves changing the size and number of the underlying pixels, and assigning them new values. -# It teaches how to change the extent and the origin of a raster "manually" (@sec-extent-and-origin), how to change the resolution in fixed "steps" through aggregation and disaggregation (@sec-raster-agg-disagg), and finally how to resample a raster into any existing template, which is the most general and often most practical approach (@sec-raster-resampling). +# It teaches how to change the extent and the origin of a raster manually (@sec-extent-and-origin), how to change the resolution in fixed steps through aggregation and disaggregation (@sec-raster-agg-disagg), and finally how to resample a raster into any existing template, which is the most general and often most practical approach (@sec-raster-resampling). # These operations are especially useful if one would like to align raster datasets from diverse sources. # Aligned raster objects share a one-to-one correspondence between pixels, allowing them to be processed using map algebra operations (@sec-raster-local-operations). # @@ -73,10 +85,10 @@ # # ### Simplification {#sec-simplification} # -# Simplification is a process for generalization of vector objects (lines and polygons) usually for use in smaller scale maps. -# Another reason for simplifying objects is to reduce the amount of memory, disk space and network bandwidth they consume: it may be wise to simplify complex geometries before publishing them as interactive maps. -# The **geopandas** package provides the [`.simplify`](https://geopandas.org/en/stable/docs/reference/api/geopandas.GeoSeries.simplify.html) method, which uses the GEOS implementation of the Douglas-Peucker algorithm to reduce the vertex count. -# `.simplify` uses the `tolerance` to control the level of generalization in map units [@douglas_algorithms_1973]. +# Simplification is a process for generalization of vector objects (lines and polygons) usually for use in smaller-scale maps. +# Another reason for simplifying objects is to reduce the amount of memory, disk space, and network bandwidth they consume: it may be wise to simplify complex geometries before publishing them as interactive maps. +# The **geopandas** package provides the `.simplify` method, which uses the GEOS implementation of the Douglas-Peucker algorithm to reduce the vertex count. +# `.simplify` uses `tolerance` to control the level of generalization in map units [@douglas_algorithms_1973]. # # For example, a simplified geometry of a `'LineString'` geometry, representing the river Seine and tributaries, using tolerance of `2000` meters, can be created using the `seine.simplify(2000)` command (@fig-simplify-lines). @@ -108,10 +120,6 @@ # This is illustrated using `us_states`, representing the contiguous United States. # As we show in @sec-reproj-geo-data, for many calculations **geopandas** (through **shapely**, and, ultimately, GEOS) assumes that the data is in a projected CRS and this could lead to unexpected results when applying distance-related operators. # Therefore, the first step is to project the data into some adequate projected CRS, such as US National Atlas Equal Area (EPSG:`9311`) (on the left in Figure @fig-simplify-polygons), using `.to_crs` (@sec-reprojecting-vector-geometries). -# -# -# -# # In[ ]: @@ -128,20 +136,17 @@ # A limitation with `.simplify`, however, is that it simplifies objects on a per-geometry basis. -# This means the "topology" is lost, resulting in overlapping and "holey" areal units as illustrated in @fig-simplify-polygons (b). +# This means the topology is lost, resulting in overlapping and 'holey' areal units as illustrated in @fig-simplify-polygons (b). # The `.toposimplify` method from package **topojson** provides an alternative that overcomes this issue. -# By [default](https://mattijn.github.io/topojson/example/settings-tuning.html#simplify_algorithm) it uses the Douglas-Peucker algorithm like the `.simplify` method. -# However, another algorithm, known as Visvalingam-Whyatt, which overcomes some limitations of the Douglas-Peucker algorithm [@visvalingam_line_1993], is also available in `.toposimplify`. -# The main advanatage of `.toposimplify` is that it is topologically "aware": it simplifies the combined borders of the polygons (rather than each polygon on its own), thus ensuring that the overlap is maintained. +# The main advanatage of `.toposimplify` is that it is topologically 'aware': it simplifies the combined borders of the polygons (rather than each polygon on its own), thus ensuring that the overlap is maintained. # The following code chunk uses `.toposimplify` to simplify `us_states9311`. -# Note that, when using the **topojson** package, we first need to calculate a "topology" object, using function `tp.Topology`, and then apply the sumplification function, such as `.toposimplify`, to obtain a simplified layer. +# Note that, when using the **topojson** package, we first need to calculate a topology object, using function `tp.Topology`, and then apply the simplification function, such as `.toposimplify`, to obtain a simplified layer. # We are also using the `.to_gdf` method to return a `GeoDataFrame`. -# -# # In[ ]: +#| warning: false topo = tp.Topology(us_states9311, prequantize=False) us_states_simp2 = topo.toposimplify(100000).to_gdf() @@ -167,7 +172,7 @@ # # Centroid operations identify the center of geographic objects. # Like statistical measures of central tendency (including mean and median definitions of 'average'), there are many ways to define the geographic center of an object. -# All of them create single point representations of more complex vector objects. +# All of them create single-point representations of more complex vector objects. # # The most commonly used centroid operation is the geographic centroid. # This type of centroid operation (often referred to as 'the centroid') represents the center of mass in a spatial object (think of balancing a plate on your finger). @@ -192,7 +197,7 @@ seine_pos = seine.representative_point() -# The centroids and points in surface are illustrated in @fig-centroid-pnt-on-surface. +# The centroids and points on surface are illustrated in @fig-centroid-pnt-on-surface. # In[ ]: @@ -203,6 +208,7 @@ #| fig-subcap: #| - New Zealand #| - Seine + # New Zealand base = nz.plot(color='white', edgecolor='lightgrey') nz_centroid.plot(ax=base, color='None', edgecolor='black') @@ -223,19 +229,27 @@ # # @fig-buffers illustrates buffers of two different sizes (5 and 50 $km$) surrounding the river Seine and tributaries. # These buffers were created with commands below, using the `.buffer` method, applied to a `GeoSeries` or `GeoDataFrame`. -# The `.buffer` method requires one important argument: the buffer distance, provided in the units of the CRS, in this case, meters (@fig-buffers). +# The `.buffer` method requires one important argument: the buffer distance, provided in the units of the CRS, in this case, meters. + +# In[ ]: + + +seine_buff_5km = seine.buffer(5000) +seine_buff_50km = seine.buffer(50000) + + +# The results are shown in @fig-buffers. # In[ ]: #| label: fig-buffers -#| fig-cap: Buffers around the Seine dataset of 5 km (left) and 50 km (right). Note the colors, which reflect the fact that one buffer is created per geometry feature. +#| fig-cap: Buffers around the Seine dataset of 5 $km$ and 50 $km$. Note the colors, which reflect the fact that one buffer is created per geometry feature. #| layout-ncol: 2 #| fig-subcap: #| - 5 $km$ buffer #| - 50 $km$ buffer -seine_buff_5km = seine.buffer(5000) -seine_buff_50km = seine.buffer(50000) + seine_buff_5km.plot(color='none', edgecolor=['c', 'm', 'y']); seine_buff_50km.plot(color='none', edgecolor=['c', 'm', 'y']); @@ -258,7 +272,7 @@ seine_buff_5km -# Alternative option is to add a secondary geometry column directly to the original `GeoDataFrame`. +# An alternative option is to add a secondary geometry column directly to the original `GeoDataFrame`. # In[ ]: @@ -267,7 +281,7 @@ seine -# You can then switch to either geometry column (i.e., make it the "active" geometry column) using `.set_geometry`, as in: +# You can then switch to either geometry column (i.e., make it 'active') using `.set_geometry`, as in: # In[ ]: @@ -287,17 +301,15 @@ # ### Affine transformations {#sec-affine-transformations} # # Affine transformations include, among others, shifting (translation), scaling and rotation, or any combination of these. -# They preserves lines and parallelism, by angles and lengths are not necessarily preserved. +# They preserves lines and parallelism, but angles and lengths are not necessarily preserved. # These transformations are an essential part of geocomputation. # For example, shifting is needed for labels placement, scaling is used in non-contiguous area cartograms, and many affine transformations are applied when reprojecting or improving the geometry that was created based on a distorted or wrongly projected map. # # The **geopandas** package implements affine transformation, for objects of classes `GeoSeries` and `GeoDataFrame`. -# In both cases, the method is applied on the `GeoSeries` part, returning a just the `GeoSeries` of transformed geometries. -# -# +# In both cases, the method is applied on the `GeoSeries` part, returning just the `GeoSeries` of transformed geometries. # -# Affine transformations of `GeoSeries` can be done using the [`.affine_transform`](https://geopandas.org/en/stable/docs/reference/api/geopandas.GeoSeries.affine_transform.html) method, which is a wrapper around the `shapely.affinity.affine_transform` function. -# As [documented](https://shapely.readthedocs.io/en/stable/manual.html#shapely.affinity.affine_transform), a 2D affine transformation requires a six-parameter list `[a,b,d,e,xoff,yoff]` which represents the following equations for transforming the coordinates (@eq-affine1 and @eq-affine2)/ +# Affine transformations of `GeoSeries` can be done using the `.affine_transform` method, which is a wrapper around the `shapely.affinity.affine_transform` function. +# A two-dimensional affine transformation requires a six-parameter list `[a,b,d,e,xoff,yoff]` which represents @eq-affine1 and @eq-affine2 for transforming the coordinates. # # $$ # x' = a x + b y + x_\mathrm{off} @@ -307,16 +319,13 @@ # y' = d x + e y + y_\mathrm{off} # $$ {#eq-affine2} # -# There are also simplified `GeoSeries` [methods](https://geopandas.org/en/stable/docs/user_guide/geometric_manipulations.html#affine-transformations) for specific scenarios, such as: +# There are also simplified `GeoSeries` methods for specific scenarios, such as: # # - `.translate(xoff=0.0, yoff=0.0)` # - `.scale(xfact=1.0, yfact=1.0, origin='center')` # - `.rotate(angle, origin='center', use_radians=False)` -# - `.skew(angle, origin='center', use_radians=False)` # -# For example, *shifting* only requires the $x_{off}$ and $y_{off}$, using [`.translate`](https://geopandas.org/en/stable/docs/reference/api/geopandas.GeoSeries.translate.html). -# -# +# For example, *shifting* only requires the $x_{off}$ and $y_{off}$, using `.translate`. # The code below shifts the y-coordinates of `nz` by 100 $km$ to the north, but leaves the x-coordinates untouched. # In[ ]: @@ -327,19 +336,15 @@ # ::: callout-note -# **shapely**, and consequently **geopandas**, operations, typically [ignore](https://shapely.readthedocs.io/en/stable/manual.html#geometric-objects) the z-dimension of geometries in operations. For example, `shapely.LineString([(0,0,0),(0,0,1)]).length` returns `0` (and not `1`), since `.length` ignores the z-dimension. In this book (like in most real-world spatial analysis applications), we deal only with two-dimensional geometries. +# **shapely**, and consequently **geopandas**, operations, typically ignore the z-dimension (if there is one) of geometries in operations. For example, `shapely.LineString([(0,0,0),(0,0,1)]).length` returns `0` (and not `1`), since `.length` ignores the z-dimension. This is not an issue in this book (and in most real-world spatial analysis applications), since we are dealing only with two-dimensional geometries. # ::: # # Scaling enlarges or shrinks objects by a factor, and can be applied either globally or locally. # Global scaling increases or decreases all coordinates values in relation to the origin coordinates, while keeping all geometries topological relations intact. -# **geopandas** implements local scaling using the [`.scale`](https://geopandas.org/en/stable/docs/reference/api/geopandas.GeoSeries.scale.html) method. -# -# +# **geopandas** implements scaling using the `.scale` method. # Local scaling treats geometries independently and requires points around which geometries are going to be scaled, e.g., centroids. # In the example below, each geometry is shrunk by a factor of two around the centroids (@fig-affine-transformations (b)). # To achieve that, we pass the `0.5` and `0.5` scaling factors (for x and y, respectively), and the `'centroid'` option for the point of origin. -# -# # In[ ]: @@ -348,13 +353,11 @@ nz_scale -# ::: callout-note # When setting the `origin` in `.scale`, other than `'centroid'` it is possible to use `'center'`, for the bounding box center, or specific point coordinates, such as `(0,0)`. -# ::: # -# Rotating the geometries can be done using the [`.rotate`](https://geopandas.org/en/stable/docs/reference/api/geopandas.GeoSeries.rotate.html) method. +# Rotating the geometries can be done using the `.rotate` method. # When rotating, we need to specify the rotation angle (positive values imply clockwise rotation) and the `origin` points (using the same options as in `scale`). -# For example, the following expression rotates `nz` by 30 degrees counter-clockwise, around the geometry centroids. +# For example, the following expression rotates `nz` by $30\degree$ counter-clockwise, around the geometry centroids. # In[ ]: @@ -363,13 +366,13 @@ nz_rotate -# @fig-affine-transformations shows the original layer `nz`, and the shifting, scaling and rotation results. +# @fig-affine-transformations shows the original layer `nz`, and the shifting, scaling, and rotation results. # In[ ]: #| label: fig-affine-transformations -#| fig-cap: 'Illustrations of affine transformations: shift, scale and rotate' +#| fig-cap: 'Affine transformations of the `nz` layer: shift, scale, and rotate' #| layout-ncol: 3 #| fig-subcap: #| - Shift @@ -386,13 +389,10 @@ nz_rotate.plot(ax=base, color='red', edgecolor='darkgrey'); -# -# -# # ### Pairwise geometry-generating operations {#sec-clipping} # # Spatial clipping is a form of spatial subsetting that involves changes to the geometry columns of at least some of the affected features. -# Clipping can only apply to features more complex than points: lines, polygons and their 'multi' equivalents. +# Clipping can only apply to features more complex than points: lines, polygons, and their 'multi' equivalents. # To illustrate the concept we will start with a simple example: two overlapping circles with a center point one unit away from each other and a radius of one (@fig-overlapping-circles). # In[ ]: @@ -405,7 +405,7 @@ shapely.GeometryCollection([x, y]) -# Imagine you want to select not one circle or the other, but the space covered by both x and y. +# Imagine you want to select not one circle or the other, but the space covered by both `x` and `y`. # This can be done using the `.intersection` method from **shapely**, illustrated using objects named `x` and `y` which represent the left- and right-hand circles (@fig-intersection). # In[ ]: @@ -423,7 +423,7 @@ #| label: fig-difference -#| fig-cap: Difference between `x` and `y` (namely, `x` "minus" `y`) +#| fig-cap: Difference between `x` and `y` (namely, `x` 'minus' `y`) x.difference(y) @@ -449,13 +449,13 @@ # The **geopandas** package, as is often the case, contains wrappers of these **shapely** functions to be applied to multiple, or pairwise, use cases. # For example, applying either of the pairwise methods on a `GeoSeries` or `GeoDataFrame`, combined with a `shapely` geometry, returns the pairwise (many-to-one) results (which is analogous to other operators, like `.intersects` or `.distance`, see @sec-spatial-subsetting-vector and @sec-distance-relations, respectively). # -# Let's demonstrate the "many-to-one" scenario by calculating the difference between each geometry in a `GeoSeries` and a "fixed" `shapely` geometry. -# To create the latter, let's take `x` and combine it with itself translated (@sec-affine-transformations) to a distance of `1` and `2` units "upwards" on the y-axis. +# Let's demonstrate the 'many-to-one' scenario by calculating the difference between each geometry in a `GeoSeries` and a fixed `shapely` geometry. +# To create the latter, let's take `x` and combine it with itself translated (@sec-affine-transformations) to a distance of `1` and `2` units 'upwards' on the y-axis. # In[ ]: -geom1 = gpd.GeoSeries([x]) +geom1 = gpd.GeoSeries(x) geom2 = geom1.translate(0, 1) geom3 = geom1.translate(0, 2) geom = pd.concat([geom1, geom2, geom3]) @@ -468,13 +468,14 @@ #| label: fig-geom-intersection -#| fig-cap: A `GeoSeries` with three circles, and a `shapely` geometry that we will "subtract" from it (in red) +#| fig-cap: A `GeoSeries` with three circles (in grey), and a `shapely` geometry that we will subtract from it (in red) + fig, ax = plt.subplots() -geom.plot(color='lightgrey', edgecolor='black', ax=ax) +geom.plot(color='#00000030', edgecolor='black', ax=ax) gpd.GeoSeries(y).plot(color='#FF000040', edgecolor='black', ax=ax); -# Now, using `.intersection` automatically applies the **shapely** method of the same name on each geometry in `geom`, returning a new `GeoSeries`, which we name `geom_inter_y`, with the pairwise "intersections". +# Now, using `.intersection` automatically applies the **shapely** method of the same name on each geometry in `geom`, returning a new `GeoSeries`, which we name `geom_inter_y`, with the pairwise intersections. # Note the empty third geometry (can you explain the meaning of this result?). # In[ ]: @@ -492,23 +493,24 @@ #| label: fig-geom-intersection2 #| fig-cap: The output `GeoSeries`, after subtracting a `shapely` geometry using `.intersection` -geom_inter_y.plot(color='lightgrey', edgecolor='black'); +geom_inter_y.plot(color='#00000030', edgecolor='black'); -# The `.overlay` method (see @sec-joining-incongruent-layers) further extends this technique, making it possible to apply "many-to-many" pairwise geometry generations between all pairs of two `GeoDataFrame`s. +# The `.overlay` method (see @sec-joining-incongruent-layers) further extends this technique, making it possible to apply 'many-to-many' pairwise geometry generations between all pairs of two `GeoDataFrame`s. # The output is a new `GeoDataFrame` with the pairwise outputs, plus the attributes of both inputs which were the inputs of the particular pairwise output geometry. -# See the ["Set operations with overlay"](https://geopandas.org/en/stable/docs/user_guide/set_operations.html) article in the **geopandas** documentation for examples of `.overlay`. +# Also see the *Set operations with overlay*[^set_ops_w_overlay] article in the **geopandas** documentation for examples of `.overlay`. +# +# [^set_ops_w_overlay]: [https://geopandas.org/en/stable/docs/user_guide/set_operations.html](https://geopandas.org/en/stable/docs/user_guide/set_operations.html) # # ### Subsetting vs. clipping {#sec-subsetting-vs-clipping} # # In the last two chapters we have introduced two types of spatial operators: boolean, such as `.intersects` (@sec-spatial-subsetting-vector), and geometry-generating, such as `.intersection` (@sec-clipping). # Here, we illustrate the difference between them. # We do this using the specific scenario of subsetting points by polygons, where (unlike in other cases) both methods can be used for the same purpose and giving the same result. -# # # To illustrate the point, we will subset points that cover the bounding box of the circles `x` and `y` from @fig-overlapping-circles. -# Some points will be inside just one circle, some will be inside both and some will be inside neither. -# The following code sections generates the sample data for this section, a simple random distribution of points within the extent of circles `x` and `y`, resulting in output illustrated in @fig-random-points. +# Some points will be inside just one circle, some will be inside both, and some will be inside neither. +# The following code sections generate the sample data for this section, a simple random distribution of points within the extent of circles `x` and `y`, resulting in output illustrated in @fig-random-points. # We create the sample points in two steps. # First, we figure out the bounds where random points are to be generated. @@ -532,7 +534,7 @@ coords -# Third, we transform the list of coordinates into a `list` of `shapely` points and then to a `GeoSeries`. +# Third, we transform the list of coordinates into a `list` of `shapely` points, and then to a `GeoSeries`. # In[ ]: @@ -547,15 +549,15 @@ #| label: fig-random-points -#| fig-cap: Randomly distributed points within the bounding box enclosing circles x and y. The point that intersects with both objects x and y are highlighted. +#| fig-cap: Randomly distributed points within the bounding box enclosing circles `x` and `y` base = pnt.plot(color='none', edgecolor='black') -gpd.GeoSeries([x]).plot(ax=base, color='none', edgecolor='darkgrey'); -gpd.GeoSeries([y]).plot(ax=base, color='none', edgecolor='darkgrey'); +gpd.GeoSeries(x).plot(ax=base, color='none', edgecolor='darkgrey'); +gpd.GeoSeries(y).plot(ax=base, color='none', edgecolor='darkgrey'); # Now, we can get back to our question: how to subset the points to only return the point that intersects with both `x` and `y`? # The code chunks below demonstrate two ways to achieve the same result. -# In the first approach, we can calculate a boolean `Series`, evaluating whether each point of `pnt` intersects with the intersection of `x` and `y` (see @sec-spatial-subsetting-vector) and then use it to subset `pnt` to get the result `pnt1`. +# In the first approach, we can calculate a boolean `Series`, evaluating whether each point of `pnt` intersects with the intersection of `x` and `y` (see @sec-spatial-subsetting-vector), and then use it to subset `pnt` to get the result `pnt1`. # In[ ]: @@ -566,7 +568,7 @@ # In the second approach, we can also find the intersection between the input points represented by `pnt`, using the intersection of `x` and `y` as the subsetting/clipping object. -# Since the second argument is an individual `shapely` geometry (`x.intersection(y)`), we get "pairwise" intersections of each `pnt` with it (see @sec-clipping): +# Since the second argument is an individual `shapely` geometry (`x.intersection(y)`), we get 'pairwise' intersections of each `pnt` with it (see @sec-clipping): # In[ ]: @@ -583,12 +585,12 @@ #| label: fig-intersection-points #| fig-cap: Randomly distributed points within the bounding box enclosing circles x and y. The point that intersects with both objects x and y are highlighted. base = pnt.plot(color='none', edgecolor='black') -gpd.GeoSeries([x]).plot(ax=base, color='none', edgecolor='darkgrey'); -gpd.GeoSeries([y]).plot(ax=base, color='none', edgecolor='darkgrey'); +gpd.GeoSeries(x).plot(ax=base, color='none', edgecolor='darkgrey'); +gpd.GeoSeries(y).plot(ax=base, color='none', edgecolor='darkgrey'); pnt2.plot(ax=base, color='red'); -# The only difference between the two approaches is that `.intersection` returns all "intersections", even if they are empty. +# The only difference between the two approaches is that `.intersection` returns all intersections, even if they are empty. # When these are filtered out, `pnt2` becomes identical to `pnt1`: # In[ ]: @@ -598,14 +600,8 @@ pnt2 -# -# # The example above is rather contrived and provided for educational rather than applied purposes. -# However, we encourage the reader to reproduce the results to deepen your understanding for handling geographic vector objects in Python. -# -# -# -# +# However, we encourage the reader to reproduce the results to deepen your understanding of handling geographic vector objects in Python. # # ### Geometry unions {#sec-geometry-unions} # @@ -626,7 +622,7 @@ #| label: fig-dissolve -#| fig-cap: "Spatial aggregation on contiguous polygons, illustrated by aggregating the population of 49 US states into 4 regions, with population represented by color. Note the operation automatically dissolves boundaries between states." +#| fig-cap: 'Spatial aggregation on contiguous polygons, illustrated by aggregating the population of 49 US states into 4 regions, with population represented by color. Note the operation automatically dissolves boundaries between states.' #| layout-ncol: 2 #| fig-subcap: #| - 49 States @@ -640,9 +636,9 @@ # What is happening with the geometries here? -# Behind the scenes, `.dissolve` combines the geometries and dissolve the boundaries between them using the [`.unary_union`](https://geopandas.org/en/stable/docs/reference/api/geopandas.GeoSeries.unary_union.html#geopandas.GeoSeries.unary_union) method per group. -# This is demonstrated in the code chunk below which creates a united western US using the standalone `unary_union` operation. -# Note that the result is a `shapely` geometry, as the individual attributes are "lost" as part of dissolving (@fig-dissolve2). +# Behind the scenes, `.dissolve` combines the geometries and dissolves the boundaries between them using the `.union_all` method per group. +# This is demonstrated in the code chunk below which creates a united western US using the standalone `.union_all` operation. +# Note that the result is a `shapely` geometry, as the individual attributes are 'lost' as part of dissolving (@fig-dissolve2). # In[ ]: @@ -650,11 +646,11 @@ #| label: fig-dissolve2 #| fig-cap: Western US us_west = us_states[us_states['REGION'] == 'West'] -us_west_union = us_west.geometry.unary_union +us_west_union = us_west.geometry.union_all() us_west_union -# To dissolve two (or more) groups of a `GeoDataFrame` into one geometry, we can either (a) use a combined condition or (b) concatenate the two separate subsets and then dissove using `.unary_union`. +# To dissolve two (or more) groups of a `GeoDataFrame` into one geometry, we can either (a) use a combined condition or (b) concatenate the two separate subsets and then dissolve using `.union_all`. # In[ ]: @@ -662,11 +658,11 @@ # Approach 1 sel = (us_states['REGION'] == 'West') | (us_states['NAME'] == 'Texas') texas_union = us_states[sel] -texas_union = texas_union.geometry.unary_union +texas_union = texas_union.geometry.union_all() # Approach 2 us_west = us_states[us_states['REGION'] == 'West'] texas = us_states[us_states['NAME'] == 'Texas'] -texas_union = pd.concat([us_west, texas]).unary_union +texas_union = pd.concat([us_west, texas]).union_all() # The result is identical in both cases, shown in @fig-dissolve3. @@ -681,27 +677,24 @@ # ### Type transformations {#sec-type-transformations} # -# -# -# -# Transformation of geometries, from one type to another, also known as "geometry casting", is often required to facilitate spatial analysis. +# Transformation of geometries, from one type to another, also known as 'geometry casting', is often required to facilitate spatial analysis. # Either the **geopandas** or the **shapely** packages can be used for geometry casting, depending on the type of transformation, and the way that the input is organized (whether and individual geometry, or a vector layer). # Therefore, the exact expression(s) depend on the specific transformation we are interested in. # -# In general, you need to figure out the required input of the respective construstor function according to the "destination" geometry (e.g., `shapely.LineString`, etc.), then reshape the input of the "source" geometry into the right form to be passed to that function. +# In general, you need to figure out the required input of the respective constructor function according to the 'destination' geometry (e.g., `shapely.LineString`, etc.), then reshape the input of the source geometry into the right form to be passed to that function. # Or, when available, you can use a wrapper from **geopandas**. # -# In this section we demonstrate several common scenarios. +# In this section, we demonstrate several common scenarios. # We start with transformations of individual geometries from one type to another, using **shapely** methods: # # * `'MultiPoint'` to `'LineString'` (@fig-type-transform-linestring) # * `'MultiPoint'` to `'Polygon'` (@fig-type-transform-polygon) # * `'LineString'` to `'MultiPoint'` (@fig-type-transform-multipoint2) -# * `'LineString'` to `'Polygon'` (@fig-type-transform-polygon2) +# * `'Polygon'` to `'MultiPoint'` (@fig-type-transform-polygon2) # * `'Polygon'`s to `'MultiPolygon'` (@fig-type-transform-multipolygon) # * `'MultiPolygon'`s to `'Polygon'`s (@fig-type-transform-multipolygon1, @fig-type-transform-multipolygon2) # -# Then, we move on and demonstrate casting workflows on `GeoDataFrame`s, where we have further considerations, such as keeping track of geometry attributes, and the possibility of dissolving, rather than just combining, geometries. As we will see, these are done either by "manually" applying **shapely** methods on all geometries in the given layer, or using **geopandas** wrapper methods which do it automatically: +# Then, we move on and demonstrate casting workflows on `GeoDataFrame`s, where we have further considerations, such as keeping track of geometry attributes, and the possibility of dissolving, rather than just combining, geometries. As we will see, these are done either by manually applying **shapely** methods on all geometries in the given layer, or using **geopandas** wrapper methods which do it automatically: # # * `'MultiLineString'` to `'LineString'`s (using `.explode`) (@fig-multilinestring-to-linestring) # * `'LineString'` to `'MultiPoint'`s (using `.apply`) (@fig-linestring-to-multipoint) @@ -722,10 +715,8 @@ # A `'LineString'` can be created using `shapely.LineString` from a `list` of points. -# Thus, a `'MultiPoint'` can be converted to a `'LineString'` by extracting the individual points into a `list`, then passing them to `shapely.LineString` (@fig-type-transform-linestring). -# The `.geoms` property, mentioned in @sec-geometries, give access to the indivudual parts that comprise a multi-part geometry; it is one of the **shapely** access methods to internal parts of a geometry. -# -# +# Thus, a `'MultiPoint'` can be converted to a `'LineString'` by passing the points into a `list`, then passing them to `shapely.LineString` (@fig-type-transform-linestring). +# The `.geoms` property, mentioned in @sec-geometries, gives access to the individual parts that comprise a multi-part geometry, as an iterable object similar to a `list`; it is one of the **shapely** access methods to internal parts of a geometry. # In[ ]: @@ -745,7 +736,7 @@ #| label: fig-type-transform-polygon #| fig-cap: A `'Polygon'` created from the `'MultiPoint'` in @fig-type-transform-multipoint -polygon = shapely.Polygon([[p.x, p.y] for p in multipoint.geoms]) +polygon = shapely.Polygon(multipoint.geoms) polygon @@ -767,7 +758,7 @@ gpd.GeoSeries(polygon).plot(); -# Conversion from `'MultiPoint'` to `'LineString'` (@fig-type-transform-linestring) is a common operation that creates a line object from ordered point observations, such as GPS measurements or geotagged media. +# Conversion from `'MultiPoint'` to `'LineString'`, shown above (@fig-type-transform-linestring), is a common operation that creates a line object from ordered point observations, such as GPS measurements or geotagged media. # This allows spatial operations such as the length of the path traveled. # Conversion from `'MultiPoint'` or `'LineString'` to `'Polygon'` (@fig-type-transform-polygon) is often used to calculate an area, for example from the set of GPS measurements taken around a lake or from the corners of a building lot. # @@ -791,7 +782,7 @@ shapely.MultiPoint(polygon.exterior.coords) -# Using these methods, we can transform between `'Point'`, `'LineString'`, and `'Polygon'` geometries, assuming there is a sufficient number of points (at least two to form a line, and at least three to form a polygon). +# Using these methods, we can transform between `'Point'`, `'LineString'`, and `'Polygon'` geometries, assuming there is a sufficient number of points (at least two for a line, and at least three for a polygon). # When dealing with multi-part geometries using **shapely**, we can: # # - Access single-part geometries (e.g., each `'Polygion'` in a `'MultiPolygon'` geometry) using `.geoms[i]`, where `i` is the index of the geometry @@ -812,13 +803,13 @@ multipolygon -# Now, here is how we can get back the `'Polygon'` part 1 (@fig-type-transform-multipolygon1): +# Given `multipolygon`, here is how we can get back the `'Polygon'` part 1 (@fig-type-transform-multipolygon1): # In[ ]: #| label: fig-type-transform-multipolygon1 -#| fig-cap: The 1^st^ "part" extracted from the `'MultiPolygon'` in @fig-type-transform-multipolygon +#| fig-cap: The 1^st^ part extracted from the `'MultiPolygon'` in @fig-type-transform-multipolygon multipolygon.geoms[0] @@ -829,18 +820,14 @@ #| label: fig-type-transform-multipolygon2 -#| fig-cap: The 2^nd^ "part" extracted from the `'MultiPolygon'` in @fig-type-transform-multipolygon +#| fig-cap: The 2^nd^ part extracted from the `'MultiPolygon'` in @fig-type-transform-multipolygon multipolygon.geoms[1] -# However, dealing with multi-part geometries can be easier with **geopandas**. Thanks to the fact that geometries in a `GeoDataFrame` are associated with attributes, we can keep track of the origin of each geometry: duplicating the attributes when going from multi-part to single-part (using `.explode`, see below), or "collapsing" the attributes through aggregation when going from single-part to multi-part (using `.dissolve`, see @sec-geometry-unions). -# -# +# However, dealing with multi-part geometries can be easier with **geopandas**. Thanks to the fact that geometries in a `GeoDataFrame` are associated with attributes, we can keep track of the origin of each geometry: duplicating the attributes when going from multi-part to single-part (using `.explode`, see below), or 'collapsing' the attributes through aggregation when going from single-part to multi-part (using `.dissolve`, see @sec-geometry-unions). # # Let's demonstrate going from multi-part to single-part (@fig-multilinestring-to-linestring) and then back to multi-part (@sec-geometry-unions), using a small line layer. -# -# # As input, we will create a `'MultiLineString'` geometry composed of three lines (@fig-type-transform-multilinestring3). # In[ ]: @@ -860,7 +847,7 @@ # In[ ]: -geom = gpd.GeoSeries([ml]) +geom = gpd.GeoSeries(ml) geom @@ -875,8 +862,8 @@ # You can imagine it as a road or river network. # The above layer `dat` has only one row that defines all the lines. -# This restricts the number of operations that can be done, for example it prevents adding names to each line segment or calculating lengths of single lines. -# Using **shapely** methods with which we are already familiar with (see above), the individual single-part geometries (i.e., the "parts") can be accessed through the `.geoms` property. +# This restricts the number of operations that can be done, for example, it prevents adding names to each line segment or calculating lengths of single lines. +# Using **shapely** methods with which we are already familiar with (see above), the individual single-part geometries (i.e., the 'parts') can be accessed through the `.geoms` property. # In[ ]: @@ -884,37 +871,38 @@ list(ml.geoms) -# However, specifically for the "multi-part to single part" type transformation scenarios, there is also a method called `.explode`, which can convert an entire multi-part `GeoDataFrame` to a single-part one. +# However, specifically for the 'multi-part to single part' type transformation scenarios, there is also a method called `.explode`, which can convert an entire multi-part `GeoDataFrame` to a single-part one. # The advantage is that the original attributes (such as `id`) are retained, so that we can keep track of the original multi-part geometry properties that each part came from. # The `index_parts=True` argument also lets us keep track of the original multipart geometry indices, and part indices, named `level_0` and `level_1`, respectively. # In[ ]: +#| warning: false dat1 = dat.explode(index_parts=True).reset_index() dat1 # For example, here we see that all `'LineString'` geometries came from the same multi-part geometry (`level_0`=`0`), which had three parts (`level_1`=`0`,`1`,`2`). -# @fig-multilinestring-to-linestring demonstrates the effect of `.explode` in converting a layer with multi-part geometries into a layer with single part geometries. +# @fig-multilinestring-to-linestring demonstrates the effect of `.explode` in converting a layer with multi-part geometries into a layer with single-part geometries. # In[ ]: #| label: fig-multilinestring-to-linestring -#| fig-cap: Transformation a `'MultiLineString'` layer with one feature, into a `'LineString'` layer with three features, using `.explode` +#| fig-cap: Transformation of a `'MultiLineString'` layer with one feature, into a `'LineString'` layer with three features, using `.explode` #| layout-ncol: 2 #| fig-subcap: #| - MultiLineString layer #| - LineString layer, after applying `.explode` -dat.plot(column='id'); -dat1.plot(column='level_1'); +dat.plot(column='id', linewidth=7); +dat1.plot(column='level_1', linewidth=7); # As a side-note, let's demonstrate how the above **shapely** casting methods can be translated to **geopandas**. # Suppose that we want to transform `dat1`, which is a layer of type `'LineString'` with three features, to a layer of type `'MultiPoint'` (also with three features). # Recall that for a single geometry, we use the expression `shapely.MultiPoint(x.coords)`, where `x` is a `'LineString'` (@fig-type-transform-multipoint2). -# When dealing with a `GeoDataFrame`, we wrap the conversion into `.apply`, to apply it on all geometries: +# When dealing with a `GeoDataFrame`, we wrap the conversion into `.apply`, to apply it to all geometries: # In[ ]: @@ -930,17 +918,17 @@ #| label: fig-linestring-to-multipoint -#| fig-cap: Transformation a `'LineString'` layer with three features, into a `'MultiPoint'` layer (also with three features), using `.apply` and **shapely** methods +#| fig-cap: Transformation of a `'LineString'` layer with three features, into a `'MultiPoint'` layer (also with three features), using `.apply` and **shapely** methods #| layout-ncol: 2 #| fig-subcap: #| - LineString layer #| - MultiPoint layer -dat1.plot(column='level_1'); -dat2.plot(column='level_1'); +dat1.plot(column='level_1', linewidth=7); +dat2.plot(column='level_1', markersize=50); -# The opposite transformation, i.e., "single-part to multi-part", is achieved using the `.dissolve` method (which we are already familiar with, see @sec-geometry-unions). -# For example, here is how we can get back to the `'MultiLineString'` geometry: +# The opposite transformation, i.e., 'single-part to multi-part', is achieved using the `.dissolve` method (which we are already familiar with, see @sec-geometry-unions). +# For example, here is how we can get from the `'LineString'` layer with three features back to the `'MultiLineString'` layer with one feature (since, in this case, there is just one group): # In[ ]: @@ -948,7 +936,7 @@ dat1.dissolve(by='id').reset_index() -# The next code chunk is another example, dissolving the `nz` north and south parts into `'MultiPolygon'` geometries. +# The next code chunk is another example, dissolving the 16 polygons in `nz` into two geometries of the north and south parts (i.e., the two `'Island'` groups). # In[ ]: @@ -960,9 +948,9 @@ # Note that `.dissolve` not only combines single-part into multi-part geometries, but also dissolves any internal borders. -# So, in fact, the result may be single-part (in case when all parts touch each other, unlike in `nz`). +# So, in fact, the resulting geometries may be single-part (in case when all parts touch each other, unlike in `nz`). # If, for some reason, we want to combine geometries into multi-part *without* dissolving, we can fall back to the **pandas** `.agg` method (custom table aggregation), supplemented with a **shapely** function specifying how exactly we want to transform each group of geometries into a new single geometry. -# In the following example, for instance, we collect all `'Polygon'` and `'MultiPolygon'` parts of `nz` into a single `'MultiPolygon'` geometry with many separate parts (i.e., without dissolving), per group (`Island`). +# In the following example, for instance, we collect all `'Polygon'` and `'MultiPolygon'` parts of `nz` into a single `'MultiPolygon'` geometry with many separate parts (i.e., without dissolving), per group. # In[ ]: @@ -979,7 +967,7 @@ nz_dis2 -# The difference between the last two results (with and without dissolving, respectively) is not evident in the printout: in both cases we got a layer with two features of type `'MultiPolygon'`. +# The difference between the last two results `nz_dis1` and `nz_dis2` (with and without dissolving, respectively) is not evident in the printout: in both cases we got a layer with two features of type `'MultiPolygon'`. # However, in the first case internal borders were dissolved, while in the second case they were not. # This is illustrated in @fig-combine-geoms: @@ -987,7 +975,7 @@ #| label: fig-combine-geoms -#| fig-cap: Combining New Zealand geometries into one, for each island, with and witout dissolving +#| fig-cap: Combining New Zealand geometries into one, for each island, with and without dissolving #| layout-ncol: 2 #| fig-subcap: #| - Dissolving (using the **geopandas** `.dissolve` method) @@ -1000,12 +988,9 @@ # # ## Geometric operations on raster data {#sec-geo-ras} # -# -# -# -# Geometric raster operations include the shift, flipping, mirroring, scaling, rotation or warping of images. -# These operations are necessary for a variety of applications including georeferencing, used to allow images to be overlaid on an accurate map with a known CRS [@liu_essential_2009]. -# A variety of georeferencing techniques exist, including: +# Geometric raster operations include the shift, flipping, mirroring, scaling, rotation, or warping of images. +# These operations are necessary for a variety of applications including georeferencing, used to allow images to be overlaid on an accurate map with a known CRS [@liu_essential_2009]. +# A variety of georeferencing techniques exist, including: # # * Georectification based on known ground control points # * Orthorectification, which also accounts for local topography @@ -1013,14 +998,14 @@ # # Python is rather unsuitable for the first two points since these often require manual intervention which is why they are usually done with the help of dedicated GIS software. # On the other hand, aligning several images is possible in Python and this section shows among others how to do so. -# This often includes changing the extent, the resolution and the origin of an image. -# A matching projection is of course also required but is already covered @sec-reprojecting-raster-geometries. +# This often includes changing the extent, the resolution, and the origin of an image. +# A matching projection is of course also required but is already covered in @sec-reprojecting-raster-geometries. # # In any case, there are other reasons to perform a geometric operation on a single raster image. # For instance, a common reason for aggregating a raster is to decrease run-time or save disk space. # Of course, this approach is only recommended if the task at hand allows a coarser resolution of raster data. # -# ### Geometric intersections {#sec-raster-geometric-intersections} +# # # # @@ -1028,117 +1013,70 @@ # # # -# In @sec-spatial-subsetting-raster we have shown how to extract values from a raster overlaid by coordinates or by a matching boolean mask. -# A different case is when the area of interest is defined by any general (possibly non-matching) raster B, to retrieve a spatial output of a (smaller) subset of raster A we can: +# +# # -# - Extract the bounding box polygon of B (hereby, `clip`) -# - Mask and crop A (hereby, `elev.tif`) using B (@sec-raster-cropping) +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# # -# For example, suppose that we want to get a subset of the `elev.tif` raster using another, smaller, raster. -# To demonstrate this, let's create (see @sec-raster-from-scratch) that smaller raster, hereby named `clip`. -# First, we need to create a $3 \times 3$ array of raster values. - -# In[ ]: - - -clip = np.array([1] * 9).reshape(3, 3) -clip - - -# Then, we define the transformation matrix, in such a way that `clip` intersects with `elev.tif` (@fig-raster-intersection). - -# In[ ]: - - -new_transform = rasterio.transform.from_origin( - west=0.9, - north=0.45, - xsize=0.3, - ysize=0.3 -) -new_transform - - -# Now, for subsetting, we will derive a `shapely` geometry representing the `clip` raster extent, using [`rasterio.transform.array_bounds`](https://rasterio.readthedocs.io/en/latest/api/rasterio.transform.html#rasterio.transform.array_bounds). - -# In[ ]: - - -bbox = rasterio.transform.array_bounds( - clip.shape[1], # columns - clip.shape[0], # rows - new_transform -) -bbox - - -# The four numeric values can be transformed into a rectangular `shapely` geometry using `shapely.box` (@fig-raster-clip-bbox). - -# In[ ]: - - -#| label: fig-raster-clip-bbox -#| fig-cap: '`shapely` geometry derived from a clipping raster bounding box coordinates, a preliminary step for geometric intersection between two rasters' -bbox = shapely.box(*bbox) -bbox - - -# @fig-raster-intersection shows the alignment of `bbox` and `elev.tif`. - -# In[ ]: - - -#| label: fig-raster-intersection -#| fig-cap: The `elev.tif` raster, and the extent of another (smaller) raster `clip` which we use to subset it -fig, ax = plt.subplots() -rasterio.plot.show(src_elev, ax=ax) -gpd.GeoSeries([bbox]).plot(color='none', ax=ax); - - -# From here on, subsetting can be done using masking and cropping, just like with any vector layer other than `bbox`, regardless whether it is rectangular or not. -# We elaborate on masking and cropping in @sec-raster-cropping (check that section for details about `rasterio.mask.mask`), but, for completeness, here is the code for the last step of masking and cropping: - -# In[ ]: - - -out_image, out_transform = rasterio.mask.mask( - src_elev, - [bbox], - crop=True, - all_touched=True, - nodata=0 -) - - -# The resulting subset array `out_image` contains all pixels intersecting with `clip` *pixels* (not necessarily with the centroids!). -# However, due to the `all_touched=True` argument, those pixels which intersect with `clip`, but their centroid does not, retain their original values (e.g., `17`, `23`) rather than turned into "No Data" (e.g., `0`). - -# In[ ]: - - -out_image - - -# Therefore, in our case, subset `out_image` dimensions are $2 \times 2$ (@fig-raster-intersection2; also see @fig-raster-intersection). - -# In[ ]: - - -#| label: fig-raster-intersection2 -#| fig-cap: The resulting subset of the `elev.tif` raster -fig, ax = plt.subplots() -rasterio.plot.show(out_image, transform=out_transform, ax=ax) -gpd.GeoSeries([bbox]).plot(color='none', ax=ax); - - # ### Extent and origin {#sec-extent-and-origin} # -# When merging or performing map algebra on rasters, their resolution, projection, origin and/or extent have to match. +# When merging or performing map algebra on rasters, their resolution, projection, origin, and/or extent have to match. # Otherwise, how should we add the values of one raster with a resolution of `0.2` decimal degrees to a second raster with a resolution of `1` decimal degree? # The same problem arises when we would like to merge satellite imagery from different sensors with different projections and resolutions. # We can deal with such mismatches by aligning the rasters. # Typically, raster alignment is done through resampling---that way, it is guaranteed that the rasters match exactly (@sec-raster-resampling). -# However, sometimes it can be useful to modify raster placement and extent "manually", by adding or removing rows and columns, or by modifying the origin, that is, slightly shifting the raster. +# However, sometimes it can be useful to modify raster placement and extent manually, by adding or removing rows and columns, or by modifying the origin, that is, slightly shifting the raster. # Sometimes, there are reasons other than alignment with a second raster for manually modifying raster extent and placement. # For example, it may be useful to add extra rows and columns to a raster prior to focal operations, so that it is easier to operate on the edges. # @@ -1152,7 +1090,7 @@ r -# To pad an `ndarray`, we can use the [`np.pad`](https://numpy.org/doc/stable/reference/generated/numpy.pad.html) function. +# To pad an `ndarray`, we can use the `np.pad` function. # The function accepts an array, and a tuple of the form `((rows_top,rows_bottom),(columns_left, columns_right))`. # Also, we can specify the value that's being used for padding with `constant_values` (e.g., `18`). # For example, here we pad `r` with one extra row and two extra columns, on both sides, resulting in the array `r_pad`: @@ -1166,11 +1104,9 @@ r_pad -# -# # However, for `r_pad` to be used in any spatial operation, we also have to update its transformation matrix. # Whenever we add extra columns on the left, or extra rows on top, the raster *origin* changes. -# To reflect this fact, we have to take to "original" origin and add the required multiple of pixel widths or heights (i.e., raster resolution steps). +# To reflect this fact, we have to take to 'original' origin and add the required multiple of pixel widths or heights (i.e., raster resolution steps). # The transformation matrix of a raster is accessible from the raster file metadata (@sec-raster-from-scratch) or, as a shortcut, through the `.transform` property of the raster file connection. # For example, the next code chunk shows the transformation matrix of `elev.tif`. @@ -1238,7 +1174,7 @@ # We can shift a raster origin not just when padding, but in any other use case, just by changing its transformation matrix. # The effect is that the raster is going to be shifted (which is analogous to `.translate` for shifting a vector layer, see @sec-affine-transformations). -# Manually shifting a raster to arbitrary distance is rarely needed in real-life scenarios, but it is useful to know how to do it at least for better understanding the concept of *raster origin*. +# Manually shifting a raster to arbitrary distance is rarely needed in real-life scenarios, but it is useful to know how to do it at least for a better understanding of the concept of *raster origin*. # As an example, let's shift the origin of `elev.tif` by `(-0.25,0.25)`. # First, we need to calculate the new origin. @@ -1251,8 +1187,6 @@ # To shift the origin in other directions we should change the two operators (`-`, `+`) accordingly. -# -# # # Then, same as when padding (see above), we create an updated transformation matrix. @@ -1274,7 +1208,7 @@ #| label: fig-raster-shift-origin2 -#| fig-cap: The padded `elev.tif` raster (@fig-raster-shift-origin) further shifted by `(0.25,0.25)`, and the extent of the original `elev.tif` raster (in red) +#| fig-cap: The `elev.tif` raster shifted by `(0.25,0.25)`, and its original extent (in red) fig, ax = plt.subplots() rasterio.plot.show(r, transform=new_transform, cmap='Greys', ax=ax) elev_bbox.plot(color='none', edgecolor='red', ax=ax); @@ -1282,19 +1216,17 @@ # ### Aggregation and disaggregation {#sec-raster-agg-disagg} # -# Raster datasets vary based on their resolution, from high resolution datasets that enable individual trees to be seen, to low resolution datasets covering large swaths of the Earth. +# Raster datasets vary based on their resolution, from high-resolution datasets that enable individual trees to be seen, to low-resolution datasets covering large swaths of the Earth. # Raster datasets can be transformed to either decrease (aggregate) or increase (disaggregate) their resolution, for a number of reasons. # For example, aggregation can be used to reduce computational resource requirements of raster storage and subsequent steps, while disaggregation can be used to match other datasets, or to add detail. -# As an example, we here change the spatial resolution of `dem.tif` by a factor of `5` (@fig-raster-aggregate). # # ::: callout-note # Raster aggregation is, in fact, a special case of raster resampling (see @sec-raster-resampling), where the target raster grid is aligned with the original raster, only with coarser pixels. -# Conversely, raster resampling is the general case where the new grid is not necessarily an aggregation of the original one, but any other type of grid (such as a rotated and/or shifted one, etc.). +# Conversely, raster resampling is the general case where the new grid is not necessarily an aggregation of the original one, but any other type of grid (i.e., shifted and or having increased/reduced resolution, by any factor). # ::: -# -# # -# To aggregate a raster using **rasterio**, we go through [two steps](https://rasterio.readthedocs.io/en/stable/topics/resampling.html): +# As an example, we here change the spatial resolution of `dem.tif` by a factor of `5` (@fig-raster-aggregate). +# To aggregate a raster using **rasterio**, we go through two steps: # # - Reading the raster values (using `.read`) into an `out_shape` that is different from the original `.shape` # - Updating the `transform` according to `out_shape` @@ -1318,10 +1250,10 @@ # To aggregate, instead of reading the raster values the usual way, as in `src.read(1)`, we can specify `out_shape` to read the values into a different shape. # Here, we calculate a new shape which is downscaled by a factor of `5`, i.e., the number of rows and columns is multiplied by `0.2`. -# We must truncate any "partial" rows and columns, e.g., using `int`. -# Each new pixel is now obtained, or "resampled", from $\sim 5 \times 5 = \sim 25$ "old" raster values. +# We must truncate any partial rows and columns, e.g., using `int`. +# Each new pixel is now obtained, or resampled, from $\sim 5 \times 5 = \sim 25$ 'old' raster values. # It is crucial to choose an appropriate *resampling method* through the `resampling` parameter. -# Here we use [`rasterio.enums.Resampling.average`](https://rasterio.readthedocs.io/en/stable/api/rasterio.enums.html#rasterio.enums.Resampling), i.e., the new "large" pixel value is the average of all coinciding small pixels, which makes sense for our elevation data in `dem.tif`. +# Here we use `rasterio.enums.Resampling.average`, i.e., the new 'large' pixel value is the average of all coinciding small pixels, which makes sense for our elevation data in `dem.tif`. # See @sec-raster-resampling for a list of other available methods. # In[ ]: @@ -1345,9 +1277,6 @@ r.shape -# -# -# # What's left to be done is the second step, to update the transform, taking into account the change in raster shape. # This can be done as follows, using `.transform.scale`. @@ -1392,8 +1321,6 @@ # Then we can create a new file (`dem_agg5.tif`) in writing mode, and write the values from the aggregated array `r` into the 1^st^ band of the file (see @sec-data-output-raster for a detailed explanation of writing raster files with **rasterio**). -# -# # In[ ]: @@ -1404,15 +1331,15 @@ # ::: callout-note -# The `**` syntax in Python is known as variable-length ["*keyword* arguments"](https://docs.python.org/3/glossary.html#term-argument). +# The `**` syntax in Python is known as variable-length '*keyword* arguments'. # It is used to pass a dictionary of numerous `parameter:argument` pairs to named arguments of a function. -# In `rasterio.open` writing mode, the "keyword arguments" syntax often comes in handy, because, instead of specifying each and every property of a new file, we pass a (modified) `.meta` dictionary based on another, template, raster. +# In `rasterio.open` writing mode, the 'keyword arguments' syntax often comes in handy, because, instead of specifying each and every property of a new file, we pass a (modified) `.meta` dictionary based on another, template, raster. # # Technically, keep in mind that the expression: # ``` # rasterio.open('out.tif', 'w', **dst_kwargs) # ``` -# where `dst_kwargs` is a `dict` of the following form (typically coming from a template raster, possibly with few "updated" properties using `.update`, see above): +# where `dst_kwargs` is a `dict` of the following form (typically coming from a template raster, possibly with few updated properties using `.update`, see above): # ``` # {'driver': 'GTiff', # 'dtype': 'float32', @@ -1430,13 +1357,11 @@ # ... # ) # ``` -# "*Positional* arguments" is a related technique; see note in @sec-reprojecting-raster-geometries. +# *Positional* arguments is a related technique; see note in @sec-reprojecting-raster-geometries. # ::: # # The opposite operation, namely disaggregation, is when we increase the resolution of raster objects. # Either of the supported resampling methods (see @sec-raster-resampling) can be used. -# -# # However, since we are not actually summarizing information but transferring the value of a large pixel into multiple small pixels, it makes sense to use either: # # - Nearest neighbor resampling (`rasterio.enums.Resampling.nearest`), when want to keep the original values as-is, since modifying them would be incorrect (such as in categorical rasters) @@ -1480,13 +1405,13 @@ # The original raster `dem.tif` was already quite detailed, so it would be difficult to see any difference when plotting it along with the disaggregation result. # A zoom-in of a small section of the rasters works better. -# @fig-raster-disaggregate allows us to see the top-left corner of the original raster and the disaggregated one, demonstrating the increase in the number of pixels through disaggregation. +# @fig-raster-disaggregate shows the top-left corners of the original raster and the disaggregated one, demonstrating the increase in the number of pixels through disaggregation. # In[ ]: #| label: fig-raster-disaggregate -#| fig-cap: Disaggregating a raster by a factor of 5, using bilinear tresampling. Only the a small portion (top-left corner) of the rasters is shown, to zoom-in and demonstrate the effect of disaggregation. +#| fig-cap: Disaggregating a raster by a factor of 5, using bilinear tresampling. Only a small portion (top-left corner) of the rasters is shown, to zoom-in and demonstrate the effect of disaggregation. #| layout-ncol: 2 #| fig-subcap: #| - Original @@ -1507,22 +1432,20 @@ # There are several methods for estimating values for a raster with different resolutions/origins (@fig-raster-resample). # The main resampling methods include: # -# - Nearest neighbor: assigns the value of the nearest cell of the original raster to the cell of the target one. This is a fast simple technique that is usually suitable for resampling categorical rasters -# - Bilinear interpolation: assigns a weighted average of the four nearest cells from the original raster to the cell of the target one. This is the fastest method that is appropriate for continuous rasters -# - Cubic interpolation: uses values of the 16 nearest cells of the original raster to determine the output cell value, applying third-order polynomial functions. Used for continuous rasters and results in a smoother surface compared to bilinear interpolation, but is computationally more demanding -# - Cubic spline interpolation: also uses values of the 16 nearest cells of the original raster to determine the output cell value, but applies cubic splines (piecewise third-order polynomial functions). Used for continuous rasters -# - Lanczos windowed sinc resampling: uses values of the 36 nearest cells of the original raster to determine the output cell value. Used for continuous rasters +# - Nearest neighbor---assigns the value of the nearest cell of the original raster to the cell of the target one. This is a fast simple technique that is usually suitable for resampling categorical rasters +# - Bilinear interpolation---assigns a weighted average of the four nearest cells from the original raster to the cell of the target one. This is the fastest method that is appropriate for continuous rasters +# - Cubic interpolation---uses values of the 16 nearest cells of the original raster to determine the output cell value, applying third-order polynomial functions. Used for continuous rasters and results in a smoother surface compared to bilinear interpolation, but is computationally more demanding +# - Cubic spline interpolation---also uses values of the 16 nearest cells of the original raster to determine the output cell value, but applies cubic splines (piecewise third-order polynomial functions). Used for continuous rasters +# - Lanczos windowed sinc resampling---uses values of the 36 nearest cells of the original raster to determine the output cell value. Used for continuous rasters # - Additionally, we can use straightforward summary methods, taking into account all pixels that coincide with the target pixel, such as average (@fig-raster-aggregate), minimum, maximum (@fig-raster-resample), median, mode, and sum # # The above explanation highlights that only nearest neighbor resampling is suitable for categorical rasters, while all remaining methods can be used (with different outcomes) for continuous rasters. # -# -# -# With **rasterio**, resampling can be done using the [`rasterio.warp.reproject`](https://rasterio.readthedocs.io/en/stable/api/rasterio.warp.html#rasterio.warp.reproject) function . +# With **rasterio**, resampling can be done using the `rasterio.warp.reproject` function. # To clarify this naming convention, note that raster *reprojection* is not fundamentally different from *resampling*---the difference is just whether the target grid is in the same CRS as the origin (resampling) or in a different CRS (reprojection). # In other words, reprojection is *resampling* into a grid that is in a different CRS. # Accordingly, both resampling and reprojection are done using the same function `rasterio.warp.reproject`. -# We will demonstrate reprojection using `rasterio.warp.reproject` later in @sec-reprojecting-raster-geometries. +# We will demonstrate *reprojection* using `rasterio.warp.reproject` later in @sec-reprojecting-raster-geometries. # # The information required for `rasterio.warp.reproject`, whether we are resampling or reprojecting, is: # @@ -1531,25 +1454,24 @@ # # Importantly, `rasterio.warp.reproject` can work with file connections, such as a connection to an output file in write (`'w'`) mode. # This makes the function efficient for large rasters. -# -# # # The target and destination CRS are straightforward to specify, depending on our choice. -# The source transform is also available, e.g., through the `.transform` property of the source file connection. +# The source transform is also readily available, through the `.transform` property of the source file connection. # The only complicated part is to figure out the *destination transform*. # When resampling, the transform is typically derived either from a *template* raster, such as an existing raster file that we would like our origin raster to match, or from a numeric specification of our target grid (see below). # Otherwise, when the exact grid is not of importance, we can simply aggregate or disaggregate our raster as shown above (@sec-raster-agg-disagg). -# (Note that when *reprojecting*, the target transform is not as straightforward to figure out, therefore we further use the `rasterio.warp.calculate_default_transform` function to compute it, as will be shown in @sec-reprojecting-raster-geometries.) +# (Note that when *reprojecting*, the target transform is more difficult to figure out, therefore we further use the `rasterio.warp.calculate_default_transform` function to compute it, as will be shown in @sec-reprojecting-raster-geometries.) # # Finally, the resampling method is specified through the `resampling` parameter of `rasterio.warp.reproject`. # The default is nearest neighbor resampling. -# However, as mentioned above, you should be aware of the distiction between resampling methods, and choose the appropriate one according to the data type (continuous/categorical), the input and output resolution, and resampling purposes. -# Possible arguments for [`resampling`](https://rasterio.readthedocs.io/en/stable/api/rasterio.enums.html#rasterio.enums.Resampling) include: +# However, as mentioned above, you should be aware of the distinction between resampling methods, and choose the appropriate one according to the data type (continuous/categorical), the input and output resolution, and resampling purposes. +# Possible arguments for `resampling` include: # # - `rasterio.enums.Resampling.nearest`---Nearest neighbor # - `rasterio.enums.Resampling.bilinear`---Bilinear # - `rasterio.enums.Resampling.cubic`---Cubic # - `rasterio.enums.Resampling.lanczos`---Lanczos windowed +# - `rasterio.enums.Resampling.average`---Average # - `rasterio.enums.Resampling.mode`---Mode. i.e., most common value # - `rasterio.enums.Resampling.min`---Minimum # - `rasterio.enums.Resampling.max`---Maximum @@ -1557,7 +1479,7 @@ # - `rasterio.enums.Resampling.sum`---Median # # Let's demonstrate resampling into a destination grid which is specified through numeric constraints, such as the extent and resolution. -# These could have been specified manually (such as here), or obtained from a template raster metadata that we would like to match. +# Again, these could have been specified manually (such as here), or obtained from a template raster metadata that we would like to match. # Note that the resolution of the destination grid is \~10 times more coarse (300 $m$) than the original resolution of `dem.tif` (\~30 $m$) (@fig-raster-resample). # In[ ]: @@ -1584,10 +1506,10 @@ dst_transform -# Again, note that in case we needed to resample into a grid specified by an existing "template" raster, we could skip this step and simply read the transform from the template file, as in `rasterio.open('template.tif').transform`. +# In case we needed to resample into a grid specified by an existing template raster, we could have skipped this step and simply read the transform from the template file, as in `rasterio.open('template.tif').transform`. # -# Now, we can move on to creating the destination file connection. -# For that, we also have to know the raster dimensions that can be derived from the extent and the resolution. +# We can move on to creating the destination file connection. +# For that, we also have to know the raster dimensions, which can be derived from the extent and the resolution. # In[ ]: @@ -1598,7 +1520,7 @@ # Now we can create the destination file connection. -# We are using the same metadata as the source file, except for the dimensions and the transform, which are going to be different and reflecting the resampling process. +# We are using the same metadata as the source file, except for the dimensions and the transform, which are going to be different and reflect the resampling process. # In[ ]: @@ -1613,7 +1535,7 @@ # Finally, we reproject using function `rasterio.warp.reproject`. -# Note that the source and destination are specified using [`rasterio.band`](https://rasterio.readthedocs.io/en/latest/api/rasterio.html#rasterio.band) applied on either the file connection, reflecting the fact that we operate on a specific layer of the rasters. +# Note that the source and destination are specified using `rasterio.band` applied on both file connections, reflecting the fact that we operate on a specific layer of the rasters. # The resampling method being used here is nearest neighbor resampling (`rasterio.enums.Resampling.nearest`). # In[ ]: @@ -1630,9 +1552,7 @@ ) -# In the end, we close the file connection and create a new file `output/dem_resample_nearest.tif` with the resampling result (@fig-raster-resample). -# -# +# In the end, we close the file connection, thus finalizing the new file `output/dem_resample_nearest.tif` with the resampling result (@fig-raster-resample). # In[ ]: @@ -1642,13 +1562,10 @@ # Here is another code section just to demonstrate a different resampling method, the maximum resampling, i.e., every new pixel gets the maximum value of all the original pixels it coincides with (@fig-raster-resample). # Note that all arguments in the `rasterio.warp.reproject` function call are identical to the previous example, except for the `resampling` method. -# -# # In[ ]: -#| eval: false dst = rasterio.open('output/dem_resample_maximum.tif', 'w', **dst_kwargs) rasterio.warp.reproject( source=rasterio.band(src, 1), @@ -1668,12 +1585,12 @@ #| label: fig-raster-resample -#| fig-cap: Visual comparison of the original raster and two different resampling methods' +#| fig-cap: The original raster `dem.tif` and two different resampling method results #| layout-ncol: 3 #| fig-subcap: -#| - Input -#| - Nearest neighbor -#| - Maximum +#| - Input +#| - Nearest neighbor +#| - Maximum # Input fig, ax = plt.subplots(figsize=(4,4)) rasterio.plot.show(src, ax=ax); @@ -1685,6 +1602,4 @@ rasterio.plot.show(rasterio.open('output/dem_resample_maximum.tif'), ax=ax); -# ## Exercises -# -# ## References +# diff --git a/code/chapters/05-raster-vector.py b/code/chapters/05-raster-vector.py index c72a1ef1..87f849d9 100644 --- a/code/chapters/05-raster-vector.py +++ b/code/chapters/05-raster-vector.py @@ -1,6 +1,10 @@ #!/usr/bin/env python # coding: utf-8 +# --- +# jupyter: python3 +# --- +# # # Raster-vector interactions {#sec-raster-vector} # # ## Prerequisites {.unnumbered} @@ -9,17 +13,17 @@ #| echo: false -import matplotlib.pyplot as plt -import pandas as pd -pd.options.display.max_rows = 6 -pd.options.display.max_columns = 6 -pd.options.display.max_colwidth = 35 -plt.rcParams['figure.figsize'] = (5, 5) +import book_options + + +# In[ ]: + + +#| echo: false +import book_options_pdf # This chapter requires importing the following packages: -# -# # In[ ]: @@ -28,6 +32,7 @@ import math import numpy as np import matplotlib.pyplot as plt +import pandas as pd import shapely import geopandas as gpd import rasterio @@ -64,13 +69,13 @@ # - Extracting raster values using different types of vector data (Section @sec-raster-extraction) # - Raster-vector conversion (@sec-rasterization and @sec-spatial-vectorization) # -# These concepts are demonstrated using data from in previous chapters, to understand their potential real-world applications. +# These concepts are demonstrated using data from previous chapters, to understand their potential real-world applications. # # ## Raster masking and cropping {#sec-raster-cropping} # # Many geographic data projects involve integrating data from many different sources, such as remote sensing images (rasters) and administrative boundaries (vectors). # Often the extent of input raster datasets is larger than the area of interest. -# In this case raster *masking*, *cropping*, or both, are useful for unifying the spatial extent of input data (@fig-raster-crop (b) and (c), and the following two examples, illustrate the difference between masking and cropping). +# In this case, raster *masking*, *cropping*, or both, are useful for unifying the spatial extent of input data (@fig-raster-crop (b) and (c), and the following two examples, illustrate the difference between masking and cropping). # Both operations reduce object memory use and associated computational resources for subsequent analysis steps, and may be a necessary preprocessing step before creating attractive maps involving raster data. # # We will use two layers to illustrate raster cropping: @@ -80,8 +85,6 @@ # # Both target and cropping objects must have the same projection. # Since it is easier and more precise to reproject vector layers, compared to rasters, we use the following expression to reproject (@sec-reprojecting-vector-geometries) the vector layer `zion` into the CRS of the raster `src_srtm`. -# -# # In[ ]: @@ -89,7 +92,7 @@ zion = zion.to_crs(src_srtm.crs) -# To mask the image, i.e., convert all pixels which do not intersect with the `zion` polygon to "No Data", we use the [`rasterio.mask.mask`](https://rasterio.readthedocs.io/en/stable/api/rasterio.mask.html#rasterio.mask.mask) function. +# To mask the image, i.e., convert all pixels which do not intersect with the `zion` polygon to 'No Data', we use the `rasterio.mask.mask` function. # # In[ ]: @@ -103,11 +106,9 @@ ) -# Note that we need to choose and specify a "No Data" value, within the valid range according to the data type. +# Note that we need to choose and specify a 'No Data' value, within the valid range according to the data type. # Since `srtm.tif` is of type `uint16` (how can we check?), we choose `9999` (a positive integer that is guaranteed not to occur in the raster). -# Also note that **rasterio** does not directly support **geopandas** data structures, so we need to pass a "collection" of **shapely** geometries: a `GeoSeries` (see above) or a `list` of **shapely** geometries (see next example) both work. -# -# +# Also note that **rasterio** does not directly support **geopandas** data structures, so we need to pass a 'collection' of **shapely** geometries: a `GeoSeries` (see above) or a `list` of **shapely** geometries (see next example) both work. # The output consists of two objects. # The first one is the `out_image` array with the masked values. @@ -128,9 +129,9 @@ # Note that masking (without cropping!) does not modify the raster extent. # Therefore, the new transform is identical to the original (`src_srtm.transform`). # -# Unfortunately, the `out_image` and `out_transform` objects do not contain any information indicating that `9999` represents "No Data". +# Unfortunately, the `out_image` and `out_transform` objects do not contain any information indicating that `9999` represents 'No Data'. # To associate the information with the raster, we must write it to file along with the corresponding metadata. -# For example, to write the masked raster to file, we first need to modify the "No Data" setting in the metadata. +# For example, to write the masked raster to file, we first need to modify the 'No Data' setting in the metadata. # In[ ]: @@ -150,7 +151,7 @@ new_dataset.close() -# Now we can re-import the raster and check that the "No Data" value is correctly set. +# Now we can re-import the raster and check that the 'No Data' value is correctly set. # In[ ]: @@ -159,7 +160,7 @@ # The `.meta` property contains the `nodata` entry. -# Now, any relevant operation (such as plotting, see @fig-raster-crop (b)) will take "No Data" into account. +# Now, any relevant operation (such as plotting, see @fig-raster-crop (b)) will take 'No Data' into account. # In[ ]: @@ -169,22 +170,22 @@ # The related operation, cropping, reduces the raster extent to the extent of the vector layer: # -# - To just crop, *without* masking, we can derive the bounding box polygon of the vector layer, and then crop using that polygon, also combined with `crop=True` (@fig-raster-crop (c)) -# - To crop *and* mask, we can use `rasterio.mask.mask`, same as above for masking, just setting `crop=True` instead of the default `crop=False` (@fig-raster-crop (d)) +# * To crop *and* mask, we can use `rasterio.mask.mask`, same as above for masking, while setting `crop=True` (@fig-raster-crop (d)) +# * To just crop, *without* masking, we can derive the bounding box polygon of the vector layer, and then crop using that polygon, also combined with `crop=True` (@fig-raster-crop (c)) # -# For the example of cropping only, the extent polygon of `zion` can be obtained as a `shapely` geometry object using the `.unary_union.envelope` property(@fig-zion-bbox). +# For the example of cropping only, the extent polygon of `zion` can be obtained as a `shapely` geometry object using `.union_all().envelope`(@fig-zion-bbox). # In[ ]: #| label: fig-zion-bbox #| fig-cap: Bounding box `'Polygon'` geometry of the `zion` layer -bb = zion.unary_union.envelope +bb = zion.union_all().envelope bb # The extent can now be used for masking. -# Here, we are also using the `all_touched=True` option so that pixels partially overlapping with the extent are also included in the output. +# Here, we are also using the `all_touched=True` option, so that pixels which are partially overlapping with the extent are also included in the output. # In[ ]: @@ -198,20 +199,13 @@ ) -# In the case of cropping, there is no particular reason to write the result to file for easier plotting, such as in the other two examples, since there are no "No Data" values (@fig-raster-crop (c)). +# In the case of cropping, there is no particular reason to write the result to file for easier plotting, such as in the other two examples, since there are no 'No Data' values (@fig-raster-crop (c)). # # ::: callout-note # As mentioned above, **rasterio** functions typically accept vector geometries in the form of `lists` of `shapely` objects. `GeoSeries` are conceptually very similar, and also accepted. However, even an individual geometry has to be in a `list`, which is why we pass `[bb]`, and not `bb`, in the above `rasterio.mask.mask` function call (the latter would raise an error). # ::: # -# -# -# Finally, the third example is where we perform crop both and mask operations, using `rasterio.mask.mask` with `crop=True`. -# -# -# -# -# +# Finally, the third example is where we perform both crop and mask operations, using `rasterio.mask.mask` with `crop=True` passing `zion.geometry`. # In[ ]: @@ -224,7 +218,7 @@ ) -# When writing the result to file, it is here crucial to update the transform and dimensions, since they were modified as a result of cropping. +# When writing the result to a file, it is here crucial to update the transform and dimensions, since they were modified as a result of cropping. # Also note that `out_image_mask_crop` is a three-dimensional array (even though it has one band in this case), so the number of rows and columns are in `.shape[1]` and `.shape[2]` (rather than `.shape[0]` and `.shape[1]`), respectively. # In[ ]: @@ -252,10 +246,9 @@ src_srtm_mask_crop = rasterio.open('output/srtm_masked_cropped.tif') -out_image_mask_crop.shape -# @fig-raster-crop shows the original raster, and the all of the masked and cropped results. +# @fig-raster-crop shows the original raster, and the three masking and/or cropping results. # In[ ]: @@ -293,8 +286,8 @@ # # In the following examples, we use a package called **rasterstats**, which is specifically aimed at extracting raster values: # -# - To *points* (@sec-extraction-to-points) or to *lines* (@sec-extraction-to-lines), via the [`rasterstats.point_query`](https://pythonhosted.org/rasterstats/rasterstats.html#rasterstats.point_query) function -# - To *polygons* (@sec-extraction-to-polygons), via the [`rasterstats.zonal_stats`](https://pythonhosted.org/rasterstats/rasterstats.html#rasterstats.zonal_stats) function +# * To *points* (@sec-extraction-to-points) or to *lines* (@sec-extraction-to-lines), via the `rasterstats.point_query` function +# * To *polygons* (@sec-extraction-to-polygons), via the `rasterstats.zonal_stats` function # # ### Extraction to points {#sec-extraction-to-points} # @@ -305,10 +298,10 @@ #| label: fig-zion-points -#| fig-cap: 30 point locations within the Zion National Park, with elevation in the background +#| fig-cap: 30-point locations within the Zion National Park, with elevation in the background fig, ax = plt.subplots() rasterio.plot.show(src_srtm, ax=ax) -zion_points.plot(ax=ax, color='black'); +zion_points.plot(ax=ax, color='black', edgecolor='white'); # The following expression extracts elevation values from `srtm.tif` according to `zion_points`, using `rasterstats.point_query`. @@ -325,11 +318,11 @@ ) -# The first two arguments are the vector layer and the array with rastetr values. -# The `nodata` and `affine` arguments are used to align the array values into the CRS, and to correctly treat "No Data" flags. -# Finally, the `interpolate` argument controls the way that the cell values are asigned to the point; `interpolate='nearest'` typically makes more sense, as opposed to the other option `interpolate='bilinear'` which is the default. +# The first two arguments are the vector layer and the array with raster values. +# The `nodata` and `affine` arguments are used to align the array values into the CRS, and to correctly treat 'No Data' flags. +# Finally, the `interpolate` argument controls the way that the cell values are assigned to the point; `interpolate='nearest'` typically makes more sense, as opposed to the other option `interpolate='bilinear'` which is the default. # -# Alternatively, we can pass a raster file path to `rasterstats.point_query`, in which case `nodata` and `affine` are not necessary, as the function can understand those properties from the raster file. +# Alternatively, we can pass a raster file path to `rasterstats.point_query`, in which case `nodata` and `affine` are not necessary, as the function can understand those properties directly from the raster file. # In[ ]: @@ -341,10 +334,7 @@ ) -# -# -# -# The resulting object is a `list` of raster values, corresponding to `zion_points`. +# Either way, the resulting object is a `list` of raster values, corresponding to `zion_points`. # For example, here are the elevations of the first five points. # In[ ]: @@ -353,12 +343,6 @@ result1[:5] -# In[ ]: - - -result2[:5] - - # To get a `GeoDataFrame` with the original points geometries (and other attributes, if any), as well as the extracted raster values, we can assign the extraction result into a new column. # As you can see, both approaches give the same result. @@ -370,9 +354,6 @@ zion_points -# -# -# # The function supports extracting from just one raster band at a time. # When passing an array, we can read the required band (as in, `.read(1)`, `.read(2)`, etc.). # When passing a raster file path, we can set the band using the `band_num` argument (the default being `band_num=1`). @@ -383,7 +364,7 @@ # The typical line extraction algorithm is to extract one value for each raster cell touched by a line. # However, this particular approach is not recommended to obtain values along the transects, as it is hard to get the correct distance between each pair of extracted raster values. # -# For line extraction, a better approach is to split the line into many points (at equal distances along the line) and then extract the values for these points using the "extraction to points" technique (@sec-extraction-to-points). +# For line extraction, a better approach is to split the line into many points (at equal distances along the line) and then extract the values for these points using the 'extraction to points' technique (@sec-extraction-to-points). # To demonstrate this, the code below creates (see @sec-vector-data for recap) `zion_transect`, a straight line going from northwest to southeast of the Zion National Park. # In[ ]: @@ -416,8 +397,9 @@ print(zion_transect_utm) -# Next, we need to calculate the distances, along the line, where points are going to be generated, using [`np.arange`](https://numpy.org/doc/stable/reference/generated/numpy.arange.html). -# This is a numeric sequence starting at `0`, going up to line `.length`, in steps of `250` ($m$). +# Next, we need to calculate the distances, along the line, where points are going to be generated. +# We do this using `np.arange`. +# The result is a numeric sequence starting at `0`, going up to line `.length`, in steps of `250` ($m$). # In[ ]: @@ -426,18 +408,20 @@ distances[:7] ## First 7 distance cutoff points -# The distances cutoffs are used to sample ("interpolate") points along the line. -# The **shapely** [`.interpolate`](https://shapely.readthedocs.io/en/stable/manual.html#object.interpolate) method is used to generate the points, which then are reprojected back to the geographic CRS of the raster (EPSG:`4326`). +# The distance cutoffs are used to sample ('interpolate') points along the line. +# The **shapely** `.interpolate` method is used to generate the points, which then are reprojected back to the geographic CRS of the raster (EPSG:`4326`). # In[ ]: -zion_transect_pnt = [zion_transect_utm.interpolate(distance) for distance in distances] -zion_transect_pnt = gpd.GeoSeries(zion_transect_pnt, crs=32612).to_crs(src_srtm.crs) +#| code-overflow: wrap +zion_transect_pnt = [zion_transect_utm.interpolate(d) for d in distances] +zion_transect_pnt = gpd.GeoSeries(zion_transect_pnt, crs=32612) \ + .to_crs(src_srtm.crs) zion_transect_pnt -# Finally, we extract the elevation values for each point in our transect and combine the information with `zion_transect_pnt` (after "promoting" it to a `GeoDataFrame`, to accommodate extra attributes), using the point extraction method shown earlier (@sec-extraction-to-points). +# Finally, we extract the elevation values for each point in our transect and combine the information with `zion_transect_pnt` (after 'promoting' it to a `GeoDataFrame`, to accommodate extra attributes), using the point extraction method shown earlier (@sec-extraction-to-points). # We also attach the respective distance cutoff points `distances`. # In[ ]: @@ -482,7 +466,7 @@ # ### Extraction to polygons {#sec-extraction-to-polygons} # # The final type of geographic vector object for raster extraction is polygons. -# Like lines, polygons tend to return many raster values per polygon. +# Like lines, polygons tend to return many raster values per vector geometry. # For continuous rasters (@fig-raster-extract-to-polygon (a)), we typically want to generate summary statistics for raster values per polygon, for example to characterize a single region or to compare many regions. # The generation of raster summary statistics, by polygons, is demonstrated in the code below using `rasterstats.zonal_stats`, which creates a list of summary statistics (in this case a list of length 1, since there is just one polygon). @@ -513,17 +497,17 @@ # Because there is only one polygon in the example, a `DataFrame` with a single row is returned. # However, if `zion` was composed of more than one polygon, we would accordingly get more rows in the `DataFrame`. -# The result provides useful summaries, for example that the maximum height in the park is around `2661` $m$ above see level. +# The result provides useful summaries, for example that the maximum height in the park is `2661` $m$ above see level. # # Note the `stats` argument, where we determine what type of statistics are calculated per polygon. -# Possible values other than `'mean'`, `'min'`, `'max'` are: +# Possible values other than `'mean'`, `'min'`, and `'max'` are: # -# - `'count'`---The number of valid (i.e., excluding "No Data") pixels -# - `'nodata'`---The number of pixels with 'No Data" +# - `'count'`---The number of valid (i.e., excluding 'No Data') pixels +# - `'nodata'`---The number of pixels with 'No Data' # - `'majority'`---The most frequently occurring value # - `'median'`---The median value # -# See the [documentation](https://pythonhosted.org/rasterstats/manual.html#statistics) of `rasterstats.zonal_stats` for the complete list. +# See the documentation of `rasterstats.zonal_stats` for the complete list. # Additionally, the `rasterstats.zonal_stats` function accepts user-defined functions for calculating any custom statistics. # # To count occurrences of categorical raster values within polygons (@fig-raster-extract-to-polygon (b)), we can use masking (@sec-raster-cropping) combined with `np.unique`, as follows. @@ -535,13 +519,13 @@ src_nlcd, zion.geometry.to_crs(src_nlcd.crs), crop=False, - nodata=9999 + nodata=src_nlcd.nodata ) counts = np.unique(out_image, return_counts=True) counts -# According to the result, for example, pixel value `2` ("Developed" class) appears in `4205` pixels within the Zion polygon. +# According to the result, for example, the value `2` ('Developed' class) appears in `4205` pixels within the Zion polygon. # # @fig-raster-extract-to-polygon illustrates the two types of raster extraction to polygons described above. @@ -569,48 +553,47 @@ # # ## Rasterization {#sec-rasterization} # -# -# -# # Rasterization is the conversion of vector objects into their representation in raster objects. # Usually, the output raster is used for quantitative analysis (e.g., analysis of terrain) or modeling. # As we saw in @sec-spatial-class, the raster data model has some characteristics that make it conducive to certain methods. # Furthermore, the process of rasterization can help simplify datasets because the resulting values all have the same spatial resolution: rasterization can be seen as a special type of geographic data aggregation. # -# The **rasterio** package contains the [`rasterio.features.rasterize`](https://rasterio.readthedocs.io/en/stable/api/rasterio.features.html#rasterio.features.rasterize) function for doing this work. -# To make it happen, we need to have the "template" grid definition, i.e., the "template" raster defining the extent, resolution and CRS of the output, in the `out_shape` (the output dimensions) and `transform` (the transformation matrix) arguments of `rasterio.features.rasterize`. +# The **rasterio** package contains the `rasterio.features.rasterize` function for doing this work. +# To make it happen, we need to have the 'template' grid definition, i.e., the 'template' raster defining the extent, resolution and CRS of the output, in the `out_shape` (the output dimensions) and `transform` (the transformation matrix) arguments of `rasterio.features.rasterize`. # In case we have an existing template raster, we simply need to query its `.shape` and `.transform`. # On the other hand, if we need to create a custom template, e.g., covering the vector layer extent with specified resolution, there is some extra work to calculate both of these objects (see next example). # # As for the vector geometries and their associated values, the `rasterio.features.rasterize` function requires the input vector shapes in the form of an iterable object of `geometry,value` pairs, where: # # - `geometry` is the given geometry (**shapely** geometry object) -# - `value` is the value to be "burned" into pixels coinciding with the geometry (`int` or `float`) +# - `value` is the value to be 'burned' into pixels coinciding with the geometry (`int` or `float`) # # Furthermore, we define how to deal with multiple values burned into the same pixel, using the `merge_alg` parameter. -# The default `merge_alg=rasterio.enums.MergeAlg.replace` means that "later" values replace "earlier" ones, i.e., the pixel gets the "last" burned value. +# The default `merge_alg=rasterio.enums.MergeAlg.replace` means that 'later' values replace 'earlier' ones, i.e., the pixel gets the 'last' burned value. # The other option `merge_alg=rasterio.enums.MergeAlg.add` means that burned values are summed, i.e., the pixel gets the sum of all burned values. # # When rasterizing lines and polygons, we also have the choice between two pixel-matching algorithms. -# The default, `all_touched=False`, implies pixels that are selected by [Bresenham's line algorithm](https://en.wikipedia.org/wiki/Bresenham%27s_line_algorithm) (for lines) or pixels whose center is within the polygon (for polygons). +# The default, `all_touched=False`, implies pixels that are selected by Bresenham's line algorithm[^bresenham] (for lines) or pixels whose center is within the polygon (for polygons). # The other option `all_touched=True`, as the name suggests, implies that all pixels intersecting with the geometry are matched. # -# Finally, we can set the `fill` value, which is the value that "unaffected" pixels get, with `fill=0` being the default. +# [^bresenham]: [https://en.wikipedia.org/wiki/Bresenham%27s_line_algorithm](https://en.wikipedia.org/wiki/Bresenham%27s_line_algorithm) +# +# Finally, we can set the `fill` value, which is the value that 'unaffected' pixels get, with `fill=0` being the default. # # How the `rasterio.features.rasterize` function works with all of these various parameters will be made clear in the next examples. # -# The geographic resolution of the "template" raster has a major impact on the results: if it is too low (cell size is too large), the result may miss the full geographic variability of the vector data; if it is too high, computational times may be excessive. +# The geographic resolution of the 'template' raster has a major impact on the results: if it is too low (cell size is too large), the result may miss the full geographic variability of the vector data; if it is too high, computational times may be excessive. # There are no simple rules to follow when deciding an appropriate geographic resolution, which is heavily dependent on the intended use of the results. -# Often the target resolution is imposed on the user, for example when the output of rasterization needs to be aligned to the existing raster. +# Often the target resolution is imposed on the user, for example when the output of rasterization needs to be aligned to an existing raster. # # Depending on the input data, rasterization typically takes one of two forms which we demonstrate next: # # - in *point* rasterization (@sec-rasterizing-points), we typically choose how to treat multiple points: either to summarize presence/absence, point count, or summed attribute values (@fig-rasterize-points) -# - in *line* and *polygon* rasterization (@sec-rasterizing-lines-and-polygons), there are typically no such "overlaps" and we simply "burn" attribute values, or fixed values, into pixels coinciding with the given geometries (@fig-rasterize-lines-polygons) +# - in *line* and *polygon* rasterization (@sec-rasterizing-lines-and-polygons), there are typically no such 'overlaps' and we simply 'burn' attribute values, or fixed values, into pixels coinciding with the given geometries (@fig-rasterize-lines-polygons) # # ### Rasterizing points {#sec-rasterizing-points} # -# To demonstrate point rasterization, we will prepare a "template" raster that has the same extent and CRS as the input vector data `cycle_hire_osm_projected` (a dataset on cycle hire points in London, illustrated in @fig-rasterize-points (a)) and a spatial resolution of 1000 $m$. +# To demonstrate point rasterization, we will prepare a 'template' raster that has the same extent and CRS as the input vector data `cycle_hire_osm_projected` (a dataset on cycle hire points in London, illustrated in @fig-rasterize-points (a)) and a spatial resolution of 1000 $m$. # To do that, we first take our point layer and transform it to a projected CRS. # In[ ]: @@ -648,7 +631,7 @@ # Finally, we are ready to rasterize. -# As mentioned abover, point rasterization can be a very flexible operation: the results depend not only on the nature of the template raster, but also on the pixel "activation" method, namely the way we deal with multiple points matching the same pixel. +# As mentioned above point rasterization can be a very flexible operation: the results depend not only on the nature of the template raster, but also on the pixel 'activation' method, namely the way we deal with multiple points matching the same pixel. # # To illustrate this flexibility, we will try three different approaches to point rasterization (@fig-rasterize-points (b)-(d)). # First, we create a raster representing the presence or absence of cycle hire points (known as presence/absence rasters). @@ -656,10 +639,6 @@ # In the **rasterio** framework, we use the `rasterio.features.rasterize` function, which requires an iterable object of `geometry,value` pairs. # In this first example, we transform the point `GeoDataFrame` into a `list` of `shapely` geometries and the (fixed) value of `1`, using list comprehension as follows. # The first five elements of the `list` are hereby printed to illustrate its structure. -# -# -# -# # In[ ]: @@ -668,11 +647,9 @@ g[:5] -# The list of `geometry,value` pairs is passed to `rasterio.features.rasterize`, along with the `shape` and `transform` which define the raster template. -# The result `ch_raster1` is an `ndarray` with the burned values of `1` where the pixel coincides with at least one point, and `0` in "unaffected" pixels. -# Note that `merge_alg=rasterio.enums.MergeAlg.replace` (the default) is used here, which means that a pixel get `1` when one or more point fall in it, or keeps the original `0` value otherwise. -# -# +# The list of `geometry,value` pairs is passed to `rasterio.features.rasterize`, along with the `out_shape` and `transform` which define the raster template. +# The result `ch_raster1` is an `ndarray` with the burned values of `1` where the pixel coincides with at least one point, and `0` in 'unaffected' pixels. +# Note that `merge_alg=rasterio.enums.MergeAlg.replace` (the default) is used here, which means that a pixel get `1` when one or more points fall in it, or keeps the original `0` value otherwise. # In[ ]: @@ -689,9 +666,6 @@ # To do that, we use the fixed value of `1` (same as in the last example), but this time combined with the `merge_alg=rasterio.enums.MergeAlg.add` argument. # That way, multiple values burned into the same pixel are *summed*, rather than replaced keeping last (which is the default). # The new output, `ch_raster2`, shows the number of cycle hire points in each grid cell. -# -# -# # In[ ]: @@ -708,10 +682,8 @@ # The cycle hire locations have different numbers of bicycles described by the capacity variable, raising the question, what is the capacity in each grid cell? # To calculate that, in our third point rasterization variant we sum the field (`'capacity'`) rather than the fixed values of `1`. -# This requires using a more complex list comprehension expression, where we also (1) extract both geometries and the attribute of interest, and (2) filter out "No Data" values, which can be done as follows. +# This requires using a more complex list comprehension expression, where we also (1) extract both geometries and the attribute of interest, and (2) filter out 'No Data' values, which can be done as follows. # You are invited to run the separate parts to see how this works; the important point is that, in the end, we get the list `g` with the `geometry,value` pairs to be burned, only that the `value` is now variable, rather than fixed, among points. -# -# # In[ ]: @@ -779,16 +751,16 @@ california -# Second, we "cast" the polygon into a `'MultiLineString'` geometry, using the [`.boundary`](https://geopandas.org/en/stable/docs/reference/api/geopandas.GeoSeries.boundary.html) property that `GeoSeries` have. +# Second, we 'cast' the polygon into a `'MultiLineString'` geometry, using the `.boundary` property that `GeoSeries` and `DataFrame`s have. # In[ ]: -california_borders = california.geometry.boundary +california_borders = california.boundary california_borders -# Third, we create the `transform` and `shape` describing our template raster, with a resolution of a `0.5` degree, using the same approach as in @sec-rasterizing-points. +# Third, we create the `transform` and `shape` describing our template raster, with a resolution of `0.5` degree, using the same approach as in @sec-rasterizing-points. # In[ ]: @@ -811,7 +783,7 @@ # When considering line or polygon rasterization, one useful additional argument is `all_touched`. # By default it is `False`, but when changed to `True`---all cells that are touched by a line or polygon border get a value. # Line rasterization with `all_touched=True` is demonstrated in the code below (@fig-rasterize-lines-polygons, left). -# We are also using `fill=np.nan` to set "background" values as "No Data". +# We are also using `fill=np.nan` to set 'background' values to 'No Data'. # In[ ]: @@ -821,11 +793,12 @@ out_shape=shape, transform=transform, all_touched=True, - fill=np.nan + fill=np.nan, + dtype=np.float64 ) -# Compare it to a polygon rasterization, with `all_touched=False` (the default), which selects only raster cells whose centroids are inside the selector polygon, as illustrated in @fig-rasterize-lines-polygons (right). +# Compare it to polygon rasterization, with `all_touched=False` (the default), which selects only raster cells whose centroids are inside the selector polygon, as illustrated in @fig-rasterize-lines-polygons (right). # In[ ]: @@ -834,14 +807,13 @@ [(g, 1) for g in california.geometry], out_shape=shape, transform=transform, - fill=np.nan + fill=np.nan, + dtype=np.float64 ) # To illustrate which raster pixels are actually selected as part of rasterization, we also show them as points. # This also requires the following code section to calculate the points, which we explain in @sec-spatial-vectorization. -# -# # In[ ]: @@ -884,7 +856,7 @@ # ## Spatial vectorization {#sec-spatial-vectorization} # # Spatial vectorization is the counterpart of rasterization (@sec-rasterization). -# It involves converting spatially continuous raster data into spatially discrete vector data such as points, lines or polygons. +# It involves converting spatially continuous raster data into spatially discrete vector data such as points, lines, or polygons. # There are three standard methods to convert a raster to a vector layer, which we cover next: # # - Raster to polygons (@sec-raster-to-polygons)---converting raster cells to rectangular polygons, representing pixel areas @@ -895,19 +867,15 @@ # # ### Raster to polygons {#sec-raster-to-polygons} # -# The [`rasterio.features.shapes`](https://rasterio.readthedocs.io/en/stable/api/rasterio.features.html#rasterio.features.shapes) gives access to raster pixels as polygon geometries, along with the associated raster values. +# The `rasterio.features.shapes` gives access to raster pixels as polygon geometries, along with the associated raster values. # The returned object is a generator (see note in @sec-spatial-subsetting-raster), yielding `geometry,value` pairs. -# -# # # For example, the following expression returns a generator named `shapes`, referring to the pixel polygons. # In[ ]: -shapes = rasterio.features.shapes( - rasterio.band(src_grain, 1) -) +shapes = rasterio.features.shapes(rasterio.band(src_grain, 1) ) shapes @@ -919,7 +887,7 @@ pol = list(shapes) -# Each element in `pol` is a `tuple` of length 2, containing the GeoJSON-like `dict`---representing the polygon geometry and the value of the pixel(s)---which comprise the polygon. +# Each element in `pol` is a `tuple` of length 2, containing the GeoJSON-like `dict`---representing the polygon geometry and the value of the pixel(s) which comprise the polygon. # For example, here is the first element of `pol`. # In[ ]: @@ -928,18 +896,13 @@ pol[0] -# -# -# # ::: callout-note -# Note that, when transforming a raster cell into a polygon, five coordinate pairs need to be kept in memory to represent its geometry (explaining why rasters are often fast compared with vectors!). +# Note that, when transforming a raster cell into a polygon, five-coordinate pairs need to be kept in memory to represent its geometry (explaining why rasters are often fast compared with vectors!). # ::: # # To transform the `list` coming out of `rasterio.features.shapes` into the familiar `GeoDataFrame`, we need few more steps of data reshaping. -# First, we apply the [`shapely.geometry.shape`](https://shapely.readthedocs.io/en/stable/manual.html#shapely.geometry.shape) function to go from a `list` of GeoJSON-like `dict`s to a `list` of `shapely` geometry objects. +# First, we apply the `shapely.geometry.shape` function to go from a `list` of GeoJSON-like `dict`s to a `list` of `shapely` geometry objects. # The `list` can then be converted to a `GeoSeries` (see @sec-vector-layer-from-scratch). -# -# # In[ ]: @@ -949,7 +912,7 @@ geom -# The values can also be extracted from the `rasterio.features.shapes` and turned into a corresponding `Series`. +# The values can also be extracted from the `rasterio.features.shapes` result and turned into a corresponding `Series`. # In[ ]: @@ -980,11 +943,11 @@ # As highlighted using `edgecolor='black'`, neighboring pixels sharing the same raster value are dissolved into larger polygons. # The `rasterio.features.shapes` function unfortunately does not offer a way to avoid this type of dissolving. -# One [suggestion](https://gis.stackexchange.com/questions/455980/vectorizing-all-pixels-as-separate-polygons-using-rasterio#answer-456251) is to add unique values between `0` and `0.9999` to all pixels, convert to polygons, and then get back to the original values using [`np.floor`](https://numpy.org/doc/stable/reference/generated/numpy.floor.html). +# One [suggestion](https://gis.stackexchange.com/questions/455980/vectorizing-all-pixels-as-separate-polygons-using-rasterio#answer-456251) is to add unique values between `0` and `0.9999` to all pixels, convert to polygons, and then get back to the original values using `np.floor`. # # ### Raster to points {#sec-raster-to-points} # -# To transform a raster to points, we can use the [`rasterio.transform.xy`](https://rasterio.readthedocs.io/en/latest/api/rasterio.transform.html#rasterio.transform.xy). +# To transform a raster to points, we can use the `rasterio.transform.xy` function. # As the name suggests, the function accepts row and column indices, and transforms them into x- and y-coordinates (using the raster's transformation matrix). # For example, the coordinates of the top-left pixel can be calculated passing the `(row,col)` indices of `(0,0)`. @@ -1008,7 +971,7 @@ # ::: # # To generalize the above expression to calculate the coordinates of *all* pixels, we first need to generate a grid of all possible row/column index combinations. -# This can be done using [`np.meshgrid`](https://numpy.org/doc/stable/reference/generated/numpy.meshgrid.html), as follows. +# This can be done using `np.meshgrid`, as follows. # In[ ]: @@ -1066,8 +1029,8 @@ pnt -# This "high-level" workflow, like many other **rasterio**-based workflows covered in the book, is a commonly used one but lacking from the package itself. -# From the user perspective, it may be a good idea to wrap the workflow into a function (e.g., `raster_to_points(src)`, returning a `GeoDataFrame`), to be re-used whenever we need it. +# This 'high-level' workflow, like many other **rasterio**-based workflows covered in the book, is a commonly used one but lacking from the package itself. +# From the user's perspective, it may be a good idea to wrap the workflow into a function (e.g., `raster_to_points(src)`, returning a `GeoDataFrame`), to be re-used whenever we need it. # # @fig-raster-to-points shows the input raster and the resulting point layer. @@ -1086,17 +1049,17 @@ rasterio.plot.show(src_elev, ax=ax); # Points fig, ax = plt.subplots() -pnt.plot(column='value', legend=True, ax=ax) -rasterio.plot.show(src_elev, cmap='Greys', ax=ax); +pnt.plot(column='value', legend=True, edgecolor='black', ax=ax) +rasterio.plot.show(src_elev, alpha=0, ax=ax); -# Note that "No Data" pixels can be filtered out from the conversion, if necessary (see @sec-distance-to-nearest-geometry). +# Note that 'No Data' pixels can be filtered out from the conversion, if necessary (see @sec-distance-to-nearest-geometry). # # ### Raster to contours {#sec-raster-to-contours} # -# Another common type of spatial vectorization is the creation of contour lines representing lines of continuous height or temperatures (*isotherms*), for example. +# Another common type of spatial vectorization is the creation of contour lines, representing lines of continuous height or temperatures (*isotherms*), for example. # We will use a real-world digital elevation model (DEM) because the artificial raster `elev.tif` produces parallel lines (task for the reader: verify this and explain why this happens). -# Plotting contour lines is straightforward, using the `contour=True` option of `rasterio.plot.show` (@fig-raster-contours1). +# *Plotting* contour lines is straightforward, using the `contour=True` option of `rasterio.plot.show` (@fig-raster-contours1). # In[ ]: @@ -1114,12 +1077,12 @@ ); -# Unfortunately, `rasterio` does not provide any way of extracting the contour lines in the form of a vector layer, for uses other than plotting. +# Unfortunately, **rasterio** does not provide any way of extracting the contour lines in the form of a vector layer, for uses other than plotting. # # There are two possible workarounds: # -# 1. Using `gdal_contour` on the [command line](https://gdal.org/programs/gdal_contour.html) (see below), or through its Python interface [**osgeo**](https://gis.stackexchange.com/questions/360431/how-can-i-create-contours-from-geotiff-and-python-gdal-rasterio-etc-into-sh) -# 2. Writing a custom function to export contour coordinates generated by, e.g., [**matplotlib**](https://www.tutorialspoint.com/how-to-get-coordinates-from-the-contour-in-matplotlib) or [**skimage**](https://gis.stackexchange.com/questions/268331/how-can-i-extract-contours-from-a-raster-with-python) +# 1. Using `gdal_contour` on the command line (see below), or through its Python interface **osgeo** +# 2. Writing a custom function to export contour coordinates generated by, e.g., **matplotlib** or **skimage** # # We demonstrate the first approach, using `gdal_contour`. # Although we deviate from the Python-focused approach towards more direct interaction with GDAL, the benefit of `gdal_contour` is the proven algorithm, customized to spatial data, and with many relevant options. @@ -1160,25 +1123,23 @@ # ## Distance to nearest geometry {#sec-distance-to-nearest-geometry} # -# Calculating a raster of distances to the nearest geometry is an example of a "global" raster operation (@sec-global-operations-and-distances). +# Calculating a raster of distances to the nearest geometry is an example of a 'global' raster operation (@sec-global-operations-and-distances). # To demonstrate it, suppose that we need to calculate a raster representing the distance to the nearest coast in New Zealand. -# This example also wraps many of the concepts introduced in this chapter and in previous chapter, such as raster aggregation (@sec-raster-agg-disagg), raster conversion to points (@sec-raster-to-points), and rasterizing points (@sec-rasterizing-points). +# This example also wraps many of the concepts introduced in this chapter and in previous chapters, such as raster aggregation (@sec-raster-agg-disagg), raster conversion to points (@sec-raster-to-points), and rasterizing points (@sec-rasterizing-points). # -# For the coastline, we will dissolve the New Zealand administrative division polygon layer and "extract" the boundary as a `'MultiLineString'` geometry. +# For the coastline, we will dissolve the New Zealand administrative division polygon layer and 'extract' the boundary as a `'MultiLineString'` geometry (@fig-nz-coastline). Note that `.dissolve(by=None)` (@sec-vector-attribute-aggregation) calls `.union_all` on all geometries (i.e., aggregates everything into one group), which is what we want to do here. # In[ ]: -coastline = gpd.GeoSeries(nz.unary_union, crs=nz.crs) \ - .to_crs(src_nz_elev.crs) \ - .boundary +#| label: fig-nz-coastline +#| fig-cap: New Zealand coastline geometry +coastline = nz.dissolve().to_crs(src_nz_elev.crs).boundary.iloc[0] coastline -# For a "template" raster, we will aggregate the New Zealand DEM, in the `nz_elev.tif` file, to 5 times coarser resolution. -# The code section below follows the aggeregation example in @sec-raster-agg-disagg. -# -# +# For a 'template' raster, we will aggregate the New Zealand DEM, in the `nz_elev.tif` file, to 5 times coarser resolution. +# The code section below follows the aggregation example in @sec-raster-agg-disagg. # In[ ]: @@ -1206,11 +1167,11 @@ #| label: fig-raster-distances1 -#| fig-cap: Template with cell IDs to calculate distance to nearest geometry +#| fig-cap: Template to calculate distance to nearest geometry (coastlines, in red) fig, ax = plt.subplots() rasterio.plot.show(r, transform=new_transform, ax=ax) -gpd.GeoSeries(coastline).plot(ax=ax, edgecolor='black'); +gpd.GeoSeries(coastline).plot(ax=ax, edgecolor='red'); # To calculate the actual distances, we must convert each pixel to a vector (point) geometry. @@ -1252,16 +1213,13 @@ image = rasterio.features.rasterize( distances, out_shape=r.shape, - dtype=np.float_, + dtype=np.float64, transform=new_transform, fill=np.nan ) image -# -# -# # The final result, a raster of distances to the nearest coastline, is shown in @fig-raster-distances2. # In[ ]: @@ -1271,7 +1229,7 @@ #| fig-cap: Distance to nearest coastline in New Zealand fig, ax = plt.subplots() rasterio.plot.show(image, transform=new_transform, ax=ax) -gpd.GeoSeries(coastline).plot(ax=ax, edgecolor='black'); +gpd.GeoSeries(coastline).plot(ax=ax, edgecolor='red'); -# ## Exercises +# diff --git a/code/chapters/06-reproj.py b/code/chapters/06-reproj.py index 575132bd..49985254 100644 --- a/code/chapters/06-reproj.py +++ b/code/chapters/06-reproj.py @@ -1,6 +1,10 @@ #!/usr/bin/env python # coding: utf-8 +# --- +# jupyter: python3 +# --- +# # # Reprojecting geographic data {#sec-reproj-geo-data} # # ## Prerequisites {.unnumbered} @@ -9,12 +13,14 @@ #| echo: false -import matplotlib.pyplot as plt -import pandas as pd -pd.options.display.max_rows = 6 -pd.options.display.max_columns = 6 -pd.options.display.max_colwidth = 35 -plt.rcParams['figure.figsize'] = (5, 5) +import book_options + + +# In[ ]: + + +#| echo: false +import book_options_pdf # This chapter requires importing the following packages: @@ -34,14 +40,12 @@ import rasterio.warp -# -# -# # It also relies on the following data files: # In[ ]: +#| warning: false src_srtm = rasterio.open('data/srtm.tif') src_nlcd = rasterio.open('data/nlcd.tif') zion = gpd.read_file('data/zion.gpkg') @@ -55,49 +59,40 @@ # This chapter builds on that knowledge and goes further. # It demonstrates how to set and transform geographic data from one CRS to another and, furthermore, highlights specific issues that can arise due to ignoring CRSs that you should be aware of, especially if your data is stored with lon/lat coordinates. # -# -# -# # It is important to know if your data is in a projected or geographic coordinate system, and the consequences of this for geometry operations. # However, if you know the CRS of your data and the consequences for geometry operations (covered in the next section), CRSs should just work behind the scenes: people often suddenly need to learn about CRSs when things go wrong. # Having a clearly defined project CRS that all project data is in, plus understanding how and why to use different CRSs, can ensure that things do not go wrong. # Furthermore, learning about coordinate systems will deepen your knowledge of geographic datasets and how to use them effectively. # -# This chapter teaches the fundamentals of CRSs, demonstrates the consequences of using different CRSs (including what can go wrong), and how to "reproject" datasets from one coordinate system to another. +# This chapter teaches the fundamentals of CRSs, demonstrates the consequences of using different CRSs (including what can go wrong), and how to 'reproject' datasets from one coordinate system to another. # In the next section we introduce CRSs in Python, followed by @sec-querying-and-setting-coordinate-systems which shows how to get and set CRSs associated with spatial objects. # @sec-geometry-operations-on-projected-and-unprojected-data demonstrates the importance of knowing what CRS your data is in with reference to a worked example of creating buffers. # We tackle questions of when to reproject and which CRS to use in @sec-when-to-reproject and @sec-which-crs-to-use, respectively. -# Finally, we cover reprojecting vector and raster objects in @sec-reprojecting-vector-geometries and @sec-reprojecting-raster-geometries and modifying map projections in @sec-custom-map-projections. +# Finally, we cover reprojecting vector and raster objects in @sec-reprojecting-vector-geometries and @sec-reprojecting-raster-geometries and using custom projections in @sec-custom-map-projections. # # ## Coordinate Reference Systems {#sec-coordinate-reference-systems} # -# Most modern geographic tools that require CRS conversions, including Python packages and desktop GIS software such as QGIS, interface with [PROJ](https://proj.org/), an open source C++ library that "transforms coordinates from one coordinate reference system (CRS) to another". +# Most modern geographic tools that require CRS conversions, including Python packages and desktop GIS software such as QGIS, interface with PROJ, an open source C++ library that 'transforms coordinates from one coordinate reference system (CRS) to another'. # CRSs can be described in many ways, including the following: # -# - Simple, yet potentially ambiguous, statements, such as, "it's in lon/lat coordinates" -# - Formalized, yet now outdated, 'proj-strings' such as `+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs` -# - With an identifying 'authority:code' text string such as `EPSG:4326` +# - Simple, yet potentially ambiguous, statements, such as 'it's in lon/lat coordinates' +# - Formalized, yet now outdated, 'proj-strings', such as `+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs` +# - With an identifying 'authority:code' text string, such as `EPSG:4326` # # Each refers to the same thing: the 'WGS84' coordinate system that forms the basis of Global Positioning System (GPS) coordinates and many other datasets. # But which one is correct? # -# The short answer is that the third way to identify CRSs is correct: `EPSG:4326` is understood by **geopandas** and **rasterio** packages covered in this book, plus many other software projects for working with geographic data including [QGIS](https://docs.qgis.org/3.22/en/docs/user_manual/working_with_projections/working_with_projections.html) and [PROJ](https://proj.org/development/quickstart.html). +# The short answer is that the third way to identify CRSs is correct: `EPSG:4326` is understood by **geopandas** and **rasterio** packages covered in this book, plus many other software projects for working with geographic data including QGIS and PROJ. # `EPSG:4326` is future-proof. -# Furthermore, although it is machine readable, unlike the proj-string representation `EPSG:4326` is short, easy to remember and highly 'findable' online (searching for `EPSG:4326` yields a dedicated page on the website [epsg.io](https://epsg.io/4326), for example). -# The more concise identifier `4326` is also understood by **geopandas** and **rasterio**, but we recommend the more explicit `AUTHORITY:CODE` representation to prevent ambiguity and to provide context. -# -# +# Furthermore, although it is machine readable, unlike the proj-string representation `EPSG:4326` is short, easy to remember and highly 'findable' online (searching for `EPSG:4326` yields a dedicated page on the website epsg.io[^epsgio], for example). +# The more concise identifier `4326` is also understood by **geopandas** and **rasterio**. +# +# [^epsgio]: [https://epsg.io/4326](https://epsg.io/4326) # -# The longer answer is that none of the three descriptions are sufficient, and more detail is needed for unambiguous CRS handling and transformations: due to the complexity of CRSs, it is not possible to capture all relevant information about them in such short text strings. +# The longer answer is that none of the three descriptions is sufficient, and more detail is needed for unambiguous CRS handling and transformations: due to the complexity of CRSs, it is not possible to capture all relevant information about them in such short text strings. # For this reason, the Open Geospatial Consortium (OGC, which also developed the Simple Features specification that the **geopandas** package implements) developed an open standard format for describing CRSs that is called WKT (Well Known Text). -# -# -# This is detailed in a [100+ page document](https://portal.opengeospatial.org/files/18-010r7) that "defines the structure and content of a text string implementation of the abstract model for coordinate reference systems described in ISO 19111:2019" [@opengeospatialconsortium_wellknown_2019]. -# -# -# The [WKT representation](https://en.wikipedia.org/wiki/Well-known_text_representation_of_coordinate_reference_systems) of the WGS84 CRS, which has the identifier `EPSG:4326` is as follows. -# -# +# This is detailed in a 100+ page document that 'defines the structure and content of a text string implementation of the abstract model for coordinate reference systems described in ISO 19111:2019' [@opengeospatialconsortium_wellknown_2019]. +# The WKT representation of the WGS84 CRS, which has the identifier `EPSG:4326` is as follows. # In[ ]: @@ -106,11 +101,9 @@ print(crs.to_wkt(pretty=True)) -# -# -# The output of the command shows how the CRS identifier (also known as a Spatial Reference Identifier or [SRID](https://postgis.net/workshops/postgis-intro/projection.html)) works: it is simply a look-up, providing a unique identifier associated with a more complete WKT representation of the CRS. +# The output of the command shows how the CRS identifier (also known as a Spatial Reference Identifier, or SRID) works: it is simply a look-up, providing a unique identifier associated with a more complete WKT representation of the CRS. # This raises the question: what happens if there is a mismatch between the identifier and the longer WKT representation of a CRS? -# On this point Open Geospatial Consortium [@opengeospatialconsortium_wellknown_2019] is clear, the verbose WKT representation takes precedence over the [identifier](https://docs.opengeospatial.org/is/18-010r7/18-010r7.html#37): +# On this point Open Geospatial Consortium [@opengeospatialconsortium_wellknown_2019] is clear, the verbose WKT representation takes precedence over the identifier: # # > Should any attributes or values given in the cited identifier be in conflict with attributes or values given explicitly in the WKT description, the WKT values shall prevail. # @@ -128,25 +121,21 @@ # WKT strings are exhaustive, detailed, and precise, allowing for unambiguous CRSs storage and transformations. # They contain all relevant information about any given CRS, including its datum and ellipsoid, prime meridian, projection, and units. -# -# # # Recent PROJ versions (6+) still allow use of proj-strings to define coordinate operations, but some proj-string keys (`+nadgrids`, `+towgs84`, `+k`, `+init=epsg:`) are either no longer supported or are discouraged. -# -# # Additionally, only three datums (i.e., WGS84, NAD83, and NAD27) can be directly set in proj-string. -# Longer explanations of the evolution of CRS definitions and the PROJ library can be found in [@bivand_progress_2021], Chapter 2 of [@pebesma_spatial_2022], and a [blog post by Floris Vanderhaeghe](https://inbo.github.io/tutorials/tutorials/spatial_crs_coding/). +# Longer explanations of the evolution of CRS definitions and the PROJ library can be found in [@bivand_progress_2021], Chapter 2 of [@pebesma_spatial_2022], and a blog post by Floris Vanderhaeghe[^floris_blog]. +# +# [^floris_blog]: [https://inbo.github.io/tutorials/tutorials/spatial_crs_coding/](https://inbo.github.io/tutorials/tutorials/spatial_crs_coding/) # # ::: callout-note -# As outlined in the [PROJ documentation](https://proj.org/development/reference/cpp/cpp_general.html), there are different versions of the WKT CRS format including WKT1 and two variants of WKT2, the latter of which (WKT2, 2018 specification) corresponds to the ISO 19111:2019 [@opengeospatialconsortium_wellknown_2019]. +# As outlined in the PROJ documentation, there are different versions of the WKT CRS format including WKT1 and two variants of WKT2, the latter of which (WKT2, 2018 specification) corresponds to the ISO 19111:2019 [@opengeospatialconsortium_wellknown_2019]. # ::: -# -# # # ## Querying and setting coordinate systems {#sec-querying-and-setting-coordinate-systems} # # Let's see how CRSs are stored in Python spatial objects and how they can be queried and set. -# First we will look at getting and setting CRSs in vector geographic data objects. +# First, we will look at getting and setting CRSs in vector geographic data objects. # Consider the `GeoDataFrame` object named `world`, imported from a file `world.gpkg` that represents countries worldwide. # Its CRS can be retrieved using the `.crs` property. @@ -182,8 +171,7 @@ # In[ ]: -world.crs.axis_info[0].unit_name -world.crs.axis_info[1].unit_name +world.crs.axis_info[0].unit_name, world.crs.axis_info[1].unit_name # `AUTHORITY` and `CODE` strings may be obtained with the `.to_authority()` method. @@ -194,12 +182,6 @@ world.crs.to_authority() -# -# -# -# -# -# # In cases when a coordinate reference system (CRS) is missing or the wrong CRS is set, the `.set_crs` method can be used on a `GeoSeries` or a `GeoDataFrame` to set it. # The CRS can be specified using an EPSG code as the first argument. # In case the object already has a different CRS definition, we must also specify `allow_override=True` to replace it (otherwise we get an error). @@ -225,8 +207,6 @@ # Replacing the CRS definition for a **rasterio** file connection is typically not necessary, because it is not considered in any operation; only the transformation matrix and coordinates are. # One exception is when writing the raster, in which case we need to construct the metadata of the raster file to be written, and therein specify the CRS anyway (@sec-raster-from-scratch). # However, if we, for some reason, need to change the CRS definition in the file connection metadata, we can do that when opening the file in `r+` (reading and writing) mode. -# -# # To demonstrate, we will create a copy of the `nlcd.tif` file, named `nlcd2.tif`, # In[ ]: @@ -245,7 +225,7 @@ # ::: callout-note -# The `rasterio.open` function `mode`s generally follows Python's standard [file connection](https://docs.python.org/3/tutorial/inputoutput.html#reading-and-writing-files) modes, with possible arguments being `'r'` (read), `'w'` (write), `'r+'` (read/write), and `'w+'` (write/read) (the `'a'` "append" mode is irrelevant for raster files). In the book, and in general, the most commonly used modes are `'r'` (read) and `'w'` (write). `'r+'`, used in the last example, means 'read/write'. Unlike with `'w'`, `'r+'` does not delete the existing content on open, making `'r+'` suitable for making changes in an existing file (such as in the last example, where we replaced the CRS). +# The `rasterio.open` function `mode`s generally follows Python's standard file connection modes, with possible arguments being `'r'` (read), `'w'` (write), `'r+'` (read/write), and `'w+'` (write/read) (the `'a'` 'append' mode is irrelevant for raster files). In the book, and in general, the most commonly used modes are `'r'` (read) and `'w'` (write). `'r+'`, used in the last example, means 'read/write'. Unlike with `'w'`, `'r+'` does not delete the existing content on open, making `'r+'` suitable for making changes in an existing file (such as here, replacing the CRS). # ::: # # To replace the definition with a new one, such as `EPSG:3857`, we can use the `.crs` method, as shown below. @@ -267,7 +247,7 @@ # Importantly, the `.set_crs` (for vector layers) or the assignment to `.crs` (for rasters), as shown above, do not alter coordinates' values or geometries. # Their role is only to set a metadata information about the object CRS. -# Consequently, the objects we created, `world3`, `world4`, and `src_nlcd2` are "incorrect", in the sense that the geometries are in fact given in a different CRS than specified in the associated CRS definition. +# Consequently, the objects we created, `world3`, `world4`, and `src_nlcd2` are 'incorrect', in the sense that the geometries are in fact given in a different CRS than specified in the associated CRS definition. # # In some cases, the CRS of a geographic object is unknown, as is the case in the London dataset created in the code chunk below, building on the example of London introduced in @sec-vector-layer-from-scratch. @@ -289,10 +269,8 @@ # This implies that **geopandas** does not know what the CRS is and is unwilling to guess. -# Unless a CRS is manually specified or is loaded from a source that has CRS metadata, **geopandas** does not make any explicit assumptions about which coordinate systems, other than to say "I don't know". -# This behavior makes sense given the diversity of available CRSs but differs from some approaches, such as the GeoJSON file format specification, which makes the simplifying [assumption](https://datatracker.ietf.org/doc/html/rfc7946#section-4) that all coordinates have a lon/lat CRS: `EPSG:4326`. -# -# +# Unless a CRS is manually specified or is loaded from a source that has CRS metadata, **geopandas** does not make any explicit assumptions about which coordinate systems, other than to say 'I don't know'. +# This behavior makes sense given the diversity of available CRSs but differs from some approaches, such as the GeoJSON file format specification, which makes the simplifying assumption that all coordinates have a lon/lat CRS: `EPSG:4326`. # # A CRS can be added to `GeoSeries` or `GeoDataFrame` objects using the `.set_crs` method, as mentioned above. @@ -302,9 +280,6 @@ lnd_layer = lnd_layer.set_crs(4326) -# -# -# # When working with **geopandas** and **rasterio**, datasets without a specified CRS are not an issue in most workflows, since only the coordinates are considered. # It is up to the user to make sure that, when working with more than one layer, all of the coordinates are given in the same CRS (whether specified or not). # When exporting the results, though, it is important to keep the CRS definition in place, because other software typically *do* use, and require, the CRS definition in calculation. @@ -313,7 +288,7 @@ # ## Geometry operations on projected and unprojected data {#sec-geometry-operations-on-projected-and-unprojected-data} # # The **geopandas** package, through its dependency **shapely**, assumes planar geometry and works with distance/area values assumed to be in CRS units. -# In fact, the CRS definition is typically ignored, and the respective functions (such as in plotting and distance calculations) are applied on the "bare" **shapely** geometries. +# In fact, the CRS definition is typically ignored, and the respective functions (such as in plotting and distance calculations) are applied on the 'bare' **shapely** geometries. # Accordingly, it is crucial to make sure that: # # - Geometric calculations are only applied in projected CRS @@ -369,12 +344,12 @@ uk = world[world['name_long'] == 'United Kingdom'] uk_proj = uk.to_crs(27700) # Around projected point -base = uk_proj.plot(color='none', edgecolor='darkgrey') -lnd_layer_proj_buff.plot(color='lightgrey', edgecolor='black', ax=base) +base = uk_proj.plot(color='none', edgecolor='darkgrey', linewidth=0.5) +lnd_layer_proj_buff.plot(color='grey', edgecolor='black', alpha=0.5, ax=base) lnd_layer_proj.plot(color='red', ax=base); # Around point in lon/lat -base = uk.plot(color='none', edgecolor='darkgrey') -lnd_layer_buff.plot(color='lightgrey', edgecolor='black', ax=base) +base = uk.plot(color='none', edgecolor='darkgrey', linewidth=0.5) +lnd_layer_buff.plot(color='grey', edgecolor='black', alpha=0.5, ax=base) lnd_layer.plot(color='red', ax=base); @@ -392,7 +367,7 @@ # ## When to reproject? {#sec-when-to-reproject} # # The previous section showed how to set the CRS manually, with an expression such as `lnd_layer.set_crs(4326)`. -# In real world applications, however, CRSs are usually set automatically when data is read-in. +# In real-world applications, however, CRSs are usually set automatically when data is read-in. # Thus, in many projects the main CRS-related task is to transform objects, from one CRS into another. # But when should data be transformed? # And into which CRS? @@ -400,7 +375,7 @@ # However, there are some general principles provided in this section that can help you decide. # # First, it's worth considering when to transform. -# In some cases transformation to a geographic CRS is essential, such as when publishing data online (for example, a Leaflet-based map using Python package [**folium**](https://python-visualization.github.io/folium/latest/)). +# In some cases, transformation to a geographic CRS is essential, such as when publishing data online (for example, a Leaflet-based map using Python package **folium**). # Another case is when two objects with different CRSs must be compared or combined, as shown when we try to find the distance between two objects with different CRSs. # In[ ]: @@ -409,39 +384,39 @@ lnd_layer.distance(lnd_layer_proj) -# Here, we got a meaningless result, and a warning. +# Here, we got a meaningless distance value of `559715`, and a warning. # # To make the `lnd_layer` and `lnd_layer_proj` objects geographically comparable, one of them must be transformed into the CRS of the other. # But which CRS to use? # The answer depends on context: many projects, especially those involving web mapping, require outputs in `EPSG:4326`, in which case it is worth transforming the projected object. # If, however, the project requires geometric calculations, implying planar geometry, e.g., calculating buffers (@sec-geometry-operations-on-projected-and-unprojected-data), it is necessary to transform data with a geographic CRS into an equivalent object with a projected CRS, such as the British National Grid (`EPSG:27700`). # That is the subject of @sec-which-crs-to-use. -# -# # # ## Which CRS to use? {#sec-which-crs-to-use} # -# The question of which CRS is tricky, and there is rarely a "right" answer: "There exist no all-purpose projections, all involve distortion when far from the center of the specified frame" [@bivand_applied_2013]. +# The question of which CRS is tricky, and there is rarely a 'right' answer: 'There exist no all-purpose projections, all involve distortion when far from the center of the specified frame' [@bivand_applied_2013]. # Additionally, you should not be attached just to one projection for every task. # It is possible to use one projection for some part of the analysis, another projection for a different part, and even some other for visualization. # Always try to pick the CRS that serves your goal best! # -# When selecting *geographic* CRSs, the answer is often [WGS84](https://en.wikipedia.org/wiki/World_Geodetic_System#A_new_World_Geodetic_System:_WGS_84). +# When selecting *geographic* CRSs, the answer is often WGS84. # It is used not only for web mapping, but also because GPS datasets and thousands of raster and vector datasets are provided in this CRS by default. # WGS84 is the most common CRS in the world, so it is worth knowing its EPSG code: `4326`. -# This "magic number" can be used to convert objects with unusual projected CRSs into something that is widely understood. +# This 'magic number' can be used to convert objects with unusual projected CRSs into something that is widely understood. # # What about when a *projected* CRS is required? -# In some cases, it is not something that we are free to decide: "often the choice of projection is made by a public mapping agency" [@bivand_applied_2013]. +# In some cases, it is not something that we are free to decide: 'often the choice of projection is made by a public mapping agency' [@bivand_applied_2013]. # This means that when working with local data sources, it is likely preferable to work with the CRS in which the data was provided, to ensure compatibility, even if the official CRS is not the most accurate. # The example of London was easy to answer because the British National Grid (with its associated EPSG code `27700`) is well known, and the original dataset (`lnd_layer`) already had that CRS. # -# A commonly used default is Universal Transverse Mercator ([UTM](https://en.wikipedia.org/wiki/Universal_Transverse_Mercator_coordinate_system)), a set of CRSs that divides the Earth into 60 longitudinal wedges and 20 latitudinal segments. +# A commonly used default is Universal Transverse Mercator (UTM), a set of CRSs that divide the Earth into 60 longitudinal wedges and 20 latitudinal segments. # The transverse Mercator projection used by UTM CRSs is conformal but distorts areas and distances with increasing severity with distance from the center of the UTM zone. -# Documentation from the GIS software Manifold therefore suggests restricting the longitudinal extent of projects using UTM zones to 6 degrees from the central meridian (source: [manifold.net](http://www.manifold.net/doc/mfd9/universal_transverse_mercator_projection.htm)). +# Documentation from the GIS software Manifold therefore suggests restricting the longitudinal extent of projects using UTM zones to 6 degrees from the central meridian[^manifold_recommendation]. # Therefore, we recommend using UTM only when your focus is on preserving angles for a relatively small area! # -# Almost every place on Earth has a UTM code, such as `'60H'` which refers, amoung others, to northern New Zealand. +# [^manifold_recommendation]: [http://www.manifold.net/doc/mfd9/universal_transverse_mercator_projection.htm](http://www.manifold.net/doc/mfd9/universal_transverse_mercator_projection.htm) +# +# Almost every place on Earth has a UTM code, such as `'60H'` which refers, among others, to northern New Zealand. # UTM EPSG codes run sequentially from `32601` to `32660` for northern hemisphere locations and from `32701` to `32760` for southern hemisphere locations. # # To show how the system works, let's create a function, `lonlat2UTM` to calculate the EPSG code associated with any point on the planet. @@ -466,7 +441,7 @@ def lonlat2UTM(lon, lat): lonlat2UTM(174.7, -36.9) -# Here is another example for London (where we "unpack" the coordinates of the 1^st^ geometry in `lnd_layer` into the `lonlat2UTM` function arguments). +# Here is another example for London (where we 'unpack' the coordinates of the 1^st^ geometry in `lnd_layer` into the `lonlat2UTM` function arguments). # In[ ]: @@ -481,15 +456,17 @@ def lonlat2UTM(lon, lat): # In cases where an appropriate CRS is not immediately clear, the choice of CRS should depend on the properties that are most important to preserve in the subsequent maps and analysis. # All CRSs are either equal-area, equidistant, conformal (with shapes remaining unchanged), or some combination of compromises of those (@sec-projected-coordinate-reference-systems). # Custom CRSs with local parameters can be created for a region of interest and multiple CRSs can be used in projects when no single CRS suits all tasks. -# "Geodesic calculations" can provide a fall-back if no CRSs are appropriate (see ). +# 'Geodesic calculations' can provide a fall-back if no CRSs are appropriate[^proj_geodesic]. # Regardless of the projected CRS used, the results may not be accurate for geometries covering hundreds of kilometers. # +# [^proj_geodesic]: [https://proj.org/geodesic.html](https://proj.org/geodesic.html) +# # When deciding on a custom CRS, we recommend the following: # -# - A Lambert azimuthal equal-area ([LAEA](https://en.wikipedia.org/wiki/Lambert_azimuthal_equal-area_projection)) projection for a custom local projection (set latitude and longitude of origin to the center of the study area), which is an equal-area projection at all locations but distorts shapes beyond thousands of kilometers -# - Azimuthal equidistant ([AEQD](https://en.wikipedia.org/wiki/Azimuthal_equidistant_projection)) projections for a specifically accurate straight-line distance between a point and the center point of the local projection -# - Lambert conformal conic ([LCC](https://en.wikipedia.org/wiki/Lambert_conformal_conic_projection)) projections for regions covering thousands of kilometers, with the cone set to keep distance and area properties reasonable between the secant lines -# - Stereographic ([STERE](https://en.wikipedia.org/wiki/Stereographic_projection)) projections for polar regions, but taking care not to rely on area and distance calculations thousands of kilometers from the center +# - A Lambert azimuthal equal-area (LAEA) projection for a custom local projection (set latitude and longitude of origin to the center of the study area), which is an equal-area projection at all locations but distorts shapes beyond thousands of kilometers +# - Azimuthal equidistant (AEQD) projections for a specifically accurate straight-line distance between a point and the center point of the local projection +# - Lambert conformal conic (LCC) projections for regions covering thousands of kilometers, with the cone set to keep distance and area properties reasonable between the secant lines +# - Stereographic (STERE) projections for polar regions, but taking care not to rely on area and distance calculations thousands of kilometers from the center # # One possible approach to automatically select a projected CRS specific to a local dataset is to create an azimuthal equidistant (AEQD) projection for the center-point of the study area. # This involves creating a custom CRS (with no EPSG code) with units of meters based on the center point of a dataset. @@ -506,7 +483,7 @@ def lonlat2UTM(lon, lat): # Reprojecting vectors thus consists of transforming the coordinates of these points, which form the vertices of lines and polygons. # # @sec-geometry-operations-on-projected-and-unprojected-data contains an example in which at a `GeoDataFrame` had to be transformed into an equivalent object, with a different CRS, to calculate the distance between two objects. -# Reprojection of vector layers is done using the [.to_crs](https://geopandas.org/en/stable/docs/reference/api/geopandas.GeoDataFrame.to_crs.html) method. +# Reprojection of vector layers is done using the `.to_crs` method. # In[ ]: @@ -523,27 +500,10 @@ def lonlat2UTM(lon, lat): # It may come as a surprise that `lnd_layer` and `lnd_layer2` are just over 2 $km$ apart! -# The difference in location between the two points is not due to imperfections in the transforming operation (which is in fact very accurate) but the low precision of the manually-created coordinates that created `lnd_layer` and `lnd_layer_proj`. -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# -# +# The difference in location between the two points is not due to imperfections in the transforming operation (which is in fact very accurate) but the low precision of the manually specified coordinates when creating `lnd_layer` and `lnd_layer_proj`. # # Reprojecting to a different CRS is also demonstrated below using `cycle_hire_osm`, a point layer that represents 'docking stations' where you can hire bicycles in London. -# The contents of the CRS object associated with a given geometry column is changed when the object's CRS is transformed. +# The contents of the CRS object associated with a given geometry column are changed when the object's CRS is transformed. # In the code chunk below, we create a new version of `cycle_hire_osm` with a projected CRS. # In[ ]: @@ -553,8 +513,8 @@ def lonlat2UTM(lon, lat): cycle_hire_osm_projected.crs -# The resulting object has a new CRS with an EPSG code `27700`. -# But how to find out more details about this EPSG code, or any code? +# The resulting object has a new CRS according to the EPSG code `27700`. +# How to find out more details about this EPSG code, or any code? # One option is to search for it online. # Another option is to create a standalone CRS object within the Python environment (using `pyproj.CRS.from_string` or `pyproj.CRS.from_epsg`, see @sec-coordinate-reference-systems), and then query its properties, such as `.name` and `.to_wkt()`. @@ -565,10 +525,7 @@ def lonlat2UTM(lon, lat): crs_lnd_new.name, crs_lnd_new.to_wkt() -# -# -# -# The result shows that the EPSG code `27700` represents the British National Grid, a result that could have been found by searching online for "[EPSG 27700](https://www.google.com/search?q=CRS+27700)". +# The result shows that the EPSG code `27700` represents the British National Grid, a result that could have been found by searching online for 'EPSG 27700'. # # ## Reprojecting raster geometries {#sec-reprojecting-raster-geometries} # @@ -576,26 +533,24 @@ def lonlat2UTM(lon, lat): # However, there are important differences in reprojection of vectors and rasters: transforming a vector object involves changing the coordinates of every vertex, but this does not apply to raster data. # Rasters are composed of rectangular cells of the same size (expressed by map units, such as degrees or meters), so it is usually impracticable to transform coordinates of pixels separately. # Raster reprojection involves creating a new raster object in the destination CRS, often with a different number of columns and rows than the original. -# The attributes must subsequently be re-estimated, allowing the new pixels to be "filled" with appropriate values. +# The attributes must subsequently be re-estimated, allowing the new pixels to be 'filled' with appropriate values. # In other words, raster reprojection can be thought of as two separate spatial operations: a vector reprojection of the raster extent to another CRS (@sec-reprojecting-vector-geometries), and computation of new pixel values through resampling (@sec-raster-resampling). # Due to this additional complexity, in most cases when both raster and vector data are used, it is better to avoid reprojecting rasters and reproject vectors instead. # # ::: callout-note # Reprojection of the regular rasters is also known as warping. -# Additionally, there is a second similar operation called "transformation". +# Additionally, there is a second similar operation called 'transformation'. # Instead of resampling all of the values, it leaves all values intact but recomputes new coordinates for every raster cell, changing the grid geometry. # For example, it could convert the input raster (a regular grid) into a curvilinear grid. # The **rasterio**, like common raster file formats (such as GeoTIFF), does not support curvilinear grids. -# The **xarray** package, for instance, can be used to [work with](https://docs.xarray.dev/en/stable/examples/multidimensional-coords.html) curvilinear grids. +# The **xarray** package, for instance, can be used to work with curvilinear grids. # ::: -# -# # # The raster reprojection process is done using two functions from the `rasterio.warp` sub-package: # -# 1. `rasterio.warp.calculate_default_transform`: [`calculate_default_transform`](https://rasterio.readthedocs.io/en/latest/api/rasterio.warp.html#rasterio.warp.calculate_default_transform), is used to calculate the new transformation matrix in the destination CRS, according to the source raster dimensions and bounds. +# 1. `rasterio.warp.calculate_default_transform`, used to calculate the new transformation matrix in the destination CRS, according to the source raster dimensions and bounds. # Alternatively, the destination transformation matrix can be obtained from an existing raster; this is common practice when we need to align one raster with another, for instance to be able to combine them in raster algebra operations (@sec-raster-local-operations) (see below) -# 2. `rasterio.warp.reproject`: introduced in @sec-raster-resampling, calculates cell values in the destination grid, using the user-selected resampling method (such as nearest neighbor, or bilinear) +# 2. `rasterio.warp.reproject`, introduced in @sec-raster-resampling, calculates cell values in the destination grid, using the user-selected resampling method (such as nearest neighbor, or bilinear) # # Let's take a look at two examples of raster transformation: using categorical and continuous data. # Land cover data are usually represented by categorical maps. @@ -634,6 +589,13 @@ def lonlat2UTM(lon, lat): src_nlcd.height, *src_nlcd.bounds ) + + +# Here is the result. + +# In[ ]: + + dst_transform @@ -649,11 +611,8 @@ def lonlat2UTM(lon, lat): dst_height -# -# -# # ::: callout-note -# The `*` syntax in Python is known as variable-length ["*positional* arguments"](https://docs.python.org/3/glossary.html#term-argument). +# The `*` syntax in Python is known as variable-length '*positional* arguments'. # It is used to pass a `list` or `tuple` (or other iterables object) to positional arguments of a function. # # For example, in the last code block, `*`, in `*src_nlcd.bounds`, is used to unpack `src_nlcd.bounds` (an iterable of length 4) to four separate arguments (`left`, `bottom`, `right`, and `top`), which `rasterio.warp.calculate_default_transform` requires in that order. @@ -680,18 +639,16 @@ def lonlat2UTM(lon, lat): # src_nlcd.bounds[3] # ) # ``` -# "*Keyword* arguments" is a related technique; see note in @sec-raster-agg-disagg. +# '*Keyword* arguments' is a related technique; see note in @sec-raster-agg-disagg. # ::: # # -# Recall from @sec-raster-resampling that resampling using `rasterio.warp.reproject` can take place directly into a "destination" raster file connection. +# Recall from @sec-raster-resampling that resampling using `rasterio.warp.reproject` can take place directly into a 'destination' raster file connection. # Therefore, our next step is to create the metadata file used for writing the reprojected raster to file. # For convenience, we are taking the metadata of the source raster (`src_nlcd.meta`), making a copy (`dst_kwargs`), and then updating those specific properties that need to be changed. -# Note that the reprojection process typically creates "No Data" pixels, even when there were none in the input raster, since the raster orientation changes and the edges need to be "filled" to get back a rectangular extent. -# For example, a reprojected raster may appear as a "tilted" rectangle, inside a larger straight rectangular extent, whereas the margins around the tilted rectangle are inevitably filled with "No Data" (e.g., the white stripes surrounding the edges in @fig-raster-reproject-nlcd (b) are "No Data" pixels created as a result of reprojection). -# -# -# We need to specify a "No Data" value of our choice, if there is no existing definition, or keep the existing source raster "No Data" setting, such as `255` in this case. +# Note that the reprojection process typically creates 'No Data' pixels, even when there were none in the input raster, since the raster orientation changes and the edges need to be 'filled' to get back a rectangular extent. +# For example, a reprojected raster may appear as a 'tilted' rectangle, inside a larger straight rectangular extent, whereas the margins around the tilted rectangle are inevitably filled with 'No Data' (e.g., the white stripes surrounding the edges in @fig-raster-reproject-nlcd (b) are 'No Data' pixels created as a result of reprojection). +# We need to specify a 'No Data' value of our choice, if there is no existing definition, or keep the existing source raster 'No Data' setting, such as `255` in this case. # In[ ]: @@ -708,7 +665,7 @@ def lonlat2UTM(lon, lat): # Now, we are ready to create the reprojected raster. # Here, reprojection takes place between two file connections, meaning that the raster value arrays are not being read into memory at once. -# It is also possible to reproject into an in-memory `ndarray` object, see the [documentation](https://rasterio.readthedocs.io/en/latest/api/rasterio.warp.html#rasterio.warp.reproject). +# (It is also possible to reproject into an in-memory `ndarray` object.) # # To write the reprojected raster, we first create a destination file connection `dst_nlcd`, pointing at the output file path of our choice (`'output/nlcd_4326.tif'`), using the updated metadata object created earlier (`dst_kwargs`): @@ -719,11 +676,11 @@ def lonlat2UTM(lon, lat): # Then, we use the `rasterio.warp.reproject` function to calculate and write the reprojection result into the `dst_nlcd` file connection. -# # In[ ]: +#| output: false rasterio.warp.reproject( source=rasterio.band(src_nlcd, 1), destination=rasterio.band(dst_nlcd, 1), @@ -735,9 +692,9 @@ def lonlat2UTM(lon, lat): ) -# Note--like in the example in @sec-raster-resampling---that the `source` and `destination` accept a "band" object, created using `rasterio.band`. +# Note--like in the example in @sec-raster-resampling---that the `source` and `destination` accept a 'band' object, created using `rasterio.band`. # In this case, there is just one band. -# If there were more bands, we would have to repeat the procedure for each band, using `i` instead of `1` inside a [loop](https://rasterio.readthedocs.io/en/latest/topics/reproject.html#reprojecting-a-geotiff-dataset). +# If there were more bands, we would have to repeat the procedure for each band, using `i` instead of `1` inside a loop. # Finally, we close the file connection so that the data are actually written. # In[ ]: @@ -761,7 +718,7 @@ def lonlat2UTM(lon, lat): src_nlcd_4326.meta -# Examining the unique raster values tells us that the new raster has the same categories, plus the value `255` representing "No Data": +# Examining the unique raster values tells us that the new raster has the same categories, plus the value `255` representing 'No Data': # In[ ]: @@ -793,11 +750,11 @@ def lonlat2UTM(lon, lat): # In the above example, we automatically calculated an optimal (i.e., most information preserving) destination grid using `rasterio.warp.calculate_default_transform`. # This is appropriate when there are no specific requirements for the destination raster spatial properties. # Namely, we are not required to obtain a specific origin and resolution, but just wish to preserve the raster values as much as possible. -# To do that, `rasterio.warp.calculate_default_transform` "tries" to keep the extent and resolution of the destination raster as similar as possible to the source. -# In other situations, however, we need to reproject a raster into a specific "template", so that it corresponds, for instance, with other rasters we use in the analysis. -# In the following code examples, we reproject the `nlcd.tif` raster, again, but this time using the `nlcd_4326.tif` reprojection result as the "template" to demonstrate this alternative workflow. +# To do that, `rasterio.warp.calculate_default_transform` 'tries' to keep the extent and resolution of the destination raster as similar as possible to the source. +# In other situations, however, we need to reproject a raster into a specific 'template', so that it corresponds, for instance, with other rasters we use in the analysis. +# In the following code examples, we reproject the `nlcd.tif` raster, again, but this time using the `nlcd_4326.tif` reprojection result as the 'template' to demonstrate this alternative workflow. # -# First, we create a connection to our "template" raster to read its metadata. +# First, we create a connection to our 'template' raster to read its metadata. # In[ ]: @@ -806,9 +763,7 @@ def lonlat2UTM(lon, lat): template.meta -# Then, we create a write-mode connection to our destination raster, using this exact metadata, meaning that as the resampling result is going to have identical properties as the "template". -# -# +# Then, we create a write-mode connection to our destination raster, using this exact metadata, meaning that the resampling result is going to have identical properties as the 'template'. # In[ ]: @@ -844,10 +799,10 @@ def lonlat2UTM(lon, lat): np.all(d) -# The difference is that in the first example we calculated the template automatically, using `rasterio.warp.calculate_default_transform`, while in the second example we used an existing raster as the "template". +# The difference is that in the first example we calculated the template automatically, using `rasterio.warp.calculate_default_transform`, while in the second example we used an existing raster as the 'template'. # -# Importantly, when the template raster has much more "coarse" resolution than the source raster, the `rasterio.enums.Resampling.average` (for continuous rasters) or `rasterio.enums.Resampling.mode` (for categorical rasters) resampling methods should be used, instead of `rasterio.enums.Resampling.nearest`. -# Otherwise, much of the data will be lost, as the "nearest" method can capture one pixel value only for each destination raster pixel. +# Importantly, when the template raster has much more 'coarse' resolution than the source raster, the `rasterio.enums.Resampling.average` (for continuous rasters) or `rasterio.enums.Resampling.mode` (for categorical rasters) resampling methods should be used, instead of `rasterio.enums.Resampling.nearest`. +# Otherwise, much of the data will be lost, as the 'nearest' method can capture one-pixel value only for each destination raster pixel. # # Reprojecting continuous rasters (with numeric or, in this case, integer values) follows an almost identical procedure. # This is demonstrated below with `srtm.tif` from the Shuttle Radar Topography Mission (SRTM), which represents height in meters above sea level (elevation) with the WGS84 CRS. @@ -855,8 +810,8 @@ def lonlat2UTM(lon, lat): # We will reproject this dataset into a projected CRS, but not with the nearest neighbor method. # Instead, we will use the bilinear method which computes the output cell value based on the four nearest cells in the original raster. # The values in the projected dataset are the distance-weighted average of the values from these four cells: the closer the input cell is to the center of the output cell, the greater its weight. -# The following code section create a text string representing WGS 84 / UTM zone 12N, and reproject the raster into this CRS, using the bilinear method. -# The code is practically the same, except for changing the source and destination file names, and replacing `rasterio.enums.Resampling.nearest` with `rasterio.enums.Resampling.bilinear`. +# The following code section creates a text string representing WGS 84 / UTM zone 12N, and reprojects the raster into this CRS, using the bilinear method. +# The code is practically the same as in the first example in this section, except for changing the source and destination file names, and replacing `rasterio.enums.Resampling.nearest` with `rasterio.enums.Resampling.bilinear`. # In[ ]: @@ -911,22 +866,20 @@ def lonlat2UTM(lon, lat): # @sec-which-crs-to-use mentioned reasons for using custom CRSs, and provided several possible approaches. # Here, we show how to apply these ideas in Python. # -# One is to take an existing WKT definition of a CRS, modify some of its elements, and then use the new definition for reprojecting, using the reprojection methods shown above for vector layers (@sec-reprojecting-vector-geometries) and rasters (@sec-reprojecting-raster-geometries). -# For example, let's transforms the `zion.gpkg` vector layer to a custom azimuthal equidistant (AEQD) CRS. +# One approach is to take an existing WKT definition of a CRS, modify some of its elements, and then use the new definition for reprojecting, using the reprojection methods shown above for vector layers (@sec-reprojecting-vector-geometries) and rasters (@sec-reprojecting-raster-geometries). +# For example, let's transform the `zion.gpkg` vector layer to a custom azimuthal equidistant (AEQD) CRS. # Using a custom AEQD CRS requires knowing the coordinates of the center point of a dataset in degrees (geographic CRS). # In our case, this information can be extracted by calculating the centroid of the `zion` layer transformed into WGS84: # In[ ]: -lon, lat = zion.to_crs(4326).unary_union.centroid.coords[0] +lon, lat = zion.to_crs(4326).union_all().centroid.coords[0] lon, lat # Next, we can use the obtained lon/lat coordinates in `coords` to update the WKT definition of the azimuthal equidistant (AEQD) CRS seen below. # Notice that we modified just two values below---`"Central_Meridian"` to the longitude and `"Latitude_Of_Origin"` to the latitude of our centroid. -# -# # In[ ]: @@ -945,7 +898,7 @@ def lonlat2UTM(lon, lat): # ::: callout-note -# The above expression uses the so-called ["f-strings"](https://docs.python.org/3/tutorial/inputoutput.html#tut-f-strings) syntax, which is one of several Python techniques to embed values inside a string (as alternatives to concatenating with `+`). +# The above expression uses the so-called 'f-strings' syntax, which is one of several Python techniques to embed values inside a string (as alternatives to concatenating with `+`). # For example, given: # ``` # x = 5 @@ -969,15 +922,17 @@ def lonlat2UTM(lon, lat): zion_aeqd = zion.to_crs(my_wkt) -# Custom projections can also be made interactively, for example, using the [Projection Wizard](https://projectionwizard.org/#) web application [@savric_projection_2016]. +# Custom projections can also be made interactively, for example, using the Projection Wizard[^projection_wizard] web application [@savric_projection_2016]. # This website allows you to select a spatial extent of your data and a distortion property, and returns a list of possible projections. # The list also contains WKT definitions of the projections that you can copy and use for reprojections. -# See Open Geospatial Consortium ([2019](https://r.geocompx.org/references.html#ref-opengeospatialconsortium_wellknown_2019)) for details on creating custom CRS definitions with WKT strings. +# See Open Geospatial Consortium [@opengeospatialconsortium_wellknown_2019] for details on creating custom CRS definitions with WKT strings. +# +# [^projection_wizard]: [https://projectionwizard.org/#](https://projectionwizard.org/#) # # PROJ strings can also be used to create custom projections, accepting the limitations inherent to projections, especially of geometries covering large geographic areas, as mentioned in @sec-coordinate-reference-systems. -# Many projections have been developed and can be set with the `+proj=` element of PROJ strings, with dozens of projects described in detail on the [PROJ website](https://proj.org/operations/projections/index.html) alone. +# Many projections have been developed and can be set with the `+proj=` element of PROJ strings, with dozens of projections described in detail on the PROJ website alone. # -# When mapping the world while preserving area relationships the Mollweide projection, illustrated in @fig-mollweide, is a popular and often sensible choice [@jenny_guide_2017]. +# When mapping the world while preserving area relationships, the Mollweide projection, illustrated in @fig-mollweide, is a popular and often sensible choice [@jenny_guide_2017]. # To use this projection, we need to specify it using the proj-string element, `'+proj=moll'`, in the `.to_crs` method: # In[ ]: @@ -989,7 +944,7 @@ def lonlat2UTM(lon, lat): # It is often desirable to minimize distortion for all spatial properties (area, direction, distance) when mapping the world. -# One of the most popular projections to achieve this is [Winkel tripel](http://www.winkel.org/other/Winkel%20Tripel%20Projections.htm) (`'+proj=wintri'`), illustrated in @fig-wintri. +# One of the most popular projections to achieve this is Winkel tripel (`'+proj=wintri'`), illustrated in @fig-wintri. # In[ ]: @@ -999,7 +954,7 @@ def lonlat2UTM(lon, lat): world.to_crs('+proj=wintri').plot(color='none', edgecolor='black'); -# Moreover, proj-string parameters can be modified in most CRS definitions, for example the center of the projection can be adjusted using the `+lon_0` and `+lat_0` parameters. +# Moreover, proj-string parameters can be modified in most CRS definitions, for example, the center of the projection can be adjusted using the `+lon_0` and `+lat_0` parameters. # The below code transforms the coordinates to the Lambert azimuthal equal-area projection centered on the longitude and latitude of New York City (@fig-azimuthal-equal-area). # In[ ]: @@ -1011,8 +966,9 @@ def lonlat2UTM(lon, lat): .plot(color='none', edgecolor='black'); -# More information on CRS modifications can be found in the [Using PROJ](https://proj.org/usage/index.html) documentation. +# More information on CRS modifications can be found in the Using PROJ documentation[^using_proj]. +# +# [^using_proj]: [https://proj.org/usage/index.html](https://proj.org/usage/index.html) # -# ## Exercises +# # -# ## References diff --git a/code/chapters/07-read-write.py b/code/chapters/07-read-write.py index da655085..c3a25f62 100644 --- a/code/chapters/07-read-write.py +++ b/code/chapters/07-read-write.py @@ -9,12 +9,23 @@ #| echo: false -import matplotlib.pyplot as plt -import pandas as pd -pd.options.display.max_rows = 6 -pd.options.display.max_columns = 6 -pd.options.display.max_colwidth = 35 -plt.rcParams['figure.figsize'] = (5, 5) +#| include: false +#| error: true +import map_to_png + + +# In[ ]: + + +#| echo: false +import book_options + + +# In[ ]: + + +#| echo: false +import book_options_pdf # This chapter requires importing the following packages: @@ -25,8 +36,10 @@ import urllib.request import zipfile import numpy as np +import matplotlib.pyplot as plt +import pandas as pd import shapely -import fiona +import pyogrio import geopandas as gpd import rasterio import rasterio.plot @@ -51,48 +64,65 @@ # Taken together, these processes of input/output can be referred to as data I/O. # # Geographic data I/O is often done with few lines of code at the beginning and end of projects. -# It is often overlooked as a simple one step process. +# It is often overlooked as a simple one-step process. # However, mistakes made at the outset of projects (e.g., using an out-of-date or in some way faulty dataset) can lead to large problems later down the line, so it is worth putting considerable time into identifying which datasets are available, where they can be found and how to retrieve them. -# These topics are covered in @sec-retrieving-open-data, which describes various geoportals, which collectively contain many terabytes of data, and how to use them. -# To further ease data access, a number of packages for downloading geographic data have been developed, as described in @sec-geographic-data-packages. +# These topics are covered in @sec-retrieving-open-data, which describes several geoportals, which collectively contain many terabytes of data, and how to use them. +# To further ease data access, a number of packages for downloading geographic data have been developed, as demonstrated in @sec-geographic-data-packages. # # There are many geographic file formats, each of which has pros and cons, described in @sec-file-formats. # The process of reading and writing files efficiently is covered in Sections @sec-data-input and @sec-data-output, respectively. # # ## Retrieving open data {#sec-retrieving-open-data} # -# A vast and ever-increasing amount of geographic data is available on the internet, much of which is free to access and use (with appropriate credit given to its providers).[^07-read-write-plot-1] +# A vast and ever-increasing amount of geographic data is available on the internet, much of which is free to access and use (with appropriate credit given to its providers)[^07-read-write-plot-1]. # In some ways there is now too much data, in the sense that there are often multiple places to access the same dataset. # Some datasets are of poor quality. # In this context, it is vital to know where to look, so the first section covers some of the most important sources. -# Various 'geoportals' (web services providing geospatial datasets such as [Data.gov](https://catalog.data.gov/dataset?metadata_type=geospatial)) are a good place to start, providing a wide range of data but often only for specific locations (as illustrated in the updated [Wikipedia page](https://en.wikipedia.org/wiki/Geoportal) on the topic). +# Various 'geoportals' (web services providing geospatial datasets, such as Data.gov[^data_gov]) are a good place to start, providing a wide range of data but often only for specific locations (as illustrated in the updated Wikipedia page[^wiki_geoportal] on the topic). # # [^07-read-write-plot-1]: For example, visit for a vast list of websites with freely available geographic datasets. +# [^data_gov]: +# [^wiki_geoportal]: # # Some global geoportals overcome this issue. -# The [GEOSS portal](http://www.geoportal.org/) and the [Copernicus Open Access Hub](https://scihub.copernicus.eu/), for example, contain many raster datasets with global coverage. -# A wealth of vector datasets can be accessed from the [SEDAC](http://sedac.ciesin.columbia.edu/) portal run by the National Aeronautics and Space Administration (NASA) and the European Union's [INSPIRE geoportal](http://inspire-geoportal.ec.europa.eu/), with global and regional coverage. +# The GEOSS portal[^geoss_portal] and the Copernicus Data Space Ecosystem[^copernicus], for example, contain many raster datasets with global coverage. +# A wealth of vector datasets can be accessed from the SEDAC[^sedac] portal run by the National Aeronautics and Space Administration (NASA) and the European Union's INSPIRE geoportal[^inspire_geoportal], with global and regional coverage. +# +# [^geoss_portal]: +# [^copernicus]: +# [^sedac]: +# [^inspire_geoportal]: # -# Most geoportals provide a graphical interface allowing datasets to be queried based on characteristics such as spatial and temporal extent, the United States Geological Survey's [EarthExplorer](https://earthexplorer.usgs.gov/) being a prime example. +# Most geoportals provide a graphical interface allowing datasets to be queried based on characteristics such as spatial and temporal extent, the United States Geological Survey's EarthExplorer[^earthexplorer] and NASA's EarthData Search[^earthdata_search] being prime examples. # Exploring datasets interactively on a browser is an effective way of understanding available layers. # From reproducibility and efficiency perspectives, downloading data is, however, best done with code. -# Downloads can be initiated from the command line using a variety of techniques, primarily via URLs and APIs (see the [Sentinel API](https://scihub.copernicus.eu/twiki/do/view/SciHubWebPortal/APIHubDescription), for example). -# Files hosted on static URLs can be downloaded with the following method, as illustrated in the code chunk below which accesses the [Natural Earth Data](https://www.naturalearthdata.com/) website to download the world airports layer zip file and to extract the contained ESRI Shapefile. +# Downloads can be initiated from the command line using a variety of techniques, primarily via URLs and APIs (see the Sentinel API[^sentinel_api], for example). +# +# [^earthexplorer]: +# [^earthdata_search]: +# [^sentinel_api]: +# +# Files hosted on static URLs can be downloaded with the following method, as illustrated in the code chunk below which accesses the Natural Earth Data[^natural_earth_data] website to download the world airports layer zip file and to extract the contained ESRI Shapefile. # Note that the download code is complicated by the fact that the server checks the `User-agent` header of the request, basically to make sure that the download takes place through a browser. # To overcome this, we add a header corresponding to a request coming from a browser (such as Firefox) in our code. -# -# +# +# [^natural_earth_data]: # In[ ]: #| eval: false # Set URL+filename -url = 'https://www.naturalearthdata.com/http//www.naturalearthdata.com/download/10m/cultural/ne_10m_airports.zip' +url = 'https://www.naturalearthdata.com/http//www.naturalearthdata.com/' +url += 'download/10m/cultural/ne_10m_airports.zip' filename = 'output/ne_10m_airports.zip' # Download opener = urllib.request.build_opener() -opener.addheaders = [('User-agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/116.0')] +opener.addheaders = [( + 'User-agent', + 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) ' + + 'Gecko/20100101 Firefox/116.0' +)] urllib.request.install_opener(opener) urllib.request.urlretrieve(url, filename) # Extract @@ -121,23 +151,11 @@ # ## Geographic data packages {#sec-geographic-data-packages} # -# Many Python packages have been developed for accessing geographic data, two of which are presented in @tbl-data-packages and demonstrated below. +# Several Python packages have been developed for accessing geographic data, two of which are demonstrated below. # These provide interfaces to one or more spatial libraries or geoportals and aim to make data access even quicker from the command line. -# -# -# -# | Package | Description | -# |-------------|---------------------------------------------------------------------------------------------------| -# | **cartopy** | Download layers from [Natural Earth Data](https://www.naturalearthdata.com/downloads/) | -# | **osmnx** | Access to [OpenStreetMap](https://www.openstreetmap.org/) data and conversion to spatial networks | -# -# : Selected Python packages for geographic data retrieval {#tbl-data-packages} -# -# Each data package has its own syntax for accessing data. -# This diversity is demonstrated in the subsequent code chunks, which show how to get data using the packages from @tbl-data-packages. # # Administrative borders are often useful in spatial analysis. -# These can be accessed with the [`cartopy.io.shapereader.natural_earth`](https://scitools.org.uk/cartopy/docs/latest/reference/generated/cartopy.io.shapereader.natural_earth.html) function from the **cartopy** package [@cartopy]. +# These can be accessed with the `cartopy.io.shapereader.natural_earth` function from the **cartopy** package [@cartopy]. # For example, the following code loads the `'admin_2_counties'` dataset of US counties into a `GeoDataFrame`. # In[ ]: @@ -162,25 +180,23 @@ counties.plot(); -# Note that @fig-ne-counties x-axis spans the entire range of longitues, between `-180` and `180`, since the Aleutian Islands county (which is small and difficult to see on the map) crosses the [International Date Line](https://en.wikipedia.org/wiki/International_Date_Line). -# -# -# +# Note that @fig-ne-counties x-axis spans the entire range of longitudes, between `-180` and `180`, since the Aleutian Islands county (which is small and difficult to see on the map) crosses the International Date Line. # -# Other layers can be accessed the same way. -# You need to specify the `resolution`, `category`, and `name` of the requested dataset in [Natural Earth Data](https://www.naturalearthdata.com/downloads/), then run the `cartopy.io.shapereader.natural_earth`, which downloads the file(s) and returns the path, and read the file into the Python environment, e.g., using `gpd.read_file`. -# This is an alternative approach to "directly" downloading files as shown earlier (@sec-retrieving-open-data). +# Other layers can from NaturalEarth be accessed the same way. +# You need to specify the `resolution`, `category`, and `name` of the requested dataset in Natural Earth Data, then run the `cartopy.io.shapereader.natural_earth`, which downloads the file(s) and returns the path, and read the file into the Python environment, e.g., using `gpd.read_file`. +# This is an alternative approach to 'directly' downloading files as shown earlier (@sec-retrieving-open-data). # # The second example uses the **osmnx** package [@osmnx] to find parks from the OpenStreetMap (OSM) database. -# As illustrated in the code-chunk below, OpenStreetMap data can be obtained using the `ox.features.features_from_place` function. +# As illustrated in the code chunk below, OpenStreetMap data can be obtained using the `ox.features.features_from_place` function. # The first argument is a string which is geocoded to a polygon (the `ox.features.features_from_bbox` and `ox.features.features_from_polygon` can also be used to query a custom area of interest). -# The second argument specifies the OSM [tag(s)](https://wiki.openstreetmap.org/wiki/Map_features), selecting which OSM elements we're interested in (parks, in this case), represented by key-value pairs. -# -# +# The second argument specifies the OSM tag(s)[^osm_tags], selecting which OSM elements we're interested in (parks, in this case), represented by key-value pairs. +# +# [^osm_tags]: # In[ ]: +#| warning: false parks = ox.features.features_from_place( query='leeds uk', tags={'leisure': 'park'} @@ -188,7 +204,9 @@ # The result is a `GeoDataFrame` with the parks in Leeds. -# Now, we can plots the geometries with the `name` property in the tooltips using `explore` (@fig-ox-features). +# Now, we can plot the geometries with the `name` property in the tooltips using `explore` (@fig-ox-features). +# +# ::: {.content-visible when-format="html"} # In[ ]: @@ -198,76 +216,77 @@ parks[['name', 'geometry']].explore() -# It should be noted that the **osmnx** package downloads OSM data from the [Overpass API](https://wiki.openstreetmap.org/wiki/Overpass_API), which is rate limited and therefore unsuitable for queries covering very large areas. -# To overcome this limitation, you can download OSM data extracts, such as in Shapefile format from [Geofabrik](https://download.geofabrik.de/), and then load them from the file into the Python environment. -# -# +# ::: +# ::: {.content-visible when-format="pdf"} + +# In[ ]: + + +#| eval: false +parks[['name', 'geometry']].explore() + + +# In[ ]: + + +#| echo: false +#| output: false +#| error: true +map_to_png.map_to_png(parks[['name', 'geometry']].explore(), 'fig-ox-features') + + +# ![Parks in Leeds, based on OpenStreetMap data, downloaded using package **osmnx**](images/fig-ox-features.png){#fig-ox-features} +# ::: +# +# It should be noted that the **osmnx** package downloads OSM data from the Overpass API[^overpass_api], which is rate limited and therefore unsuitable for queries covering very large areas. +# To overcome this limitation, you can download OSM data extracts, such as in Shapefile format from Geofabrik[^geofabrik], and then load them from the file into the Python environment. # -# OpenStreetMap is a vast global database of crowd-sourced data, is growing daily, and has a wider ecosystem of tools enabling easy access to the data, from the [Overpass turbo](https://overpass-turbo.eu/) web service for rapid development and testing of OSM queries to [osm2pgsql](https://osm2pgsql.org/) for importing the data into a PostGIS database. +# [^overpass_api]: +# [^geofabrik]: +# +# OpenStreetMap is a vast global database of crowd-sourced data, is growing daily, and has a wider ecosystem of tools enabling easy access to the data, from the Overpass turbo[^overpass_turbo] web service for rapid development and testing of OSM queries to `osm2pgsql` for importing the data into a PostGIS database. # Although the quality of datasets derived from OSM varies, the data source and wider OSM ecosystems have many advantages: they provide datasets that are available globally, free of charge, and constantly improving thanks to an army of volunteers. -# Using OSM encourages 'citizen science' and contributions back to the digital commons (you can start editing data representing a part of the world you know well at [www.openstreetmap.org](https://www.openstreetmap.org/)). -# -# -# -# -# -# -# -# -# -# -# -# +# Using OSM encourages 'citizen science' and contributions back to the digital commons (you can start editing data representing a part of the world you know well at ). +# +# [^overpass_turbo]: # # One way to obtain spatial information is to perform geocoding---transform a description of a location, usually an address, into a set of coordinates. # This is typically done by sending a query to an online service and getting the location as a result. # Many such services exist that differ in the used method of geocoding, usage limitations, costs, or API key requirements. -# [Nominatim](https://nominatim.openstreetmap.org/ui/about.html) is a well-known free service, based on OpenStreetMap data, and there are many other free and commercial geocoding services. +# Nominatim[^nominatim] is a well-known free service, based on OpenStreetMap data, and there are many other free and commercial geocoding services. # -# **geopandas** provides the [`gpd.tools.geocode`](https://geopandas.org/en/stable/docs/reference/api/geopandas.tools.geocode.html), which can geocode addresses to a `GeoDataFrame`. +# [^nominatim]: +# +# **geopandas** provides the `gpd.tools.geocode`, which can geocode addresses to a `GeoDataFrame`. # Internally it uses the **geopy** package, supporting several providers through the `provider` parameter (use `geopy.geocoders.SERVICE_TO_GEOCODER` to see possible options). -# -# -# -# The example below searches for [John Snow blue plaque](https://en.m.wikipedia.org/wiki/John_Snow_(public_house)) coordinates located on a building in the Soho district of London. +# The example below searches for John Snow blue plaque[^john_snow_blue_plaque] coordinates located on a building in the Soho district of London. # The result is a `GeoDataFrame` with the address we passed to `gpd.tools.geocode`, and the detected point location. -# -# +# +# [^john_snow_blue_plaque]: # In[ ]: -result = gpd.tools.geocode('54 Frith St, London W1D 4SJ, UK') +result = gpd.tools.geocode('54 Frith St, London W1D 4SJ, UK', timeout=10) result -# Importantly, (1) we can pass a `list` of multiple addresses instead of just one, resulting in a `GeoDataFrame` with corresponding multiple rows, and (2) "No Results" responses are represented by `POINT EMPTY` geometries, as shown in the following example. +# Importantly, (1) we can pass a `list` of multiple addresses instead of just one, resulting in a `GeoDataFrame` with corresponding multiple rows, and (2) 'No Results' responses are represented by `POINT EMPTY` geometries, as shown in the following example. # In[ ]: -result = gpd.tools.geocode([ - '54 Frith St, London W1D 4SJ, UK', - 'abcdefghijklmnopqrstuvwxyz' -]) +result = gpd.tools.geocode( + ['54 Frith St, London W1D 4SJ, UK', 'abcdefghijklmnopqrstuvwxyz'], + timeout=10 +) result -# -# -# -# -# -# -# -# -# -# -# -# -# # The result is visualized in @fig-ox-geocode using the `.explore` function. # We are using the `marker_kwds` parameter of `.explore` to make the marker larger (see @sec-interactive-styling). +# +# ::: {.content-visible when-format="html"} # In[ ]: @@ -277,28 +296,45 @@ result.iloc[[0]].explore(color='red', marker_kwds={'radius':20}) -# -# -# -# +# ::: +# ::: {.content-visible when-format="pdf"} + +# In[ ]: + + +#| eval: false +result.iloc[[0]].explore(color='red', marker_kwds={'radius':20}) + + +# In[ ]: + + +#| echo: false +#| output: false +#| error: true +map_to_png.map_to_png(result.iloc[[0]].explore(color='red', marker_kwds={'radius':20}), 'fig-ox-geocode') + + +# ![Specific address in London, geocoded into a `GeoDataFrame`](images/fig-ox-geocode.png){#fig-ox-geocode} +# ::: # # ## File formats {#sec-file-formats} # # Geographic datasets are usually stored as files or in spatial databases. -# File formats usually can either store vector or raster data, while spatial databases such as [PostGIS](https://postgis.net/) can store both. -# The large variety of file formats may seem bewildering, but there has been much consolidation and standardization since the beginnings of GIS software in the 1960s when the first widely distributed program ([SYMAP](https://news.harvard.edu/gazette/story/2011/10/the-invention-of-gis/)) for spatial analysis was created at Harvard University [@coppock_history_1991]. +# File formats usually can either store vector or raster data, while spatial databases such as PostGIS can store both. +# The large variety of file formats may seem bewildering, but there has been much consolidation and standardization since the beginnings of GIS software in the 1960s when the first widely distributed program SYMAP for spatial analysis was created at Harvard University [@coppock_history_1991]. # -# GDAL (which originally was pronounced as "goo-dal", with the double "o" making a reference to object-orientation), the Geospatial Data Abstraction Library, has resolved many issues associated with incompatibility between geographic file formats since its release in 2000. +# GDAL (which originally was pronounced as 'goo-dal', with the double 'o' making a reference to object-orientation), the Geospatial Data Abstraction Library, has resolved many issues associated with incompatibility between geographic file formats since its release in 2000. # GDAL provides a unified and high-performance interface for reading and writing of many raster and vector data formats. # Many open and proprietary GIS programs, including GRASS, ArcGIS and QGIS, use GDAL behind their GUIs for doing the legwork of ingesting and spitting out geographic data in appropriate formats. -# Most Pyhton packages for working with spatial data, including **geopandas** and **rasterio** used in this book, also rely on GDAL for importing and exporting spatial data files. +# Most Python packages for working with spatial data, including **geopandas** and **rasterio** used in this book, also rely on GDAL for importing and exporting spatial data files. # # GDAL provides access to more than 200 vector and raster data formats. -# @tbl-file-formats presents some basic information about selected and often used spatial file formats. +# @tbl-file-formats presents some basic information about selected and often-used spatial file formats. # # | Name | Extension | Info | Type | Model | # |-------------------|------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------|----------------| -# | ESRI Shapefile | `.shp` (the main file) | Popular format consisting of at least three files. No support for: files \> 2GB;mixed types; names \> 10 chars; cols \> 255. | Vector | Partially open | +# | ESRI Shapefile | `.shp` (the main file) | Popular format consisting of at least three files. No support for: files \> 2GB; mixed types; names \> 10 chars; cols \> 255. | Vector | Partially open | # | GeoJSON | `.geojson` | Extends the JSON exchange format by including a subset of the simple feature representation; mostly used for storing coordinates in longitude and latitude; it is extended by the TopoJSON format. | Vector | Open | # | KML | `.kml` | XML-based format for spatial visualization, developed for use with Google Earth. Zipped KML file forms the KMZ format. | Vector | Open | # | GPX | `.gpx` | XML schema created for exchange of GPS data. | Vector | Open | @@ -307,23 +343,21 @@ # | Arc ASCII | `.asc` | Text format where the first six lines represent the raster header, followed by the raster cell values arranged in rows and columns. | Raster | Open | # | SQLite/SpatiaLite | `.sqlite` | Standalone relational database, SpatiaLite is the spatial extension of SQLite. | Vector and raster | Open | # | ESRI FileGDB | `.gdb` | Spatial and nonspatial objects created by ArcGIS. Allows: multiple feature classes; topology. Limited support from GDAL. | Vector and raster | Proprietary | -# | GeoPackage | `.gpkg` | Lightweight database container based on SQLite allowing an easy and platform-independent exchange of geodata | Vector and (very limited) raster | Open | +# | GeoPackage | `.gpkg` | Lightweight database container based on SQLite allowing an easy and platform-independent exchange of geodata. | Vector and (very limited) raster | Open | # -# : Commonly used spatial data file formats {#tbl-file-formats} +# : Commonly used spatial data file formats {#tbl-file-formats tbl-colwidths="[23, 13, 54, 15, 15]"} # -# An important development ensuring the standardization and open-sourcing of file formats was the founding of the Open Geospatial Consortium ([OGC](http://www.opengeospatial.org/)) in 1994. +# An important development ensuring the standardization and open-sourcing of file formats was the founding of the Open Geospatial Consortium (OGC) in 1994. # Beyond defining the Simple Features data model (see @sec-simple-features), the OGC also coordinates the development of open standards, for example as used in file formats such as KML and GeoPackage. -# -# # Open file formats of the kind endorsed by the OGC have several advantages over proprietary formats: the standards are published, ensure transparency and open up the possibility for users to further develop and adjust the file formats to their specific needs. # -# ESRI Shapefile is the most popular vector data exchange format; however, it is not an fully open format (though its specification is open). +# ESRI Shapefile is the most popular vector data exchange format; however, it is not a fully open format (though its specification is open). # It was developed in the early 1990s and, from a modern standpoint, has a number of limitations. # First of all, it is a multi-file format, which consists of at least three files. # It also only supports 255 columns, its column names are restricted to ten characters and the file size limit is 2 GB. # Furthermore, ESRI Shapefile does not support all possible geometry types, for example, it is unable to distinguish between a polygon and a multipolygon. # Despite these limitations, a viable alternative had been missing for a long time. -# In 2014, [GeoPackage](https://www.geopackage.org/) emerged, and seems to be a more than suitable replacement candidate for ESRI Shapefile. +# In 2014, GeoPackage emerged, and seems to be a more than suitable replacement candidate for ESRI Shapefile. # GeoPackage is a format for exchanging geospatial information and an OGC standard. # This standard describes the rules on how to store geospatial information in a tiny SQLite container. # Hence, GeoPackage is a lightweight spatial database container, which allows the storage of vector and raster data but also of non-spatial data and extensions. @@ -333,60 +367,52 @@ # It allows spatial information, such as the CRS definition and the transformation matrix (see @sec-using-rasterio), to be embedded within a TIFF file. # Similar to ESRI Shapefile, this format was firstly developed in the 1990s, but as an open format. # Additionally, GeoTIFF is still being expanded and improved. -# One of the most significant recent addition to the GeoTIFF format is its variant called COG (Cloud Optimized GeoTIFF). +# One of the most significant recent additions to the GeoTIFF format is its variant called COG (Cloud Optimized GeoTIFF). # Raster objects saved as COGs can be hosted on HTTP servers, so other people can read only parts of the file without downloading the whole file (@sec-input-raster). # # There is also a plethora of other spatial data formats that we do not explain in detail or mention in @tbl-file-formats due to the book limits. -# If you need to use other formats, we encourage you to read the GDAL documentation about [vector](https://gdal.org/drivers/vector/index.html) and [raster](https://gdal.org/drivers/raster/index.html) drivers. +# If you need to use other formats, we encourage you to read the GDAL documentation about vector and raster drivers. # Additionally, some spatial data formats can store other data models (types) than vector or raster. -# It includes LAS and LAZ formats for storing lidar point clouds, and NetCDF and HDF for storing multidimensional arrays. +# Two examples are LAS and LAZ formats for storing lidar point clouds, and NetCDF and HDF for storing multidimensional arrays. # -# Finally, spatial data is also often stored using tabular (non-spatial) text formats, including CSV files or Excel spreadsheets. +# Finally, spatial data are also often stored using tabular (non-spatial) text formats, including CSV files or Excel spreadsheets. # This can be convenient to share spatial (point) datasets with people who, or software that, struggle with spatial data formats. # If necessary, the table can be converted to a point layer (see examples in @sec-vector-layer-from-scratch and @sec-spatial-joining). # # ## Data input (I) {#sec-data-input} # -# Executing commands such as `geopandas.read_file` (the main function we use for loading vector data) or `rasterio.open`+`.read` (the main group of functions used for loading raster data) silently sets off a chain of events that reads data from files. +# Executing commands such as `gpd.read_file` (the main function we use for loading vector data) or `rasterio.open`+`.read` (the main group of functions used for loading raster data) silently sets off a chain of events that reads data from files. # Moreover, there are many Python packages containing a wide range of geographic data or providing simple access to different data sources. # All of them load the data into the Python environment or, more precisely, assign objects to your workspace, stored in RAM and accessible within the Python session. # The latter is the most straightforward approach, suitable when RAM is not a limiting factor. # For large vector layers and rasters, partial reading may be required. # For vector layers, we will demonstrate how to read subsets of vector layers, filtered by attributes or by location (@sec-input-vector). # For rasters, we already showed earlier in the book how the user can choose which specific bands to read (@sec-using-rasterio), or read resampled data to a lower resolution (@sec-raster-agg-disagg). -# In this section, we also show how to read specific rectangular extents ("windows") from a raster file (@sec-input-raster). -# -# +# In this section, we also show how to read specific rectangular extents ('windows') from a raster file (@sec-input-raster). # # ### Vector data {#sec-input-vector} # # Spatial vector data comes in a wide variety of file formats. # Most popular representations such as `.shp`, `.geojson`, and `.gpkg` files can be imported and exported with **geopandas** functions `read_file` and `to_file` (covered in @sec-data-output), respectively. # -# **geopandas** uses GDAL to read and write data, via **fiona** (the [default](https://github.com/geopandas/geopandas/issues/2217)) or **pyogrio** packages (a recently developed alternative to **fiona**, which will become the default in the future, see [note](https://geopandas.org/en/stable/docs/user_guide/io.html) in "Reading and writing files" tutorial). -# -# -# -# -# After **fiona** is imported, the command `fiona.supported_drivers` can be used to list drivers available to GDAL, including whether they can (`'r'`), append (`'a'`), or write (`'w'`) data, or all three. +# **geopandas** uses GDAL to read and write data, via **pyogrio** since `geopandas` version `1.0.0` (previously via **fiona**). +# After **pyogrio** is imported, `pyogrio.list_drivers` can be used to list drivers available to GDAL, including whether they can read (`'r'`), append (`'a'`), or write (`'w'`) data, or all three. # In[ ]: #| eval: false -fiona.supported_drivers +pyogrio.list_drivers() # ``` -# {'DXF': 'rw', -# 'CSV': 'raw', -# ... -# 'TopoJSON': 'r', -# 'LIBKML': 'r'} +# {'PCIDSK': 'rw', +# 'PDS4': 'rw', +# ... +# 'AVCE00': 'r', +# 'HTTP': 'r'} # ``` # -# Other, less common, drivers can be ["activated"](https://geopandas.org/en/stable/docs/user_guide/io.html) by manually supplementing `fiona.supported_drivers`. -# # The first argument of the **geopandas** versatile data import function `gpd.read_file` is `filename`, which is typically a string, but can also be a file connection. # The content of a string could vary between different drivers. # In most cases, as with the ESRI Shapefile (`.shp`) or the GeoPackage format (`.gpkg`), the `filename` argument would be a path or a URL to an actual file, such as `geodata.gpkg`. @@ -410,7 +436,7 @@ # Some vector formats, such as GeoPackage, can store multiple data layers. # By default, `gpd.read_file` reads the first layer of the file specified in `filename`. # However, using the `layer` argument you can specify any other layer. -# To list the available layers, we can use function `fiona.listlayers` or `pyogrio.list_layers`. +# To list the available layers, we can use function `gpd.list_layers` (or `pyogrio.list_layers`). # # The `gpd.read_file` function also allows for reading just parts of the file into RAM with two possible mechanisms. # The first one is related to the `where` argument, which allows specifying what part of the data to read using an SQL `WHERE` expression. @@ -424,18 +450,25 @@ tanzania -# If you do not know the names of the available columns, a good approach is to just read one row of the data using the `rows` argument, which can be used to read the first N rows, then use the `.columns` property to examine the column names: +# If you do not know the names of the available columns, a good approach is to read the layer metadata using `pyogrio.read_info`. The resulting object contains, among other properties, the column names (`fields`) and data types (`dtypes`): + +# In[ ]: + + +info = pyogrio.read_info('data/world.gpkg') +info['fields'] + # In[ ]: -gpd.read_file('data/world.gpkg', rows=1).columns +info['dtypes'] # The second mechanism uses the `mask` argument to filter data based on intersection with an existing geometry. # This argument expects a geometry (`GeoDataFrame`, `GeoSeries`, or `shapely` geometry) representing the area where we want to extract the data. # Let's try it using a small example---we want to read polygons from our file that intersect with the buffer of 50,000 $m$ of Tanzania's borders. -# To do it, we need to transform the geometry to a projected CRS (such as `EPSG:32736`), prepare our "filter" by creating the buffer (@sec-buffers), and transform back to the original CRS to be used as a mask (@fig-read-shp-query (a)). +# To do it, we need to transform the geometry to a projected CRS (such as `EPSG:32736`), prepare our 'filter' by creating the buffer (@sec-buffers), and transform back to the original CRS to be used as a mask (@fig-read-shp-query (a)). # In[ ]: @@ -443,7 +476,7 @@ tanzania_buf = tanzania.to_crs(32736).buffer(50000).to_crs(4326) -# Now, we can pass the "filter" geometry `tanzania_buf` to the `mask` argument of `gpd.read_file`. +# Now, we can pass the 'filter' geometry `tanzania_buf` to the `mask` argument of `gpd.read_file`. # In[ ]: @@ -466,14 +499,18 @@ # Using 'where' fig, ax = plt.subplots() tanzania.plot(ax=ax, color='lightgrey', edgecolor='grey') -tanzania.apply(lambda x: ax.annotate(text=x['name_long'], - xy=x.geometry.centroid.coords[0], ha='center'), axis=1); +tanzania.apply( + lambda x: ax.annotate(text=x['name_long'], + xy=x.geometry.centroid.coords[0], ha='center'), axis=1 +); # Using 'mask' fig, ax = plt.subplots() tanzania_neigh.plot(ax=ax, color='lightgrey', edgecolor='grey') tanzania_buf.plot(ax=ax, color='none', edgecolor='red') -tanzania_neigh.apply(lambda x: ax.annotate(text=x['name_long'], - xy=x.geometry.centroid.coords[0], ha='center'), axis=1); +tanzania_neigh.apply( + lambda x: ax.annotate(text=x['name_long'], + xy=x.geometry.centroid.coords[0], ha='center'), axis=1 +); # A different, `gpd.read_postgis`, function can be used to read a vector layer from a PostGIS database. @@ -487,6 +524,7 @@ #| label: fig-cycle_hire_xy-layer #| fig-cap: The `cycle_hire_xy.csv` table transformed to a point layer +#| warning: false cycle_hire = pd.read_csv('data/cycle_hire_xy.csv') geom = gpd.points_from_xy(cycle_hire['X'], cycle_hire['Y'], crs=4326) geom = gpd.GeoSeries(geom) @@ -494,7 +532,7 @@ cycle_hire_xy.plot(); -# Instead of columns describing 'XY' coordinates, a single column can also contain the geometry information, not necessarily points but possible any other geometry type. +# Instead of columns describing 'XY' coordinates, a single column can also contain the geometry information, not necessarily points but possibly any other geometry type. # Well-known text (WKT), well-known binary (WKB), and GeoJSON are examples of formats used to encode geometry in such a column. # For instance, the `world_wkt.csv` file has a column named `'WKT'`, representing polygons of the world's countries (in WKT format). # When importing the CSV file into a `DataFrame`, the `'WKT'` column is interpreted just like any other string column. @@ -506,19 +544,16 @@ world_wkt -# To convert it to a `GeoDataFrame`, we can apply the `shapely.from_wkt` function (@sec-geometries) on the WKT strings, to convert them into `shapely` geometries (also see note about the `.apply` method in @sec-topological-relations). +# To convert it to a `GeoDataFrame`, we can apply the `gpd.GeoSeries.from_wkt` function (which is analogous to `shapely`'s `shapely.from_wkt`, see @sec-geometries) on the WKT strings, to convert the series of WKT strings into a `GeoSeries` with the geometries. # In[ ]: -world_wkt['geometry'] = world_wkt['WKT'].apply(shapely.from_wkt) +world_wkt['geometry'] = gpd.GeoSeries.from_wkt(world_wkt['WKT']) world_wkt = gpd.GeoDataFrame(world_wkt) world_wkt -# -# -# # The resulting layer is shown in @fig-world_wkt-layer. # In[ ]: @@ -526,40 +561,21 @@ #| label: fig-world_wkt-layer #| fig-cap: The `world_wkt.csv` table transformed to a polygon layer +#| warning: false world_wkt.plot(); -# -# -# -# -# -# -# -# -# -# # As a final example, we will show how **geopandas** also reads KML files. # A KML file stores geographic information in XML format---a data format for the creation of web pages and the transfer of data in an application-independent way [@nolan_xml_2014]. # Here, we access a KML file from the web. -# First, if necessary, we may need to "activate" the `KML` driver, which is not always available by default (just one of these expressions should be sufficient, depending on your system). - -# In[ ]: - - -fiona.supported_drivers['KML'] = 'r' -fiona.supported_drivers['LIBKML'] = 'r' - - +# # The sample KML file `KML_Samples.kml` contains more than one layer. -# -# # In[ ]: u = 'https://developers.google.com/kml/documentation/KML_Samples.kml' -fiona.listlayers(u) +gpd.list_layers(u) # We can choose, for instance, the first layer `'Placemarks'` and read it, using `gpd.read_file` with an additional `layer` argument. @@ -591,7 +607,9 @@ # In[ ]: -src = rasterio.open('https://zenodo.org/record/5774954/files/clm_snow.prob_esacci.dec_p.90_500m_s0..0cm_2000..2012_v2.0.tif') +url = 'https://zenodo.org/record/5774954/files/' +url += 'clm_snow.prob_esacci.dec_p.90_500m_s0..0cm_2000..2012_v2.0.tif' +src = rasterio.open(url) src @@ -601,7 +619,7 @@ # This is very useful when working with large datasets hosted online from resource-constrained computing environments such as laptops. # # For example, we can read a specified rectangular extent of the raster. -# With **rasterio**, this is done using the so-called [windowed reading](https://rasterio.readthedocs.io/en/latest/topics/windowed-rw.html) capabilities. +# With **rasterio**, this is done using the so-called *windowed reading* capabilities. # Note that, with windowed reading, we import just a subset of the raster extent into an `ndarray` covering any partial extent. # Windowed reading is therefore memory- (and, in this case, bandwidth-) efficient, since it avoids reading the entire raster into memory. # It can also be considered an alternative pathway to *cropping* (@sec-raster-cropping). @@ -618,8 +636,8 @@ ymax=70 -# Using the extent coordinates along with the raster transformation matrix, we create a window object, using the [`rasterio.windows.from_bounds`](https://rasterio.readthedocs.io/en/stable/api/rasterio.windows.html#rasterio.windows.from_bounds) function. -# This function basically "translates" the extent from coordinates, to row/column ranges. +# Using the extent coordinates along with the raster transformation matrix, we create a window object, using the `rasterio.windows.from_bounds` function. +# This function basically 'translates' the extent from coordinates, to row/column ranges. # In[ ]: @@ -634,9 +652,6 @@ w -# -# -# # Now we can read the partial array, according to the specified window `w`, by passing it to the `.read` method. # In[ ]: @@ -724,9 +739,7 @@ world.to_file('output/world.gpkg') -# Instead of overwriting the file, we could add new rows to the file with `mode='a'` ("append" mode, as opposed to the default `mode='w'` for the "write" mode). -# -# +# Instead of overwriting the file, we could add new rows to the file with `mode='a'` ('append' mode, as opposed to the default `mode='w'` for the 'write' mode). # Appending is supported by several spatial formats, including GeoPackage. # In[ ]: @@ -736,11 +749,12 @@ world.to_file('output/w_many_features.gpkg', mode='a') -# Now, `w_many_features.gpkg` contains a polygonal layer named `world` with two "copies" of each country (that is 177×2=354 features, whereas the `world` layer has 177 features). +# Now, `w_many_features.gpkg` contains a polygonal layer named `world` with two 'copies' of each country (that is 177×2=354 features, whereas the `world` layer has 177 features). # In[ ]: +#| warning: false gpd.read_file('output/w_many_features.gpkg').shape @@ -753,8 +767,8 @@ world.to_file('output/w_many_layers.gpkg', layer='world2') -# In this case, `w_many_layers.gpkg` has two "layers": `w_many_layers` (same as the file name, when `layer` is unspecified) and `world2`. -# Incidentally, the contents of the two layers is identical, but this does not have to be. +# In this case, `w_many_layers.gpkg` has two 'layers': `w_many_layers` (same as the file name, when `layer` is unspecified) and `world2`. +# Incidentally, the contents of the two layers are identical, but this does not have to be. # Each layer from such a file can be imported separately using the `layer` argument of `gpd.read_file`. # In[ ]: @@ -777,15 +791,15 @@ # - `height`---Number of rows # - `width`---Number of columns # - `count`---Number of bands -# - `nodata`---The value which represents "No Data", if any +# - `nodata`---The value which represents 'No Data', if any # - `dtype`---The raster data type, one of **numpy** types supported by the `driver` (e.g., `np.int64`) (see @tbl-numpy-data-types) # - `crs`---The CRS, e.g., using an EPSG code (such as `4326`) # - `transform`---The transform matrix # - `compress`---A compression method to apply, such as `'lzw'`. This is optional and most useful for large rasters. Note that, at the time of writing, this [does not work well](https://gis.stackexchange.com/questions/404738/why-does-rasterio-compression-reduces-image-size-with-single-band-but-not-with-m) for writing multiband rasters # -# ```{note} -# Note that `'GTiff` (GeoTIFF, `.tif`), which is the recommended driver, [supports](https://gdal.org/drivers/raster/gtiff.html) just some of the possible **numpy** data types (see @tbl-numpy-data-types). Importantly, it does not support `np.int64`, the default `int` type. The recommendation in such case it to use `np.int32` (if the range is sufficient), or `np.float64`. -# ``` +# ::: callout-note +# Note that `'GTiff` (GeoTIFF, `.tif`), which is the recommended driver, supports just some of the possible **numpy** data types (see @tbl-numpy-data-types). Importantly, it does not support `np.int64`, the default `int` type. The recommendation in such case it to use `np.int32` (if the range is sufficient), or `np.float64`. +# ::: # # Once the file connection with the right metadata is ready, we do the actual writing using the `.write` method of the file connection. # If there are several bands we may execute the `.write` method several times, as in `.write(a,n)`, where `a` is a two-dimensional array representing a single band, and `n` is the band index (starting from `1`, see below). @@ -795,11 +809,9 @@ # # Most of the properties are either straightforward to choose, based on our aims, (e.g., `driver`, `crs`, `compress`, `nodata`), or directly derived from the array with the raster values itself (e.g., `height`, `width`, `count`, `dtype`). # The most complicated property is the `transform`, which specifies the raster origin and resolution. -# The `transform` is typically either obtained from an existing raster (serving as a "template"), created from scratch based on manually specified origin and resolution values (e.g., using `rasterio.transform.from_origin`), or calculated automatically (e.g., using `rasterio.warp.calculate_default_transform`), as shown in previous chapters. +# The `transform` is typically either obtained from an existing raster (serving as a 'template'), created from scratch based on manually specified origin and resolution values (e.g., using `rasterio.transform.from_origin`), or calculated automatically (e.g., using `rasterio.warp.calculate_default_transform`), as shown in previous chapters. # # Earlier in the book, we have already demonstrated five common scenarios of writing rasters, covering the above-mentioned considerations: -# -# # # - Creating from scratch (@sec-raster-from-scratch)---we created and wrote two rasters from scratch by associating the `elev` and `grain` arrays with an arbitrary spatial extent. The custom arbitrary transformation matrix was created using `rasterio.transform.from_origin` # - Aggregating (@sec-raster-agg-disagg)---we wrote an aggregated a raster, by resampling from an exising raster file, then updating the transformation matrix using `.transform.scale` @@ -841,7 +853,7 @@ new_transform -# Then, we establish the writing-mode file connection to `r.tif`, which will be eithe created or overwritten. +# Then, we establish the writing-mode file connection to `r.tif`, which will be either created or overwritten. # In[ ]: @@ -878,18 +890,18 @@ # These expressions, taken together, create a new file `output/r.tif`, which is a $2 \times 2$ raster, having a 2 decimal degree resolution, with the top-left corner placed over London. # -# To make the picture of raster export complete, there are three important concepts we have not covered yet: array and raster data types, writing multiband rasters, and handling "No Data" values. +# To make the picture of raster export complete, there are three important concepts we have not covered yet: array and raster data types, writing multiband rasters, and handling 'No Data' values. # # Arrays (i.e., `ndarray` objects defined in package **numpy**) are used to store raster values when reading them from file, using `.read` (@sec-using-rasterio). # All values in an array are of the same type, whereas the **numpy** package supports numerous numeric data types of various precision (and, accordingly, memory footprint). # Raster formats, such as GeoTIFF, support (a subset of) exactly the same data types as **numpy**, which means that reading a raster file uses as little RAM as possible. -# The most useful types for raster data, and thir support in GeoTIFF are summarized in @tbl-numpy-data-types. +# The most useful types for raster data, and their support in GeoTIFF are summarized in @tbl-numpy-data-types. # # | Data type | Description | GeoTIFF | # |-----------|----------------------------------------------------------------------|:--------:| -# | `int8` | Integer in a single byte (`-128` to `127`) | + | +# | `int8` | Integer in a single byte (`-128` to `127`) | | # | `int16` | Integer in 16 bits (`-32768` to `32767`) | + | -# | `int32` | Integer in 32 bits (`-2147483648` to `2147483647`) | | +# | `int32` | Integer in 32 bits (`-2147483648` to `2147483647`) | + | # | `int64` | Integer in 64 bits (`-9223372036854775808` to `9223372036854775807`) | | # | `uint8` | Unsigned integer in 8 bits (`0` to `255`) | + | # | `uint16` | Unsigned integer in 16 bits (`0` to `65535`) | + | @@ -928,9 +940,6 @@ # These code sections demonstrate the agreement between GeoTIFF (and other file formats) data types, which are universal and understood by many programs and programming languages, and the corresponding `ndarray` data types which are defined by **numpy** (@tbl-numpy-data-types). # -# -# -# # Writing multiband rasters is similar to writing single-band rasters, only that we need to: # # - Define a number of bands other than `count=1`, according to the number of bands we are going to write @@ -956,7 +965,7 @@ dst_kwds -# Finally, we can create a file connection using the updated metadata, write the values of the three bands, and close the connection (note that we are switching to the "keyword argument" syntax of Python function calls here; see note in @sec-raster-agg-disagg). +# Finally, we can create a file connection using the updated metadata, write the values of the three bands, and close the connection (note that we are switching to the 'keyword argument' syntax of Python function calls here; see note in @sec-raster-agg-disagg). # In[ ]: @@ -970,16 +979,16 @@ # As a result, a three-band raster named `r3.tif` is created. # -# Rasters often contain "No Data" values, representing missing data, for example, unreliable measurements due to clouds or pixels outside of the photographed extent. -# In a **numpy** `ndarray` object, "No Data" values may be represented by the special `np.nan` value. +# Rasters often contain 'No Data' values, representing missing data, for example, unreliable measurements due to clouds or pixels outside of the photographed extent. +# In a **numpy** `ndarray` object, 'No Data' values may be represented by the special `np.nan` value. # However, due to computer memory limitations, only arrays of type `float` can contain `np.nan`, while arrays of type `int` cannot. -# For `int` rasters containing "No Data", we typically mark missing data with a specific value beyond the valid range (e.g., `-9999`). -# The missing data "flag" definition is stored in the file (set through the `nodata` property of the file connection, see above). -# When reading an `int` raster with "No Data" back into Python, we need to be aware of the flag, if any. +# For `int` rasters containing 'No Data', we typically mark missing data with a specific value beyond the valid range (e.g., `-9999`). +# The missing data 'flag' definition is stored in the file (set through the `nodata` property of the file connection, see above). +# When reading an `int` raster with 'No Data' back into Python, we need to be aware of the flag, if any. # Let's demonstrate it through examples. # # We will start with the simpler case, rasters of type `float`. -# Since `float` arrays may contain the "native" value `np.nan`, representing "No Data" is straightforward. +# Since `float` arrays may contain the 'native' value `np.nan`, representing 'No Data' is straightforward. # For example, suppose that we have a `float` array of size $2 \times 2$ containing one `np.nan` value. # In[ ]: @@ -995,7 +1004,7 @@ r.dtype -# When writing this type of array to a raster file, we do not need to specify any particular `nodata` "flag" value. +# When writing this type of array to a raster file, we do not need to specify any particular `nodata` 'flag' value. # In[ ]: @@ -1030,7 +1039,7 @@ rasterio.open('output/r_nodata_float.tif').read() -# Now, conversely, suppose that we have an `int` array with missing data, where the "missing" value must inevitably be marked using a specific `int` "flag" value, such as `-9999` (remember that we can't store `np.nan` in an `int` array!). +# Now, conversely, suppose that we have an `int` array with missing data, where the 'missing' value must inevitably be marked using a specific `int` 'flag' value, such as `-9999` (remember that we can't store `np.nan` in an `int` array!). # In[ ]: @@ -1045,7 +1054,7 @@ r.dtype -# When writing the array to file, we must specify `nodata=-9999` to keep track of our "No Data" flag. +# When writing the array to file, we must specify `nodata=-9999` to keep track of our 'No Data' flag. # In[ ]: @@ -1074,7 +1083,7 @@ # If you try to open the file in GIS software, such as QGIS, you will see the missing data interpreted (e.g., the pixel shown as blank), meaning that the software is aware of the flag. -# However, reading the data back into Python reproduces an `int` array with `-9999`, due to the limitation of `int` arrays stated before/ +# However, reading the data back into Python reproduces an `int` array with `-9999`, due to the limitation of `int` arrays stated before. # In[ ]: @@ -1084,8 +1093,8 @@ r -# The Python user must therefore be mindful of "No Data" `int` rasters, for example to avoid interpreting the value `-9999` literally. -# For instance, if we "forget" about the `nodata` flag, the literal calculation of the `.mean` would incorrectly include the value `-9999`. +# The Python user must therefore be mindful of 'No Data' `int` rasters, for example to avoid interpreting the value `-9999` literally. +# For instance, if we 'forget' about the `nodata` flag, the literal calculation of the `.mean` would incorrectly include the value `-9999`. # In[ ]: @@ -1093,7 +1102,7 @@ r.mean() -# There are two basic ways to deal with the situation: either converting the raster to `float`, or using a "No Data" mask. +# There are two basic ways to deal with the situation: either converting the raster to `float`, or using a 'No Data' mask. # The first approach, simple and particularly relevant for small rasters where memory constraints are irrelevant, is to go from `int` to `float`, to gain the ability of the natural `np.nan` representation. # Here is how we can do this with `r_nodata_int.tif`. # We detect the missing data flag, convert the raster to `float`, then assign `np.nan` into the cells that are supposed to be missing. @@ -1107,7 +1116,7 @@ r -# From there on, we deal with `np.nan` the usual way, such as using `np.nanmean` to calculate the mean excluding "No Data". +# From there on, we deal with `np.nan` the usual way, such as using `np.nanmean` to calculate the mean excluding 'No Data'. # In[ ]: @@ -1115,8 +1124,8 @@ np.nanmean(r) -# The second approach is to read the values into a so-called ["masked" array](https://numpy.org/doc/stable/reference/maskedarray.generic.html#what-is-a-masked-array), using the argument `masked=True` of the `.read` method. -# A masked array can be thought of as an extended `ndarray`, with two components: `.data` (the values) and `.mask` (a corresponding boolean array marking "No Data" values). +# The second approach is to read the values into a so-called *'masked' array*, using the argument `masked=True` of the `.read` method. +# A masked array can be thought of as an extended `ndarray`, with two components: `.data` (the values) and `.mask` (a corresponding boolean array marking 'No Data' values). # In[ ]: @@ -1126,7 +1135,7 @@ # Complete treatment of masked arrays is beyond the scope of this book. -# However, the basic idea is that many **numpy** operations "honor" the mask, so that the user does not have to keep track of the way that "No Data" values are marked, similarly to the natural `np.nan` representation and regardless of the data type. +# However, the basic idea is that many **numpy** operations 'honor' the mask, so that the user does not have to keep track of the way that 'No Data' values are marked, similarly to the natural `np.nan` representation and regardless of the data type. # For example, the `.mean` of a masked array ignores the value `-9999`, because it is masked, taking into account just the valid values `1`, `2`, and `4`. # In[ ]: @@ -1138,12 +1147,8 @@ # Switching to `float` and assigning `np.nan` is the simpler approach, since that way we can keep working with the familiar `ndarray` data structure for all raster types, whether `int` or `float`. # Nevertheless, learning how to work with masked arrays can be beneficial when we have good reasons to keep our raster data in `int` arrays (for example, due to RAM limits) and still perform operations that take missing values into account. # -# -# -# -# Finally, keep in mind that, confusingly, `float` rasters may represent "No Data" using a specific "flag" (such as `-9999.0`), instead, or in addition to (!), the native `np.nan` representation. +# Finally, keep in mind that, confusingly, `float` rasters may represent 'No Data' using a specific 'flag' (such as `-9999.0`), instead, or in addition to (!), the native `np.nan` representation. # In such cases, the same considerations shown for `int` apply to `float` rasters as well. # -# ## Exercises +# # -# ## References diff --git a/code/chapters/08-mapping.py b/code/chapters/08-mapping.py index a9b14efe..82a03b3c 100644 --- a/code/chapters/08-mapping.py +++ b/code/chapters/08-mapping.py @@ -9,12 +9,23 @@ #| echo: false -import matplotlib.pyplot as plt -import pandas as pd -pd.options.display.max_rows = 6 -pd.options.display.max_columns = 6 -pd.options.display.max_colwidth = 35 -plt.rcParams['figure.figsize'] = (5, 5) +#| include: false +#| error: true +import map_to_png + + +# In[ ]: + + +#| echo: false +import book_options + + +# In[ ]: + + +#| echo: false +import book_options_pdf # In[ ]: @@ -64,9 +75,7 @@ # ## Introduction # # -# # -# # # # A satisfying and important aspect of geographic research is communicating the results. @@ -82,7 +91,7 @@ # Historic examples include maps of buildings and land ownership in the Old Babylonian dynasty more than 3000 years ago and Ptolemy's world map in his masterpiece Geography nearly 2000 years ago [@talbert_ancient_2014]. # # Map making has historically been an activity undertaken only by, or on behalf of, the elite. -# This has changed with the emergence of open source mapping software such as mapping packages in Python, R, and other languages, and the "print composer" in QGIS, which enable anyone to make high-quality maps, enabling "citizen science". +# This has changed with the emergence of open-source mapping software such as mapping packages in Python, R, and other languages, and the 'print composer' in QGIS, which enable anyone to make high-quality maps, enabling 'citizen science'. # Maps are also often the best way to present the findings of geocomputational research in a way that is accessible. # Map making is therefore a critical part of geocomputation and its emphasis not only on describing, but also changing the world. # @@ -90,27 +99,21 @@ # Other, more advanced uses of these methods, were also encountered in subsequent chapters, when demonstrating the various outputs we got. # In this chapter, we provide a comprehensive summary of the most useful workflows of these two methods for creating static maps (@sec-static-maps). # Static maps can be easily shared and viewed (whether digitally or in print), however they can only convey as much information as a static image can. -# Interactive maps provide much more flexibilty in terms of user experience and amount of information, however they often require more work to design and effectively share. +# Interactive maps provide much more flexibility in terms of user experience and amount of information, however they often require more work to design and effectively share. # Thus, in @sec-interactive-maps, we move on to elaborate on the `.explore` method for creating interactive maps, which was also briefly introduced earlier in @sec-vector-layers. # # ## Static maps {#sec-static-maps} # -# -# -# # Static maps are the most common type of visual output from geocomputation. # For example, we have been using `.plot` and `rasterio.plot.show` throughout the book, to display **geopandas** and **rasterio** geocomputation results, respectively. -# In this section we systematically review and elaborate on the various properties that can be customized when using those functions. +# In this section, we systematically review and elaborate on the various properties that can be customized when using those functions. # # A static map is basically a digital image. # When stored in a file, standard formats include `.png` and `.pdf` for graphical raster and vector outputs, respectively. # Thanks to their simplicity, static maps can be shared in a wide variety of ways: in print, through files sent by e-mail, embedded in documents and web pages, etc. # # Nevertheless, there are many aesthetic considerations when making a static map, and there is also a wide variety of ways to create static maps using novel presentation methods. -# This is the focus of the field of [cartography](https://en.wikipedia.org/wiki/Cartography), and beyond the scope of this book. -# -# -# +# This is the focus of the field of cartography, and beyond the scope of this book. # # # @@ -161,9 +164,7 @@ # The next example uses `markersize` to get larger points (@fig-basic-plot-markersize). -# It also demonstrates how to control the overall [figure size](https://matplotlib.org/stable/gallery/subplots_axes_and_figures/figure_size_units.html), such as $4 \times 4$ $in$ in this case, using [`plt.subplots`](https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.subplots.html) to initialize the plot and its `figsize` parameter to specify dimensions. -# -# +# It also demonstrates how to control the overall figure size, such as $4 \times 4$ $in$ in this case, using `plt.subplots` to initialize the plot and its `figsize` parameter to specify dimensions. # In[ ]: @@ -176,7 +177,7 @@ # ::: callout-note # As you have probably noticed throughout the book, the `plt.subplots` function is used to initialize a **maptplotlib** plot layout, possibly also specifying image size (e.g., @fig-basic-plot-markersize) and multi-panel layout (e.g., @fig-faceted-map). -# The returned value is a `tuple` of [`Figure`](https://matplotlib.org/stable/api/figure_api.html#matplotlib.figure.Figure) and [`Axes`](https://matplotlib.org/stable/api/_as_gen/matplotlib.axes.Axes.html#matplotlib.axes.Axes) objects, which we conventionally unpack to variables named `fig` and `ax`. +# The returned value is a `tuple` of `Figure` and `Axes` objects, which we conventionally unpack to variables named `fig` and `ax`. # These two variables represent the entire figure, and the elements of individual sub-figures, respectively. # # For our purposes in this book, we have been using just the `ax` object, passing it to the `ax` parameter in further function calls, in order to add subsequent layers (e.g., @fig-plot-raster-and-vector) or other elements (e.g., @fig-plot-symbology-colors-r-scale) into the same panel. @@ -192,8 +193,6 @@ # - `column`---a column name # - `legend`---whether to show a legend # - `cmap`---color map, a.k.a. color scale, a palette from which the colors are sampled -# -# # # For example, @fig-plot-symbology shows the `nz` polygons colored according to the `'Median_income'` attribute (column), with a legend. @@ -206,11 +205,14 @@ # The default color scale which you see in @fig-plot-symbology is `cmap='viridis'`. -# The `cmap` ("color map") argument can be used to specify one of countless color scales. -# A first safe choice is often the [ColorBrewer](https://colorbrewer2.org/#type=sequential&scheme=BuGn&n=3) collection of color scales, specifically designed for mapping. +# The `cmap` ('color map') argument can be used to specify one of countless color scales. +# A first safe choice is often the ColorBrewer[^colorbrewer] collection of color scales, specifically designed for mapping. # Any color scale can be reversed, using the `_r` suffix. -# Finally, other color scales are available: see the **matplotlib** [colormaps article](https://matplotlib.org/stable/tutorials/colors/colormaps.html) for details. -# The following code sections demonstrates three color scale specifications other than the default (@fig-plot-symbology-colors). +# Finally, other color scales are available: see the **matplotlib** colormaps article[^matplotlib_colormaps] for details. +# The following code section demonstrates three-color scale specifications other than the default (@fig-plot-symbology-colors). +# +# [^colorbrewer]: +# [^matplotlib_colormaps]: # In[ ]: @@ -227,9 +229,6 @@ nz.plot(column='Median_income', legend=True, cmap='plasma'); -# -# -# # Categorical symbology is also supported, such as when `column` points to an `str` attribute. # For categorical variables, it makes sense to use a qualitative color scale, such as `'Set1'` from ColorBrewer. # For example, the following expression sets symbology according to the `'Island'` column (@fig-plot-symbology-categorical). @@ -270,11 +269,8 @@ # Unfortunately, there is no built-in option to display a legend in `rasterio.plot.show`. -# The following [workaround](https://stackoverflow.com/questions/61327088/rio-plot-show-with-colorbar), reverting to **matplotlib** methods, can be used to acheive it instead (@fig-plot-symbology-colors-r-scale). -# Basically, the code reverts to the **matplotlib** [`.colorbar`](https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.colorbar.html) method to add a legend, using the [`plt.imshow`](https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.imshow.html) function that draws an image of a **numpy** array (which `rasterio.plot.show` is a wrapper of). -# -# -# +# The following workaround, reverting to **matplotlib** methods, can be used to acheive it instead (@fig-plot-symbology-colors-r-scale). +# Basically, the code reverts to the **matplotlib** `.colorbar` method to add a legend, using the `plt.imshow` function that draws an image of a **numpy** array (which `rasterio.plot.show` is a wrapper of). # In[ ]: @@ -291,7 +287,7 @@ # # Labels are often useful to annotate maps and identify the location of specific features. # GIS software, as opposed to **matplotlib**, has specialized algorithms for label placement, e.g., to avoid overlaps between adjacent labels. -# Furthermore, editing in graphical editing software is sometimes used for fine tuning of label placement. +# Furthermore, editing in graphical editing software is sometimes used for fine-tuning of label placement. # Nevertheless, simple labels added within the Python environment can be a good starting point, both for interactive exploration and sharing analysis results. # # To demonstrate it, suppose that we have a layer `nz1` of regions comprising the New Zealand southern Island. @@ -302,13 +298,11 @@ nz1 = nz[nz['Island'] == 'South'] -# To add a label in **matplotlib**, we use the [`.annotate`](https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.annotate.html) method where the important arguments are the label string and the placement (a `tuple` of the form `(x,y)`). +# To add a label in **matplotlib**, we use the `.annotate` method where the important arguments are the label string and the placement (a `tuple` of the form `(x,y)`). # When labeling vector layers, we typically want to add numerous labels, based on (one or more) attribute of each feature. # To do that, we can run a `for` loop, or use the `.apply` method, to pass the label text and the coordinates of each feature to `.annotate`. # In the following example, we use the `.apply` method the pass the region name (`'Name'` attribute) and the geometry centroid coordinates, for each region, to `.annotate`. -# We are also using `ha`, short for `horizontalalignment`, with `'center'` (other options are `'right'` and `'left'`, see [Text properties and layout](https://matplotlib.org/stable/users/explain/text/text_props.html) reference for **matplotlib**) (@fig-labels-polygon). -# -# +# We are also using `ha`, short for `horizontalalignment`, with `'center'` (other options are `'right'` and `'left'`) (@fig-labels-polygon). # In[ ]: @@ -340,7 +334,7 @@ # Then, we again use `.apply`, combined with `.annotate`, to add the text labels. # The main difference compared to the previous example (@fig-labels-polygon) is that we are directly passing the geometry coordinates (`.geometry.coords[0]`), since the geometries are points rather than polygons. -# We are also using the `weight='bold'` argument to use bold font (see [Text properties and layout](https://matplotlib.org/stable/users/explain/text/text_props.html) reference for **matplotlib**) for list of other options) (@fig-labels-points1). +# We are also using the `weight='bold'` argument to use bold font (@fig-labels-points1). # # @@ -362,11 +356,9 @@ ); -# It should be noted that sometimes we wish to add text labels "manually", one by one, rather than use a loop or `.apply`. +# It should be noted that sometimes we wish to add text labels 'manually', one by one, rather than use a loop or `.apply`. # For example, we may want to add labels of specific locations not stored in a layer, or to have control over the specific properties of each label. -# To add text labels manually, we can run the `.annotate` expressions one at a time, as shown in the code section below recreating the last result with the "manual" approach (@fig-labels-points2). -# -# +# To add text labels manually, we can run the `.annotate` expressions one at a time, as shown in the code section below recreating the last result with the 'manual' approach (@fig-labels-points2). # In[ ]: @@ -381,7 +373,7 @@ # ### Layers {#sec-plot-static-layers} # -# To display more than one layer in the same static map, we need to: +# To display more than one layer in the same static map, we can: # # 1. Store the first plot in a variable (e.g., `base`) # 2. Pass it as the `ax` argument of any subsequent plot(s) (e.g., `ax=base`) @@ -397,15 +389,28 @@ nz_height.plot(ax=base, color='red'); +# Alternatively (see note in @sec-static-styling), we can: +# +# 1. Initialize the plot using `fig,ax=plt.subplots()` +# 2. Pass `ax` to any subsequent plot + +# In[ ]: + + +#| label: fig-two-layers2 +#| fig-cap: Plotting two layers, `nz` (polygons) and `nz_height` (points), using `plt.subplots` +fig, ax = plt.subplots() +nz.plot(ax=ax, color='none') +nz_height.plot(ax=ax, color='red'); + + # We can combine rasters and vector layers in the same plot as well, which we already used earlier in the book, for example when explaining masking and cropping (@fig-raster-crop). # The technique is to initialize a plot with `fig,ax=plt.subplots()`, then pass `ax` to any of the separate plots, making them appear together. -# -# # # For example, @fig-plot-raster-and-vector demonstrated plotting a raster with increasingly complicated additions: # # - Panel (a) shows a raster (New Zealand elevation) and a vector layer (New Zealand administrative division) -# - Panel (b) shows the raster with a buffer of 22.2 $km$ around the dissolved administrative borders, representing New Zealand's [territorial waters](https://en.wikipedia.org/wiki/Territorial_waters) (see @sec-global-operations-and-distances) +# - Panel (b) shows the raster with a buffer of 22.2 $km$ around the dissolved administrative borders, representing New Zealand's territorial waters (see @sec-global-operations-and-distances) # - Panel (c) shows the raster with two vector layers: the territorial waters (in red) and elevation measurement points (in yellow) # In[ ]: @@ -425,7 +430,7 @@ # Raster + computed vector layer fig, ax = plt.subplots(figsize=(5, 5)) rasterio.plot.show(nz_elev, ax=ax) -gpd.GeoSeries(nz.unary_union, crs=nz.crs) \ +gpd.GeoSeries(nz.union_all(), crs=nz.crs) \ .to_crs(nz_elev.crs) \ .buffer(22200) \ .exterior \ @@ -433,7 +438,7 @@ # Raster + two vector layers fig, ax = plt.subplots(figsize=(5, 5)) rasterio.plot.show(nz_elev, ax=ax) -gpd.GeoSeries(nz.unary_union, crs=nz.crs) \ +gpd.GeoSeries(nz.union_all(), crs=nz.crs) \ .to_crs(nz_elev.crs) \ .buffer(22200) \ .exterior \ @@ -441,13 +446,8 @@ nz_height.to_crs(nz_elev.crs).plot(ax=ax, color='yellow'); -# -# -# -# -# # ::: callout-note -# Note that the drawing order of layers is not necessarily according to the order of expressions, in the code, but according to layer *type*. For example, by [default](https://matplotlib.org/stable/gallery/misc/zorder_demo.html) line layers are drawn on top of point layers. To override the default plotting order, we can use the `zorder` argument of `.plot`. Layers with higher `zorder` values will be drawn on top. For example, the following would draw `layer2` on top of `layer1` (regaredless of their types). +# Note that the drawing order of layers is not necessarily according to the order of expressions, in the code, but according to layer *type*. For example, by default line layers are drawn on top of point layers. To override the default plotting order, we can use the `zorder` argument of `.plot`. Layers with higher `zorder` values will be drawn on top. For example, the following would draw `layer2` on top of `layer1` (regaredless of their types). # # ```python # base = layer1.plot(zorder=1) @@ -457,13 +457,15 @@ # # ### Basemaps # -# Basemaps, or background layers, are often useful to provide context to the displayed layers (which are in the "foreground"). +# Basemaps, or background layers, are often useful to provide context to the displayed layers (which are in the 'foreground'). # Basemaps are ubiquitous in interactive maps (see @sec-interactive-maps). # However, they are often useful in static maps too. # -# Basemaps can be added to **geopandas** static plots using the [**contextily**](https://contextily.readthedocs.io/en/latest/index.html) package. -# A preliminary step is to convert our layers to `EPSG:3857` (["Web Mercator"](https://en.wikipedia.org/wiki/Web_Mercator_projection)), to be in agreement with the basemaps, which are typically provided in this CRS. +# Basemaps can be added to **geopandas** static plots using the **contextily** package. +# A preliminary step is to convert our layers to `EPSG:3857` ('Web Mercator'), to be in agreement with the basemaps, which are typically provided in this CRS[^reproject_tiles]. # For example, let's take the small `"Nelson"` polygon from `nz`, and reproject it to `3857`. +# +# [^reproject_tiles]: Another option is to reproject the tiles to match the CRS of the foreground layers; this is less commonly used workflow, as it may lead to distorted appearance of the background layer. # In[ ]: @@ -472,15 +474,15 @@ # To add a basemap, we use the `contextily.add_basemap` function, similarly to the way we added multiple layers (@sec-plot-static-layers). -# The default basemap is "OpenStreetMap". +# The default basemap is 'OpenStreetMap'. # You can specify a different basemap using the `source` parameter, with one of the values in `cx.providers` (@fig-basemap). # In[ ]: #| label: fig-basemap -#| fig-cap: Adding a basemap to a static map, using `contextily` -#| layout-ncol: 3 +#| fig-cap: Adding a basemap to a static map, using **contextily** +#| layout-ncol: 2 #| fig-subcap: #| - "'OpenStreetMap' basemap" #| - "'CartoDB Positron' basemap" @@ -494,9 +496,12 @@ cx.add_basemap(ax, source=cx.providers.CartoDB.Positron); -# Check out the [gallery](https://xyzservices.readthedocs.io/en/stable/gallery.html) for more possible basemaps. -# Custom basemaps (such as from your own raster tile server) can be also specified using a [URL](https://contextily.readthedocs.io/en/latest/providers_deepdive.html#Manually-specifying-a-provider). -# Finally, you may read the [Adding a background map to plots](https://geopandas.org/en/stable/gallery/plotting_basemap_background.html) tutorial for more examples. +# Check out the gallery[^xyzservices_gallery] for more possible basemaps. +# Custom basemaps (such as from your own raster tile server) can be also specified using a URL. +# Finally, you may read the *Adding a background map to plots*[^basemaps_tutorial] tutorial for more examples. +# +# [^xyzservices_gallery]: +# [^basemaps_tutorial]: # # ### Faceted maps {#sec-faceted-maps} # @@ -513,8 +518,6 @@ # We may want to plot them all in a faceted map, that is, four small maps of `nz` with the different variables. # To do that, we initialize the plot with the expected number of panels, such as `ncols=len(vars)` if we wish to have one row and four columns, and then go over the variables in a `for` loop, each time plotting `vars[i]` into the `ax[i]` panel (@fig-faceted-map). -# -# # In[ ]: @@ -527,16 +530,16 @@ ax[i].set_title(vars[i]) -# In case we prefer a specific layout, rather than one row or one column, we can initialize the required number or rows and columns, as in `plt.subplots(nrows,ncols)`, "flatten" `ax`, so that the facets are still accessible using a single index `ax[i]` (rather than the default `ax[i][j]`), and plot into `ax[i]`. +# In case we prefer a specific layout, rather than one row or one column, we can initialize the required number or rows and columns, as in `plt.subplots(nrows,ncols)`, 'flatten' `ax`, so that the facets are still accessible using a single index `ax[i]` (rather than the default `ax[i][j]`), and plot into `ax[i]`. # For example, here is how we can reproduce the last plot, this time in a $2 \times 2$ layout, instead of a $1 \times 4$ layout (@fig-faceted-map2). -# One more modification we are doing here is hiding the axis ticks and labels, to make the map less "crowded", using `ax[i].xaxis.set_visible(False)` (and same for `.yaxis`). +# One more modification we are doing here is hiding the axis ticks and labels, to make the map less 'crowded', using `ax[i].xaxis.set_visible(False)` (and same for `.yaxis`). # In[ ]: #| label: fig-faceted-map2 -#| fig-cap: 2D layout in a faceted map, using a `for` loop -fig, ax = plt.subplots(ncols=2, nrows=int(len(vars)/2), figsize=(6, 6)) +#| fig-cap: Two-dimensional layout in a faceted map, using a `for` loop +fig, ax = plt.subplots(nrows=int(len(vars)/2), ncols=2, figsize=(6, 6)) ax = ax.flatten() for i in range(len(vars)): nz.plot(ax=ax[i], column=vars[i], legend=True) @@ -545,7 +548,7 @@ ax[i].yaxis.set_visible(False) -# It is also possible to "manually" specify the properties of each panel, and which row/column it goes in (e.g., @fig-spatial-aggregation-different-functions). +# It is also possible to 'manually' specify the properties of each panel, and which row/column it goes in (e.g., @fig-spatial-aggregation-different-functions). # This can be useful when the various panels have different components, or even completely different types of plots (e.g., @fig-zion-transect), making automation with a `for` loop less applicable. # For example, here is a plot similar to @fig-faceted-map2, but specifying each panel using a separate expression instead of using a `for` loop (@fig-faceted-map3). @@ -569,10 +572,8 @@ # # ### Exporting {#sec-exporting-static-maps} # -# Static maps can be exported to a file using the [`matplotlib.pyplot.savefig`](https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.savefig.html) function. -# For example, the following code section recreates fig-two-layers, but this time the last expression saves the image to a JPG image named `plot_geopandas.jpg`. -# -# +# Static maps can be exported to a file using the `matplotlib.pyplot.savefig` function. +# For example, the following code section recreates @fig-two-layers, but this time the last expression saves the image to a JPG image named `plot_geopandas.jpg`. # In[ ]: @@ -609,28 +610,25 @@ plt.savefig('output/plot_rasterio2.svg', dpi=300) -# -# # ## Interactive maps {#sec-interactive-maps} # -# -# -# # While static maps can enliven geographic datasets, interactive maps can take them to a new level. # Interactivity can take many forms, the most common and useful of which is the ability to pan around and zoom into any part of a geographic dataset overlaid on a 'web map' to show context. # Less advanced interactivity levels include popups which appear when you click on different features, a kind of interactive label. -# More advanced levels of interactivity include the ability to tilt and rotate maps, and the provision of "dynamically linked" sub-plots which automatically update when the user pans and zooms [@pezanowski_senseplace3_2018]. +# More advanced levels of interactivity include the ability to tilt and rotate maps, and the provision of 'dynamically linked' sub-plots which automatically update when the user pans and zooms [@pezanowski_senseplace3_2018]. # # The most important type of interactivity, however, is the display of geographic data on interactive or 'slippy' web maps. -# Significant features of web maps are that (1) they eventually comprise static HTML files, easily shared and accessed by a wide audience, and (2) they can "grab" content (e.g., basemaps) or use services from other locations on the internet, that way providing detailed context without much requiring much effort from the person who created the map. -# The most popular approaches for web mapping, in Python and elsewhere, are based on the [Leaflet](https://leafletjs.com/) JavaScript library [@dorman2020introduction]. -# The [**folium**](https://python-visualization.github.io/folium/latest/) Python package provides an extensive interface to create customized web maps based on Leaflet; it is recommended for highly-custimized maps. +# Significant features of web maps are that (1) they eventually comprise static HTML files, easily shared and accessed by a wide audience, and (2) they can 'grab' content (e.g., basemaps) or use services from other locations on the internet, that way providing detailed context without much requiring much effort from the person who created the map. +# The most popular approaches for web mapping, in Python and elsewhere, are based on the Leaflet JavaScript library [@dorman2020introduction]. +# The **folium** Python package provides an extensive interface to create customized web maps based on Leaflet; it is recommended for highly customized maps. # However, the **geopandas** wrapper `.explore`, introduced in @sec-vector-layers, can be used for a wide range of scenarios which are often sufficient. # This is what we cover in this section. # # ### Minimal example # # An interactive map of a `GeoSeries` or `GeoDataFrame` can be created with `.explore` (@sec-vector-layers). +# +# ::: {.content-visible when-format="html"} # In[ ]: @@ -640,6 +638,28 @@ nz.explore() +# ::: +# ::: {.content-visible when-format="pdf"} + +# In[ ]: + + +#| eval: false +nz.explore() + + +# In[ ]: + + +#| echo: false +#| output: false +#| error: true +map_to_png.map_to_png(nz.explore(), 'fig-explore') + + +# ![Minimal example of an interactive vector layer plot with `.explore`](images/fig-explore.png){#fig-explore} +# ::: +# # ### Styling {#sec-interactive-styling} # # The `.explore` method has a `color` parameter which affects both the fill and outline color. @@ -655,7 +675,9 @@ # - `fillColor`---Fill color # - `fillOpacity`---Fill opacity (from `0` to `1`) # -# For example, here is how we can set green fill color and 30% opaque black outline of `nz` polygons in `.explore` (@fig-explore-styling-polygons): +# For example, here is how we can set green fill color and 30% opaque black outline of `nz` polygons in `.explore` (@fig-explore-styling-polygons). +# +# ::: {.content-visible when-format="html"} # In[ ]: @@ -665,18 +687,42 @@ nz.explore(color='green', style_kwds={'color':'black', 'opacity':0.3}) +# ::: +# ::: {.content-visible when-format="pdf"} + +# In[ ]: + + +#| eval: false +nz.explore(color='green', style_kwds={'color':'black', 'opacity':0.3}) + + +# In[ ]: + + +#| echo: false +#| output: false +#| error: true +map_to_png.map_to_png(nz.explore(color='green', style_kwds={'color':'black', 'opacity':0.3}), 'fig-explore-styling-polygons') + + +# ![Styling of polygons in `.explore`](images/fig-explore-styling-polygons.png){#fig-explore-styling-polygons} +# ::: +# # The `dict` passed to `marker_kwds` controls the way that points are displayed: # -# - `radius`---Curcle radius (in $m$ for `circle`, see below) or in pixels (for `circle_marker`) +# - `radius`---Curcle radius, in $m$ for `circle` (see below), or in pixels for `circle_marker` # - `fill`---Whether to draw fill (for `circle` or `circle_marker`) # -# Additionally, for points, we can set the `marker_type`, to one of: +# Accordingly, for points, we can set the `marker_type`, to one of: # # - `'marker'`---A PNG image of a marker # - `'circle'`---A vector circle with radius specified in $m$ # - `'circle_marker'`---A vector circle with radius specified in pixels (the default) # -# For example, the following expression draws `'circe_marker`' points with 20 pixel radius, green fill, and black outline (@fig-explore-styling-points). +# For example, the following expression draws `'circe_marker`' points with 20-pixel radius, green fill, and black outline (@fig-explore-styling-points). +# +# ::: {.content-visible when-format="html"} # In[ ]: @@ -690,8 +736,40 @@ ) +# ::: +# ::: {.content-visible when-format="pdf"} + +# In[ ]: + + +#| eval: false +nz_height.explore( + color='green', + style_kwds={'color':'black', 'opacity':0.5, 'fillOpacity':0.1}, + marker_kwds={'radius':20} +) + + +# In[ ]: + + +#| echo: false +#| output: false +#| error: true +map_to_png.map_to_png(nz_height.explore( + color='green', + style_kwds={'color':'black', 'opacity':0.5, 'fillOpacity':0.1}, + marker_kwds={'radius':20} +), 'fig-explore-styling-points') + + +# ![Styling of points in `.explore` (using `circle_marker`)](images/fig-explore-styling-points.png){#fig-explore-styling-points} +# ::: +# # @fig-explore-styling-points2 demonstrates the `'marker_type'` option. -# Note that the above-mentioned styling properties (other then `opacity`) are not applicable when using `marker_type='marker'`, because the markers are fixed PNG images. +# Note that the above-mentioned styling properties (other than `opacity`) are not applicable when using `marker_type='marker'`, because the markers are fixed PNG images. +# +# ::: {.content-visible when-format="html"} # In[ ]: @@ -701,12 +779,33 @@ nz_height.explore(marker_type='marker') -# -# +# ::: +# ::: {.content-visible when-format="pdf"} + +# In[ ]: + + +#| eval: false +nz_height.explore(marker_type='marker') + + +# In[ ]: + + +#| echo: false +#| output: false +#| error: true +map_to_png.map_to_png(nz_height.explore(marker_type='marker'), 'fig-explore-styling-points2') + + +# ![Styling of points in `.explore` (using `marker`)](images/fig-explore-styling-points2.png){#fig-explore-styling-points2} +# ::: # # ### Layers # # To display multiple layers, one on top of another, with `.explore`, we use the `m` argument, which stands for the previous map (@fig-explore-layers). +# +# ::: {.content-visible when-format="html"} # In[ ]: @@ -717,10 +816,36 @@ nz_height.explore(m=m, color='red') -# One of the advantages of interactive maps is the ability to turn layers "on" and "off". -# This capability is implemented in [`folium.LayerControl`](https://python-visualization.github.io/folium/latest/user_guide/ui_elements/layer_control.html#LayerControl) from package **folium**, which the **geopandas** `.explore` method is a wrapper of. +# ::: +# ::: {.content-visible when-format="pdf"} + +# In[ ]: + + +#| eval: false +m = nz.explore() +nz_height.explore(m=m, color='red') + + +# In[ ]: + + +#| echo: false +#| output: false +#| error: true +m = nz.explore() +map_to_png.map_to_png(nz_height.explore(m=m, color='red'), 'fig-explore-layers') + + +# ![Displaying multiple layers in an interactive map with `.explore`](images/fig-explore-layers.png){#fig-explore-layers} +# ::: +# +# One of the advantages of interactive maps is the ability to turn layers 'on' and 'off'. +# This capability is implemented in `folium.LayerControl` from package **folium**, which the **geopandas** `.explore` method is a wrapper of. # For example, this is how we can add a layer control for the `nz` and `nz_height` layers (@fig-explore-layers-controls). -# Note the `name` properties, used to specify layer names in the control, and the `collapsed` property, used to specify whether the control is fully visible at all times (`False`), or on mouse hover (`True`, the default). +# Note the `name` properties, used to specify layer names in the control, and the `collapsed` property, used to specify whether the control is fully visible at all times (`False`), or only on mouse hover (`True`, the default). +# +# ::: {.content-visible when-format="html"} # In[ ]: @@ -733,10 +858,40 @@ m +# ::: +# ::: {.content-visible when-format="pdf"} + +# In[ ]: + + +#| eval: false +m = nz.explore(name='Polygons (adm. areas)') +nz_height.explore(m=m, color='red', name='Points (elevation)') +folium.LayerControl(collapsed=False).add_to(m) +m + + +# In[ ]: + + +#| echo: false +#| output: false +#| error: true +m = nz.explore(name='Polygons (adm. areas)') +nz_height.explore(m=m, color='red', name='Points (elevation)') +folium.LayerControl(collapsed=False).add_to(m) +map_to_png.map_to_png(m, 'fig-explore-layers-controls') + + +# ![Displaying multiple layers in an interactive map with `.explore`](images/fig-explore-layers-controls.png){#fig-explore-layers-controls} +# ::: +# # ### Symbology {#sec-explore-symbology} # # Symbology can be specified in `.explore` using similar arguments as in `.plot` (@sec-plot-symbology). # For example, @fig-explore-symbology is an interactive version of @fig-plot-symbology-colors (a). +# +# ::: {.content-visible when-format="html"} # In[ ]: @@ -746,18 +901,74 @@ nz.explore(column='Median_income', legend=True, cmap='Reds') +# ::: +# ::: {.content-visible when-format="pdf"} + +# In[ ]: + + +#| eval: false +nz.explore(column='Median_income', legend=True, cmap='Reds') + + +# In[ ]: + + +#| echo: false +#| output: false +#| error: true +map_to_png.map_to_png(nz.explore(column='Median_income', legend=True, cmap='Reds'), 'fig-explore-symbology') + + +# ![Symbology in an interactive map of a vector layer, created with `.explore`](images/fig-explore-symbology.png){#fig-explore-symbology} +# ::: +# # Fixed styling (@sec-explore-symbology) can be combined with symbology settings. # For example, polygon outline colors in @fig-explore-symbology are styled according to `'Median_income'`, however, this layer has overlapping outlines and their color is arbitrarily set according to the order of features (top-most features), which may be misleading and confusing. # To specify fixed outline colors (e.g., black), we can use the `color` and `weight` properties of `style_kwds` (@fig-explore-symbology2): +# +# ::: {.content-visible when-format="html"} # In[ ]: #| label: fig-explore-symbology2 #| fig-cap: 'Symbology combined with fixed styling in `.explore`' -nz.explore(column='Median_income', legend=True, cmap='Reds', style_kwds={'color':'black', 'weight': 0.5}) +nz.explore( + column='Median_income', + legend=True, + cmap='Reds', + style_kwds={'color':'black', 'weight': 0.5} +) + + +# ::: +# ::: {.content-visible when-format="pdf"} + +# In[ ]: + + +#| eval: false +nz.explore( + column='Median_income', + legend=True, + cmap='Reds', + style_kwds={'color':'black', 'weight': 0.5} +) + + +# In[ ]: +#| echo: false +#| output: false +#| error: true +map_to_png.map_to_png(nz.explore(column='Median_income', legend=True, cmap='Reds', style_kwds={'color':'black', 'weight': 0.5}), 'fig-explore-symbology2') + + +# ![Symbology combined with fixed styling in `.explore`](images/fig-explore-symbology2.png){#fig-explore-symbology2} +# ::: +# # ### Basemaps # # The basemap in `.explore` can be specified using the `tiles` argument. @@ -767,8 +978,10 @@ # - `'CartoDB positron'` # - `'CartoDB dark_matter'` # -# Other basemaps are available through the **xyzservices** package, which needs to be installed (see `xyzservices.providers` for a list), or using a custom tile server URL. +# Other basemaps are available through the **xyzservices** package (see `xyzservices.providers` for a list), or using a custom tile server URL. # For example, the following expression displays the `'CartoDB positron'` tiles in an `.explore` map (@fig-explore-basemaps). +# +# ::: {.content-visible when-format="html"} # In[ ]: @@ -778,11 +991,35 @@ nz.explore(tiles='CartoDB positron') +# ::: +# ::: {.content-visible when-format="pdf"} + +# In[ ]: + + +#| eval: false +nz.explore(tiles='CartoDB positron') + + +# In[ ]: + + +#| echo: false +#| output: false +#| error: true +map_to_png.map_to_png(nz.explore(tiles='CartoDB positron'), 'fig-explore-basemaps') + + +# ![Specifying the basemap in `.explore`](images/fig-explore-basemaps.png){#fig-explore-basemaps} +# ::: +# # ### Exporting # # An interactive map can be exported to an HTML file using the `.save` method of the `map` object. -# The HTML file can then be shared with other people, or published on a server and shared through a URL. -# A good free option for publishing a web map is through [GitHub Pages](https://pages.github.com/). +# The HTML file can then be shared with other people, or published on a server and shared through a URL[^leaflet_size]. +# A good free option for publishing a web map is through GitHub Pages. +# +# [^leaflet_size]: The GeoJSON representation of the data is embedded in the HTML file, which means that the file size can get large, and the web map may become unusable due to browser performance limitations. # # For example, here is how we can export the map shown in @fig-explore-layers-controls, to a file named `map.html`. @@ -796,10 +1033,8 @@ m.save('output/map.html') -# +# # # # -# ## Exercises -# -# ## References +# diff --git a/book_options.py b/code/chapters/book_options.py similarity index 100% rename from book_options.py rename to code/chapters/book_options.py diff --git a/book_options_pdf.py b/code/chapters/book_options_pdf.py similarity index 100% rename from book_options_pdf.py rename to code/chapters/book_options_pdf.py diff --git a/map_to_png.py b/code/chapters/map_to_png.py similarity index 100% rename from map_to_png.py rename to code/chapters/map_to_png.py diff --git a/code/chapters/references.py b/code/chapters/references.py new file mode 100644 index 00000000..8469162c --- /dev/null +++ b/code/chapters/references.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python +# coding: utf-8 + +# # References {.unnumbered} +# +# ::: {#refs} +# ::: diff --git a/ipynb/01-spatial-data.ipynb b/ipynb/01-spatial-data.ipynb index 3e87c2b7..c277be77 100644 --- a/ipynb/01-spatial-data.ipynb +++ b/ipynb/01-spatial-data.ipynb @@ -4,39 +4,80 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Geographic data in Python {#sec-spatial-class}\n", + "---\n", + "jupyter: python3\n", + "---\n", "\n", + "# Geographic data in Python {#sec-spatial-class}" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| echo: false\n", + "#| include: false\n", + "#| error: true\n", + "import map_to_png" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| echo: false\n", + "import book_options" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| echo: false\n", + "import book_options_pdf" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ "## Introduction\n", "\n", - "This chapter outlines two fundamental geographic data models --- vector and raster --- and introduces the main Python packages for working with them.\n", + "This chapter outlines two fundamental geographic data models (vector and raster) and introduces Python packages for working with them.\n", "Before demonstrating their implementation in Python, we will introduce the theory behind each data model and the disciplines in which they predominate.\n", "\n", - "The vector data model (@sec-vector-data) represents the world using points, lines, and polygons.\n", + "The vector data model (@sec-vector-data) represents geographic entities with points, lines, and polygons.\n", "These have discrete, well-defined borders, meaning that vector datasets usually have a high level of precision (but not necessarily accuracy).\n", "The raster data model (@sec-raster-data), on the other hand, divides the surface up into cells of constant size.\n", - "Raster datasets are the basis of background images used in web-mapping and have been a vital source of geographic data since the origins of aerial photography and satellite-based remote sensing devices.\n", + "Raster datasets are the basis of background images used in online maps and have been a vital source of geographic data since the origins of aerial photography and satellite-based remote sensing devices.\n", "Rasters aggregate spatially specific features to a given resolution, meaning that they are consistent over space and scalable, with many worldwide raster datasets available.\n", "\n", "Which to use?\n", "The answer likely depends on your domain of application, and the datasets you have access to:\n", "\n", - "- Vector datasets and methods dominate the social sciences because human settlements and processes (e.g., transport infrastructure) tend to have discrete borders \n", + "- Vector datasets and methods dominate the social sciences because human settlements and processes (e.g., transport infrastructure) tend to have discrete borders\n", "- Raster datasets and methods dominate many environmental sciences because of the reliance on remote sensing data\n", "\n", "Python has strong support for both data models.\n", "We will focus on **shapely** and **geopandas** for working with geograpic vector data, and **rasterio** for working with rasters.\n", "\n", - "**shapely** is a \"low-level\" package for working with individual vector geometry objects.\n", - "**geopandas** is a \"high-level\" package for working with geometry columns (`GeoSeries` objects), which internally contain **shapely** geometries, and vector layers (`GeoDataFrame` objects).\n", + "**shapely** is a 'low-level' package for working with individual vector geometry objects.\n", + "**geopandas** is a 'high-level' package for working with geometry columns (`GeoSeries` objects), which internally contain **shapely** geometries, and with vector layers (`GeoDataFrame` objects).\n", "The **geopandas** ecosystem provides a comprehensive approach for working with vector layers in Python, with many packages building on it.\n", "\n", "There are several partially overlapping packages for working with raster data, each with its own advantages and disadvantages.\n", - "In this book, we focus on the most prominent one: **rasterio**, which represents \"simple\" raster datasets with a combination of a **numpy** array, and a metadata object (`dict`) providing geographic metadata such as the coordinate system.\n", + "In this book, we focus on the most prominent one: **rasterio**, which represents 'simple' raster datasets with a combination of a **numpy** array, and a metadata object (`dict`) providing geographic metadata such as the coordinate system.\n", "**xarray** is a notable alternative to **rasterio** not covered in this book which uses native `xarray.Dataset` and `xarray.DataArray` classes to effectively represent complex raster datasets such as NetCDF files with multiple bands and metadata.\n", "\n", - "There is much overlap in some fields and raster and vector datasets can be used together: ecologists and demographers, for example, commonly use both vector and raster data.\n", + "There is much overlap in some fields, and raster and vector datasets can be used together: ecologists and demographers, for example, commonly use both vector and raster data.\n", "Furthermore, it is possible to convert between the two forms (see @sec-raster-vector).\n", - "Whether your work involves more use of vector or raster datasets, it is worth understanding the underlying data models before using them, as discussed in subsequent chapters.\n", + "Whether your work involves use of vector or raster datasets, it is worth understanding the underlying data models before using them, as discussed in subsequent chapters.\n", "\n", "## Vector data {#sec-vector-data}\n", "\n", @@ -47,47 +88,42 @@ "In this system, London, for example, can be represented by the coordinates `(-0.1,51.5)`.\n", "This means that its location is -0.1 degrees east and 51.5 degrees north of the origin.\n", "The origin, in this case, is at 0 degrees longitude (a prime meridian located at Greenwich) and 0 degrees latitude (the Equator) in a geographic ('lon/lat') CRS (@fig-vector-london, left panel).\n", - "The same point could also be approximated in a projected CRS with 'Easting/Northing' values of `(530000, 180000)` in the British National Grid, meaning that London is located 530 $km$ East and 180 $km$ North of the origin of the CRS (@fig-vector-london, right panel).\n", + "The same point could also be approximated in a projected CRS with 'Easting/Northing' values of `(530000,180000)` in the British National Grid, meaning that London is located 530 $km$ East and 180 $km$ North of the origin of the CRS (@fig-vector-london, right panel).\n", "The location of National Grid's origin, in the sea beyond South West Peninsular, ensures that most locations in the UK have positive Easting and Northing values.\n", "\n", - "::: {#fig-vector-london}\n", + "::: {#fig-vector-london layout-ncol=2}\n", "\n", - "::: {.columns}\n", - ":::: {.column width=\"50%\"}\n", "![](images/vector_lonlat.png)\n", - "::::\n", - ":::: {.column width=\"50%\"}\n", + "\n", "![](images/vector_projected.png)\n", - "::::\n", - ":::\n", "\n", "Illustration of vector (point) data in which location of London (the red X) is represented with reference to an origin (the blue circle). \n", "The left plot represents a geographic CRS with an origin at 0° longitude and latitude. \n", "The right plot represents a projected CRS with an origin located in the sea west of the South West Peninsula.\n", ":::\n", "\n", - "There is more to CRSs, as described in @sec-coordinate-reference-systems-intro and @sec-reproj-geo-data but, for the purposes of this section, it is sufficient to know that coordinates consist of two numbers representing the distance from an origin, usually in $x$ then $y$ dimensions.\n", + "There is more to CRSs, as described in @sec-coordinate-reference-systems-intro and @sec-reproj-geo-data but, for the purposes of this section, it is sufficient to know that coordinates consist of two numbers representing the distance from an origin, usually in $x$ and $y$ dimensions.\n", "\n", "**geopandas** [@geopandas] provides classes for geographic vector data and a consistent command-line interface for reproducible geographic data analysis in Python.\n", - "It also provides an interface to three mature libraries for geocomputation which, in combination, represent a strong foundation on which many geographic applications (including QGIS and R's spatial ecosystem):\n", + "It also provides an interface to three mature libraries for geocomputation, a strong foundation on which many geographic applications are built:\n", "\n", "- GDAL, for reading, writing, and manipulating a wide range of geographic data formats, covered in @sec-read-write\n", "- PROJ, a powerful library for coordinate system transformations, which underlies the content covered in @sec-reproj-geo-data\n", "- GEOS, a planar geometry engine for operations such as calculating buffers and centroids on data with a projected CRS, covered in @sec-geometric-operations\n", "\n", - "Tight integration with these geographic libraries makes reproducible geocomputation possible: an advantage of using a higher level language such as Python to access these libraries is that you do not need to know the intricacies of the low level components, enabling focus on the methods rather than the implementation.\n", + "Tight integration with these geographic libraries makes reproducible geocomputation possible: an advantage of using a higher-level language such as Python to access these libraries is that you do not need to know the intricacies of the low-level components, enabling focus on the methods rather than the implementation.\n", "\n", "### Vector data classes\n", "\n", - "The main classes for working with geographic vector data in Python are hierarchical, meaning the highest level 'vector layer' class is composed of simpler 'geometry column' and individual 'geometry' components.\n", + "The main classes for working with geographic vector data in Python are hierarchical, meaning that the 'vector layer' class is composed of simpler 'geometry column' and individual 'geometry' components.\n", "This section introduces them in order, starting with the highest level class.\n", - "For many applications, the high level vector layer class, which are essentially a data frame with geometry columns, are all that's needed.\n", - "However, it's important to understand the structure of vector geographic objects and their component pieces for more advanced applications.\n", + "For many applications, the vector layer class, a data frame with geometry columns, is all that's needed.\n", + "However, it's important to understand the structure of vector geographic objects and their components for some applications and for a deep understanding.\n", "The three main vector geographic data classes in Python are:\n", "\n", "- `GeoDataFrame`, a class representing vector layers, with a geometry column (class `GeoSeries`) as one of the columns\n", "- `GeoSeries`, a class that is used to represent the geometry column in `GeoDataFrame` objects\n", - "- `shapely` geometry objects which represent individual geometries, such as a point or a polygon\n", + "- `shapely` geometry objects, which represent individual geometries, such as a point or a polygon in `GeoSeries` objects\n", "\n", "The first two classes (`GeoDataFrame` and `GeoSeries`) are defined in **geopandas**.\n", "The third class is defined in the **shapely** package, which deals with individual geometries, and is a main dependency of the **geopandas** package.\n", @@ -114,7 +150,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We also limit the maximum number of printed rows to four, to save space, using the `'display.max_rows'` option of **pandas**." + "We also limit the maximum number of printed rows to six, to save space, using the `'display.max_rows'` option of **pandas**." ] }, { @@ -131,7 +167,7 @@ "metadata": {}, "source": [ "Projects often start by importing an existing vector layer saved as a GeoPackage (`.gpkg`) file, an ESRI Shapefile (`.shp`), or other geographic file format.\n", - "The function `read_file()` imports a GeoPackage file named `world.gpkg` located in the `data` directory of Python's working directory into a `GeoDataFrame` named `gdf`." + "The function `gpd.read_file` imports a GeoPackage file named `world.gpkg` located in the `data` directory of Python's working directory into a `GeoDataFrame` named `gdf`." ] }, { @@ -272,7 +308,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Interactive maps of `GeoDataFrame` objects can be created with the `.explore` method, as illustrated in @fig-gdf-explore which was created with the following command:" + "Interactive maps of `GeoDataFrame` objects can be created with the `.explore` method, as illustrated in @fig-gdf-explore which was created with the following command:\n", + "\n", + "::: {.content-visible when-format=\"html\"}" ] }, { @@ -290,7 +328,42 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "A subset of the data can be also plotted in a similar fashion." + ":::\n", + "::: {.content-visible when-format=\"pdf\"}" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| eval: false\n", + "gdf.explore()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| echo: false\n", + "#| output: false\n", + "#| error: true\n", + "map_to_png.map_to_png(gdf.explore(), 'fig-gdf-explore')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Basic interactive map with `.explore`](images/fig-gdf-explore.png){#fig-gdf-explore}\n", + ":::\n", + "\n", + "A subset of the data can be also plotted in a similar fashion.\n", + "\n", + "::: {.content-visible when-format=\"html\"}" ] }, { @@ -304,16 +377,32 @@ "execution_count": null, "outputs": [] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + ":::\n", + "::: {.content-visible when-format=\"pdf\"}" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| eval: false\n", + "gdf[gdf['name_long'] == 'Egypt'].explore()" + ], + "execution_count": null, + "outputs": [] + }, { "cell_type": "code", "metadata": {}, "source": [ "#| echo: false\n", - "# (Alternative)\n", - "# import hvplot.pandas\n", - "# gdf.hvplot(title='Hello world', geo=True, hover_cols=['name_long'], legend=False).opts(bgcolor='lightgray', active_tools=['wheel_zoom']) \n", - "#This way, we can also add background tiles:\n", - "# gdf.hvplot(tiles='OSM', alpha=0.5, geo=True, title='Hello world', hover_cols=['name_long'], legend=False).opts(active_tools=['wheel_zoom']) " + "#| output: false\n", + "#| error: true\n", + "map_to_png.map_to_png(gdf[gdf['name_long'] == 'Egypt'].explore(), 'fig-gdf-explore2')" ], "execution_count": null, "outputs": [] @@ -322,6 +411,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ + "![Interactive map of a `GeoDataFrame` subset](images/fig-gdf-explore2.png){#fig-gdf-explore2}\n", + ":::\n", + "\n", "### Geometry columns {#sec-geometry-columns}\n", "\n", "The geometry column of class `GeoSeries` is an essential column in a `GeoDataFrame`.\n", @@ -360,9 +452,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Many geometry operations, such as calculating the centroid, buffer, or bounding box of each feature involve just the geometry.\n", + "Many geometry operations, such as calculating the centroid, buffer, or bounding box of each feature, involve just the geometry.\n", "Applying this type of operation on a `GeoDataFrame` is therefore basically a shortcut to applying it on the `GeoSeries` object in the geometry column.\n", - "For example, the two following commands return exactly the same result, a `GeoSeries` with country bounding box polygons (using the [`.envelope`](https://geopandas.org/en/stable/docs/reference/api/geopandas.GeoSeries.envelope.html) method)." + "For example, the two following commands return exactly the same result, a `GeoSeries` containing bounding box polygons (using the `.envelope` method)." ] }, { @@ -387,8 +479,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Note that `.envelope`, and other similar operators in **geopandas** such as `.centroid` (@sec-centroids), `.buffer` (@sec-buffers) or `.convex_hull`, return only the geometry (i.e., a `GeoSeries`), not a `GeoDataFrame` with the original attribute data.\n", - "In case we want the latter, we can create a copy of the `GeoDataFrame` and then \"overwrite\" its geometry (or, we can overwrite the geometries directly in case we do not need the original ones, as in `gdf.geometry=gdf.envelope`)." + "Note that `.envelope`, and other similar operators in **geopandas** such as `.centroid` (@sec-centroids), `.buffer` (@sec-buffers), or `.convex_hull`, return only the geometry (i.e., a `GeoSeries`), not a `GeoDataFrame` with the original attribute data.\n", + "In case we want the latter, we can create a copy of the `GeoDataFrame` and then 'overwrite' its geometry (or, we can overwrite the geometries directly in case we do not need the original ones, as in `gdf.geometry=gdf.envelope`)." ] }, { @@ -407,8 +499,9 @@ "metadata": {}, "source": [ "Another useful property of the geometry column is the geometry type, as shown in the following code.\n", - "Note that the types of geometries contained in a geometry column (and, thus, a vector layer) are not necessarily the same for every row.\n", - "Accordingly, the `.type` property returns a `Series` (of type `string`), rather than a single value (the same can be done with the shortcut `gdf.geom_type`)." + "Note that the types of geometries contained in a geometry column (and, thus, a vector layer) are not necessarily the same for every row. \n", + "It is possible to have multiple geometry types in a single `GeoSeries`.\n", + "Accordingly, the `.type` property returns a `Series` (with values of type `str`, i.e., strings), rather than a single value (the same can be done with the shortcut `gdf.geom_type`)." ] }, { @@ -424,7 +517,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "To summarize the occurrence of different geometry types in a geometry column, we can use the **pandas** method called `value_counts`." + "To summarize the occurrence of different geometry types in a geometry column, we can use the **pandas** `.value_counts` method.\n", + "In this case, we see that the `gdf` layer contains only `'MultiPolygon'` geometries." ] }, { @@ -440,10 +534,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "It is possible to have multiple geometry types in a single `GeoSeries`.\n", - "However, in this case, we see that the `gdf` layer contains only `'MultiPolygon'` geometries.\n", - "\n", - "A `GeoDataFrame` can also have multiple `GeoSeries`." + "A `GeoDataFrame` can also have multiple `GeoSeries` columns, as demonstrated in the following code section." ] }, { @@ -462,9 +553,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Only one geometry column at a time is \"active\", in the sense that it is being accessed in operations involving the geometries (such as `.centroid`, `.crs`, etc.).\n", - "To switch the active geometry column from one `GeoSeries` column to another, we use `set_geometry`.\n", - "@fig-switch-to-centroids and @fig-switch-to-polygons shows interactive maps of the `gdf` layer with the `'bbox'` and `'polygon'` geometry columns activated, respectively." + "Only one geometry column at a time is 'active', in the sense that it is being accessed in operations involving the geometries (such as `.centroid`, `.crs`, etc.).\n", + "To switch the active geometry column from one `GeoSeries` column to another, we use `.set_geometry`.\n", + "@fig-switch-to-centroids and @fig-switch-to-polygons shows interactive maps of the `gdf` layer with the `'bbox'` and `'polygon'` geometry columns activated, respectively.\n", + "\n", + "::: {.content-visible when-format=\"html\"}" ] }, { @@ -479,6 +572,48 @@ "execution_count": null, "outputs": [] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + ":::\n", + "::: {.content-visible when-format=\"pdf\"}" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| eval: false\n", + "gdf = gdf.set_geometry('bbox')\n", + "gdf.explore()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| echo: false\n", + "#| output: false\n", + "#| error: true\n", + "gdf = gdf.set_geometry('bbox')\n", + "map_to_png.map_to_png(gdf.explore(), 'fig-switch-to-centroids')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Switching to the `'bbox'` geometry column in the `world` layer, and plotting it](images/fig-switch-to-centroids.png){#fig-switch-to-centroids}\n", + ":::\n", + "\n", + "::: {.content-visible when-format=\"html\"}" + ] + }, { "cell_type": "code", "metadata": {}, @@ -495,21 +630,56 @@ "cell_type": "markdown", "metadata": {}, "source": [ + ":::\n", + "::: {.content-visible when-format=\"pdf\"}" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| eval: false\n", + "gdf = gdf.set_geometry('polygon')\n", + "gdf.explore()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| echo: false\n", + "#| output: false\n", + "#| error: true\n", + "gdf = gdf.set_geometry('polygon')\n", + "map_to_png.map_to_png(gdf.explore(), 'fig-switch-to-polygons')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Switching to the `'polygons'` geometry column in the `world` layer, and plotting it](images/fig-switch-to-polygons.png){#fig-switch-to-polygons}\n", + ":::\n", + "\n", "### The Simple Features standard {#sec-simple-features}\n", "\n", "Geometries are the basic building blocks of vector layers.\n", "Although the Simple Features standard defines about 20 types of geometries, we will focus on the seven most commonly used types: `POINT`, `LINESTRING`, `POLYGON`, `MULTIPOINT`, `MULTILINESTRING`, `MULTIPOLYGON` and `GEOMETRYCOLLECTION`.\n", - "A useful list of possible geometry types can be found in R's **sf** package [documentation](https://r-spatial.github.io/sf/articles/sf1.html#simple-feature-geometry-types).\n", + "A useful list of possible geometry types can be found in R's **sf** package documentation[^sf_docs].\n", + "\n", + "[^sf_docs]: [https://r-spatial.github.io/sf/articles/sf1.html#simple-feature-geometry-types](https://r-spatial.github.io/sf/articles/sf1.html#simple-feature-geometry-types)\n", "\n", "Simple feature geometries can be represented by well-known binary (WKB) and well-known text (WKT) encodings.\n", " WKB representations are usually hexadecimal strings easily readable for computers, and this is why GIS software and spatial databases use WKB to transfer and store geometry objects.\n", "WKT, on the other hand, is a human-readable text markup description of Simple Features.\n", - "\n", - "\n", "Both formats are exchangeable, and if we present one, we will naturally choose the WKT representation.\n", "\n", "The foundation of each geometry type is the point.\n", - "A point is simply a coordinate in 2D, 3D, or 4D space such as shown in @fig-point and @fig-point2.\n", + "A point is simply a coordinate in two-dimensional, three-dimensional, or four-dimensional space such as shown in @fig-point.\n", "\n", "``` text\n", "POINT (5 2)\n", @@ -529,7 +699,7 @@ "```\n", "\n", "So far we have created geometries with only one geometric entity per feature.\n", - "However, the Simple Features standard allows multiple geometries to exist within a single feature, using \"multi\" versions of each geometry type, as illustrated in @fig-multipoint, @fig-multilinestring, and @fig-multipolygon1.\n", + "However, the Simple Features standard allows multiple geometries to exist within a single feature, using 'multi' versions of each geometry type, as illustrated in @fig-multipoint, @fig-multilinestring, and @fig-multipolygon1.\n", "\n", "``` text\n", "MULTIPOINT (5 2, 1 3, 3 4, 3 2)\n", @@ -537,7 +707,7 @@ "MULTIPOLYGON (((1 5, 2 2, 4 1, 4 4, 1 5), (0 2, 1 2, 1 3, 0 3, 0 2)))\n", "```\n", "\n", - "Finally, a geometry collection can contain any combination of geometries including (multi)points and linestrings (@fig-geometrycollection).\n", + "Finally, a geometry collection can contain any combination of geometries of the other six types, such as the combination of a multipoint and linestring shown below (@fig-geometrycollection).\n", "\n", "``` text\n", "GEOMETRYCOLLECTION (MULTIPOINT (5 2, 1 3, 3 4, 3 2),\n", @@ -546,8 +716,8 @@ "\n", "### Geometries {#sec-geometries}\n", "\n", - "Each element in the geometry column is a geometry object, of class `shapely` [@shapely].\n", - "For example, here is one specific geometry selected by implicit index (Canada, i.e., the 4^th^ element in `gdf`'s geometry column')." + "Each element in the geometry column (`GeoSeries`) is a geometry object of class `shapely` [@shapely].\n", + "For example, here is one specific geometry selected by implicit index (Canada, the 4^th^ element in `gdf`'s geometry column)." ] }, { @@ -585,7 +755,7 @@ "In the first example (a `'Point'`) we show two types of inputs to create a geometry: a list of coordinates or a `string` in the WKT format.\n", "In the examples for the remaining geometries we use the former approach.\n", "\n", - "Creating a `'Point'` geometry from a list of coordinates uses the `shapely.Point` function (@fig-point)." + "Creating a `'Point'` geometry from a list of coordinates uses the `shapely.Point` function in the following expression (@fig-point)." ] }, { @@ -593,7 +763,7 @@ "metadata": {}, "source": [ "#| label: fig-point\n", - "#| fig-cap: A `Point` geometry (created from a `list`)\n", + "#| fig-cap: A `Point` geometry (created either from a `list` or WKT)\n", "point = shapely.Point([5, 2])\n", "point" ], @@ -604,16 +774,15 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Alternatively, we can use the `shapely.from_wkt` to transform a WKT string to a `shapely` geometry object.\n", - "Here is an example of creating the same `'Point'` geometry from WKT (@fig-point2)." + "Alternatively, we can use `shapely.from_wkt` to transform a WKT string to a `shapely` geometry object.\n", + "Here is an example of creating the same `'Point'` geometry from WKT (@fig-point)." ] }, { "cell_type": "code", "metadata": {}, "source": [ - "#| label: fig-point2\n", - "#| fig-cap: A `Point` geometry (created from a WKT string)\n", + "#| output: false\n", "point = shapely.from_wkt('POINT (5 2)')\n", "point" ], @@ -655,7 +824,7 @@ "#| fig-cap: A `Polygon` geometry\n", "polygon = shapely.Polygon(\n", " [(1,5), (2,2), (4,1), (4,4), (1,5)], ## Exterior\n", - " [[(2,4), (3,4), (3,3), (2,3), (2,4)]] ## Holes\n", + " [[(2,4), (3,4), (3,3), (2,3), (2,4)]] ## Hole(s)\n", ")\n", "polygon" ], @@ -718,7 +887,7 @@ "#| fig-cap: A `MultiPolygon` geometry\n", "multipolygon = shapely.MultiPolygon([\n", " [[(1,5), (2,2), (4,1), (4,4), (1,5)], []], ## Polygon 1 \n", - " [[(0,2), (1,2), (1,3), (0,3), (0,2)], []] ## Polygon 2, etc.\n", + " [[(0,2), (1,2), (1,3), (0,3), (0,2)], []] ## Polygon 2, etc.\n", "])\n", "multipolygon" ], @@ -729,18 +898,17 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Since the required input has four hierarchical levels, it may be more clear to create the single-part `'Polygon'` geometries in advance, using the respective function (`shapely.Polygon`), and then pass them to `shapely.MultiPolygon` (@fig-multipolygon2). (The same technique can be used with the other `shapely.Multi*` functions.)" + "Since the required input has four hierarchical levels, it may be more clear to create the single-part `'Polygon'` geometries in advance, using the respective function (`shapely.Polygon`), and then pass them to `shapely.MultiPolygon` (@fig-multipolygon1). (The same technique can be used with the other `shapely.Multi*` functions.)" ] }, { "cell_type": "code", "metadata": {}, "source": [ - "#| label: fig-multipolygon2\n", - "#| fig-cap: A `MultiPolygon` geometry\n", + "#| output: false\n", "multipolygon = shapely.MultiPolygon([\n", " shapely.Polygon([(1,5), (2,2), (4,1), (4,4), (1,5)]), ## Polygon 1 \n", - " shapely.Polygon([(0,2), (1,2), (1,3), (0,3), (0,2)]) ## Polygon 2, etc.\n", + " shapely.Polygon([(0,2), (1,2), (1,3), (0,3), (0,2)]) ## Polygon 2, etc.\n", "])\n", "multipolygon" ], @@ -770,8 +938,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "`shapely` geometries act as atomic units of vector data, meaning that there is no concept of geometry *sets*: each operation accepts individual geometry object(s) as input, and retunrs an individual geometry as output. (The `GeoSeries` and `GeoDataFrame` objects, defined in **geopandas**, are used to deal with sets of `shapely` geometries, collectively)\n", - "For example, the following expression calculates the difference between the buffered `multipolygon` (using distance of `0.2`) and itself (@fig-mpol-buffer-difference):" + "`shapely` geometries act as atomic units of vector data, meaning that there is no concept of geometry *sets*: each operation accepts individual geometry object(s) as input, and returns an individual geometry as output. \n", + "(The `GeoSeries` and `GeoDataFrame` objects, defined in **geopandas**, are used to deal with sets of `shapely` geometries, collectively.)\n", + "For example, the following expression calculates the difference (see @sec-clipping) between the buffered (see @sec-buffers) `multipolygon` (using distance of `0.2`) and itself (@fig-mpol-buffer-difference):" ] }, { @@ -789,7 +958,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "As demonstrated above, a `shapely` geometry object is automatically evaluated to a small image of the geometry (when using an interface capable of displaying it, such as a Jupyter Notebook).\n", + "As demonstrated in the last few figures, a `shapely` geometry object is automatically evaluated to a small image of the geometry (when using an interface capable of displaying it, such as Jupyter Notebook).\n", "To print the WKT string instead, we can use the `print` function:" ] }, @@ -824,9 +993,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ + "Also see @sec-type-transformations, where `.coords`, `.geoms`, and `.exterior` are used to transform a given `shapely` geometry to a different type (e.g., `'Polygon'` to `'MultiPoint'`).\n", + "\n", "### Vector layer from scratch {#sec-vector-layer-from-scratch}\n", "\n", - "In the previous sections, we started with a vector layer (`GeoDataFrame`), from an existing GeoPackage file, and \"decomposed\" it to extract the geometry column (`GeoSeries`, @sec-geometry-columns) and separate geometries (`shapely`, see @sec-geometries).\n", + "In the previous sections, we started with a vector layer (`GeoDataFrame`), from an existing GeoPackage file, and 'decomposed' it to extract the geometry column (`GeoSeries`, @sec-geometry-columns) and separate geometries (`shapely`, see @sec-geometries).\n", "In this section, we will demonstrate the opposite process, constructing a `GeoDataFrame` from `shapely` geometries, combined into a `GeoSeries`.\n", "This will help you better understand the structure of a `GeoDataFrame`, and may come in handy when you need to programmatically construct simple vector layers, such as a line between two given points.\n", "\n", @@ -836,13 +1007,13 @@ "![Creating a `GeoDataFrame` from scratch](images/gdf-flow.svg){#fig-gdf-flow}\n", "\n", "The final result, a vector layer (`GeoDataFrame`) is therefore a hierarchical structure (@fig-gdf-structure), containing the geometry column (`GeoSeries`), which in turn contains geometries (`shapely`).\n", - "Each of the \"internal\" components can be accessed, or \"extracted\", which is sometimes necessary, as we will see later on.\n", + "Each of the 'internal' components can be accessed, or 'extracted', which is sometimes necessary, as we will see later on.\n", "\n", - "![Structure of a `GeoDataFrame`](images/gdf-structure.svg){#fig-gdf-structure}\n", + "![Structure of a `GeoDataFrame`](images/gdf-structure.svg){width=40% fig-align=\"center\" #fig-gdf-structure}\n", "\n", "Non-geographic attributes may represent the name of the feature, and other attributes such as measured values, groups, etc.\n", "To illustrate attributes, we will represent a temperature of 25°C in London on June 21st, 2023.\n", - "This example contains a geometry (the coordinates), and three attributes with three different classes (place name, temperature and date).\n", + "This example contains a geometry (the coordinates), and three attributes with three different classes (place name, temperature, and date).\n", "Objects of class `GeoDataFrame` represent such data by combining the attributes (`Series`) with the simple feature geometry column (`GeoSeries`).\n", "First, we create a point geometry, which we know how to do from @sec-geometries (@fig-point-lnd)." ] @@ -885,7 +1056,7 @@ "The geometry column is a `GeoSeries`, named `geometry`.\n", "The other attributes (if any) may be defined using `list` or `Series` objects.\n", "Here, for simplicity, we use the `list` option for defining the three attributes `name`, `temperature`, and `date`.\n", - "Again, note that the `list` can be of length \\>1, in case we are creating a layer with more than one feature." + "Again, note that the `list` can be of length \\>1, in case we are creating a layer with more than one feature (i.e., multiple rows)." ] }, { @@ -955,8 +1126,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now, we are able to create an interactive map of the `towns_layer` object(@fig-layer-from-scratch-explore).\n", - "To make the points easier to see, we are customizing a fill color and size (we elaborate on `.explore` options in @sec-interactive-maps)." + "Now, we are able to create an interactive map of the `towns_layer` object (@fig-layer-from-scratch-explore).\n", + "To make the points easier to see, we are customizing a fill color and size (we elaborate on `.explore` options in @sec-interactive-maps).\n", + "\n", + "::: {.content-visible when-format=\"html\"}" ] }, { @@ -974,8 +1147,41 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Spatial object can be also created from a `pandas.DataFrame` object that contains columns with coordinates.\n", - "For that, we need to first create a `GeoSeries` object from the coordinates, and then combine it with `DataFrame` to a `GeoDataFrame` object." + ":::\n", + "::: {.content-visible when-format=\"pdf\"}" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| eval: false\n", + "towns_layer.explore(color='red', marker_kwds={'radius': 10})" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| echo: false\n", + "#| output: false\n", + "#| error: true\n", + "map_to_png.map_to_png(towns_layer.explore(color='red', marker_kwds={'radius': 10}), 'fig-layer-from-scratch-explore')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![`towns_layer`, created from scratch, visualized using `.explore`](images/fig-layer-from-scratch-explore.png){#fig-layer-from-scratch-explore}\n", + ":::\n", + "\n", + "A spatial (point) layer can be also created from a `DataFrame` object (package **pandas**) that contains columns with coordinates.\n", + "To demonstrate, we hereby first create a `GeoSeries` object from the coordinates, and then combine it with the `DataFrame` to form a `GeoDataFrame`." ] }, { @@ -1000,12 +1206,12 @@ "metadata": {}, "source": [ "The output gives the same result as previous `towns_layer`.\n", - "This approach is particularly useful when we need to read data from a CSV file, e.g., using `pandas.read_csv`, and want to turn the resulting `DataFrame` into a `GeoDataFrame` (see another example in @sec-spatial-joining).\n", + "This approach is particularly useful when we need to read data from a CSV file, e.g., using `pd.read_csv`, and want to turn the resulting `DataFrame` into a `GeoDataFrame` (see another example in @sec-spatial-joining).\n", "\n", "### Derived numeric properties {#sec-area-length}\n", "\n", - "Vector layers are characterized by two essential derived numeric properties: Length (`.length`)---applicable to lines and Area (`.area`)---applicable to polygons.\n", - "Area and length can be calculated for any data structures discussed above, either a `shapely` geometry, in which case the returned value is a number or for `GeoSeries` or `DataFrame`, in which case the returned value is a numeric `Series`." + "Vector layers are characterized by two essential derived numeric properties: *length* (`.length`)---applicable to lines, and *area* (`.area`)---applicable to polygons.\n", + "Area and length can be calculated for any data structures discussed above, either a `shapely` geometry, in which case the returned value is a number, or for `GeoSeries` or `DataFrame`, in which case the returned value is a numeric `Series`." ] }, { @@ -1044,7 +1250,7 @@ "\n", "To obtain meaningful length and area measurements for data in a geographic CRS, the geometries first need to be transformed to a projected CRS (see @sec-reprojecting-vector-geometries) applicable to the area of interest.\n", "For example, the area of Slovenia can be calculated in the UTM zone 33N CRS (`crs=32633`).\n", - "The result is in $m^2$, the units of the CRS of this dataset." + "The result is in $m^2$, the units of the UTM zone 33N CRS." ] }, { @@ -1063,20 +1269,22 @@ "## Raster data {#sec-raster-data}\n", "\n", "The spatial raster data model represents the world with the continuous grid of cells (often also called pixels; @fig-raster-intro-plot1 (A)). \n", - "This data model often refers to so-called regular grids, in which each cell has the same, constant size---and we will focus on the regular grids in this book only. \n", + "This data model often refers to so-called regular grids, in which each cell has the same, constant size---and we will focus only on regular grids in this book. \n", "However, several other types of grids exist, including rotated, sheared, rectilinear, and curvilinear grids (see Chapter 1 of @pebesma_spatial_2022 or Chapter 2 of @tennekes_elegant_2022).\n", "\n", "The raster data model usually consists of a raster header (or metadata) and a matrix (with rows and columns) representing equally spaced cells (often also called pixels; @fig-raster-intro-plot1 (A)). \n", - "The raster header defines the coordinate reference system, the extent and the origin. \n", - "The origin (or starting point) is frequently the coordinate of the lower-left corner of the matrix. \n", - "The metadata defines the extent via the origin, the number of columns, the number of rows, and the cell size resolution. \n", - "The matrix representation avoids storing explicitly the coordinates for the four corner points (in fact it only stores one coordinate, namely the origin) of each cell, as would be the case for rectangular vector polygons. \n", + "The raster header defines the coordinate reference system, the origin and the resolution. \n", + "The origin (or starting point) is typically the coordinate of the lower-left corner of the matrix. \n", + "The metadata defines the origin, and the cell size, i.e., resolution.\n", + "Combined with the column and row count, the extent can also be derived.\n", + "The matrix representation avoids storing explicitly the coordinates for the four corner points (in fact it only stores one coordinate, namely the origin) of each cell, as would be the case for rectangular vector polygons.\n", "This and map algebra (@sec-map-algebra) makes raster processing much more efficient and faster than vector data processing. \n", - "However, in contrast to vector data, the cell of one raster layer can only hold a single value. The value might be numeric or categorical (@fig-raster-intro-plot1 (C)).\n", + "However, in contrast to vector data, the cell of one raster layer can only hold a single value. \n", + "The cell values are numeric, representing either a continuous or a categorical variable (@fig-raster-intro-plot1 (C)).\n", "\n", "![Raster data types: (A) cell IDs, (B) cell values, (C) a colored raster map](images/raster-intro-plot1.png){#fig-raster-intro-plot1}\n", "\n", - "Raster maps usually represent continuous phenomena such as elevation, temperature, population density or spectral data. \n", + "Raster maps usually represent continuous phenomena such as elevation, temperature, population density, or spectral data. \n", "Discrete features such as soil or land-cover classes can also be represented in the raster data model. \n", "Both uses of raster datasets are illustrated in @fig-raster-intro-plot2, which shows how the borders of discrete features may become blurred in raster datasets. \n", "Depending on the nature of the application, vector representations of discrete features may be more suitable.\n", @@ -1089,12 +1297,12 @@ "The two most notable approaches for working with rasters in Python are provided by **rasterio** and **rioxarray** packages.\n", "As we will see shortly, they differ in scope and underlying data models.\n", "Specifically, **rasterio** represents rasters as **numpy** arrays associated with a separate object holding the spatial metadata.\n", - "The **rioxarray** package, a warpper of **rasterio**, however, represents rasters with **xarray** \"extended\" arrays, which are an extension of **numpy** array designed to hold axis labels and attributes in the same object, together with the array of raster values.\n", + "The **rioxarray** package, a wrapper of **rasterio**, however, represents rasters with **xarray** 'extended' arrays, which are an extension of **numpy** array designed to hold axis labels and attributes in the same object, together with the array of raster values.\n", "Similar approaches are provided by less well-known **xarray-spatial** and **geowombat** packages.\n", - "Comparatively, **rasterio** is more well-established, but it is more low-level (which has both advantabes and distadvantages).\n", + "Comparatively, **rasterio** is more well-established, but it is more low-level (which has both advantages and distadvantages).\n", "\n", "All of the above-mentioned packages, however, are not exhaustive in the same way **geopandas** is.\n", - "For example, when working with **rasterio**, on the one hand, more packages may be needed to accomplish common tasks such as zonal statistics (package **rasterstats**) or calculating topographic indices (package **richdem**).\n", + "For example, when working with **rasterio**, more packages may be needed to accomplish common tasks such as zonal statistics (package **rasterstats**) or calculating topographic indices (package **richdem**).\n", "\n", "\n", "In the following two sections, we introduce **rasterio**, which is the raster-related package we are going to work with through the rest of the book.\n", @@ -1103,11 +1311,7 @@ "\n", "To work with the **rasterio** package, we first need to import it.\n", "Additionally, as the raster data is stored within **numpy** arrays, we import the **numpy** package and make all its functions accessible for effective data manipulation. \n", - " \n", - "\n", - "Finally, we import the **rasterio.plot** sub-module for its `rasterio.plot.show` function that allows for quick visualization of rasters.\n", - "\n", - "" + "Finally, we import the **rasterio.plot** sub-module for its `rasterio.plot.show` function that allows for quick visualization of rasters." ] }, { @@ -1128,18 +1332,16 @@ "Rasters are typically imported from existing files.\n", "When working with **rasterio**, importing a raster is actually a two-step process:\n", "\n", - "- First, we open a raster file \"connection\" using `rasterio.open`\n", + "- First, we open a raster file 'connection' using `rasterio.open`\n", "- Second, we read raster values from the connection using the `.read` method\n", "\n", - "This separation is analogous to basic Python functions for reading from files, such as `open` and `.readline` to read from a text file.\n", + "This type of separation is analogous to basic Python functions for reading from files, such as `open` and `.readline` to read from a text file.\n", "The rationale is that we do not always want to read all information from the file into memory, which is particularly important as rasters size can be larger than RAM size.\n", - "\n", - "\n", "Accordingly, the second step (`.read`) is selective, meaning that the user can fine-tune the subset of values (bands, rows/columns, resolution, etc.) that are actually being read.\n", "For example, we may want to read just one raster band rather than reading all bands.\n", "\n", - "In the first step, we pass a file path to the `rasterio.open` function to create a `DatasetReader` file connection.\n", - "For this example, we use a single-band raster representing elevation in Zion National Park." + "In the first step, we pass a file path to the `rasterio.open` function to create a `DatasetReader` file connection, hereby named `src`.\n", + "For this example, we use a single-band raster representing elevation in Zion National Park, stored in `srtm.tif`." ] }, { @@ -1165,6 +1367,7 @@ "source": [ "#| label: fig-rasterio-plot\n", "#| fig-cap: Basic plot of a raster, the data are coming from a **rasterio** file connection\n", + "#| out-width: 60%\n", "rasterio.plot.show(src);" ], "execution_count": null, @@ -1175,7 +1378,7 @@ "metadata": {}, "source": [ "The `DatasetReader` contains the raster metadata, that is, all of the information other than the raster values.\n", - "Let us examine it with the `meta` property." + "Let's examine it with the `.meta` property." ] }, { @@ -1191,13 +1394,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - " \n", "Namely, it allows us to see the following properties, which we will elaborate on below, and in later chapters:\n", "\n", "- `driver`---The raster file format (see @sec-data-output-raster)\n", "- `dtype`---Data type (see @tbl-numpy-data-types)\n", - "- `nodata`---The value being used as \"No Data\" flag (see @sec-data-output-raster)\n", + "- `nodata`---The value being used as 'No Data' flag (see @sec-data-output-raster)\n", "- Dimensions:\n", " - `width`---Number of columns\n", " - `height`---Number of rows\n", @@ -1207,7 +1408,7 @@ "\n", "The last item (i.e., `transform`) deserves more attention.\n", "To position a raster in geographical space, in addition to the CRS, we must specify the raster *origin* ($x_{min}$, $y_{max}$) and resolution ($delta_{x}$, $delta_{y}$).\n", - "In the transformation matrix notation, these data items are stored as follows:\n", + "In the transformation matrix notation, assuming a regular grid, these data items are stored as follows:\n", "\n", "```{text}\n", "Affine(delta_x, 0.0, x_min,\n", @@ -1215,8 +1416,9 @@ "```\n", "\n", "Note that, by convention, raster y-axis origin is set to the maximum value ($y_{max}$) rather than the minimum, and, accordingly, the y-axis resolution ($delta_{y}$) is negative.\n", + "In other words, since the origin is in the *top*-left corner, advancing along the y-axis is done through negative steps (downwards).\n", "\n", - "Finally, the `.read` method of the `DatasetReader` is used to read the actual raster values.\n", + "In the second step, the `.read` method of the `DatasetReader` is used to read the actual raster values.\n", "Importantly, we can read:\n", "\n", "- All layers (as in `.read()`)\n", @@ -1225,12 +1427,12 @@ "\n", "Note that the layer indices start from `1`, contrary to the Python convention of the first index being `0`.\n", "\n", - "The resulting object is a **numpy** array [@numpy], with either two or three dimensions:\n", + "The object returned by `.read` is a **numpy** array [@numpy], with either two or three dimensions:\n", "\n", "- *Three* dimensions, when reading more than one layer (e.g., `.read()` or `.read([1,2])`). In such case, the dimensions pattern is `(layers, rows, columns)`\n", "- *Two* dimensions, when reading one specific layer (e.g., `.read(1)`). In such case, the dimensions pattern is `(rows, columns)`\n", "\n", - "Let's read the first (and only) layer from the `srtm.tif` raster, using the file connection object `src` using the `.read(1)` method." + "Let's read the first (and only) layer from the `srtm.tif` raster, using the file connection object `src` and the `.read` method." ] }, { @@ -1246,7 +1448,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The result is a two-dimensional **numpy** array in which each value represents the elevation of the corresponding pixel.\n", + "The result is a two-dimensional **numpy** array where each value represents the elevation of the corresponding pixel.\n", "\n", "The relation between a **rasterio** file connection and the derived properties is summarized in @fig-rasterio-structure.\n", "The file connection (created with `rasterio.open`) gives access to the two components of raster data: the metadata (via the `.meta` property) and the values (via the `.read` method).\n", @@ -1257,20 +1459,18 @@ "\n", "In this section, we are going to demonstrate the creation of rasters from scratch.\n", "We will construct two small rasters, `elev` and `grain`, which we will use in examples later in the book.\n", - "Unlike creating a vector layer (see @sec-vector-layer-from-scratch), creating a raster from scratch is rarely needed in practice because aligning a raster with the proper spatial extent is challenging to do programmatically (\"georeferencing\" tools in GIS software are a better fit for the job).\n", + "Unlike creating a vector layer (see @sec-vector-layer-from-scratch), creating a raster from scratch is rarely needed in practice because aligning a raster with the proper spatial extent is challenging to do programmatically ('georeferencing' tools in GIS software are a better fit for the job).\n", "Nevertheless, the examples will be helpful to become more familiar with the **rasterio** data structures.\n", "\n", "Conceptually, a raster is an array combined with georeferencing information, whereas the latter comprises:\n", "\n", - "- A transformation matrix, linking pixel indices with coordinates in a particular coordinate system\n", + "- A transformation matrix, containing the origin and resolution, thus linking pixel indices with coordinates in a particular coordinate system\n", "- A CRS definition, specifying the association of that coordinate system with the surface of the earth (optional)\n", "\n", "Therefore, to create a raster, we first need to have an array with the values, and then supplement it with the georeferencing information.\n", "Let's create the arrays `elev` and `grain`.\n", "The `elev` array is a $6 \\times 6$ array with sequential values from `1` to `36`.\n", - "It can be created as follows using the `np.arange` function and `.reshape` method.\n", - "\n", - "" + "It can be created as follows using the `np.arange` function and `.reshape` method from **numpy**." ] }, { @@ -1287,8 +1487,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The `grain` array represents a categorical raster with values `0`, `1`, `2`, corresponding to categories \"clay\", \"silt\", \"sand\", respectively.\n", - "We will create it from a specific arrangement of pixel values using the **numpy** `array` and `reshape` functions." + "The `grain` array represents a categorical raster with values `0`, `1`, `2`, corresponding to categories 'clay', 'silt', 'sand', respectively.\n", + "We will create it from a specific arrangement of pixel values, using **numpy**'s `np.array` and `.reshape`." ] }, { @@ -1314,8 +1514,6 @@ "metadata": {}, "source": [ "Note that in both cases, we are using the `uint8` (unsigned integer in 8 bits, i.e., `0-255`) data type, which is sufficient to represent all possible values of the given rasters (see @tbl-numpy-data-types).\n", - "\n", - "\n", "This is the recommended approach for a minimal memory footprint.\n", "\n", "What is missing now is the georeferencing information (see @sec-using-rasterio).\n", @@ -1324,7 +1522,8 @@ "- The origin ($x_{min}$, $y_{max}$) is at `-1.5,1.5`\n", "- The raster resolution ($delta_{x}$, $delta_{y}$) is `0.5,-0.5`\n", "\n", - "We can add this information using [`rasterio.transform.from_origin`](rasterio.transform.from_origin), and specifying `west`, `north`, `xsize`, and `ysize` parameters." + "We can add this information using `rasterio.transform.from_origin`, and specifying `west`, `north`, `xsize`, and `ysize` parameters.\n", + "The resulting transformation matrix object is hereby named `new_transform`." ] }, { @@ -1387,10 +1586,10 @@ "At this point, we have two rasters, each composed of an array and related transformation matrix.\n", "We can work with the raster using **rasterio** by:\n", "\n", - "- Passing the transformation matrix wherever actual raster pixel coordinates are important (such as in function `show` above)\n", - "- Keeping in mind that any other layer we use in the analysis is in the same CRS of those coordinates\n", + "- Passing the transformation matrix wherever actual raster pixel coordinates are important (such as in function `rasterio.plot.show` above)\n", + "- Keeping in mind that any other layer we use in the analysis is in the same CRS\n", "\n", - "Finally, to export the raster for permanent storage, along with the CRS definition, we need to go through the following steps:\n", + "Finally, to export the raster for permanent storage, along with the spatial metadata, we need to go through the following steps:\n", "\n", "1. Create a raster file connection (where we set the transform and the CRS, among other settings)\n", "2. Write the array with raster values into the connection\n", @@ -1398,9 +1597,7 @@ "\n", "Don't worry if the code below is unclear; the concepts related to writing raster data to file will be explained in @sec-data-output-raster. \n", "For now, for completeness, and also to use these rasters in subsequent chapters without having to re-create them from scratch, we just provide the code for exporting the `elev` and `grain` rasters into the `output` directory.\n", - "In the case of `elev`, we do it as follows with the `open`, `write`, and `close` methods of the **rasterio** package.\n", - "\n", - "" + "In the case of `elev`, we do it as follows with the `rasterio.open`, `.write`, and `.close` functions and methods of the **rasterio** package." ] }, { @@ -1430,7 +1627,7 @@ "source": [ "Note that the CRS we (arbitrarily) set for the `elev` raster is WGS84, defined using `crs=4326` according to the EPSG code.\n", "\n", - "Exporting the `grain` raster is done in the same way, with the only difference being the array we write into the connection." + "Exporting the `grain` raster is done in the same way, with the only differences being the file name and the array we write into the connection." ] }, { @@ -1495,7 +1692,7 @@ "This can be seen in @fig-geocentric-vs-local, where the local datum is fitted to the area of Philippines, but is misaligned with most of the rest of the planet's surface.\n", "Both datums in @fig-geocentric-vs-local are put on top of a geoid---a model of global mean sea level.\n", "\n", - "![Geocentric and local geodetic datums shown on top of a geoid (in false color and the vertical exaggeration by 10,000 scale factor). Image of the geoid is adapted from the work of [@essd-11-647-2019].](https://r.geocompx.org/figures/02_datum_fig.png){#fig-geocentric-vs-local}\n", + "![Geocentric and local geodetic datums shown on top of a geoid (in false color and the vertical exaggeration by 10,000 scale factor). Image of the geoid is adapted from the work of [@essd-11-647-2019].](images/geocompr_02_datum_fig.png){#fig-geocentric-vs-local}\n", "\n", "### Projected coordinate reference systems {#sec-projected-coordinate-reference-systems}\n", "\n", @@ -1549,7 +1746,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "A quick summary of different projections, their types, properties, and suitability can be found in \"Map Projections\" (1993) and at .\n", + "A quick summary of different projections, their types, properties, and suitability can be found at .\n", "We will expand on CRSs and explain how to project from one CRS to another in @sec-reproj-geo-data.\n", "But, for now, it is sufficient to know:\n", "\n", @@ -1574,11 +1771,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can also illustrate the difference between a geographic and a projected CRS by plotting the `zion` data in both CRSs (@fig-zion-crs). Note that we are using the [`.grid`](https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.grid.html) method of **matplotlib** to draw grid lines on top of the plot.\n", - "\n", - "\n", - "\n", - "" + "We can also illustrate the difference between a geographic and a projected CRS by plotting the `zion` data in both CRSs (@fig-zion-crs). Note that we are using the `.grid` method of **matplotlib** to draw grid lines on top of the plot." ] }, { @@ -1639,17 +1832,15 @@ "It is up to the user to determine which units the result is given in, and treat the result accordingly.\n", "For example, if the area output was in $m^2$ and we need the result in $km^2$, then we need to divide the result by $1000^2$.\n", "\n", - "## Exercises\n", - "\n", - "## References" + "\n" ] } ], "metadata": { "kernelspec": { - "name": "python3", + "display_name": "Python 3", "language": "python", - "display_name": "Python 3 (ipykernel)" + "name": "python3" } }, "nbformat": 4, diff --git a/ipynb/02-attribute-operations.ipynb b/ipynb/02-attribute-operations.ipynb index 8f91fce5..0075bfab 100644 --- a/ipynb/02-attribute-operations.ipynb +++ b/ipynb/02-attribute-operations.ipynb @@ -4,12 +4,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Attribute data operations {#sec-attr}\n", + "---\n", + "jupyter: python3\n", + "---\n", "\n", - "## Prerequisites {.unnumbered}\n", + "# Attribute data operations {#sec-attr}\n", "\n", - "\n", - "" + "## Prerequisites {.unnumbered}" ] }, { @@ -17,12 +18,17 @@ "metadata": {}, "source": [ "#| echo: false\n", - "import matplotlib.pyplot as plt\n", - "import pandas as pd\n", - "pd.options.display.max_rows = 6\n", - "pd.options.display.max_columns = 6\n", - "pd.options.display.max_colwidth = 35\n", - "plt.rcParams['figure.figsize'] = (5, 5)" + "import book_options" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| echo: false\n", + "import book_options_pdf" ], "execution_count": null, "outputs": [] @@ -59,6 +65,7 @@ "metadata": {}, "source": [ "#| echo: false\n", + "#| include: false\n", "import os\n", "from urllib.request import urlretrieve\n", "\n", @@ -95,15 +102,15 @@ "\n", "Attribute data is non-spatial information associated with geographic (geometry) data.\n", "A bus stop provides a simple example: its position would typically be represented by latitude and longitude coordinates (geometry data), in addition to its name.\n", - "The Elephant & Castle / New Kent Road bus stop in London, for example has coordinates of `-0.098` degrees longitude and `51.495` degrees latitude which can be represented as `POINT (-0.098 51.495)` using the Simple Feature representation described in @sec-spatial-class.\n", + "A bus stop in London, for example, has coordinates of `-0.098` degrees longitude and `51.495` degrees latitude which can be represented as `POINT (-0.098 51.495)` using the Simple Feature representation described in @sec-spatial-class.\n", "Attributes, such as the name of the bus stop, are the topic of this chapter.\n", "\n", "Another example of an attribute is the elevation value for a specific grid cell in raster data.\n", "Unlike the vector data model, the raster data model stores the coordinate of the grid cell indirectly, meaning the distinction between attribute and spatial information is less clear.\n", - "Think of a pixel in the 3rd row and the 4th column of a raster matrix: its spatial location is defined by its index in the matrix.\n", + "Think of a pixel in the 3^rd^ row and the 4^th^ column of a raster matrix: its spatial location is defined by its index in the matrix.\n", "In this case, we need to move four cells in the x direction (typically east/right on maps) and three cells in the y direction (typically south/down) from the origin.\n", "The raster's resolution is also important as it defines the distance for each x- and y-step.\n", - "The resolution and the origin are stored in the raster's header, which is a vital component of raster datasets which specifies how pixels relate to geographic coordinates (see also @sec-spatial-operations).\n", + "The resolution and the origin are stored in the raster's metadata (header), which is a vital component of raster datasets which specifies how pixels relate to geographic coordinates (see also @sec-spatial-operations).\n", "\n", "This chapter teaches how to manipulate geographic objects based on attributes such as the names of bus stops in a vector dataset and elevations of pixels in a raster dataset.\n", "For vector data, this means techniques such as subsetting and aggregation (see @sec-vector-attribute-subsetting and @sec-vector-attribute-aggregation).\n", @@ -112,17 +119,17 @@ "This is good news: skills developed in this chapter are cross-transferable.\n", "@sec-spatial-operations extends the methods presented here to the spatial world.\n", "\n", - "After a deep dive into various types of vector attribute operations in the next section, raster attribute data operations are covered in @sec-raster-subsetting, which demonstrates extracting cell values from one or more layer (raster subsetting).\n", + "After a deep dive into various types of vector attribute operations in the next section, raster attribute data operations are covered in @sec-raster-subsetting, which demonstrates extracting cell values from one or more layers (raster subsetting).\n", "@sec-summarizing-raster-objects provides an overview of 'global' raster operations which can be used to summarize entire raster datasets.\n", "\n", "## Vector attribute manipulation {#sec-vector-attribute-manipulation}\n", "\n", - "As mentioned in @sec-vector-layers, vector layers (`GeoDataFrame`, from package **geopandas**) are basically extended tables (`DataFrame` from package **pandas**), the difference being that a vector layer has a geometry column.\n", + "As mentioned in @sec-vector-layers, vector layers (`GeoDataFrame`, from package **geopandas**) are basically extended tables (`DataFrame` from package **pandas**), the only differences being the geometry column and class.\n", "Therefore, all ordinary table-related operations from package **pandas** are supported for **geopandas** vector layers as well, as shown below.\n", "\n", "### Vector attribute subsetting {#sec-vector-attribute-subsetting}\n", "\n", - "**pandas** supports several subsetting interfaces, though the most [recommended](https://stackoverflow.com/questions/38886080/python-pandas-series-why-use-loc) ones are `.loc`, which uses **pandas** indices, and `.iloc`, which uses (implicit) **numpy**-style numeric indices.\n", + "**pandas** supports several subsetting interfaces, though the most recommended ones are `.loc`, which uses **pandas** indices, and `.iloc`, which uses (implicit) **numpy**-style numeric indices.\n", "\n", "In both cases, the method is followed by square brackets, and two indices, separated by a comma.\n", "Each index can be:\n", @@ -130,38 +137,21 @@ "- A specific value, as in `1`\n", "- A `list`, as in `[0,2,4]`\n", "- A slice, as in `0:3`\n", - "- `:`---indicating \"all\" indices, as in `[:]` \n", + "- `:`---indicating 'all' indices, as in `[:]`\n", "\n", - "An exception to this rule is selecting columns using a list, which we do using shorter notation, as in `df[['a','b']]`, instead of `df.loc[:, ['a','b']]`, to select columns `'a'` and `'b'` from `df`.\n", + "An exception to this guideline is selecting columns using a list, which we do using shorter notation, as in `df[['a','b']]`, instead of `df.loc[:, ['a','b']]`, to select columns `'a'` and `'b'` from `df`.\n", "\n", "Here are few examples of subsetting the `GeoDataFrame` of world countries (@fig-gdf-plot).\n", "First, we are subsetting rows by position.\n", - "This can be done using the three following approaches, which all return the same result. \n", - "\n", - "\n", - "\n", - "\n", - "In the expression #1, we are using the expressive notation `[0:3,:]`, meaning \"rows 1,2,3, all columns\". Keep in mind that indices in Python start from 0, and slices are inclusive of the start and exclusive of the end.; therefore, `0:3` means indices `0`, `1`, `2`, i.e., first three rows in this example. In expression #2, we omit the columns index, as well as the starting index, that is, `[:3]`, doing the same with less code. In expression #3, we are using the `.head` method to select the first N rows." + "In the first example, we are using `[0:3,:]`, meaning 'rows 1,2,3, all columns'. Keep in mind that indices in Python start from 0, and slices are inclusive of the start and exclusive of the end; therefore, `0:3` means indices `0`, `1`, `2`, i.e., first three rows in this example.\n", + "" ] }, { "cell_type": "code", "metadata": {}, "source": [ - "#| eval: false\n", - "world.iloc[0:3, :] # approach #1\n", - "world.iloc[:3] # approach #2\n", - "world.head(3) # approach #3" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "#| echo: false\n", - "world.head(3)" + "world.iloc[0:3, :]" ], "execution_count": null, "outputs": [] @@ -202,7 +192,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Subsetting columns by name is not done with the `.iloc` method, but requires specifying the column names directly in a double square bracket `[[` notation." + "Subsetting columns by name is not done with the `.iloc` method, but instead requires specifying the column names in `.loc`, or directly in a double square bracket `[[` notation." ] }, { @@ -218,7 +208,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "To select many successive columns, we can use the `:` notation, as in `world.loc[:, 'name_long':'pop']`, which selects all columns from `name_long` to `pop` (inclusive)." + "To select many successive columns, we can use the `:` (slice) notation, as in `world.loc[:, 'name_long':'pop']`, which selects all columns from `name_long` to `pop` (inclusive)." ] }, { @@ -234,9 +224,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", - "\n", "Removing rows or columns is done using the `.drop` method.\n", "We can remove specific rows by specifying their ids, e.g., dropping rows 2, 3, and 5 in the following example." ] @@ -270,7 +257,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can also rename columns using the [`.rename`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rename.html) method, in which we pass a dictionary of the form `old_name:new_name` to the `columns` argument." + "We can also rename columns using the `.rename` method, in which we pass a dictionary with items of the form `old_name:new_name` to the `columns` argument." ] }, { @@ -289,7 +276,7 @@ "The standard **numpy** comparison operators (@tbl-comparison-operators) can be used in boolean subsetting with **pandas**/**geopandas**.\n", "\n", "| `Symbol` | `Name` |\n", - "|---------------|---------------------------------|\n", + "|:---------------:|:---------------------------------:|\n", "| `==` | Equal to |\n", "| `!=` | Not equal to |\n", "| `>`, `<` | Greater/Less than |\n", @@ -298,7 +285,7 @@ "\n", ": Comparison operators that return boolean values (`True`/`False`). {#tbl-comparison-operators}\n", "\n", - "The following example demonstrates logical vectors for subsetting by creating a new `GeoDataFrame` object called `small_countries` that contains only those countries and other teritories from the `world` object whose surface area is smaller than 10,000 $km^2$.\n", + "The following example demonstrates logical vectors for subsetting by creating a new `GeoDataFrame` object called `small_countries` that contains only those countries and other territories from the `world` object whose surface area is smaller than 10,000 $km^2$.\n", "The first step is to create a logical vector (a `Series` object) that is `True` for countries with an area smaller than 10,000 $km^2$ and `False` otherwise.\n", "Then, we use this vector to subset the `world` dataset, which returns a new `GeoDataFrame` object containing only the small countries." ] @@ -374,7 +361,9 @@ "source": [ "Logical operators `&`, `|`, and `~` (@tbl-comparison-operators) can be used to combine multiple conditions.\n", "For example, here are all countries in North America or South America.\n", - "Keep in mind that the parentheses around each condition (here, and in analogous cases using other operators) are crucial; otherwise, due to Python's [precedence rules](https://docs.python.org/3/reference/expressions.html#operator-precedence), the `|` operator is executed before `==` and we get an error." + "Keep in mind that the parentheses around each condition (here, and in analogous cases using other operators) are crucial; otherwise, due to Python's precedence rules[^python_precedence_rules], the `|` operator is executed before `==` and we get an error.\n", + "\n", + "[^python_precedence_rules]: [https://docs.python.org/3/reference/expressions.html#operator-precedence](https://docs.python.org/3/reference/expressions.html#operator-precedence)" ] }, { @@ -394,7 +383,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "However, specifically, expressions combining multiple comparisons with `==` combined with `|` can be replaced with the [`.isin`](https://pandas.pydata.org/docs/reference/api/pandas.Series.isin.html) method and a `list` of values to compare with.\n", + "However, specifically, expressions combining multiple comparisons with `==` combined with `|` can be replaced with the `.isin` method and a `list` of values to compare with.\n", "The advantage of `.isin` is more concise and easy to manage code, especially when the number of comparisons is large.\n", "For example, the following expression gives the same result as above." ] @@ -421,14 +410,14 @@ "The aim is to find the `sum()` of country populations for each continent, resulting in a smaller table or vector layer (of continents).\n", "Since aggregation is a form of data reduction, it can be a useful early step when working with large datasets.\n", "\n", - "Attribute-based aggregation can be achieved using a combination of `.groupby` and `.sum`, where the former groups the data by the grouping variable(s) and the latter calculates the sum of the remaining columns." + "Attribute-based aggregation can be achieved using a combination of `.groupby` and `.sum` (package **pandas**), where the former groups the data by the grouping variable(s) and the latter calculates the sum of the specified column(s). The `.reset_index` methods moves the grouping variable into an ordinary column, rather than an index (the default), which is something we typically want to do." ] }, { "cell_type": "code", "metadata": {}, "source": [ - "world_agg1 = world[['continent', 'pop']].groupby('continent').sum()\n", + "world_agg1 = world.groupby('continent')[['pop']].sum().reset_index()\n", "world_agg1" ], "execution_count": null, @@ -443,9 +432,7 @@ "If we want to include the geometry in the aggregation result, we can use the `.dissolve` method.\n", "That way, in addition to the summed population, we also get the associated geometry per continent, i.e., the union of all countries.\n", "Note that we use the `by` parameter to choose which column(s) are used for grouping, and the `aggfunc` parameter to choose the aggregation function for non-geometry columns.\n", - "Note that the `.reset_index` method is used (here, and elsewhere in the book) to turn **pandas** and **geopandas** [*indices*](https://pandas.pydata.org/docs/reference/api/pandas.Index.html), which are automatically created for grouping variables in grouping operations such as `.dissolve`, \"back\" into ordinary columns, which are more appropriate in the scope of this book.\n", - "\n", - "" + "Again, note that the `.reset_index` method is used (here, and elsewhere in the book) to turn **pandas** and **geopandas** row *indices*, which are automatically created for grouping variables in grouping operations such as `.dissolve`, 'back' into ordinary columns, which are more appropriate in the scope of this book." ] }, { @@ -464,9 +451,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In this case, the resulting `world_agg2` object is a `GeoDataFrame` containing 8 features representing the continents of the world (and the open ocean) that we can plot (@fig-spatial-aggregation). The `plt.subplots` function is hereby used to control plot dimensions (to make the plot wider and narrower) (see @sec-static-styling).\n", - "\n", - "" + "In this case, the resulting `world_agg2` object is a `GeoDataFrame` containing 8 features representing the continents of the world that we can plot (@fig-spatial-aggregation). The `plt.subplots` function is hereby used to control plot dimensions (to make the plot wider and narrower) (see @sec-static-styling)." ] }, { @@ -491,9 +476,7 @@ "As a more complex example, the following code shows how we can calculate the total population, area, and count of countries, per continent.\n", "It is done by passing a dictionary to the `aggfunc` parameter, where the keys are the column names and the values are the aggregation functions.\n", "The result is a `GeoDataFrame` object with 8 rows (one per continent) and 4 columns (one for the continent name and one for each of the three aggregated attributes).\n", - "The `rename` method is used to rename the `'name_long'` column into `'n'`, as it now expresses the count of names (i.e., the number of countries) rather than their names.\n", - "\n", - "" + "The `rename` method is used to rename the `'name_long'` column into `'n'`, as it now expresses the count of names (i.e., the number of countries) rather than their names." ] }, { @@ -506,7 +489,7 @@ " 'name_long': 'count',\n", " 'pop': 'sum',\n", " 'area_km2': 'sum'\n", - " }).rename(columns={'name_long': 'n'})\n", + " }).rename(columns={'name_long': 'n'}).reset_index()\n", "world_agg3" ], "execution_count": null, @@ -516,7 +499,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Figure @fig-spatial-aggregation-different-functions visualizes the three aggregated attributes of our resulting layer `world_agg3`." + "@fig-spatial-aggregation-different-functions visualizes the three aggregated attributes of our resulting layer `world_agg3`." ] }, { @@ -550,9 +533,9 @@ "There are several other table-related operations that are possible, such as creating new columns or sorting the values.\n", "In the following code example, given the `world_agg3` continent summary (@fig-spatial-aggregation-different-functions), we:\n", "\n", - "- drop the geometry columns,\n", + "- drop the geometry column,\n", "- calculate population density of each continent,\n", - "- arrange continents by the number countries they contain, and\n", + "- arrange continents by the number of countries each contains, and\n", "- keep only the 3 most populous continents." ] }, @@ -576,8 +559,8 @@ "### Vector attribute joining {#sec-vector-attribute-joining}\n", "\n", "Combining data from different sources is a common task in data preparation.\n", - "Joins do this by combining tables based on a shared \"key\" variable.\n", - "**pandas** has a function named [`pd.merge`](https://pandas.pydata.org/docs/reference/api/pandas.merge.html) for joining `(Geo)DataFrames` based on common column(s) that follows conventions used in the database language SQL [@grolemund_r_2016].\n", + "Joins do this by combining tables based on a shared 'key' variable.\n", + "**pandas** has a function named `pd.merge` for joining `(Geo)DataFrames` based on common column(s) that follows conventions used in the database language SQL [@grolemund_r_2016].\n", "The `pd.merge` result can be either a `DataFrame` or a `GeoDataFrame` object, depending on the inputs.\n", "\n", "A common type of attribute join on spatial data is to join `DataFrames` to `GeoDataFrames`.\n", @@ -621,7 +604,7 @@ "source": [ "The result is a `GeoDataFrame` object identical to the original `world` object, but with two new variables (`coffee_production_2016` and `coffee_production_2017`) on coffee production.\n", "This can be plotted as a map, as illustrated (for `coffee_production_2017`) in @fig-join-coffee-production. \n", - "Note that, here and in many other examples in later chapters, we are using a technique to plot two layers (all of the world countries outline, and coffee production with symbology) at once, which will be \"formally\" introduced towards the end of the book in @sec-plot-static-layers.\n", + "Note that, here and in many other examples in later chapters, we are using a technique to plot two layers (all of the world countries outline, and coffee production with symbology) at once, which will be 'formally' introduced towards the end of the book in @sec-plot-static-layers.\n", "\n", "" ] @@ -642,15 +625,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "To work, attribute-based joins need a \"key variable\" in both datasets (`on` parameter of `pd.merge`).\n", + "To work, attribute-based joins need a 'key variable' in both datasets (`on` parameter of `pd.merge`).\n", "In the above example, both `world_coffee` and `world` DataFrames contained a column called `name_long`.\n", "\n", "::: callout-note\n", "By default, `pd.merge` uses all columns with matching names. However, it is recommended to explicitly specify the names of the columns to be used for matching, like we did in the last example.\n", ":::\n", "\n", - "\n", - "\n", "In case where column names are not the same, you can use `left_on` and `right_on` to specify the respective columns.\n", "\n", "Note that the result `world_coffee` has the same number of rows as the original dataset `world`.\n", @@ -676,31 +657,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", "### Creating attributes and removing spatial information {#sec-creating-attributes-and-removing-spatial-information}\n", "\n", "Often, we would like to create a new column based on already existing columns.\n", @@ -745,11 +701,9 @@ "metadata": {}, "source": [ "The resulting `GeoDataFrame` object has a new column called `con_reg` representing the continent and region of each country, e.g., `'South America:Americas'` for Argentina and other South America countries.\n", - "The opposite operation, splitting one column into multiple columns based on a separator string, is done using the [`.str.split`](https://pandas.pydata.org/docs/reference/api/pandas.Series.str.split.html) method.\n", - "As a result we go back to the previous state of two separate `continent` and `region_un` columns (only that their position is now last, since they are newly created).\n", - "The `str.split` method returns a column of `list`s by default; to place the strings into separate `str` columns we use the `expand=True` argument.\n", - "\n", - "" + "The opposite operation, splitting one column into multiple columns based on a separator string, is done using the `.str.split` method.\n", + "As a result, we go back to the previous state of two separate `continent` and `region_un` columns (only that their position is now last, since they are newly created).\n", + "The `str.split` method returns a column of `list`s by default; to place the strings into separate `str` columns we use the `expand=True` argument." ] }, { @@ -784,7 +738,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "To change all column names at once, we assign a `list` of the \"new\" column names into the `.columns` property.\n", + "To change all column names at once, we assign a `list` of the 'new' column names into the `.columns` property.\n", "The `list` must be of the same length as the number of columns (i.e., `world.shape[1]`).\n", "This is illustrated below, which outputs the same `world2` object, but with very short names." ] @@ -852,10 +806,7 @@ "\n", "Raster cell values can be considered the counterpart of vector attribute values. \n", "In this section, we cover operations that deal with raster values in a similar way, namely as a series of numbers. \n", - "This type of operations include subsetting raster values (@sec-raster-subsetting) and calculating global summaries of raster values (@sec-summarizing-raster-objects).\n", - "\n", - "\n", - "\n", + "This type of operations includes subsetting raster values (@sec-raster-subsetting) and calculating global summaries of raster values (@sec-summarizing-raster-objects).\n", "\n", "### Raster subsetting {#sec-raster-subsetting}\n", "\n", @@ -885,7 +836,7 @@ "cell_type": "code", "metadata": {}, "source": [ - "elev[1, 2] ## Value at row 2, column 3" + "elev[1, 2]" ], "execution_count": null, "outputs": [] @@ -946,18 +897,18 @@ "metadata": {}, "source": [ "::: callout-note\n", - "You can see that the above array is three-dimensional according to the number of brackets `[`, or check explicitly using `.shape` or [`.ndim`](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.ndim.html).\n", + "You can see that the above array is three-dimensional according to the number of brackets `[`, or check explicitly using `.shape` or `.ndim`.\n", ":::\n", "\n", "In three-dimensional arrays, we access cell values using three indices, keeping in mind that dimensions order is `(layers,rows, columns)`\n", - "For example, to get the same value shown above, at row 2, column 3 (at band 1), we use `elev[0,1,2]` returns instead of `elev[1,2]`." + "For example, to get the same value shown above, at row 2, column 3 (at band 1), we use `elev[0,1,2]` instead of `elev[1,2]`." ] }, { "cell_type": "code", "metadata": {}, "source": [ - "elev3d[0, 1, 2] ## Value at band 1, row 2, column 3" + "elev3d[0, 1, 2] " ], "execution_count": null, "outputs": [] @@ -966,9 +917,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", - "\n", "### Summarizing raster objects {#sec-summarizing-raster-objects}\n", "\n", "Global summaries of raster values can be calculated by applying **numpy** summary functions on the array with raster values, e.g., `np.mean`." @@ -987,10 +935,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Note that \"No Data\"-safe functions--such as `np.nanmean`---should be used in case the raster contains \"No Data\" values which need to be ignored.\n", - "Before we can demonstrate that, we must convert the array from `int` to `float`, as `int` arrays cannot contain `np.nan` (due to [computer memory limitations](https://en.wikipedia.org/wiki/NaN#Integer_NaN)).\n", - "\n", - "" + "Note that 'No Data'-safe functions--such as `np.nanmean`---should be used in case the raster contains 'No Data' values which need to be ignored.\n", + "Before we can demonstrate that, we must convert the array from `int` to `float`, as `int` arrays cannot contain `np.nan` (due to computer memory limitations)." ] }, { @@ -1009,9 +955,7 @@ "metadata": {}, "source": [ "Now we can insert an `np.nan` value into the array, for example to a cell located in the first row and third column.\n", - "(Trying to do so in the original `elev` array raises an error, because an `int` array cannot accomodate `np.nan`, as mentioned above; try it to see for yourself.)\n", - "\n", - "" + "(Doing so in the original `elev` array raises an error, because an `int` array cannot accommodate `np.nan`, as mentioned above; try it to see for yourself.)" ] }, { @@ -1028,7 +972,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "With the `np.nan` value inplace, the summary value becomes unknown (`np.nan`)." + "With the `np.nan` value inplace, the `np.mean` summary value becomes unknown (`np.nan`)." ] }, { @@ -1044,7 +988,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "To get a summary of all non-missing values, we need to use the specialized **numpy** functions that ignore \"No Data\" values:" + "To get a summary of all non-missing values, we need to use one of the specialized **numpy** functions that ignore 'No Data' values, such as `np.nanmean`:" ] }, { @@ -1061,7 +1005,7 @@ "metadata": {}, "source": [ "Raster value statistics can be visualized in a variety of ways.\n", - "One approach is to \"flatten\" the raster values into a one-dimensional array (`flatten`), then use a graphical function such as [`plt.hist`](https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.hist.html) or [`plt.boxplot`](https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.boxplot.html) (from **matplotlib.pyplot**).\n", + "One approach is to 'flatten' the raster values into a one-dimensional array (using `.flatten`), then use a graphical function such as `plt.hist` or `plt.boxplot` (from **matplotlib.pyplot**).\n", "For example, the following code section shows the distribution of values in `elev` using a histogram (@fig-raster-hist)." ] }, @@ -1070,7 +1014,7 @@ "metadata": {}, "source": [ "#| label: fig-raster-hist\n", - "#| fig-cap: Distribution of cell values in continuous raster (`elev.tif`)\n", + "#| fig-cap: Distribution of cell values in a continuous raster (`elev.tif`)\n", "plt.hist(elev.flatten());" ], "execution_count": null, @@ -1080,9 +1024,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", - "\n", "To summarize the distribution of values in a categorical raster, we can calculate the frequencies of unique values, and draw them using a barplot. \n", "Let's demonstrate using the `grain.tif` small categorical raster. " ] @@ -1101,7 +1042,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "To calculate the frequency of unique values in an array, we use the [`np.unique`](https://numpy.org/doc/stable/reference/generated/numpy.unique.html) with the `return_counts=True` option. \n", + "To calculate the frequency of unique values in an array, we use the `np.unique` with the `return_counts=True` option. \n", "The result is a `tuple` with two corresponding arrays: the unique values, and their counts." ] }, @@ -1119,7 +1060,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "These two arrays can be passed to the [`plt.bar`](https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.bar.html) function to draw a barplot, as shown in @fig-raster-bar." + "These two arrays can be passed to the `plt.bar` function to draw a barplot, as shown in @fig-raster-bar." ] }, { @@ -1137,17 +1078,15 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Exercises\n", - "\n", - "## References" + "\n" ] } ], "metadata": { "kernelspec": { - "name": "python3", + "display_name": "Python 3", "language": "python", - "display_name": "Python 3 (ipykernel)" + "name": "python3" } }, "nbformat": 4, diff --git a/ipynb/03-spatial-operations.ipynb b/ipynb/03-spatial-operations.ipynb index 84980c9f..e9332a5a 100644 --- a/ipynb/03-spatial-operations.ipynb +++ b/ipynb/03-spatial-operations.ipynb @@ -4,6 +4,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ + "---\n", + "jupyter: python3\n", + "---\n", + "\n", "# Spatial data operations {#sec-spatial-operations}\n", "\n", "## Prerequisites {.unnumbered}" @@ -14,12 +18,17 @@ "metadata": {}, "source": [ "#| echo: false\n", - "import matplotlib.pyplot as plt\n", - "import pandas as pd\n", - "pd.set_option('display.max_rows', 4)\n", - "pd.set_option('display.max_columns', 6)\n", - "pd.options.display.max_colwidth = 35\n", - "plt.rcParams['figure.figsize'] = (5, 5)" + "import book_options" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| echo: false\n", + "import book_options_pdf" ], "execution_count": null, "outputs": [] @@ -37,6 +46,8 @@ "source": [ "import os\n", "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", "import scipy.ndimage\n", "import scipy.stats\n", "import shapely\n", @@ -53,7 +64,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "It also relies on the following data files:" + "It also relies on the following data files: " ] }, { @@ -98,14 +109,20 @@ "source": [ "## Introduction\n", "\n", - "\n", - "\n", - "\n", - "Spatial operations, including spatial joins between vector datasets and local and focal operations on raster datasets, are a vital part of geocomputation. This chapter shows how spatial objects can be modified in a multitude of ways based on their location and shape. Many spatial operations have a non-spatial (attribute) equivalent, so concepts such as subsetting and joining datasets demonstrated in the previous chapter are applicable here. This is especially true for vector operations: @sec-vector-attribute-manipulation on vector attribute manipulation provides the basis for understanding its spatial counterpart, namely spatial subsetting (covered in @sec-spatial-subsetting-vector). Spatial joining (@sec-spatial-joining) and aggregation (@sec-vector-spatial-aggregation) also have non-spatial counterparts, covered in the previous chapter.\n", + "Spatial operations, including spatial joins between vector datasets and local and focal operations on raster datasets, are a vital part of geocomputation. \n", + "This chapter shows how spatial objects can be modified in a multitude of ways based on their location and shape. Many spatial operations have a non-spatial (attribute) equivalent, so concepts such as subsetting and joining datasets demonstrated in the previous chapter are applicable here.\n", + "This is especially true for vector operations: @sec-vector-attribute-manipulation on vector attribute manipulation provides the basis for understanding its spatial counterpart, namely spatial subsetting (covered in @sec-spatial-subsetting-vector). \n", + "Spatial joining (@sec-spatial-joining) and aggregation (@sec-vector-spatial-aggregation) also have non-spatial counterparts, covered in the previous chapter.\n", "\n", - "Spatial operations differ from non-spatial operations in a number of ways, however. Spatial joins, for example, can be done in a number of ways---including matching entities that intersect with or are within a certain distance of the target dataset---while the attribution joins discussed in @sec-vector-attribute-joining in the previous chapter can only be done in one way. Different types of spatial relationship between objects, including intersects and disjoint, are described in @sec-topological-relations. Another unique aspect of spatial objects is distance: all spatial objects are related through space, and distance calculations can be used to explore the strength of this relationship, as described in the context of vector data in @sec-distance-relations.\n", + "Spatial operations differ from non-spatial operations in a number of ways, however. \n", + "Spatial joins, for example, can be done in a number of ways---including matching entities that intersect with or are within a certain distance of the target dataset---while the attribution joins discussed in @sec-vector-attribute-joining in the previous chapter can only be done in one way.\n", + "Different types of spatial relationships between objects, including intersects and disjoints, are described in @sec-topological-relations.\n", + "Another unique aspect of spatial objects is distance: all spatial objects are related through space, and distance calculations can be used to explore the strength of this relationship, as described in the context of vector data in @sec-distance-relations.\n", "\n", - "Spatial operations on raster objects include subsetting---covered in @sec-spatial-subsetting-raster---and merging several raster 'tiles' into a single object, as demonstrated in @sec-merging-rasters. Map algebra covers a range of operations that modify raster cell values, with or without reference to surrounding cell values. The concept of map algebra, vital for many applications, is introduced in @sec-map-algebra; local, focal and zonal map algebra operations are covered in sections @sec-raster-local-operations, @sec-focal-operations, and @sec-zonal-operations, respectively. Global map algebra operations, which generate summary statistics representing an entire raster dataset, and distance calculations on rasters, are discussed in Section @sec-global-operations-and-distances. In the final section (@sec-merging-rasters) the process of merging two raster datasets is discussed and demonstrated with reference to a reproducible example.\n", + "Spatial operations on raster objects include subsetting---covered in @sec-spatial-subsetting-raster---and merging several raster 'tiles' into a single object, as demonstrated in @sec-merging-rasters.\n", + "Map algebra covers a range of operations that modify raster cell values, with or without reference to surrounding cell values.\n", + "The concept of map algebra, vital for many applications, is introduced in @sec-map-algebra; local, focal, and zonal map algebra operations are covered in sections @sec-raster-local-operations, @sec-focal-operations, and @sec-zonal-operations, respectively. \n", + "Global map algebra operations, which generate summary statistics representing an entire raster dataset, and distance calculations on rasters, are discussed in Section @sec-global-operations-and-distances.\n", "\n", "::: callout-note\n", "It is important to note that spatial operations that use two spatial objects rely on both objects having the same coordinate reference system, a topic that was introduced in @sec-coordinate-reference-systems-intro and which will be covered in more depth in @sec-reproj-geo-data. \n", @@ -113,13 +130,8 @@ "\n", "## Spatial operations on vector data {#sec-spatial-vec}\n", "\n", - "\n", - "\n", - "\n", "This section provides an overview of spatial operations on vector geographic data represented as Simple Features using the **shapely** and **geopandas** \n", "packages. \n", - "\n", - "\n", "@sec-spatial-ras then presents spatial operations on raster datasets, using the **rasterio** and **scipy** packages.\n", "\n", "### Spatial subsetting {#sec-spatial-subsetting-vector}\n", @@ -129,7 +141,7 @@ "\n", "Spatial subsetting is the process of taking a spatial object and returning a new object containing only features that relate in space to another object.\n", "Analogous to attribute subsetting (covered in @sec-vector-attribute-subsetting), subsets of `GeoDataFrame`s can be created with square bracket (`[`) operator using the syntax `x[y]`, where `x` is an `GeoDataFrame` from which a subset of rows/features will be returned, and `y` is a boolean `Series`.\n", - "The difference is, that, in spatial subsetting `y` is created based on another geometry and using one of the binary geometry relation methods, such as `.intersects` (see @sec-topological-relations), rather based on comparison based on ordinary columns.\n", + "The difference is, that, in spatial subsetting `y` is created based on another geometry and using one of the binary geometry relation methods, such as `.intersects` (see @sec-topological-relations), rather than based on comparison based on ordinary columns.\n", "\n", "To demonstrate spatial subsetting, we will use the `nz` and `nz_height` layers, which contain geographic data on the 16 main regions and 101 highest points in New Zealand, respectively (@fig-spatial-subset (a)), in a projected coordinate system.\n", "The following expression creates a new object, `canterbury`, representing only one region --- Canterbury." @@ -149,8 +161,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Then, we use the `.intersects` method evaluate, for each of the `nz_height` points, whether they intersect with Canterbury.\n", - "The result `canterbury_height` is a boolean `Series` with the \"answers\"." + "Then, we use the `.intersects` method to evaluate, for each of the `nz_height` points, whether they intersect with Canterbury.\n", + "The result `canterbury_height` is a boolean `Series` with the 'answers'." ] }, { @@ -216,7 +228,7 @@ "source": [ "Like in attribute subsetting (@sec-vector-attribute-subsetting), we are using a boolean series (`sel`), of the same length as the number of rows in the filtered table (`nz_height`), created based on a condition applied on itself.\n", "The difference is that the condition is not a comparison of attribute values, but an evaluation of a spatial relation.\n", - "Namely, we evaluate whether each geometry of `nz_height` intersects with `canterbury` geometry, using the `.intersects` method.\n", + "Namely, we evaluate whether each geometry of `nz_height` intersects with the `canterbury` geometry, using the `.intersects` method.\n", "\n", "Various topological relations can be used for spatial subsetting which determine the type of spatial relationship that features in the target object must have with the subsetting object to be selected.\n", "These include touches, crosses, or within, as we will see shortly in @sec-topological-relations.\n", @@ -267,11 +279,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In case we need to subset according to several geometries at once, e.g., find out which points intersect with both Canterbury and Southland, we can dissolve the filtering subset, using `.unary_union`, before applying the `.intersects` (or any other) operator.\n", + "In case we need to subset according to several geometries at once, e.g., find out which points intersect with both Canterbury and Southland, we can dissolve the filtering subset, using `.union_all`, before applying the `.intersects` (or any other) operator.\n", "For example, here is how we can subset the `nz_height` points which intersect with Canterbury or Southland.\n", - "(Note that we are also using the `.isin` method, as demonstrated in the end of @sec-vector-attribute-subsetting.)\n", - "\n", - "" + "(Note that we are also using the `.isin` method, as demonstrated at the end of @sec-vector-attribute-subsetting.)" ] }, { @@ -279,34 +289,13 @@ "metadata": {}, "source": [ "canterbury_southland = nz[nz['Name'].isin(['Canterbury', 'Southland'])]\n", - "sel = nz_height.intersects(canterbury_southland.unary_union)\n", + "sel = nz_height.intersects(canterbury_southland.union_all())\n", "canterbury_southland_height = nz_height[sel]\n", "canterbury_southland_height" ], "execution_count": null, "outputs": [] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "" - ] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "#| eval: false\n", - "#| echo: false\n", - "nz_height.overlay(canterbury_southland) " - ], - "execution_count": null, - "outputs": [] - }, { "cell_type": "markdown", "metadata": {}, @@ -319,7 +308,7 @@ "metadata": {}, "source": [ "#| label: fig-spatial-subset2\n", - "#| fig-cap: Spatial subsetting of points by intersection with more that one polygon\n", + "#| fig-cap: Spatial subsetting of points by intersection with more than one polygon\n", "#| fig-subcap: \n", "#| - Original points (red)\n", "#| - Spatial subset based on intersection (red), geometry used for subsetting (Canterbury and Southland) (grey)\n", @@ -339,31 +328,31 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The next section further explores different types of spatial relation, also known as binary predicates (of which `.intersects` and `.disjoint` are two examples), that can be used to identify whether or not two features are spatially related.\n", + "The next section further explores different types of spatial relations, also known as binary predicates (of which `.intersects` and `.disjoint` are two examples), that can be used to identify whether or not two features are spatially related.\n", "\n", "### Topological relations {#sec-topological-relations}\n", "\n", "Topological relations describe the spatial relationships between objects.\n", - "\"Binary topological relationships\", to give them their full name, are logical statements (in that the answer can only be `True` or `False`) about the spatial relationships between two objects defined by ordered sets of points (typically forming points, lines and polygons) in two or more dimensions [@egenhofer_mathematical_1990].\n", + "'Binary topological relationships', to give them their full name, are logical statements (in that the answer can only be `True` or `False`) about the spatial relationships between two objects defined by ordered sets of points (typically forming points, lines, and polygons) in two or more dimensions [@egenhofer_mathematical_1990].\n", "That may sound rather abstract and, indeed, the definition and classification of topological relations is based on mathematical foundations first published in book form in 1966 [@spanier_algebraic_1995], with the field of algebraic topology continuing into the 21st century [@dieck_algebraic_2008].\n", "\n", "Despite their mathematical origins, topological relations can be understood intuitively with reference to visualizations of commonly used functions that test for common types of spatial relationships.\n", "@fig-spatial-relations shows a variety of geometry pairs and their associated relations.\n", "The third and fourth pairs in @fig-spatial-relations (from left to right and then down) demonstrate that, for some relations, order is important: while the relations equals, intersects, crosses, touches and overlaps are symmetrical, meaning that if `x.relation(y)` is true, `y.relation(x)` will also be true, relations in which the order of the geometries are important such as contains and within are not.\n", - "\n", - "\n", "\n", "::: callout-note\n", - "Notice that each geometry pair has a [\"DE-9IM\"](https://en.wikipedia.org/wiki/DE-9IM) string such as `FF2F11212`.\n", + "Notice that each geometry pair has a 'DE-9IM'[^de-9im] string such as `FF2F11212`.\n", "DE-9IM strings describe the dimensionality (0=points, 1=lines, 2=polygons) of the pairwise intersections of the interior, boundary, and exterior, of two geometries (i.e., nine values of 0/1/2 encoded into a string).\n", "This is an advanced topic beyond the scope of this book, which can be useful to understand the difference between relation types, or define custom types of relations.\n", - "See the [DE-9IM strings](https://r.geocompx.org/spatial-operations#de-9im-strings) section in Geocomputation with R [@lovelace_geocomputation_2019]. \n", - "Also note that the **shapely** package contains the `.relate` and `.relate_pattern` [methods](https://shapely.readthedocs.io/en/stable/manual.html#de-9im-relationships), for derive and test for DE-9IM patterns, respectively.\n", + "See the DE-9IM strings section in Geocomputation with R [@lovelace_geocomputation_2019]. \n", + "Also note that the **shapely** package contains the `.relate` and `.relate_pattern` methods, to derive and to test for DE-9IM patterns, respectively.\n", ":::\n", "\n", - "![Topological relations between vector geometries, inspired by Figures 1 and 2 in Egenhofer and Herring (1990). The relations for which the `x.relation(y)` is true are printed for each geometry pair, with `x` represented in pink and `y` represented in blue. The nature of the spatial relationship for each pair is described by the Dimensionally Extended 9-Intersection Model string.](https://r.geocompx.org/04-spatial-operations_files/figure-html/relations-1.png){#fig-spatial-relations}\n", + "[^de-9im]: [https://en.wikipedia.org/wiki/DE-9IM](https://en.wikipedia.org/wiki/DE-9IM)\n", "\n", - "In **shapely**, methods testing for different types of topological relations are known as [\"relationships\"](https://shapely.readthedocs.io/en/stable/manual.html#relationships).\n", + "![Topological relations between vector geometries, inspired by Figures 1 and 2 in [@egenhofer_mathematical_1990]. The relations for which the `x.relation(y)` is true are printed for each geometry pair, with `x` represented in pink and `y` represented in blue. The nature of the spatial relationship for each pair is described by the Dimensionally Extended 9-Intersection Model string.](images/relations-1.png){#fig-spatial-relations}\n", + "\n", + "In **shapely**, methods testing for different types of topological relations are known as 'relationships'.\n", "**geopandas** provides their wrappers (with the same method name) which can be applied on multiple geometries at once (such as `.intersects` and `.disjoint` applied on all points in `nz_height`, see @sec-spatial-subsetting-vector).\n", "To see how topological relations work in practice, let's create a simple reproducible example, building on the relations illustrated in @fig-spatial-relations and consolidating knowledge of how vector geometries are represented from a previous chapter (@sec-geometry-columns and @sec-geometries)." ] @@ -392,9 +381,7 @@ "metadata": {}, "source": [ "The sample dataset which we created is composed of three is `GeoSeries`: named `points`, `line`, and `poly`, which are visualized in @fig-spatial-relations-geoms.\n", - "The last expression is a `for` loop used to add text labels (`1`, `2`, and `3`) to identify the points; we are going to explain the concepts of text annotations with **geopandas** `.plot` in @sec-plot-static-labels.\n", - "\n", - "" + "The last expression is a `for` loop used to add text labels (`0`, `1`, and `2`) to identify the points; we are going to explain the concepts of text annotations with **geopandas** `.plot` in @sec-plot-static-labels." ] }, { @@ -402,7 +389,7 @@ "metadata": {}, "source": [ "#| label: fig-spatial-relations-geoms\n", - "#| fig-cap: Points, line and polygon objects arranged to illustrate topological relations\n", + "#| fig-cap: Points (`points`), line (`line`), and polygon (`poly`) objects used to illustrate topological relations\n", "base = poly.plot(color='lightgrey', edgecolor='red')\n", "line.plot(ax=base, color='black', linewidth=7)\n", "points.plot(ax=base, color='none', edgecolor='black')\n", @@ -420,7 +407,7 @@ "metadata": {}, "source": [ "A simple query is: which of the points in `points` intersect in some way with polygon `poly`?\n", - "The question can be answered by visual inspection (points 1 and 3 are touching and are within the polygon, respectively).\n", + "The question can be answered by visual inspection (points `0` and `2` are touching and are within the polygon, respectively).\n", "Alternatively, we can get the solution with the `.intersects` method, which reports whether or not each geometry in a `GeoSeries` (`points`) intersects with a single `shapely` geometry (`poly.iloc[0]`)." ] }, @@ -438,17 +425,17 @@ "metadata": {}, "source": [ "The result shown above is a boolean `Series`.\n", - "Its contents should match our intuition: positive (`True`) results are returned for the first and third point, and a negative result (`False`) for the second.\n", + "Its contents should match our intuition: positive (`True`) results are returned for the points `0` and `2`, and a negative result (`False`) for point `1`.\n", "Each value in this `Series` represents a feature in the first input (`points`).\n", "\n", - "All earlier examples in this chapter demonstrate the \"many-to-one\" mode of `.intersects` and analogous methods, where the relation is evaluated between each of several geometries in a `GeoSeries`/`GeoDataFrame`, and an individual `shapely` geometry.\n", + "All earlier examples in this chapter demonstrate the 'many-to-one' mode of `.intersects` and analogous methods, where the relation is evaluated between each of several geometries in a `GeoSeries`/`GeoDataFrame`, and an individual `shapely` geometry.\n", "A second mode of those methods (not demonstrated here) is when both inputs are `GeoSeries`/`GeoDataFrame` objects.\n", - "In such case, a \"pairwise\" evaluation takes place between geometries aligned by index (`align=True`, the default) or by position (`align=False`).\n", + "In such case, a 'pairwise' evaluation takes place between geometries aligned by index (`align=True`, the default) or by position (`align=False`).\n", "For example, the expression `nz.intersects(nz)` returns a `Series` of 16 `True` values, indicating (unsurprisingly) that each geometry in `nz` intersects with itself.\n", "\n", - "A third mode is when we are interested in a \"many-to-many\" evaluation, i.e., obtaining a matrix of all pairwise combinations of geometries from two `GeoSeries` objects.\n", + "A third mode is when we are interested in a 'many-to-many' evaluation, i.e., obtaining a matrix of all pairwise combinations of geometries from two `GeoSeries` objects.\n", "At the time of writing, there is no built-in method to do this in **geopandas**.\n", - "However, the [`.apply`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.apply.html) method can be used to repeat a \"many-to-one\" evaluation over all geometries in the second layer, resulting in a matrix of *pairwise* results.\n", + "However, the `.apply` method (package **pandas**) can be used to repeat a 'many-to-one' evaluation over all geometries in the second layer, resulting in a matrix of *pairwise* results.\n", "We will create another `GeoSeries` with two polygons, named `poly2`, to demonstrate this." ] }, @@ -492,11 +479,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now we can use to get the intersection relations matrix.\n", - "\n", - "\n", - "The result is a `DataFrame`, where each row represents a `points` geometry and each column represents a `poly` geometry.\n", - "We can see that the first point intersects with both polygons, while the second and third points intersect with one of the polygons each." + "Now we can use `.apply` to get the intersection relations matrix.\n", + "The result is a `DataFrame`, where each row represents a `points` geometry and each column represents a `poly2` geometry.\n", + "We can see that the point `0` intersects with both polygons, while points `1` and `2` intersect with one of the polygons each." ] }, { @@ -513,7 +498,7 @@ "metadata": {}, "source": [ "::: callout-note\n", - "The [`.apply`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.apply.html) method (package **pandas**) is used to apply a function along one of the axes of a `DataFrame` (or `GeoDataFrame`).\n", + "The `.apply` method (package **pandas**) is used to apply a function along one of the axes of a `DataFrame` (or `GeoDataFrame`).\n", "That is, we can apply a function on all rows (`axis=1`) or all columns (`axis=0`, the default). \n", "When the function being applied returns a single value, the output of `.apply` is a `Series` (e.g., `.apply(len)` returns the lengths of all columns, because `len` returns a single value). \n", "When the function returns a `Series`, then `.apply` returns a `DataFrame` (such as in the above example.)\n", @@ -521,9 +506,7 @@ "\n", "::: callout-note\n", "Since the above result, like any pairwise matrix, (1) is composed of values of the same type, and (2) has no contrasting role for rows and columns, is may be more convenient to use a plain **numpy** array to work with it. \n", - "In such case, we can use the [`.to_numpy`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_numpy.html) method to go from `DataFrame` to `ndarray`.\n", - "\n", - "" + "In such case, we can use the `.to_numpy` method to go from `DataFrame` to `ndarray`." ] }, { @@ -541,7 +524,7 @@ "source": [ ":::\n", "\n", - "The `.intersects` method returns `True` even in cases where the features just touch: intersects is a 'catch-all' topological operation which identifies many types of spatial relation, as illustrated in @fig-spatial-relations.\n", + "The `.intersects` method returns `True` even in cases where the features just touch: intersects is a 'catch-all' topological operation which identifies many types of spatial relations, as illustrated in @fig-spatial-relations.\n", "More restrictive questions include which points lie within the polygon, and which features are on or contain a shared boundary with it?\n", "The first question can be answered with `.within`, and the second with `.touches`." ] @@ -568,7 +551,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Note that although the first point touches the boundary polygon, it is not within it; the third point is within the polygon but does not touch any part of its border.\n", + "Note that although the point `0` touches the boundary polygon, it is not within it; point `2` is within the polygon but does not touch any part of its border.\n", "The opposite of `.intersects` is `.disjoint`, which returns only objects that do not spatially relate in any way to the selecting object." ] }, @@ -585,7 +568,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Another useful type of relation is \"within distance\", where we detect features that intersect with the target buffered by particular distance.\n", + "Another useful type of relation is 'within distance', where we detect features that intersect with the target buffered by particular distance.\n", "Buffer distance determines how close target objects need to be before they are selected.\n", "This can be done by literally buffering (@sec-geometries) the target geometry, and evaluating intersection (`.intersects`).\n", "Another way is to calculate the distances using the `.distance` method, and then evaluate whether they are within a threshold distance." @@ -604,11 +587,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", - "Note that although the second point is more than `0.2` units of distance from the nearest vertex of `poly`, it is still selected when the distance is set to `0.2`.\n", - "This is because distance is measured to the nearest edge, in this case the part of the polygon that lies directly above point 2 in Figure @fig-spatial-relations.\n", - "We can verify the actual distance between the second point and the polygon is `0.13`, as follows." + "Note that although point `1` is more than `0.2` units of distance from the nearest vertex of `poly`, it is still selected when the distance is set to `0.2`.\n", + "This is because distance is measured to the nearest edge, in this case, the part of the polygon that lies directly above point 2 in Figure @fig-spatial-relations.\n", + "We can verify that the actual distance between point `1` and the polygon is `0.13`, as follows." ] }, { @@ -624,7 +605,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This is also a good opportunity to repeat that all distance-related calculations in **gopandas** (and **shapely**) assume planar geometry, and only take into account the coordinate values. It is up to the user to make sure that all input layers are in the same projected CRS, so that this type of calculations make sense (see @sec-geometry-operations-on-projected-and-unprojected-data and @sec-when-to-reproject).\n", + "This is also a good opportunity to repeat that all distance-related calculations in **geopandas** (and **shapely**) assume planar geometry, and only take into account the coordinate values. It is up to the user to make sure that all input layers are in the same projected CRS, so that this type of calculations make sense (see @sec-geometry-operations-on-projected-and-unprojected-data and @sec-when-to-reproject).\n", "\n", "### Spatial joining {#sec-spatial-joining}\n", "\n", @@ -634,9 +615,7 @@ "\n", "The following example illustrates the process: imagine you have ten points randomly distributed across the Earth's surface and you ask, for the points that are on land, which countries are they in?\n", "Implementing this idea in a reproducible example will build your geographic data handling skills and show how spatial joins work.\n", - "The starting point is to create points that are randomly scattered over the planar surface that represents Earth's geographic coordinates, in decimal degrees (@fig-spatial-join (a)).\n", - "\n", - "" + "The starting point is to create points that are randomly scattered over the planar surface that represents Earth's geographic coordinates, in decimal degrees (@fig-spatial-join (a))." ] }, { @@ -658,7 +637,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The scenario illustrated in @fig-spatial-join shows that the `random_points` object (top left) lacks attribute data, while the world (top right) has attributes, including country names shown for a sample of countries in the legend.\n", + "The scenario illustrated in @fig-spatial-join shows that the `random_points` object (top left) lacks attribute data, while the world (top right) has attributes, including country names that are shown for a sample of countries in the legend.\n", "Before creating the joined dataset, we use spatial subsetting to create `world_random`, which contains only countries that contain random points, to verify the number of country names returned in the joined dataset should be four (see the top right panel of @fig-spatial-join (b))." ] }, @@ -666,7 +645,7 @@ "cell_type": "code", "metadata": {}, "source": [ - "world_random = world[world.intersects(random_points.unary_union)]\n", + "world_random = world[world.intersects(random_points.union_all())]\n", "world_random" ], "execution_count": null, @@ -730,7 +709,7 @@ "\n", "Sometimes two geographic datasets do not touch but still have a strong geographic relationship.\n", "The datasets `cycle_hire` and `cycle_hire_osm` provide a good example.\n", - "Plotting them reeveals that they are often closely related but they do not seem to touch, as shown in @fig-cycle-hire." + "Plotting them reveals that they are often closely related but they do not seem to touch, as shown in @fig-cycle-hire." ] }, { @@ -788,11 +767,7 @@ "source": [ "This is when a non-overlapping join is needed.\n", "Spatial join (`gpd.sjoin`) along with buffered geometries (see @sec-buffers) can be used to do that, as demonstrated below using a threshold distance of 20 $m$.\n", - "\n", - "\n", - "Note that we transform the data to a projected CRS (`27700`) to use real buffer distances, in meters (see @sec-geometry-operations-on-projected-and-unprojected-data).\n", - "\n", - "" + "Note that we transform the data to a projected CRS (`27700`) to use real buffer distances, in meters (see @sec-geometry-operations-on-projected-and-unprojected-data)." ] }, { @@ -802,7 +777,11 @@ "crs = 27700\n", "cycle_hire_buffers = cycle_hire.copy().to_crs(crs)\n", "cycle_hire_buffers.geometry = cycle_hire_buffers.buffer(20)\n", - "cycle_hire_buffers = gpd.sjoin(cycle_hire_buffers, cycle_hire_osm.to_crs(crs))\n", + "cycle_hire_buffers = gpd.sjoin(\n", + " cycle_hire_buffers, \n", + " cycle_hire_osm.to_crs(crs), \n", + " how='left'\n", + ")\n", "cycle_hire_buffers" ], "execution_count": null, @@ -815,11 +794,7 @@ "Note that the number of rows in the joined result is greater than the target.\n", "This is because some cycle hire stations in `cycle_hire_buffers` have multiple matches in `cycle_hire_osm`.\n", "To aggregate the values for the overlapping points and return the mean, we can use the aggregation methods shown in @sec-vector-attribute-aggregation, resulting in an object with the same number of rows as the target.\n", - "We also go back from buffers to points using `.centroid` method.\n", - "\n", - "\n", - "\n", - "" + "We also go back from buffers to points using `.centroid` method." ] }, { @@ -850,7 +825,7 @@ "#| fig-cap: Non-overlapping join\n", "#| fig-subcap: \n", "#| - Input (`cycle_hire_osm`)\n", - "#| - Join result (`z`)\n", + "#| - Join result (`cycle_hire_buffers`)\n", "#| layout-ncol: 2\n", "# Input\n", "fig, ax = plt.subplots(1, 1, figsize=(6, 3))\n", @@ -912,7 +887,7 @@ "cell_type": "code", "metadata": {}, "source": [ - "nz_height2 = nz_height2.groupby('Name')[['elevation']].mean()\n", + "nz_height2 = nz_height2.groupby('Name')[['elevation']].mean().reset_index()\n", "nz_height2" ], "execution_count": null, @@ -939,10 +914,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We now have create the `nz_height4` layer, which gives the average `nz_height` elevation value per polygon.\n", + "We now have create the `nz2` layer, which gives the average `nz_height` elevation value per polygon.\n", "The result is shown in @fig-nz-avg-nz-height.\n", "Note that the `missing_kwds` part determines the style of geometries where the symbology attribute (`elevation`) is missing, because there were no `nz_height` points overlapping with them.\n", - "The default is to omit them, which is usually not what we want, but with `{'color':'none','edgecolor':'black'}`, those polygons are shown with black outline and no fill." + "The default is to omit them, which is usually not what we want, but with `{'color':'grey','edgecolor':'black'}`, those polygons are shown with black outline and grey fill." ] }, { @@ -955,7 +930,7 @@ " column='elevation', \n", " legend=True,\n", " cmap='Blues', edgecolor='black',\n", - " missing_kwds={'color': 'none', 'edgecolor': 'black'}\n", + " missing_kwds={'color': 'grey', 'edgecolor': 'black'}\n", ");" ], "execution_count": null, @@ -967,24 +942,17 @@ "source": [ "### Joining incongruent layers {#sec-joining-incongruent-layers}\n", "\n", - "\n", - "\n", - "\n", "Spatial congruence is an important concept related to spatial aggregation.\n", "An aggregating object (which we will refer to as `y`) is congruent with the target object (`x`) if the two objects have shared borders.\n", "Often this is the case for administrative boundary data, whereby larger units---such as Middle Layer Super Output Areas (MSOAs) in the UK, or districts in many other European countries---are composed of many smaller units.\n", "\n", "Incongruent aggregating objects, by contrast, do not share common borders with the target [@qiu_development_2012].\n", "This is problematic for spatial aggregation (and other spatial operations) illustrated in @fig-nz-and-grid: aggregating the centroid of each sub-zone will not return accurate results.\n", - "Areal interpolation overcomes this issue by transferring values from one set of areal units to another, using a range of algorithms including simple area weighted approaches and more sophisticated approaches such as 'pycnophylactic' methods [@tobler_smooth_1979].\n", + "Areal interpolation overcomes this issue by transferring values from one set of areal units to another, using a range of algorithms including simple area-weighted approaches and more sophisticated approaches such as 'pycnophylactic' methods [@tobler_smooth_1979].\n", "\n", - "To demonstrate joining incongruent layers, we will create a \"synthetic\" layer comprising a [regular grid](https://gis.stackexchange.com/questions/322589/rasterizing-polygon-grid-in-python-geopandas-rasterio) of rectangles of size $100\\times100$ $km$, covering the extent of the `nz` layer. \n", + "To demonstrate joining incongruent layers, we will create a 'synthetic' layer comprising a regular grid of rectangles of size $100\\times100$ $km$, covering the extent of the `nz` layer. \n", "This recipe can be used to create a regular grid covering any given layer (other than `nz`), at the specified resolution (`res`). \n", - "Most of the functions have been explained in previous chapters; we leave it as an exerise for the reader to explore how the code works.\n", - "\n", - "\n", - "\n", - "" + "Most of the functions have been explained in previous chapters; we leave it as an exercise for the reader to explore how the code works." ] }, { @@ -1038,7 +1006,7 @@ " column='Population', \n", " edgecolor='black', \n", " legend=True, \n", - " cmap='viridis_r'\n", + " cmap='Reds'\n", ");" ], "execution_count": null, @@ -1048,9 +1016,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Our goal, now, is to \"transfer\" the `'Population'` attribute (@fig-nz-and-grid) to the rectangular grid polygons, which is an example of a join between incongruent layers.\n", + "Our goal, now, is to 'transfer' the `'Population'` attribute (@fig-nz-and-grid) to the rectangular grid polygons, which is an example of a join between incongruent layers.\n", "To do that, we basically need to calculate--for each `grid` cell---the weighted sum of the population in `nz` polygons coinciding with that cell.\n", - "The weights in the weighted sum calculation are the ratios between the area of the coinciding \"part\" out of the entire `nz` polygon.\n", + "The weights in the weighted sum calculation are the ratios between the area of the coinciding 'part' out of the entire `nz` polygon.\n", "That is, we (inevitably) assume that the population in each `nz` polygon is equally distributed across space, therefore a partial `nz` polygon contains the respective partial population size.\n", "\n", "We start by calculating the entire area of each `nz` polygon, as follows, using the `.area` method (@sec-area-length)." @@ -1070,8 +1038,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Next, we use the [`.overlay`](https://geopandas.org/en/stable/docs/reference/api/geopandas.GeoDataFrame.overlay.html) method to calculate the pairwise intersections between `nz` and `grid`.\n", - "As a result, we now have a layer where each `nz` polygon is \"split\" according to the `grid` polygons, hereby named `nz_grid`." + "Next, we use the `.overlay` method to calculate the pairwise intersections between `nz` and `grid`.\n", + "As a result, we now have a layer where each `nz` polygon is split according to the `grid` polygons, hereby named `nz_grid`." ] }, { @@ -1125,7 +1093,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The resulting layer `nz_grid`, which the `area_sub` attribute, is shown in @fig-nz-and-grid2." + "The resulting layer `nz_grid`, with the `area_sub` attribute, is shown in @fig-nz-and-grid2." ] }, { @@ -1135,7 +1103,13 @@ "#| label: fig-nz-and-grid2\n", "#| fig-cap: The areas of pairwise intersections in the `nz_grid` layer\n", "base = grid.plot(color='none', edgecolor='grey')\n", - "nz_grid.plot(ax=base, column='area_sub', edgecolor='black', legend=True, cmap='viridis_r');" + "nz_grid.plot(\n", + " ax=base, \n", + " column='area_sub', \n", + " edgecolor='black',\n", + " legend=True, \n", + " cmap='Reds'\n", + ");" ], "execution_count": null, "outputs": [] @@ -1144,9 +1118,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Note that each of the \"intersections\" still holds the `Population` attribute of its \"origin\" feature of `nz`, i.e., each portion of the `nz` area is associated with the original complete population count for that area.\n", + "Note that each of the intersections still holds the `Population` attribute of its 'origin' feature of `nz`, i.e., each portion of the `nz` area is associated with the original complete population count for that area.\n", "The real population size of each `nz_grid` feature, however, is smaller, or equal, depending on the geographic area proportion that it occupies out of the original `nz` feature.\n", - "To make the \"correction\", we first calculate the ratio (`area_prop`) and then multiply it by the population.\n", + "To make the correction, we first calculate the ratio (`area_prop`) and then multiply it by the population.\n", "The new (lowercase) attribute `population` now has the correct estimate of population sizes in `nz_grid`:" ] }, @@ -1166,7 +1140,7 @@ "metadata": {}, "source": [ "What is left to be done is to sum (see @sec-vector-attribute-aggregation) the population in all parts forming the same grid cell and join (see @sec-vector-attribute-joining) them back to the `grid` layer.\n", - "Note that many of the grid cells have \"No Data\" for population, because they have no intersection with `nz` at all (@fig-nz-and-grid)." + "Note that many of the grid cells have 'No Data' for population, because they have no intersection with `nz` at all (@fig-nz-and-grid)." ] }, { @@ -1193,7 +1167,12 @@ "source": [ "#| label: fig-nz-and-grid3\n", "#| fig-cap: 'The `nz` layer and a regular grid of rectangles: final result'\n", - "base = grid.plot(column='population', edgecolor='black', legend=True, cmap='viridis_r');\n", + "base = grid.plot(\n", + " column='population', \n", + " edgecolor='black',\n", + " legend=True, \n", + " cmap='Reds'\n", + ");\n", "nz.plot(ax=base, color='none', edgecolor='grey', legend=True);" ], "execution_count": null, @@ -1230,24 +1209,19 @@ "source": [ "The procedure in this section is known as an area-weighted interpolation of a spatially *extensive* (e.g., population) variable.\n", "In extensive interpolation, we assume that the variable of interest represents counts (such as, here, inhabitants) uniformly distributed across space. \n", - "In such case, each \"part\" of a given polygon captures the respective proportion of counts (such as, half of a region with $N$ inhabitants contains $N/2$ ihnabitants).\n", + "In such case, each part of a given polygon captures the respective proportion of counts (such as, half of a region with $N$ inhabitants contains $N/2$ inhabitants).\n", "Accordingly, summing the parts gives the total count of the total area.\n", "\n", "An area-weighted interpolation of a spatially *intensive* variable (e.g., population density) is almost identical, except that we would have to calculate the weighted `.mean` rather than `.sum`, to preserve the average rather than the sum.\n", "In intensive interpolation, we assume that the variable of interest represents counts per unit area, i.e., density.\n", - "Since density is (assumed to be) uniform, any \"part\" of a given polygon has exactly the same density as that of the whole polygon.\n", + "Since density is (assumed to be) uniform, any part of a given polygon has exactly the same density as that of the whole polygon.\n", "Density values are therefore computed as weighted averages, rather than sums, of the parts. \n", - "Also see the \"Area-weighted interpolation\" [section](https://r-spatial.org/book/05-Attributes.html#sec-area-weighted) in @pebesma_spatial_2023.\n", - "\n", - "\n", + "Also, see the 'Area-weighted interpolation' section in @pebesma_spatial_2023.\n", "\n", "### Distance relations {#sec-distance-relations}\n", "\n", - "\n", - "\n", - "\n", "While topological relations are binary---a feature either intersects with another or does not---distance relations are continuous.\n", - "The distance between two objects is calculated with the [`.distance`](https://geopandas.org/en/stable/docs/reference/api/geopandas.GeoSeries.distance.html) method.\n", + "The distance between two objects is calculated with the `.distance` method.\n", "The method is applied on a `GeoSeries` (or a `GeoDataFrame`), with the argument being an individual `shapely` geometry.\n", "The result is a `Series` of pairwise distances.\n", "\n", @@ -1255,27 +1229,23 @@ "**geopandas** uses similar syntax and mode of operation for many of its methods and functions, including:\n", "\n", "* Numeric calculations, such as `.distance` (this section), returning numeric values\n", - "* Topological evaluations methods, such as `.intersects` or `.disjoint` (@sec-topological-relations), returning boolean values \n", + "* Topological evaluation methods, such as `.intersects` or `.disjoint` (@sec-topological-relations), returning boolean values \n", "* Geometry generating-methods, such as `.intersection` (@sec-clipping), returning geometries\n", "\n", - "In all cases, the input is a `GeoSeries` and (or a `GeoDataFrame`) and a `shapely` geometry, and the output is a `Series` or `GeoSeries` of results, contrasting each geometry from the `GeoSeries` with the `shapely` geometry. The examples in this book demonstrate this, so called \"many-to-one\", mode of the functions. \n", + "In all cases, the input is a `GeoSeries` and (or a `GeoDataFrame`) and a `shapely` geometry, and the output is a `Series` or `GeoSeries` of results, contrasting each geometry from the `GeoSeries` with the `shapely` geometry. \n", + "The examples in this book demonstrate this, so-called 'many-to-one', mode of the functions. \n", "\n", "All of the above-mentioned methods also have a pairwise mode, perhaps less useful and not used in the book, where we evaluate relations between pairs of geometries in two `GeoSeries`, aligned either by index or by position. \n", ":::\n", "\n", - "\n", - "\n", - "\n", - "To illustrate the `.distance` method, let's take the three highest point in New Zealand with `.sort_values` and `.iloc`." + "To illustrate the `.distance` method, let's take the three highest points in New Zealand with `.sort_values` and `.iloc`." ] }, { "cell_type": "code", "metadata": {}, "source": [ - "nz_highest = nz_height \\\n", - " .sort_values(by='elevation', ascending=False) \\\n", - " .iloc[:3, :]\n", + "nz_highest = nz_height.sort_values(by='elevation', ascending=False).iloc[:3, :]\n", "nz_highest" ], "execution_count": null, @@ -1317,7 +1287,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "To obtain a distance matrix, i.e., a pairwise set of distances between all combinations of features in objects `x` and `y`, we need to use the method (analogous to the way we created the `.intersects` boolean matrix in @sec-topological-relations).\n", + "To obtain a distance matrix, i.e., a pairwise set of distances between all combinations of features in objects `x` and `y`, we need to use the `.apply` method (analogous to the way we created the `.intersects` boolean matrix in @sec-topological-relations).\n", "To illustrate this, let's now take two regions in `nz`, Otago and Canterbury, represented by the object `co`." ] }, @@ -1355,7 +1325,7 @@ "metadata": {}, "source": [ "Note that the distance between the second and third features in `nz_height` and the second feature in `co` is zero.\n", - "This demonstrates the fact that distances between points and polygons refer to the distance to any part of the polygon: the second and third points in `nz_height` are in Otago, which can be verified by plotting them (two almost completly overlappling points in @fig-nz-height-and-otago)." + "This demonstrates the fact that distances between points and polygons refer to the distance to any part of the polygon: the second and third points in `nz_height` are in Otago, which can be verified by plotting them (two almost completely overlappling points in @fig-nz-height-and-otago)." ] }, { @@ -1364,8 +1334,17 @@ "source": [ "#| label: fig-nz-height-and-otago\n", "#| fig-cap: The first three `nz_height` points, and the Otago and Canterbury regions from `nz`\n", - "base = co.plot(color='lightgrey', edgecolor='black')\n", - "nz_height.iloc[:3, :].plot(ax=base, color='none', edgecolor='black');" + "fig, ax = plt.subplots()\n", + "co.plot(color='lightgrey', edgecolor='black', ax=ax)\n", + "co.apply(\n", + " lambda x: ax.annotate(\n", + " text=x['Name'], \n", + " xy=x.geometry.centroid.coords[0], \n", + " ha='center'\n", + " ), \n", + " axis=1\n", + ")\n", + "nz_height.iloc[:3, :].plot(color='none', edgecolor='black', ax=ax);" ], "execution_count": null, "outputs": [] @@ -1376,24 +1355,17 @@ "source": [ "## Spatial operations on raster data {#sec-spatial-ras}\n", "\n", - "\n", - "\n", - "\n", "This section builds on @sec-manipulating-raster-objects, which highlights various basic methods for manipulating raster datasets, to demonstrate more advanced and explicitly spatial raster operations, and uses the `elev.tif` and `grain.tif` rasters manually created in @sec-raster-from-scratch.\n", "\n", "### Spatial subsetting {#sec-spatial-subsetting-raster}\n", "\n", "The previous chapter (and especially @sec-manipulating-raster-objects) demonstrated how to retrieve values associated with specific row and column combinations from a raster.\n", - "\n", - "\n", "Raster values can also be extracted by location (coordinates) and other spatial objects.\n", - "To use coordinates for subsetting, we can use the [`.sample`](https://rasterio.readthedocs.io/en/stable/api/rasterio.io.html#rasterio.io.DatasetReader.sample) method of a `rasterio` file connection object, combined with a list of coordinate tuples.\n", - "The methods is demonstrated below to find the value of the cell that covers a point located at coordinates of `(0.1,0.1)` in `elev`.\n", + "To use coordinates for subsetting, we can use the `.sample` method of a `rasterio` file connection object, combined with a list of coordinate tuples.\n", + "The method is demonstrated below to find the value of the cell that covers a point located at coordinates of `(0.1,0.1)` in `elev`.\n", "The returned object is a *generator*. \n", "The rationale for returning a generator, rather than a `list`, is memory efficiency. \n", - "The number of sampled points may be huge, in which case we would want to \"generate\" the values one at a time rather than all at once.\n", - "\n", - "" + "The number of sampled points may be huge, in which case we would want to generate the values one at a time rather than all at once." ] }, { @@ -1463,8 +1435,10 @@ "#| fig-cap: The `elev.tif` raster, and two points where we extract its values\n", "fig, ax = plt.subplots()\n", "rasterio.plot.show(src_elev, ax=ax)\n", - "gpd.GeoSeries([shapely.Point(0.1, 0.1)]).plot(color='black', ax=ax)\n", - "gpd.GeoSeries([shapely.Point(1.1, 1.1)]).plot(color='black', ax=ax);" + "gpd.GeoSeries([shapely.Point(0.1, 0.1)]) \\\n", + " .plot(color='black', edgecolor='white', markersize=50, ax=ax)\n", + "gpd.GeoSeries([shapely.Point(1.1, 1.1)]) \\\n", + " .plot(color='black', edgecolor='white', markersize=50, ax=ax);" ], "execution_count": null, "outputs": [] @@ -1473,21 +1447,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", - "\n", "::: callout-note\n", "We elaborate on the plotting technique used to display the points and the raster in @sec-plot-static-layers.\n", "We will also introduce a more user-friendly and general method to extract raster values to points, using the **rasterstats** package, in @sec-extraction-to-points.\n", ":::\n", "\n", "Another common use case of spatial subsetting is using a boolean mask, based on another raster with the same extent and resolution, or the original one, as illustrated in @fig-raster-subset.\n", - "To do that, we \"erase\" the values in the array of one raster, according to another corresponding \"mask\" raster.\n", - "For example, let us read (@sec-using-rasterio) the `elev.tif` raster values into an array named `elev` (@fig-raster-subset (a)), \n", - "\n", - "\n", - "\n", - "" + "To do that, we erase the values in the array of one raster, according to another corresponding mask raster.\n", + "For example, let's read (@sec-using-rasterio) the `elev.tif` raster values into an array named `elev` (@fig-raster-subset (a))." ] }, { @@ -1526,7 +1493,7 @@ "In other words, we want to mask `elev` with `mask`.\n", "The result will be stored in a copy named `masked_elev` (@fig-raster-subset (c)).\n", "In the case of `elev.tif`, to be able to store `np.nan` in the array of values, we also need to convert it to `float` (see @sec-summarizing-raster-objects). \n", - "Afterwards, masking is a matter of assigning `np.nan` into a subset defined by the mask, using the [\"boolean array indexing\"](https://numpy.org/doc/stable/user/basics.indexing.html#boolean-array-indexing) syntax of **numpy**." + "Afterwards, masking is a matter of assigning `np.nan` into a subset defined by the mask, using the 'boolean array indexing' syntax of **numpy**." ] }, { @@ -1570,7 +1537,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The \"mask\" can be create from the array itself, using condition(s). \n", + "The mask can be created from the array itself, using condition(s). \n", "That way, we can replace some values (e.g., values assumed to be wrong) with `np.nan`, such as in the following example." ] }, @@ -1594,13 +1561,13 @@ "\n", "### Map algebra {#sec-map-algebra}\n", "\n", - "The term 'map algebra' was coined in the late 1970s to describe a \"set of conventions, capabilities, and techniques\" for the analysis of geographic raster and (although less prominently) vector data [@tomlin_map_1994].\n", + "The term 'map algebra' was coined in the late 1970s to describe a 'set of conventions, capabilities, and techniques' for the analysis of geographic raster and (although less prominently) vector data [@tomlin_map_1994].\n", "In this context, we define map algebra more narrowly, as operations that modify or summarize raster cell values, with reference to surrounding cells, zones, or statistical functions that apply to every cell.\n", "\n", - "Map algebra operations tend to be fast, because raster datasets only implicitly store coordinates, hence the old adage \"raster is faster but vector is corrector\".\n", + "Map algebra operations tend to be fast, because raster datasets only implicitly store coordinates, hence the old adage 'raster is faster but vector is corrector'.\n", "The location of cells in raster datasets can be calculated by using its matrix position and the resolution and origin of the dataset (stored in the raster metadata, @sec-using-rasterio).\n", "For the processing, however, the geographic position of a cell is barely relevant as long as we make sure that the cell position is still the same after the processing.\n", - "Additionally, if two or more raster datasets share the same extent, projection and resolution, one could treat them as matrices for the processing.\n", + "Additionally, if two or more raster datasets share the same extent, projection, and resolution, one could treat them as matrices for the processing.\n", "\n", "Map algebra (or cartographic modeling with raster data) divides raster operations into four subclasses [@tomlin_geographic_1990], with each working on one or several grids simultaneously:\n", "\n", @@ -1609,14 +1576,14 @@ "- Zonal operations are similar to focal operations, but the surrounding pixel grid on which new values are computed can have irregular sizes and shapes (@sec-zonal-operations)\n", "- Global or per-raster operations; that means the output cell derives its value potentially from one or several entire rasters (@sec-global-operations-and-distances)\n", "\n", - "This typology classifies map algebra operations by the number of cells used for each pixel processing step and the type of the output.\n", - "For the sake of completeness, we should mention that raster operations can also be classified by discipline such as terrain, hydrological analysis, or image classification.\n", + "This typology classifies map algebra operations by the number of cells used for each pixel processing step and the type of output.\n", + "For the sake of completeness, we should mention that raster operations can also be classified by disciplines such as terrain, hydrological analysis, or image classification.\n", "The following sections explain how each type of map algebra operations can be used, with reference to worked examples.\n", "\n", "### Local operations {#sec-raster-local-operations}\n", "\n", "Local operations comprise all cell-by-cell operations in one or several layers.\n", - "Raster algebra is a classical use case of local operations---this includes adding or subtracting values from a raster, squaring and multiplying rasters.\n", + "Raster algebra is a classical use case of local operations---this includes adding or subtracting values from a raster, squaring,, and multiplying rasters.\n", "Raster algebra also allows logical operations such as finding all raster cells that are greater than a specific value (e.g., `5` in our example below).\n", "Local operations are applied using the **numpy** array operations syntax, as demonstrated below.\n", "\n", @@ -1637,7 +1604,7 @@ "metadata": {}, "source": [ "Now, any element-wise array operation can be applied using **numpy** arithmetic or conditional operators and functions, comprising local raster operations in spatial analysis terminology.\n", - "For example `elev + elev` adds the values of `elev` to itself, resulting in a raster with double values." + "For example, `elev + elev` adds the values of `elev` to itself, resulting in a raster with double values." ] }, { @@ -1653,11 +1620,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Note that some functions and operators automatically change the data type to accommodate the resulting values, while other operators do not, potentially resulting in overflow (i.e., incorrect values for results beyond the data type range, such as trying to accomodate values above `255` in an `int8` array).\n", + "Note that some functions and operators automatically change the data type to accommodate the resulting values, while other operators do not, potentially resulting in overflow (i.e., incorrect values for results beyond the data type range, such as trying to accommodate values above `255` in an `int8` array).\n", "For example, `elev**2` (`elev` squared) results in overflow.\n", - "Since the `**` operator does not automatically change the data type, leaving it as `int8`, the resulting array has incorrect values for `16**2`, `17**2`, etc., which are above `255` and therefore cannot be accomodated.\n", - "\n", - "" + "Since the `**` operator does not automatically change the data type, leaving it as `int8`, the resulting array has incorrect values for `16**2`, `17**2`, etc., which are above `255` and therefore cannot be accommodated." ] }, { @@ -1674,9 +1639,7 @@ "metadata": {}, "source": [ "To avoid this situation, we can, for instance, transform `elev` to the standard `int64` data type, using `.astype` before applying the `**` operator.\n", - "That way all, results up to `36**2` (`1296`) can be easily accomodated, since the `int64` data type supports values up to `9223372036854775807` (@tbl-numpy-data-types).\n", - "\n", - "" + "That way, all results, up to `36**2` (`1296`), can be easily accommodated, since the `int64` data type supports values up to `9223372036854775807` (@tbl-numpy-data-types)." ] }, { @@ -1694,10 +1657,6 @@ "source": [ "Now we get correct results.\n", "\n", - "::: callout-note\n", - "**numpy** has the special data types `np.int_` and `np.float_`, which refer to \"default\" `int` and `float` data types. These are platform dependent, but typically resolve to `np.int64` and `np.float64`. Furthermore, the standard Python types `int` and `float` refer to those two **numpy** types, respectively. Therefore, for example, either of the three objects `np.int64`, `np.int_` and `int` can be passed to `.astype` in the above example, with identical result. Whereas we've used the shortest one, `int`.\n", - ":::\n", - "\n", "@fig-raster-local-operations demonstrates the result of the last two examples (`elev+elev` and `elev.astype(int)**2`), and two other ones (`np.log(elev)` and `elev>5`)." ] }, @@ -1725,8 +1684,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Another good example of local operations is the classification of intervals of numeric values into groups such as grouping a digital elevation model into low (class `1`), middle (class `2`) and high elevations (class `3`).\n", - "Here, we assign the raster values in the ranges `0`--`12`, `12`--`24` and `24`--`36` are reclassified to take values `1`, `2` and `3`, respectively." + "Another good example of local operations is the classification of intervals of numeric values into groups such as grouping a digital elevation model into low (class `1`), middle (class `2`) and high (class `3`) elevations.\n", + "Here, the raster values in the ranges `0`--`12`, `12`--`24`, and `24`--`36` are reclassified to take values `1`, `2`, and `3`, respectively." ] }, { @@ -1768,10 +1727,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The calculation of the [Normalized Difference Vegetation Index (NDVI)](https://en.wikipedia.org/wiki/Normalized_difference_vegetation_index) is a well-known local (pixel-by-pixel) raster operation.\n", + "The calculation of the Normalized Difference Vegetation Index (NDVI)[^ndvi] is a well-known local (pixel-by-pixel) raster operation.\n", "It returns a raster with values between `-1` and `1`; positive values indicate the presence of living plants (mostly \\> `0.2`).\n", - "NDVI is calculated from red and near-infrared (NIR) bands of remotely sensed imagery, typically from satellite systems such as Landsat or Sentinel 2.\n", - "Vegetation absorbs light heavily in the visible light spectrum, and especially in the red channel, while reflecting NIR light, which is emulated in the NVDI formula (@eq-ndvi).\n", + "NDVI is calculated from red and near-infrared (NIR) bands of remotely sensed imagery, typically from satellite systems such as Landsat or Sentinel-2.\n", + "Vegetation absorbs light heavily in the visible light spectrum, and especially in the red channel, while reflecting NIR light, which is emulated in the NVDI formula (@eq-ndvi),\n", + "\n", + "[^ndvi]: [https://en.wikipedia.org/wiki/Normalized_difference_vegetation_index](https://en.wikipedia.org/wiki/Normalized_difference_vegetation_index)\n", "\n", "$$\n", "NDVI=\\frac{NIR-Red} {NIR+Red}\n", @@ -1780,6 +1741,7 @@ ", where $NIR$ is the near-infrared band and $Red$ is the red band.\n", "\n", "Let's calculate NDVI for the multispectral Landsat satellite file (`landsat.tif`) of the Zion National Park.\n", + "The file `landsat.tif` contains surface reflectance values (range `0`-`1`) in the blue, green, red, and near-infrared (NIR) bands.\n", "We start by reading the file and extracting the NIR and red bands, which are the fourth and third bands, respectively.\n", "Next, we apply the formula to calculate the NDVI values." ] @@ -1788,6 +1750,7 @@ "cell_type": "code", "metadata": {}, "source": [ + "#| warning: false\n", "landsat = src_landsat.read()\n", "nir = landsat[3]\n", "red = landsat[2]\n", @@ -1796,32 +1759,12 @@ "execution_count": null, "outputs": [] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We also convert values \\>`1` to \"No Data\".\n", - "\n", - "" - ] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "ndvi[ndvi>1] = np.nan" - ], - "execution_count": null, - "outputs": [] - }, { "cell_type": "markdown", "metadata": {}, "source": [ "When plotting an RGB image using the `rasterio.plot.show` function, the function assumes that values are in the range `[0,1]` for floats, or `[0,255]` for integers (otherwise clipped) and the order of bands is RGB.\n", - "To \"prepare\" the multi-band raster for `rasterio.plot.show`, we therefore reverse the order of the first three bands (to go from B-G-R-NIR to R-G-B), using the `[:3]` slice to select the first three bands and then the `[::-1]` slice to reverse the bands order, and divide by the raster maximum to set the maximum value to `1`.\n", - "\n", - "" + "To prepare the multi-band raster for `rasterio.plot.show`, we, therefore, reverse the order of the first three bands (to go from B-G-R-NIR to R-G-B), using the `[:3]` slice to select the first three bands and then the `[::-1]` slice to reverse the bands order, and divide by the raster maximum to set the maximum value to `1`." ] }, { @@ -1842,8 +1785,8 @@ "The default is to start from the beginning, go to the end, and use steps of `1`. \n", "Otherwise, `start` is inclusive and `end` is exclusive, whereas negative `step` values imply going backwards starting from the end. \n", "Also, always keep in mind that Python indices start from `0`.\n", - "When subsetting two- or three-dimensional objects, indices for each dimension are separated by commas, where either index can be set to `:` meaning \"all values\".\n", - "The last dimensions can also be omitted implying `:`, e.g., to subset the first three bands from a three-dimensional array `a` we can use either `a[:3,:,:]` or `a[:3]`\n", + "When subsetting two- or three-dimensional objects, indices for each dimension are separated by commas, where either index can be set to `:` meaning 'all values'.\n", + "The last dimensions can also be omitted implying `:`, e.g., to subset the first three bands from a three-dimensional array `a` we can use either `a[:3,:,:]` or `a[:3]`.\n", "\n", "In the above example: \n", "\n", @@ -1876,23 +1819,23 @@ "source": [ "### Focal operations {#sec-focal-operations}\n", "\n", - "While local functions operate on one cell at time (though possibly from multiple layers), focal operations take into account a central (focal) cell and its neighbors.\n", - "The neighborhood (also named kernel, filter or moving window) under consideration is typically of $3 \\times 3$ cells (that is, the central cell and its eight surrounding neighbors), but can take on any other (not necessarily rectangular) shape as defined by the user.\n", + "While local functions operate on one cell at a time (though possibly from multiple layers), focal operations take into account a central (focal) cell and its neighbors.\n", + "The neighborhood (also named kernel, filter, or moving window) under consideration is typically of $3 \\times 3$ cells (that is, the central cell and its eight surrounding neighbors), but can take on any other (not necessarily rectangular) shape as defined by the user.\n", "A focal operation applies an aggregation function to all cells within the specified neighborhood, uses the corresponding output as the new value for the central cell, and moves on to the next central cell (@fig-focal-filter).\n", "Other names for this operation are spatial filtering and convolution [@burrough_principles_2015].\n", "\n", - "![Input raster (left) and resulting output raster (right) due to a focal operation---finding the minimum value in $3 \\times 3$ moving windows.](https://r.geocompx.org/figures/04_focal_example.png){#fig-focal-filter}\n", + "![Input raster (left) and resulting output raster (right) due to a focal operation---finding the minimum value in $3 \\times 3$ moving windows.](images/04_focal_example.png){#fig-focal-filter}\n", "\n", - "In Python, the [**scipy.ndimage**](https://docs.scipy.org/doc/scipy/tutorial/ndimage.html) [@scipy] package has a comprehensive collection of [functions](https://docs.scipy.org/doc/scipy/reference/ndimage.html#filters) to perform filtering of **numpy** arrays, such as:\n", + "In Python, the **scipy.ndimage** [@scipy] package has a comprehensive collection of functions to perform filtering of **numpy** arrays, such as:\n", "\n", - "- [`scipy.ndimage.minimum_filter`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.ndimage.minimum_filter.html)\n", - "- [`scipy.ndimage.maximum_filter`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.ndimage.maximum_filter.html)\n", - "- [`scipy.ndimage.uniform_filter`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.ndimage.uniform_filter.html) (i.e., mean filter)\n", - "- [`scipy.ndimage.median_filter`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.ndimage.median_filter.html) etc.\n", + "- `scipy.ndimage.minimum_filter`,\n", + "- `scipy.ndimage.maximum_filter`,\n", + "- `scipy.ndimage.uniform_filter` (i.e., mean filter),\n", + "- `scipy.ndimage.median_filter`, etc.\n", "\n", - "In this group of functions, we define the shape of the moving window with either one of `size`---a single number (e.g., `3`), or tuple (e.g., `(3,3)`), implying a filter of those dimensions or `footprint`---a boolean array, representing both the window shape and the identity of elements being included\n", + "In this group of functions, we define the shape of the moving window with either one of `size`---a single number (e.g., `3`), or tuple (e.g., `(3,3)`), implying a filter of those dimensions, or `footprint`---a boolean array, representing both the window shape and the identity of elements being included.\n", "\n", - "In addition to specific built-in filters, `convolve`---applies the sum function after multiplying by a custom `weights` array and `generic_filter`---makes it possible to pass any custom function, where the user can specify any type of custom window-based calculation.\n", + "In addition to specific built-in filters, `convolve`---applies the sum function after multiplying by a custom `weights` array, and `generic_filter`---makes it possible to pass any custom function, where the user can specify any type of custom window-based calculation.\n", "\n", "For example, here we apply the minimum filter with window size of `3` on `elev`.\n", "As a result, we now have a new array `elev_min`, where each value is the minimum in the corresponding $3 \\times 3$ neighborhood in `elev`." @@ -1913,12 +1856,12 @@ "metadata": {}, "source": [ "Special care should be given to the edge pixels -- how should they be calculated?\n", - "The **scipy.ndimage** filtering functions give several options through the `mode` parameter (see the documentation of any filtering function, such as [scipy.ndimage.median_filter](https://docs.scipy.org/doc/scipy/reference/generated/scipy.ndimage.median_filter.html), for the definition of each mode): `reflect` (the default), `constant`, `nearest`, `mirror`, `wrap`.\n", + "The **scipy.ndimage** filtering functions give several options through the `mode` parameter (see the documentation of any filtering function, such as `scipy.ndimage.median_filter`, for the definition of each mode): `reflect` (the default), `constant`, `nearest`, `mirror`, `wrap`.\n", "Sometimes artificially extending raster edges is considered unsuitable.\n", - "In other words, we may wish the resulting raster to contain pixel values with \"complete\" windows only, for example to have a uniform sample size or because values in all directions matter (such as in topographic calculations).\n", + "In other words, we may wish the resulting raster to contain pixel values with 'complete' windows only, for example, to have a uniform sample size or because values in all directions matter (such as in topographic calculations).\n", "There is no specific option *not* to extend edges in **scipy.ndimage**.\n", "However, to get the same effect, the edges of the filtered array can be assigned with `np.nan`, in a number of rows and columns according to filter size.\n", - "For example, when using a filter of `size=3`, the outermost \"layer\" of pixels may be assigned with `np.nan`, reflecting the fact that these pixels have incomplete $3 \\times 3$ neighborhoods:" + "For example, when using a filter of `size=3`, the outermost 'layer' of pixels may be assigned with `np.nan`, reflecting the fact that these pixels have incomplete $3 \\times 3$ neighborhoods (@fig-focal-filter):" ] }, { @@ -1942,12 +1885,9 @@ "\n", "Focal functions or filters play a dominant role in image processing.\n", "For example, low-pass or smoothing filters use the mean function to remove extremes.\n", - "By contrast, high-pass filters accentuate features.\n", - "The line detection Laplace and Sobel filters might serve as an example here.\n", - "\n", - "\n", + "By contrast, high-pass filters, often created with custom neighborhood weights, accentuate features.\n", "\n", - "In the case of categorical data, we can replace the mean with the mode, which is the most common value.\n", + "In the case of categorical data, we can replace the mean with the mode, i.e., the most common value.\n", "To demonstrate applying a mode filter, let's read the small sample categorical raster `grain.tif`." ] }, @@ -1965,9 +1905,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "There is no built-in filter function for a mode filter in **scipy.ndimage**, but we can use the [`scipy.ndimage.generic_filter`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.ndimage.generic_filter.html) function along with a custom filtering function, internally utilizing [`scipy.stats.mode`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mode.html). \n", - "\n", - "" + "There is no built-in filter function for a mode filter in **scipy.ndimage**, but we can use the `scipy.ndimage.generic_filter` function along with a custom filtering function, internally utilizing `scipy.stats.mode`. " ] }, { @@ -1997,9 +1935,7 @@ "\n", "Terrain processing is another important application of focal operations.\n", "Such functions are provided by multiple Python packages, including the general purpose **xarray** package, and more specialized packages such as **richdem** and **pysheds**.\n", - "\n", - "\n", - "Useful terrain [metrics](https://richdem.readthedocs.io/en/latest/python_api.html?highlight=TerrainAttribute#richdem.TerrainAttribute) include:\n", + "Useful terrain metrics include:\n", "\n", "- Slope, measured in units of percent, degreees, or radians [@horn_1981]\n", "- Aspect, meaning each cell's downward slope direction [@horn_1981]\n", @@ -2008,18 +1944,29 @@ "For example, each of these, and other, terrain metrics can be computed with the **richdem** package.\n", "\n", "::: callout-note\n", - "Terrain metrics are essentially focal filters with customized functions. Using `scipy.ndimage.generic_filter`, along with such custom functions, is an option for those who would like to calculate terrain metric through coding by hand and/or limiting their code dependencies. For example, the [How Aspect works](https://pro.arcgis.com/en/pro-app/latest/tool-reference/spatial-analyst/how-aspect-works.htm) and [How Slope works](https://pro.arcgis.com/en/pro-app/latest/tool-reference/spatial-analyst/how-slope-works.htm) pages from the ArcGIS Pro documentation provide exlanations and formulas of the required funtions for aspect and slope metrics (@fig-raster-slope), respectively, which can be translated to **numpy**-based functions to be used in `scipy.ndimage.generic_filter` to calculate those metrics.\n", + "Terrain metrics are essentially focal filters with customized functions. \n", + "Using `scipy.ndimage.generic_filter`, along with such custom functions, is an option for those who would like to calculate terrain metric through coding by hand and/or limiting their code dependencies.\n", + "For example, the *How Aspect works*[^how_aspect_works] and *How Slope works*[^how_slope_works] pages from the ArcGIS Pro documentation provide explanations and formulas of the required functions for aspect and slope metrics (@fig-raster-slope), respectively, which can be translated to **numpy**-based functions to be used in `scipy.ndimage.generic_filter` to calculate those metrics.\n", ":::\n", "\n", - "Another extremely fast, memory-efficient, and concise, alternative, is to the use the GDAL program called [`gdaldem`](https://gdal.org/programs/gdaldem.html).\n", + "[^how_aspect_works]: [https://pro.arcgis.com/en/pro-app/latest/tool-reference/spatial-analyst/how-aspect-works.htm](https://pro.arcgis.com/en/pro-app/latest/tool-reference/spatial-analyst/how-aspect-works.htm)\n", + "\n", + "[^how_slope_works]: [https://pro.arcgis.com/en/pro-app/latest/tool-reference/spatial-analyst/how-slope-works.htm](https://pro.arcgis.com/en/pro-app/latest/tool-reference/spatial-analyst/how-slope-works.htm)\n", + "\n", + "Another extremely fast, memory-efficient, and concise, alternative, is to the use the GDAL program called `gdaldem`.\n", "`gdaldem` can be used to calculate slope, aspect, and other terrain metrics through a single command, accepting an input file path and exporting the result to a new file.\n", "This is our first example in the book where we demonstrate a situation where it may be worthwhile to leave the Python environment, and utilize a GDAL program directly, rather than through their wrappers (such as **rasterio** and other Python packages), whether to access a computational algorithm not easily accessible in a Python package, or for GDAL's memory-efficiency and speed benefits.\n", "\n", "::: callout-note\n", - "GDAL contains a collection of over 40 [programs](https://gdal.org/programs/index.html), mostly aimed at raster processing. These include programs for fundamental operations, such as [`gdal_translate`](https://gdal.org/programs/gdal_translate.html#gdal-translate) (convert between raster file formats), [`gdalwarp`](https://gdal.org/programs/gdalwarp.html#gdalwarp) (raster reprojection), [`gdal_rasterize`](https://gdal.org/programs/gdal_rasterize.html#gdal-rasterize) (rasterize vector features), and [`gdal_merge.py`](https://gdal.org/programs/gdal_merge.html#gdal-merge) (raster mosaic), as well as numerous miscellaneous programs. In this book, we use **rasterio** for the above-mentioned operations, although the GDAL programs are a good alternative for those who are more comfortable with the command line. However, we do use two GDAL programs for tasks that are lacking in **rasterio** and not well-implemented in other Python packages: `gdaldem` (this section), and `gdal_contour` (@sec-raster-to-contours).\n", + "GDAL contains a collection of over 40 programs, mostly aimed at raster processing. These include programs for fundamental operations, such as:\n", + "\n", + "* `gdal_translate`---convert between raster file formats\n", + "* `gdalwarp`---raster reprojection\n", + "* `gdal_rasterize`---rasterize vector features\n", + "* `gdal_merge.py`---raster mosaic \n", + "\n", + "In this book, we use **rasterio** for the above-mentioned operations, although the GDAL programs are a good alternative for those who are more comfortable with the command line. However, we do use two GDAL programs for tasks that are lacking in **rasterio** and not well-implemented in other Python packages: `gdaldem` (this section), and `gdal_contour` (@sec-raster-to-contours).\n", ":::\n", - "\n", - "\n", "\n", "GDAL, along with all of its programs should be available in your Python environment, since GDAL is a dependency of **rasterio**.\n", "The following example, which should be run from the command line, takes the `srtm_32612.tif` raster (which we are going to create in @sec-reprojecting-raster-geometries, therefore it is in the `'output'` directory), calculates slope (in decimal degrees, between `0` and `90`), and exports the result to a new file `srtm_32612_slope.tif`.\n", @@ -2041,7 +1988,7 @@ "metadata": {}, "source": [ "Here we ran the `gdaldem` command through `os.system`, in order to remain in the Python environment, even though we are calling an external program.\n", - "You can also run the standalone command in the command line interface you are using, such as the Anaconda Prompt:\n", + "Alternatively, you can run the standalone command in the command line interface you are using, such as the Anaconda Prompt:\n", "\n", "```{sh}\n", "gdaldem slope output/srtm_32612.tif output/srtm_32612_slope.tif\n", @@ -2066,13 +2013,7 @@ "metadata": {}, "source": [ "@fig-raster-slope shows the results, using our more familiar plotting methods from **rasterio**.\n", - "The code section is relatively long due to the workaround to create a color key (see @sec-plot-symbology) and removing \"No Data\" flag values from the arrays so that the color key does not include them. Also note that we are using one of **matplotlib**'s the [cyclic color scales](https://matplotlib.org/stable/users/explain/colors/colormaps.html#cyclic) (`'twilight'`) when plotting aspect (@fig-raster-slope (c)).\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "" + "The code section is relatively long due to the workaround to create a color key (see @sec-plot-symbology) and removing 'No Data' flag values from the arrays so that the color key does not include them. Also note that we are using one of **matplotlib**'s the cyclic color scales (`'twilight'`) when plotting aspect (@fig-raster-slope (c))." ] }, { @@ -2120,14 +2061,14 @@ "Just like focal operations, zonal operations apply an aggregation function to multiple raster cells.\n", "However, a second raster, usually with categorical values, defines the zonal filters (or 'zones') in the case of zonal operations, as opposed to a predefined neighborhood window in the case of focal operation presented in the previous section.\n", "Consequently, raster cells defining the zonal filter do not necessarily have to be neighbors.\n", - "Our `grain.tif` raster is a good example, as illustrated in @fig-rasterio-plot-elev: different grain sizes are spread irregularly throughout the raster.\n", + "Our `grain.tif` raster is a good example, as illustrated in @fig-rasterio-plot-grain: different grain sizes are spread irregularly throughout the raster.\n", "Finally, the result of a zonal operation is a summary table grouped by zone, which is why this operation is also known as zonal statistics in the GIS world.\n", "This is in contrast to focal operations (@sec-focal-operations) which return a raster object.\n", "\n", "To demonstrate, let's get back to the `grain.tif` and `elev.tif` rasters.\n", "To calculate zonal statistics, we use the arrays with raster values, which we already imported earlier.\n", "Our intention is to calculate the average (or any other summary function, for that matter) of *elevation* in each zone defined by *grain* values.\n", - "To do that, first we first obtain the unique values defining the zones using [`np.unique`](https://numpy.org/doc/stable/reference/generated/numpy.unique.html)." + "To do that, first we first obtain the unique values defining the zones using `np.unique`." ] }, { @@ -2143,9 +2084,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now, we can use [dictionary comprehension](https://docs.python.org/3/tutorial/datastructures.html#dictionaries) to \"split\" the `elev` array into separate one-dimensional arrays with values per `grain` group, with keys being the unique `grain` values.\n", - "\n", - "" + "Now, we can use dictionary comprehension (see note below) to split the `elev` array into separate one-dimensional arrays with values per `grain` group, with keys being the unique `grain` values." ] }, { @@ -2163,7 +2102,7 @@ "metadata": {}, "source": [ "::: callout-note\n", - "[List comprehension](https://docs.python.org/3/tutorial/datastructures.html#list-comprehensions) and dictionary comprehension are concise ways to create a `list` or a `dict`, respectively, from an iterable object.\n", + "*List comprehension* and *dictionary comprehension* are concise ways to create a `list` or a `dict`, respectively, from an iterable object.\n", "Both are, conceptually, a concise syntax to replace `for` loops where we iterate over an object and return a same-length object with the results.\n", "Here are minimal examples of list and dictionary comprehension, respectively, to demonstrate the idea:\n", "\n", @@ -2203,19 +2142,15 @@ "In the first case, one can calculate the distance from each cell to specific target cells or vector geometries.\n", "For example, one might want to compute the distance to the nearest coast (see @sec-distance-to-nearest-geometry).\n", "We might also want to consider topography, that means, we are not only interested in the pure distance but would like also to avoid the crossing of mountain ranges when going to the coast.\n", - "To do so, we can weight the distance with elevation so that each additional altitudinal meter \"prolongs\" the Euclidean distance (this is beyond the scope of the book).\n", - "\n", - "\n", + "To do so, we can weight the distance with elevation so that each additional altitudinal meter 'prolongs' the Euclidean distance (this is beyond the scope of the book).\n", "Visibility and viewshed computations also belong to the family of global operations (also beyond the scope of the book).\n", - "\n", - "\n", "\n", "### Map algebra counterparts in vector processing\n", "\n", "Many map algebra operations have a counterpart in vector processing [@liu_essential_2009].\n", - "Computing a distance raster (global operation) while only considering a maximum distance (logical focal operation) is the equivalent to a vector buffer operation (@sec-buffers).\n", + "Computing a distance raster (global operation) while only considering a maximum distance (logical focal operation) is the equivalent of a vector buffer operation (@sec-buffers).\n", "Reclassifying raster data (either local or zonal function depending on the input) is equivalent to dissolving vector data (@sec-geometry-unions).\n", - "Overlaying two rasters (local operation), where one contains \"No Data\" values representing a mask, is similar to vector clipping (Section @sec-clipping).\n", + "Overlaying two rasters (local operation), where one contains 'No Data' values representing a mask, is similar to vector clipping (Section @sec-clipping).\n", "Quite similar to spatial clipping is intersecting two layers (@sec-spatial-subsetting-vector, @sec-joining-incongruent-layers).\n", "The difference is that these two layers (vector or raster) simply share an overlapping area.\n", "However, be careful with the wording.\n", @@ -2227,13 +2162,12 @@ "\n", "Suppose we would like to compute the NDVI (see @sec-raster-local-operations), and additionally want to compute terrain attributes from elevation data for observations within a study area.\n", "Such computations rely on remotely sensed information.\n", - "The corresponding source imagery is often divided into scenes covering a specific spatial extent (i.e., \"tiles\"), and frequently, a study area covers more than one scene.\n", - "Then, we would need to merge (also known as \"mosaic\") the scenes covered by our study area.\n", - "In case when all scenes are \"aligned\" (i.e., share the same origin and resolution), this can be thought of as simply gluing them into one big raster; otherwise, all scenes should be resampled (see @sec-raster-resampling) to the grid defined by the first scene.\n", + "The corresponding source imagery is often divided into scenes covering a specific spatial extent (i.e., tiles), and frequently, a study area covers more than one scene.\n", + "Then, we would need to merge (also known as mosaic) the scenes covering our study area.\n", + "In case when all scenes are aligned (i.e., share the same origin and resolution), this can be thought of as simply gluing them into one big raster; otherwise, all scenes need to be resampled (see @sec-raster-resampling) to the same grid (e.g., the one defined by the first scene).\n", "\n", - "For example, let us merge digital elevation data from two SRTM elevation tiles, for Austria (`'aut.tif'`) and Switzerland (`'ch.tif'`).\n", - "Merging can be done using function `rasterio.merge.merge`, which accepts a `list` of raster file connections, and returns the new `ndarray` and a \"transform\", representing the resulting mosaic.\n", - "" + "For example, let's merge digital elevation data from two SRTM elevation tiles, for Austria (`'aut.tif'`) and Switzerland (`'ch.tif'`).\n", + "Merging can be done using function `rasterio.merge.merge`, which accepts a `list` of raster file connections, and returns the new `ndarray` and the corresponding transform object, representing the resulting mosaic." ] }, { @@ -2286,7 +2220,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "By default in `rasterio.merge.merge` (`method='first'`), areas of overlap retain the value of the *first* raster.\n", + "By default in `rasterio.merge.merge`, areas of overlap retain the value of the *first* raster (`method='first'`).\n", "Other possible methods are:\n", "\n", "- `'last'`---Value of the last raster\n", @@ -2296,24 +2230,22 @@ "When dealing with non-overlapping tiles, such as `aut.tif` and `ch.tif` (above), the `method` argument has no practical effect.\n", "However, it becomes relevant when we want to combine spectral imagery from scenes that were taken on different dates.\n", "The above four options for `method` do not cover the commonly required scenario when we would like to compute the *mean* value---for example to calculate a seasonal average NDVI image from a set of partially overlapping satellite images (such as Landsat).\n", - "An alternative worflow to `rasterio.merge.merge`, for calculating a mosaic as well as \"averaging\" any overlaps, is to go through two steps:\n", - "\n", - "- Resampling all scenes into a common \"global\" grid (@sec-raster-resampling), thereby producing a series of \"matching\" rasters (with the area surrounding each scene set as \"No Data\")\n", - "- Averaging the rasters through raster algebra (@sec-raster-local-operations), using `np.mean(m,axis=0)` or `np.nanmean(m,axis=0)` (depending whether we prefer to ignore \"No Data\" or not), where `m` is the multi-band array, which would return a single-band array of averages\n", + "An alternative workflow to `rasterio.merge.merge`, for calculating a mosaic as well as averaging any overlaps, is to go through two steps:\n", "\n", - "## Exercises\n", + "- Resampling all scenes into a common 'global' grid (@sec-raster-resampling), thereby producing a series of matching rasters (with the area surrounding each scene set as 'No Data')\n", + "- Averaging the rasters through raster algebra (@sec-raster-local-operations), using `np.mean(m,axis=0)` or `np.nanmean(m,axis=0)` (depending whether we prefer to ignore 'No Data' or not), where `m` is the multi-band array, which would return a single-band array of averages\n", "\n", - "- Write a function which accepts and array and an `int` specifying the number of rows/columns to erase along an array edges. The function needs to return the modified array with `np.nan` values along its edges.\n", + "\n" ] } ], "metadata": { "kernelspec": { - "name": "python3", + "display_name": "Python 3", "language": "python", - "display_name": "Python 3 (ipykernel)" + "name": "python3" } }, "nbformat": 4, diff --git a/ipynb/04-geometry-operations.ipynb b/ipynb/04-geometry-operations.ipynb index 0a0eafe0..6ccf4136 100644 --- a/ipynb/04-geometry-operations.ipynb +++ b/ipynb/04-geometry-operations.ipynb @@ -4,6 +4,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ + "---\n", + "jupyter: python3\n", + "---\n", + "\n", "# Geometry operations {#sec-geometric-operations}\n", "\n", "## Prerequisites {.unnumbered}" @@ -14,12 +18,24 @@ "metadata": {}, "source": [ "#| echo: false\n", - "import matplotlib.pyplot as plt\n", - "import pandas as pd\n", - "pd.options.display.max_rows = 6\n", - "pd.options.display.max_columns = 6\n", - "pd.options.display.max_colwidth = 35\n", - "plt.rcParams['figure.figsize'] = (5, 5)" + "import book_options" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "::: {.content-visible when-format=\"pdf\"}" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| echo: false\n", + "import book_options_pdf" ], "execution_count": null, "outputs": [] @@ -28,6 +44,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ + ":::\n", + "\n", "This chapter requires importing the following packages:" ] }, @@ -37,6 +55,8 @@ "source": [ "import sys\n", "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", "import shapely\n", "import geopandas as gpd\n", "import topojson as tp\n", @@ -85,7 +105,7 @@ "\n", "@sec-geo-ras covers geometric transformations on raster objects.\n", "This involves changing the size and number of the underlying pixels, and assigning them new values.\n", - "It teaches how to change the extent and the origin of a raster \"manually\" (@sec-extent-and-origin), how to change the resolution in fixed \"steps\" through aggregation and disaggregation (@sec-raster-agg-disagg), and finally how to resample a raster into any existing template, which is the most general and often most practical approach (@sec-raster-resampling).\n", + "It teaches how to change the extent and the origin of a raster manually (@sec-extent-and-origin), how to change the resolution in fixed steps through aggregation and disaggregation (@sec-raster-agg-disagg), and finally how to resample a raster into any existing template, which is the most general and often most practical approach (@sec-raster-resampling).\n", "These operations are especially useful if one would like to align raster datasets from diverse sources.\n", "Aligned raster objects share a one-to-one correspondence between pixels, allowing them to be processed using map algebra operations (@sec-raster-local-operations).\n", "\n", @@ -100,10 +120,10 @@ "\n", "### Simplification {#sec-simplification}\n", "\n", - "Simplification is a process for generalization of vector objects (lines and polygons) usually for use in smaller scale maps.\n", - "Another reason for simplifying objects is to reduce the amount of memory, disk space and network bandwidth they consume: it may be wise to simplify complex geometries before publishing them as interactive maps.\n", - "The **geopandas** package provides the [`.simplify`](https://geopandas.org/en/stable/docs/reference/api/geopandas.GeoSeries.simplify.html) method, which uses the GEOS implementation of the Douglas-Peucker algorithm to reduce the vertex count.\n", - "`.simplify` uses the `tolerance` to control the level of generalization in map units [@douglas_algorithms_1973].\n", + "Simplification is a process for generalization of vector objects (lines and polygons) usually for use in smaller-scale maps.\n", + "Another reason for simplifying objects is to reduce the amount of memory, disk space, and network bandwidth they consume: it may be wise to simplify complex geometries before publishing them as interactive maps.\n", + "The **geopandas** package provides the `.simplify` method, which uses the GEOS implementation of the Douglas-Peucker algorithm to reduce the vertex count.\n", + "`.simplify` uses `tolerance` to control the level of generalization in map units [@douglas_algorithms_1973].\n", "\n", "For example, a simplified geometry of a `'LineString'` geometry, representing the river Seine and tributaries, using tolerance of `2000` meters, can be created using the `seine.simplify(2000)` command (@fig-simplify-lines)." ] @@ -150,11 +170,7 @@ "Simplification is also applicable for polygons.\n", "This is illustrated using `us_states`, representing the contiguous United States.\n", "As we show in @sec-reproj-geo-data, for many calculations **geopandas** (through **shapely**, and, ultimately, GEOS) assumes that the data is in a projected CRS and this could lead to unexpected results when applying distance-related operators.\n", - "Therefore, the first step is to project the data into some adequate projected CRS, such as US National Atlas Equal Area (EPSG:`9311`) (on the left in Figure @fig-simplify-polygons), using `.to_crs` (@sec-reprojecting-vector-geometries).\n", - "\n", - "\n", - "\n", - "" + "Therefore, the first step is to project the data into some adequate projected CRS, such as US National Atlas Equal Area (EPSG:`9311`) (on the left in Figure @fig-simplify-polygons), using `.to_crs` (@sec-reprojecting-vector-geometries)." ] }, { @@ -187,22 +203,19 @@ "metadata": {}, "source": [ "A limitation with `.simplify`, however, is that it simplifies objects on a per-geometry basis.\n", - "This means the \"topology\" is lost, resulting in overlapping and \"holey\" areal units as illustrated in @fig-simplify-polygons (b).\n", + "This means the topology is lost, resulting in overlapping and 'holey' areal units as illustrated in @fig-simplify-polygons (b).\n", "The `.toposimplify` method from package **topojson** provides an alternative that overcomes this issue.\n", - "By [default](https://mattijn.github.io/topojson/example/settings-tuning.html#simplify_algorithm) it uses the Douglas-Peucker algorithm like the `.simplify` method.\n", - "However, another algorithm, known as Visvalingam-Whyatt, which overcomes some limitations of the Douglas-Peucker algorithm [@visvalingam_line_1993], is also available in `.toposimplify`.\n", - "The main advanatage of `.toposimplify` is that it is topologically \"aware\": it simplifies the combined borders of the polygons (rather than each polygon on its own), thus ensuring that the overlap is maintained.\n", + "The main advanatage of `.toposimplify` is that it is topologically 'aware': it simplifies the combined borders of the polygons (rather than each polygon on its own), thus ensuring that the overlap is maintained.\n", "The following code chunk uses `.toposimplify` to simplify `us_states9311`.\n", - "Note that, when using the **topojson** package, we first need to calculate a \"topology\" object, using function `tp.Topology`, and then apply the sumplification function, such as `.toposimplify`, to obtain a simplified layer.\n", - "We are also using the `.to_gdf` method to return a `GeoDataFrame`. \n", - "\n", - "" + "Note that, when using the **topojson** package, we first need to calculate a topology object, using function `tp.Topology`, and then apply the simplification function, such as `.toposimplify`, to obtain a simplified layer.\n", + "We are also using the `.to_gdf` method to return a `GeoDataFrame`. " ] }, { "cell_type": "code", "metadata": {}, "source": [ + "#| warning: false\n", "topo = tp.Topology(us_states9311, prequantize=False)\n", "us_states_simp2 = topo.toposimplify(100000).to_gdf()" ], @@ -242,7 +255,7 @@ "\n", "Centroid operations identify the center of geographic objects.\n", "Like statistical measures of central tendency (including mean and median definitions of 'average'), there are many ways to define the geographic center of an object.\n", - "All of them create single point representations of more complex vector objects.\n", + "All of them create single-point representations of more complex vector objects.\n", "\n", "The most commonly used centroid operation is the geographic centroid.\n", "This type of centroid operation (often referred to as 'the centroid') represents the center of mass in a spatial object (think of balancing a plate on your finger).\n", @@ -283,7 +296,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The centroids and points in surface are illustrated in @fig-centroid-pnt-on-surface." + "The centroids and points on surface are illustrated in @fig-centroid-pnt-on-surface." ] }, { @@ -296,6 +309,7 @@ "#| fig-subcap: \n", "#| - New Zealand\n", "#| - Seine\n", + "\n", "# New Zealand\n", "base = nz.plot(color='white', edgecolor='lightgrey')\n", "nz_centroid.plot(ax=base, color='None', edgecolor='black')\n", @@ -322,7 +336,24 @@ "\n", "@fig-buffers illustrates buffers of two different sizes (5 and 50 $km$) surrounding the river Seine and tributaries.\n", "These buffers were created with commands below, using the `.buffer` method, applied to a `GeoSeries` or `GeoDataFrame`.\n", - "The `.buffer` method requires one important argument: the buffer distance, provided in the units of the CRS, in this case, meters (@fig-buffers)." + "The `.buffer` method requires one important argument: the buffer distance, provided in the units of the CRS, in this case, meters." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "seine_buff_5km = seine.buffer(5000)\n", + "seine_buff_50km = seine.buffer(50000)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The results are shown in @fig-buffers." ] }, { @@ -330,13 +361,12 @@ "metadata": {}, "source": [ "#| label: fig-buffers\n", - "#| fig-cap: Buffers around the Seine dataset of 5 km (left) and 50 km (right). Note the colors, which reflect the fact that one buffer is created per geometry feature.\n", + "#| fig-cap: Buffers around the Seine dataset of 5 $km$ and 50 $km$. Note the colors, which reflect the fact that one buffer is created per geometry feature.\n", "#| layout-ncol: 2\n", "#| fig-subcap: \n", "#| - 5 $km$ buffer\n", "#| - 50 $km$ buffer\n", - "seine_buff_5km = seine.buffer(5000)\n", - "seine_buff_50km = seine.buffer(50000)\n", + "\n", "seine_buff_5km.plot(color='none', edgecolor=['c', 'm', 'y']);\n", "seine_buff_50km.plot(color='none', edgecolor=['c', 'm', 'y']);" ], @@ -381,7 +411,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Alternative option is to add a secondary geometry column directly to the original `GeoDataFrame`." + "An alternative option is to add a secondary geometry column directly to the original `GeoDataFrame`." ] }, { @@ -398,7 +428,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "You can then switch to either geometry column (i.e., make it the \"active\" geometry column) using `.set_geometry`, as in:" + "You can then switch to either geometry column (i.e., make it 'active') using `.set_geometry`, as in:" ] }, { @@ -434,17 +464,15 @@ "### Affine transformations {#sec-affine-transformations}\n", "\n", "Affine transformations include, among others, shifting (translation), scaling and rotation, or any combination of these.\n", - "They preserves lines and parallelism, by angles and lengths are not necessarily preserved.\n", + "They preserves lines and parallelism, but angles and lengths are not necessarily preserved.\n", "These transformations are an essential part of geocomputation.\n", "For example, shifting is needed for labels placement, scaling is used in non-contiguous area cartograms, and many affine transformations are applied when reprojecting or improving the geometry that was created based on a distorted or wrongly projected map.\n", "\n", "The **geopandas** package implements affine transformation, for objects of classes `GeoSeries` and `GeoDataFrame`.\n", - "In both cases, the method is applied on the `GeoSeries` part, returning a just the `GeoSeries` of transformed geometries.\n", - "\n", - "\n", + "In both cases, the method is applied on the `GeoSeries` part, returning just the `GeoSeries` of transformed geometries.\n", "\n", - "Affine transformations of `GeoSeries` can be done using the [`.affine_transform`](https://geopandas.org/en/stable/docs/reference/api/geopandas.GeoSeries.affine_transform.html) method, which is a wrapper around the `shapely.affinity.affine_transform` function.\n", - "As [documented](https://shapely.readthedocs.io/en/stable/manual.html#shapely.affinity.affine_transform), a 2D affine transformation requires a six-parameter list `[a,b,d,e,xoff,yoff]` which represents the following equations for transforming the coordinates (@eq-affine1 and @eq-affine2)/\n", + "Affine transformations of `GeoSeries` can be done using the `.affine_transform` method, which is a wrapper around the `shapely.affinity.affine_transform` function.\n", + "A two-dimensional affine transformation requires a six-parameter list `[a,b,d,e,xoff,yoff]` which represents @eq-affine1 and @eq-affine2 for transforming the coordinates.\n", "\n", "$$\n", "x' = a x + b y + x_\\mathrm{off}\n", @@ -454,16 +482,13 @@ "y' = d x + e y + y_\\mathrm{off}\n", "$$ {#eq-affine2}\n", "\n", - "There are also simplified `GeoSeries` [methods](https://geopandas.org/en/stable/docs/user_guide/geometric_manipulations.html#affine-transformations) for specific scenarios, such as:\n", + "There are also simplified `GeoSeries` methods for specific scenarios, such as:\n", "\n", "- `.translate(xoff=0.0, yoff=0.0)`\n", "- `.scale(xfact=1.0, yfact=1.0, origin='center')`\n", "- `.rotate(angle, origin='center', use_radians=False)`\n", - "- `.skew(angle, origin='center', use_radians=False)`\n", "\n", - "For example, *shifting* only requires the $x_{off}$ and $y_{off}$, using [`.translate`](https://geopandas.org/en/stable/docs/reference/api/geopandas.GeoSeries.translate.html).\n", - "\n", - "\n", + "For example, *shifting* only requires the $x_{off}$ and $y_{off}$, using `.translate`.\n", "The code below shifts the y-coordinates of `nz` by 100 $km$ to the north, but leaves the x-coordinates untouched." ] }, @@ -482,19 +507,15 @@ "metadata": {}, "source": [ "::: callout-note\n", - "**shapely**, and consequently **geopandas**, operations, typically [ignore](https://shapely.readthedocs.io/en/stable/manual.html#geometric-objects) the z-dimension of geometries in operations. For example, `shapely.LineString([(0,0,0),(0,0,1)]).length` returns `0` (and not `1`), since `.length` ignores the z-dimension. In this book (like in most real-world spatial analysis applications), we deal only with two-dimensional geometries.\n", + "**shapely**, and consequently **geopandas**, operations, typically ignore the z-dimension (if there is one) of geometries in operations. For example, `shapely.LineString([(0,0,0),(0,0,1)]).length` returns `0` (and not `1`), since `.length` ignores the z-dimension. This is not an issue in this book (and in most real-world spatial analysis applications), since we are dealing only with two-dimensional geometries.\n", ":::\n", "\n", "Scaling enlarges or shrinks objects by a factor, and can be applied either globally or locally.\n", "Global scaling increases or decreases all coordinates values in relation to the origin coordinates, while keeping all geometries topological relations intact.\n", - "**geopandas** implements local scaling using the [`.scale`](https://geopandas.org/en/stable/docs/reference/api/geopandas.GeoSeries.scale.html) method.\n", - "\n", - "\n", + "**geopandas** implements scaling using the `.scale` method.\n", "Local scaling treats geometries independently and requires points around which geometries are going to be scaled, e.g., centroids.\n", "In the example below, each geometry is shrunk by a factor of two around the centroids (@fig-affine-transformations (b)).\n", - "To achieve that, we pass the `0.5` and `0.5` scaling factors (for x and y, respectively), and the `'centroid'` option for the point of origin.\n", - "\n", - "" + "To achieve that, we pass the `0.5` and `0.5` scaling factors (for x and y, respectively), and the `'centroid'` option for the point of origin." ] }, { @@ -511,13 +532,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "::: callout-note\n", "When setting the `origin` in `.scale`, other than `'centroid'` it is possible to use `'center'`, for the bounding box center, or specific point coordinates, such as `(0,0)`.\n", - ":::\n", "\n", - "Rotating the geometries can be done using the [`.rotate`](https://geopandas.org/en/stable/docs/reference/api/geopandas.GeoSeries.rotate.html) method.\n", + "Rotating the geometries can be done using the `.rotate` method.\n", "When rotating, we need to specify the rotation angle (positive values imply clockwise rotation) and the `origin` points (using the same options as in `scale`).\n", - "For example, the following expression rotates `nz` by 30 degrees counter-clockwise, around the geometry centroids." + "For example, the following expression rotates `nz` by $30\\degree$ counter-clockwise, around the geometry centroids." ] }, { @@ -534,7 +553,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "@fig-affine-transformations shows the original layer `nz`, and the shifting, scaling and rotation results." + "@fig-affine-transformations shows the original layer `nz`, and the shifting, scaling, and rotation results." ] }, { @@ -542,7 +561,7 @@ "metadata": {}, "source": [ "#| label: fig-affine-transformations\n", - "#| fig-cap: 'Illustrations of affine transformations: shift, scale and rotate'\n", + "#| fig-cap: 'Affine transformations of the `nz` layer: shift, scale, and rotate'\n", "#| layout-ncol: 3\n", "#| fig-subcap: \n", "#| - Shift\n", @@ -565,13 +584,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", - "\n", "### Pairwise geometry-generating operations {#sec-clipping}\n", "\n", "Spatial clipping is a form of spatial subsetting that involves changes to the geometry columns of at least some of the affected features.\n", - "Clipping can only apply to features more complex than points: lines, polygons and their 'multi' equivalents.\n", + "Clipping can only apply to features more complex than points: lines, polygons, and their 'multi' equivalents.\n", "To illustrate the concept we will start with a simple example: two overlapping circles with a center point one unit away from each other and a radius of one (@fig-overlapping-circles)." ] }, @@ -592,7 +608,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Imagine you want to select not one circle or the other, but the space covered by both x and y.\n", + "Imagine you want to select not one circle or the other, but the space covered by both `x` and `y`.\n", "This can be done using the `.intersection` method from **shapely**, illustrated using objects named `x` and `y` which represent the left- and right-hand circles (@fig-intersection)." ] }, @@ -620,7 +636,7 @@ "metadata": {}, "source": [ "#| label: fig-difference\n", - "#| fig-cap: Difference between `x` and `y` (namely, `x` \"minus\" `y`)\n", + "#| fig-cap: Difference between `x` and `y` (namely, `x` 'minus' `y`)\n", "x.difference(y)" ], "execution_count": null, @@ -658,15 +674,15 @@ "The **geopandas** package, as is often the case, contains wrappers of these **shapely** functions to be applied to multiple, or pairwise, use cases.\n", "For example, applying either of the pairwise methods on a `GeoSeries` or `GeoDataFrame`, combined with a `shapely` geometry, returns the pairwise (many-to-one) results (which is analogous to other operators, like `.intersects` or `.distance`, see @sec-spatial-subsetting-vector and @sec-distance-relations, respectively).\n", "\n", - "Let's demonstrate the \"many-to-one\" scenario by calculating the difference between each geometry in a `GeoSeries` and a \"fixed\" `shapely` geometry.\n", - "To create the latter, let's take `x` and combine it with itself translated (@sec-affine-transformations) to a distance of `1` and `2` units \"upwards\" on the y-axis." + "Let's demonstrate the 'many-to-one' scenario by calculating the difference between each geometry in a `GeoSeries` and a fixed `shapely` geometry.\n", + "To create the latter, let's take `x` and combine it with itself translated (@sec-affine-transformations) to a distance of `1` and `2` units 'upwards' on the y-axis." ] }, { "cell_type": "code", "metadata": {}, "source": [ - "geom1 = gpd.GeoSeries([x])\n", + "geom1 = gpd.GeoSeries(x)\n", "geom2 = geom1.translate(0, 1)\n", "geom3 = geom1.translate(0, 2)\n", "geom = pd.concat([geom1, geom2, geom3])\n", @@ -687,9 +703,10 @@ "metadata": {}, "source": [ "#| label: fig-geom-intersection\n", - "#| fig-cap: A `GeoSeries` with three circles, and a `shapely` geometry that we will \"subtract\" from it (in red)\n", + "#| fig-cap: A `GeoSeries` with three circles (in grey), and a `shapely` geometry that we will subtract from it (in red)\n", + "\n", "fig, ax = plt.subplots()\n", - "geom.plot(color='lightgrey', edgecolor='black', ax=ax)\n", + "geom.plot(color='#00000030', edgecolor='black', ax=ax)\n", "gpd.GeoSeries(y).plot(color='#FF000040', edgecolor='black', ax=ax);" ], "execution_count": null, @@ -699,7 +716,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now, using `.intersection` automatically applies the **shapely** method of the same name on each geometry in `geom`, returning a new `GeoSeries`, which we name `geom_inter_y`, with the pairwise \"intersections\".\n", + "Now, using `.intersection` automatically applies the **shapely** method of the same name on each geometry in `geom`, returning a new `GeoSeries`, which we name `geom_inter_y`, with the pairwise intersections.\n", "Note the empty third geometry (can you explain the meaning of this result?)." ] }, @@ -727,7 +744,7 @@ "#| label: fig-geom-intersection2\n", "#| fig-cap: The output `GeoSeries`, after subtracting a `shapely` geometry using `.intersection`\n", "\n", - "geom_inter_y.plot(color='lightgrey', edgecolor='black');" + "geom_inter_y.plot(color='#00000030', edgecolor='black');" ], "execution_count": null, "outputs": [] @@ -736,20 +753,21 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The `.overlay` method (see @sec-joining-incongruent-layers) further extends this technique, making it possible to apply \"many-to-many\" pairwise geometry generations between all pairs of two `GeoDataFrame`s.\n", + "The `.overlay` method (see @sec-joining-incongruent-layers) further extends this technique, making it possible to apply 'many-to-many' pairwise geometry generations between all pairs of two `GeoDataFrame`s.\n", "The output is a new `GeoDataFrame` with the pairwise outputs, plus the attributes of both inputs which were the inputs of the particular pairwise output geometry.\n", - "See the [\"Set operations with overlay\"](https://geopandas.org/en/stable/docs/user_guide/set_operations.html) article in the **geopandas** documentation for examples of `.overlay`.\n", + "Also see the *Set operations with overlay*[^set_ops_w_overlay] article in the **geopandas** documentation for examples of `.overlay`.\n", + "\n", + "[^set_ops_w_overlay]: [https://geopandas.org/en/stable/docs/user_guide/set_operations.html](https://geopandas.org/en/stable/docs/user_guide/set_operations.html)\n", "\n", "### Subsetting vs. clipping {#sec-subsetting-vs-clipping}\n", "\n", "In the last two chapters we have introduced two types of spatial operators: boolean, such as `.intersects` (@sec-spatial-subsetting-vector), and geometry-generating, such as `.intersection` (@sec-clipping).\n", "Here, we illustrate the difference between them.\n", "We do this using the specific scenario of subsetting points by polygons, where (unlike in other cases) both methods can be used for the same purpose and giving the same result.\n", - "\n", "\n", "To illustrate the point, we will subset points that cover the bounding box of the circles `x` and `y` from @fig-overlapping-circles.\n", - "Some points will be inside just one circle, some will be inside both and some will be inside neither.\n", - "The following code sections generates the sample data for this section, a simple random distribution of points within the extent of circles `x` and `y`, resulting in output illustrated in @fig-random-points.\n", + "Some points will be inside just one circle, some will be inside both, and some will be inside neither.\n", + "The following code sections generate the sample data for this section, a simple random distribution of points within the extent of circles `x` and `y`, resulting in output illustrated in @fig-random-points.\n", "We create the sample points in two steps.\n", "First, we figure out the bounds where random points are to be generated." ] @@ -789,7 +807,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Third, we transform the list of coordinates into a `list` of `shapely` points and then to a `GeoSeries`." + "Third, we transform the list of coordinates into a `list` of `shapely` points, and then to a `GeoSeries`." ] }, { @@ -814,10 +832,10 @@ "metadata": {}, "source": [ "#| label: fig-random-points\n", - "#| fig-cap: Randomly distributed points within the bounding box enclosing circles x and y. The point that intersects with both objects x and y are highlighted. \n", + "#| fig-cap: Randomly distributed points within the bounding box enclosing circles `x` and `y`\n", "base = pnt.plot(color='none', edgecolor='black')\n", - "gpd.GeoSeries([x]).plot(ax=base, color='none', edgecolor='darkgrey');\n", - "gpd.GeoSeries([y]).plot(ax=base, color='none', edgecolor='darkgrey');" + "gpd.GeoSeries(x).plot(ax=base, color='none', edgecolor='darkgrey');\n", + "gpd.GeoSeries(y).plot(ax=base, color='none', edgecolor='darkgrey');" ], "execution_count": null, "outputs": [] @@ -828,7 +846,7 @@ "source": [ "Now, we can get back to our question: how to subset the points to only return the point that intersects with both `x` and `y`?\n", "The code chunks below demonstrate two ways to achieve the same result.\n", - "In the first approach, we can calculate a boolean `Series`, evaluating whether each point of `pnt` intersects with the intersection of `x` and `y` (see @sec-spatial-subsetting-vector) and then use it to subset `pnt` to get the result `pnt1`." + "In the first approach, we can calculate a boolean `Series`, evaluating whether each point of `pnt` intersects with the intersection of `x` and `y` (see @sec-spatial-subsetting-vector), and then use it to subset `pnt` to get the result `pnt1`." ] }, { @@ -847,7 +865,7 @@ "metadata": {}, "source": [ "In the second approach, we can also find the intersection between the input points represented by `pnt`, using the intersection of `x` and `y` as the subsetting/clipping object.\n", - "Since the second argument is an individual `shapely` geometry (`x.intersection(y)`), we get \"pairwise\" intersections of each `pnt` with it (see @sec-clipping):" + "Since the second argument is an individual `shapely` geometry (`x.intersection(y)`), we get 'pairwise' intersections of each `pnt` with it (see @sec-clipping):" ] }, { @@ -874,8 +892,8 @@ "#| label: fig-intersection-points\n", "#| fig-cap: Randomly distributed points within the bounding box enclosing circles x and y. The point that intersects with both objects x and y are highlighted. \n", "base = pnt.plot(color='none', edgecolor='black')\n", - "gpd.GeoSeries([x]).plot(ax=base, color='none', edgecolor='darkgrey');\n", - "gpd.GeoSeries([y]).plot(ax=base, color='none', edgecolor='darkgrey');\n", + "gpd.GeoSeries(x).plot(ax=base, color='none', edgecolor='darkgrey');\n", + "gpd.GeoSeries(y).plot(ax=base, color='none', edgecolor='darkgrey');\n", "pnt2.plot(ax=base, color='red');" ], "execution_count": null, @@ -885,7 +903,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The only difference between the two approaches is that `.intersection` returns all \"intersections\", even if they are empty.\n", + "The only difference between the two approaches is that `.intersection` returns all intersections, even if they are empty.\n", "When these are filtered out, `pnt2` becomes identical to `pnt1`:" ] }, @@ -903,14 +921,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", "The example above is rather contrived and provided for educational rather than applied purposes.\n", - "However, we encourage the reader to reproduce the results to deepen your understanding for handling geographic vector objects in Python. \n", - "\n", - "\n", - "\n", - "\n", + "However, we encourage the reader to reproduce the results to deepen your understanding of handling geographic vector objects in Python. \n", "\n", "### Geometry unions {#sec-geometry-unions}\n", "\n", @@ -941,7 +953,7 @@ "metadata": {}, "source": [ "#| label: fig-dissolve\n", - "#| fig-cap: \"Spatial aggregation on contiguous polygons, illustrated by aggregating the population of 49 US states into 4 regions, with population represented by color. Note the operation automatically dissolves boundaries between states.\"\n", + "#| fig-cap: 'Spatial aggregation on contiguous polygons, illustrated by aggregating the population of 49 US states into 4 regions, with population represented by color. Note the operation automatically dissolves boundaries between states.'\n", "#| layout-ncol: 2\n", "#| fig-subcap: \n", "#| - 49 States\n", @@ -961,9 +973,9 @@ "metadata": {}, "source": [ "What is happening with the geometries here?\n", - "Behind the scenes, `.dissolve` combines the geometries and dissolve the boundaries between them using the [`.unary_union`](https://geopandas.org/en/stable/docs/reference/api/geopandas.GeoSeries.unary_union.html#geopandas.GeoSeries.unary_union) method per group.\n", - "This is demonstrated in the code chunk below which creates a united western US using the standalone `unary_union` operation.\n", - "Note that the result is a `shapely` geometry, as the individual attributes are \"lost\" as part of dissolving (@fig-dissolve2)." + "Behind the scenes, `.dissolve` combines the geometries and dissolves the boundaries between them using the `.union_all` method per group.\n", + "This is demonstrated in the code chunk below which creates a united western US using the standalone `.union_all` operation.\n", + "Note that the result is a `shapely` geometry, as the individual attributes are 'lost' as part of dissolving (@fig-dissolve2)." ] }, { @@ -973,7 +985,7 @@ "#| label: fig-dissolve2\n", "#| fig-cap: Western US\n", "us_west = us_states[us_states['REGION'] == 'West']\n", - "us_west_union = us_west.geometry.unary_union\n", + "us_west_union = us_west.geometry.union_all()\n", "us_west_union" ], "execution_count": null, @@ -983,7 +995,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "To dissolve two (or more) groups of a `GeoDataFrame` into one geometry, we can either (a) use a combined condition or (b) concatenate the two separate subsets and then dissove using `.unary_union`." + "To dissolve two (or more) groups of a `GeoDataFrame` into one geometry, we can either (a) use a combined condition or (b) concatenate the two separate subsets and then dissolve using `.union_all`." ] }, { @@ -993,11 +1005,11 @@ "# Approach 1\n", "sel = (us_states['REGION'] == 'West') | (us_states['NAME'] == 'Texas')\n", "texas_union = us_states[sel]\n", - "texas_union = texas_union.geometry.unary_union\n", + "texas_union = texas_union.geometry.union_all()\n", "# Approach 2\n", "us_west = us_states[us_states['REGION'] == 'West']\n", "texas = us_states[us_states['NAME'] == 'Texas']\n", - "texas_union = pd.concat([us_west, texas]).unary_union" + "texas_union = pd.concat([us_west, texas]).union_all()" ], "execution_count": null, "outputs": [] @@ -1026,27 +1038,24 @@ "source": [ "### Type transformations {#sec-type-transformations}\n", "\n", - "\n", - "\n", - "\n", - "Transformation of geometries, from one type to another, also known as \"geometry casting\", is often required to facilitate spatial analysis.\n", + "Transformation of geometries, from one type to another, also known as 'geometry casting', is often required to facilitate spatial analysis.\n", "Either the **geopandas** or the **shapely** packages can be used for geometry casting, depending on the type of transformation, and the way that the input is organized (whether and individual geometry, or a vector layer).\n", "Therefore, the exact expression(s) depend on the specific transformation we are interested in.\n", "\n", - "In general, you need to figure out the required input of the respective construstor function according to the \"destination\" geometry (e.g., `shapely.LineString`, etc.), then reshape the input of the \"source\" geometry into the right form to be passed to that function.\n", + "In general, you need to figure out the required input of the respective constructor function according to the 'destination' geometry (e.g., `shapely.LineString`, etc.), then reshape the input of the source geometry into the right form to be passed to that function.\n", "Or, when available, you can use a wrapper from **geopandas**.\n", "\n", - "In this section we demonstrate several common scenarios. \n", + "In this section, we demonstrate several common scenarios. \n", "We start with transformations of individual geometries from one type to another, using **shapely** methods:\n", "\n", "* `'MultiPoint'` to `'LineString'` (@fig-type-transform-linestring)\n", "* `'MultiPoint'` to `'Polygon'` (@fig-type-transform-polygon)\n", "* `'LineString'` to `'MultiPoint'` (@fig-type-transform-multipoint2)\n", - "* `'LineString'` to `'Polygon'` (@fig-type-transform-polygon2)\n", + "* `'Polygon'` to `'MultiPoint'` (@fig-type-transform-polygon2)\n", "* `'Polygon'`s to `'MultiPolygon'` (@fig-type-transform-multipolygon)\n", "* `'MultiPolygon'`s to `'Polygon'`s (@fig-type-transform-multipolygon1, @fig-type-transform-multipolygon2)\n", "\n", - "Then, we move on and demonstrate casting workflows on `GeoDataFrame`s, where we have further considerations, such as keeping track of geometry attributes, and the possibility of dissolving, rather than just combining, geometries. As we will see, these are done either by \"manually\" applying **shapely** methods on all geometries in the given layer, or using **geopandas** wrapper methods which do it automatically:\n", + "Then, we move on and demonstrate casting workflows on `GeoDataFrame`s, where we have further considerations, such as keeping track of geometry attributes, and the possibility of dissolving, rather than just combining, geometries. As we will see, these are done either by manually applying **shapely** methods on all geometries in the given layer, or using **geopandas** wrapper methods which do it automatically:\n", "\n", "* `'MultiLineString'` to `'LineString'`s (using `.explode`) (@fig-multilinestring-to-linestring)\n", "* `'LineString'` to `'MultiPoint'`s (using `.apply`) (@fig-linestring-to-multipoint)\n", @@ -1075,10 +1084,8 @@ "metadata": {}, "source": [ "A `'LineString'` can be created using `shapely.LineString` from a `list` of points.\n", - "Thus, a `'MultiPoint'` can be converted to a `'LineString'` by extracting the individual points into a `list`, then passing them to `shapely.LineString` (@fig-type-transform-linestring). \n", - "The `.geoms` property, mentioned in @sec-geometries, give access to the indivudual parts that comprise a multi-part geometry; it is one of the **shapely** access methods to internal parts of a geometry.\n", - "\n", - "" + "Thus, a `'MultiPoint'` can be converted to a `'LineString'` by passing the points into a `list`, then passing them to `shapely.LineString` (@fig-type-transform-linestring). \n", + "The `.geoms` property, mentioned in @sec-geometries, gives access to the individual parts that comprise a multi-part geometry, as an iterable object similar to a `list`; it is one of the **shapely** access methods to internal parts of a geometry." ] }, { @@ -1108,7 +1115,7 @@ "source": [ "#| label: fig-type-transform-polygon\n", "#| fig-cap: A `'Polygon'` created from the `'MultiPoint'` in @fig-type-transform-multipoint\n", - "polygon = shapely.Polygon([[p.x, p.y] for p in multipoint.geoms])\n", + "polygon = shapely.Polygon(multipoint.geoms)\n", "polygon" ], "execution_count": null, @@ -1144,7 +1151,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Conversion from `'MultiPoint'` to `'LineString'` (@fig-type-transform-linestring) is a common operation that creates a line object from ordered point observations, such as GPS measurements or geotagged media.\n", + "Conversion from `'MultiPoint'` to `'LineString'`, shown above (@fig-type-transform-linestring), is a common operation that creates a line object from ordered point observations, such as GPS measurements or geotagged media.\n", "This allows spatial operations such as the length of the path traveled.\n", "Conversion from `'MultiPoint'` or `'LineString'` to `'Polygon'` (@fig-type-transform-polygon) is often used to calculate an area, for example from the set of GPS measurements taken around a lake or from the corners of a building lot.\n", "\n", @@ -1184,7 +1191,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Using these methods, we can transform between `'Point'`, `'LineString'`, and `'Polygon'` geometries, assuming there is a sufficient number of points (at least two to form a line, and at least three to form a polygon).\n", + "Using these methods, we can transform between `'Point'`, `'LineString'`, and `'Polygon'` geometries, assuming there is a sufficient number of points (at least two for a line, and at least three for a polygon).\n", "When dealing with multi-part geometries using **shapely**, we can:\n", "\n", "- Access single-part geometries (e.g., each `'Polygion'` in a `'MultiPolygon'` geometry) using `.geoms[i]`, where `i` is the index of the geometry\n", @@ -1213,7 +1220,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now, here is how we can get back the `'Polygon'` part 1 (@fig-type-transform-multipolygon1):" + "Given `multipolygon`, here is how we can get back the `'Polygon'` part 1 (@fig-type-transform-multipolygon1):" ] }, { @@ -1221,7 +1228,7 @@ "metadata": {}, "source": [ "#| label: fig-type-transform-multipolygon1\n", - "#| fig-cap: The 1^st^ \"part\" extracted from the `'MultiPolygon'` in @fig-type-transform-multipolygon\n", + "#| fig-cap: The 1^st^ part extracted from the `'MultiPolygon'` in @fig-type-transform-multipolygon\n", "\n", "multipolygon.geoms[0]" ], @@ -1240,7 +1247,7 @@ "metadata": {}, "source": [ "#| label: fig-type-transform-multipolygon2\n", - "#| fig-cap: The 2^nd^ \"part\" extracted from the `'MultiPolygon'` in @fig-type-transform-multipolygon\n", + "#| fig-cap: The 2^nd^ part extracted from the `'MultiPolygon'` in @fig-type-transform-multipolygon\n", "\n", "multipolygon.geoms[1]" ], @@ -1251,13 +1258,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "However, dealing with multi-part geometries can be easier with **geopandas**. Thanks to the fact that geometries in a `GeoDataFrame` are associated with attributes, we can keep track of the origin of each geometry: duplicating the attributes when going from multi-part to single-part (using `.explode`, see below), or \"collapsing\" the attributes through aggregation when going from single-part to multi-part (using `.dissolve`, see @sec-geometry-unions).\n", - "\n", - "\n", + "However, dealing with multi-part geometries can be easier with **geopandas**. Thanks to the fact that geometries in a `GeoDataFrame` are associated with attributes, we can keep track of the origin of each geometry: duplicating the attributes when going from multi-part to single-part (using `.explode`, see below), or 'collapsing' the attributes through aggregation when going from single-part to multi-part (using `.dissolve`, see @sec-geometry-unions).\n", "\n", "Let's demonstrate going from multi-part to single-part (@fig-multilinestring-to-linestring) and then back to multi-part (@sec-geometry-unions), using a small line layer.\n", - "\n", - "\n", "As input, we will create a `'MultiLineString'` geometry composed of three lines (@fig-type-transform-multilinestring3)." ] }, @@ -1287,7 +1290,7 @@ "cell_type": "code", "metadata": {}, "source": [ - "geom = gpd.GeoSeries([ml])\n", + "geom = gpd.GeoSeries(ml)\n", "geom" ], "execution_count": null, @@ -1316,8 +1319,8 @@ "source": [ "You can imagine it as a road or river network.\n", "The above layer `dat` has only one row that defines all the lines.\n", - "This restricts the number of operations that can be done, for example it prevents adding names to each line segment or calculating lengths of single lines.\n", - "Using **shapely** methods with which we are already familiar with (see above), the individual single-part geometries (i.e., the \"parts\") can be accessed through the `.geoms` property." + "This restricts the number of operations that can be done, for example, it prevents adding names to each line segment or calculating lengths of single lines.\n", + "Using **shapely** methods with which we are already familiar with (see above), the individual single-part geometries (i.e., the 'parts') can be accessed through the `.geoms` property." ] }, { @@ -1333,7 +1336,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "However, specifically for the \"multi-part to single part\" type transformation scenarios, there is also a method called `.explode`, which can convert an entire multi-part `GeoDataFrame` to a single-part one.\n", + "However, specifically for the 'multi-part to single part' type transformation scenarios, there is also a method called `.explode`, which can convert an entire multi-part `GeoDataFrame` to a single-part one.\n", "The advantage is that the original attributes (such as `id`) are retained, so that we can keep track of the original multi-part geometry properties that each part came from.\n", "The `index_parts=True` argument also lets us keep track of the original multipart geometry indices, and part indices, named `level_0` and `level_1`, respectively." ] @@ -1342,6 +1345,7 @@ "cell_type": "code", "metadata": {}, "source": [ + "#| warning: false\n", "dat1 = dat.explode(index_parts=True).reset_index()\n", "dat1" ], @@ -1353,7 +1357,7 @@ "metadata": {}, "source": [ "For example, here we see that all `'LineString'` geometries came from the same multi-part geometry (`level_0`=`0`), which had three parts (`level_1`=`0`,`1`,`2`).\n", - "@fig-multilinestring-to-linestring demonstrates the effect of `.explode` in converting a layer with multi-part geometries into a layer with single part geometries." + "@fig-multilinestring-to-linestring demonstrates the effect of `.explode` in converting a layer with multi-part geometries into a layer with single-part geometries." ] }, { @@ -1361,13 +1365,13 @@ "metadata": {}, "source": [ "#| label: fig-multilinestring-to-linestring\n", - "#| fig-cap: Transformation a `'MultiLineString'` layer with one feature, into a `'LineString'` layer with three features, using `.explode`\n", + "#| fig-cap: Transformation of a `'MultiLineString'` layer with one feature, into a `'LineString'` layer with three features, using `.explode`\n", "#| layout-ncol: 2\n", "#| fig-subcap: \n", "#| - MultiLineString layer\n", "#| - LineString layer, after applying `.explode`\n", - "dat.plot(column='id');\n", - "dat1.plot(column='level_1');" + "dat.plot(column='id', linewidth=7);\n", + "dat1.plot(column='level_1', linewidth=7);" ], "execution_count": null, "outputs": [] @@ -1379,7 +1383,7 @@ "As a side-note, let's demonstrate how the above **shapely** casting methods can be translated to **geopandas**. \n", "Suppose that we want to transform `dat1`, which is a layer of type `'LineString'` with three features, to a layer of type `'MultiPoint'` (also with three features). \n", "Recall that for a single geometry, we use the expression `shapely.MultiPoint(x.coords)`, where `x` is a `'LineString'` (@fig-type-transform-multipoint2). \n", - "When dealing with a `GeoDataFrame`, we wrap the conversion into `.apply`, to apply it on all geometries:" + "When dealing with a `GeoDataFrame`, we wrap the conversion into `.apply`, to apply it to all geometries:" ] }, { @@ -1405,13 +1409,13 @@ "metadata": {}, "source": [ "#| label: fig-linestring-to-multipoint\n", - "#| fig-cap: Transformation a `'LineString'` layer with three features, into a `'MultiPoint'` layer (also with three features), using `.apply` and **shapely** methods\n", + "#| fig-cap: Transformation of a `'LineString'` layer with three features, into a `'MultiPoint'` layer (also with three features), using `.apply` and **shapely** methods\n", "#| layout-ncol: 2\n", "#| fig-subcap: \n", "#| - LineString layer\n", "#| - MultiPoint layer\n", - "dat1.plot(column='level_1');\n", - "dat2.plot(column='level_1');" + "dat1.plot(column='level_1', linewidth=7);\n", + "dat2.plot(column='level_1', markersize=50);" ], "execution_count": null, "outputs": [] @@ -1420,8 +1424,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The opposite transformation, i.e., \"single-part to multi-part\", is achieved using the `.dissolve` method (which we are already familiar with, see @sec-geometry-unions).\n", - "For example, here is how we can get back to the `'MultiLineString'` geometry:" + "The opposite transformation, i.e., 'single-part to multi-part', is achieved using the `.dissolve` method (which we are already familiar with, see @sec-geometry-unions).\n", + "For example, here is how we can get from the `'LineString'` layer with three features back to the `'MultiLineString'` layer with one feature (since, in this case, there is just one group):" ] }, { @@ -1437,7 +1441,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The next code chunk is another example, dissolving the `nz` north and south parts into `'MultiPolygon'` geometries." + "The next code chunk is another example, dissolving the 16 polygons in `nz` into two geometries of the north and south parts (i.e., the two `'Island'` groups)." ] }, { @@ -1457,9 +1461,9 @@ "metadata": {}, "source": [ "Note that `.dissolve` not only combines single-part into multi-part geometries, but also dissolves any internal borders.\n", - "So, in fact, the result may be single-part (in case when all parts touch each other, unlike in `nz`).\n", + "So, in fact, the resulting geometries may be single-part (in case when all parts touch each other, unlike in `nz`).\n", "If, for some reason, we want to combine geometries into multi-part *without* dissolving, we can fall back to the **pandas** `.agg` method (custom table aggregation), supplemented with a **shapely** function specifying how exactly we want to transform each group of geometries into a new single geometry.\n", - "In the following example, for instance, we collect all `'Polygon'` and `'MultiPolygon'` parts of `nz` into a single `'MultiPolygon'` geometry with many separate parts (i.e., without dissolving), per group (`Island`)." + "In the following example, for instance, we collect all `'Polygon'` and `'MultiPolygon'` parts of `nz` into a single `'MultiPolygon'` geometry with many separate parts (i.e., without dissolving), per group." ] }, { @@ -1484,7 +1488,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The difference between the last two results (with and without dissolving, respectively) is not evident in the printout: in both cases we got a layer with two features of type `'MultiPolygon'`.\n", + "The difference between the last two results `nz_dis1` and `nz_dis2` (with and without dissolving, respectively) is not evident in the printout: in both cases we got a layer with two features of type `'MultiPolygon'`.\n", "However, in the first case internal borders were dissolved, while in the second case they were not.\n", "This is illustrated in @fig-combine-geoms:" ] @@ -1494,7 +1498,7 @@ "metadata": {}, "source": [ "#| label: fig-combine-geoms\n", - "#| fig-cap: Combining New Zealand geometries into one, for each island, with and witout dissolving\n", + "#| fig-cap: Combining New Zealand geometries into one, for each island, with and without dissolving\n", "#| layout-ncol: 2\n", "#| fig-subcap: \n", "#| - Dissolving (using the **geopandas** `.dissolve` method)\n", @@ -1513,12 +1517,9 @@ "\n", "## Geometric operations on raster data {#sec-geo-ras}\n", "\n", - "\n", - "\n", - "\n", - " Geometric raster operations include the shift, flipping, mirroring, scaling, rotation or warping of images. \n", - " These operations are necessary for a variety of applications including georeferencing, used to allow images to be overlaid on an accurate map with a known CRS [@liu_essential_2009]. \n", - " A variety of georeferencing techniques exist, including:\n", + "Geometric raster operations include the shift, flipping, mirroring, scaling, rotation, or warping of images. \n", + "These operations are necessary for a variety of applications including georeferencing, used to allow images to be overlaid on an accurate map with a known CRS [@liu_essential_2009]. \n", + "A variety of georeferencing techniques exist, including:\n", "\n", "* Georectification based on known ground control points\n", "* Orthorectification, which also accounts for local topography\n", @@ -1526,14 +1527,14 @@ "\n", "Python is rather unsuitable for the first two points since these often require manual intervention which is why they are usually done with the help of dedicated GIS software. \n", "On the other hand, aligning several images is possible in Python and this section shows among others how to do so. \n", - "This often includes changing the extent, the resolution and the origin of an image. \n", - "A matching projection is of course also required but is already covered @sec-reprojecting-raster-geometries.\n", + "This often includes changing the extent, the resolution, and the origin of an image. \n", + "A matching projection is of course also required but is already covered in @sec-reprojecting-raster-geometries.\n", "\n", "In any case, there are other reasons to perform a geometric operation on a single raster image. \n", "For instance, a common reason for aggregating a raster is to decrease run-time or save disk space. \n", "Of course, this approach is only recommended if the task at hand allows a coarser resolution of raster data.\n", "\n", - "### Geometric intersections {#sec-raster-geometric-intersections}\n", + "\n", "\n", "\n", "\n", @@ -1541,181 +1542,70 @@ "\n", "\n", "\n", - "In @sec-spatial-subsetting-raster we have shown how to extract values from a raster overlaid by coordinates or by a matching boolean mask.\n", - "A different case is when the area of interest is defined by any general (possibly non-matching) raster B, to retrieve a spatial output of a (smaller) subset of raster A we can:\n", + "\n", + "\n", "\n", - "- Extract the bounding box polygon of B (hereby, `clip`)\n", - "- Mask and crop A (hereby, `elev.tif`) using B (@sec-raster-cropping)\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", "\n", - "For example, suppose that we want to get a subset of the `elev.tif` raster using another, smaller, raster.\n", - "To demonstrate this, let's create (see @sec-raster-from-scratch) that smaller raster, hereby named `clip`.\n", - "First, we need to create a $3 \\times 3$ array of raster values." - ] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "clip = np.array([1] * 9).reshape(3, 3)\n", - "clip" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Then, we define the transformation matrix, in such a way that `clip` intersects with `elev.tif` (@fig-raster-intersection)." - ] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "new_transform = rasterio.transform.from_origin(\n", - " west=0.9, \n", - " north=0.45, \n", - " xsize=0.3, \n", - " ysize=0.3\n", - ")\n", - "new_transform" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now, for subsetting, we will derive a `shapely` geometry representing the `clip` raster extent, using [`rasterio.transform.array_bounds`](https://rasterio.readthedocs.io/en/latest/api/rasterio.transform.html#rasterio.transform.array_bounds)." - ] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "bbox = rasterio.transform.array_bounds(\n", - " clip.shape[1], # columns\n", - " clip.shape[0], # rows\n", - " new_transform\n", - ")\n", - "bbox" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The four numeric values can be transformed into a rectangular `shapely` geometry using `shapely.box` (@fig-raster-clip-bbox)." - ] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "#| label: fig-raster-clip-bbox\n", - "#| fig-cap: '`shapely` geometry derived from a clipping raster bounding box coordinates, a preliminary step for geometric intersection between two rasters'\n", - "bbox = shapely.box(*bbox)\n", - "bbox" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "@fig-raster-intersection shows the alignment of `bbox` and `elev.tif`." - ] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "#| label: fig-raster-intersection\n", - "#| fig-cap: The `elev.tif` raster, and the extent of another (smaller) raster `clip` which we use to subset it\n", - "fig, ax = plt.subplots()\n", - "rasterio.plot.show(src_elev, ax=ax)\n", - "gpd.GeoSeries([bbox]).plot(color='none', ax=ax);" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "From here on, subsetting can be done using masking and cropping, just like with any vector layer other than `bbox`, regardless whether it is rectangular or not.\n", - "We elaborate on masking and cropping in @sec-raster-cropping (check that section for details about `rasterio.mask.mask`), but, for completeness, here is the code for the last step of masking and cropping:" - ] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "out_image, out_transform = rasterio.mask.mask(\n", - " src_elev, \n", - " [bbox], \n", - " crop=True,\n", - " all_touched=True,\n", - " nodata=0\n", - ")" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The resulting subset array `out_image` contains all pixels intersecting with `clip` *pixels* (not necessarily with the centroids!).\n", - "However, due to the `all_touched=True` argument, those pixels which intersect with `clip`, but their centroid does not, retain their original values (e.g., `17`, `23`) rather than turned into \"No Data\" (e.g., `0`)." - ] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "out_image" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Therefore, in our case, subset `out_image` dimensions are $2 \\times 2$ (@fig-raster-intersection2; also see @fig-raster-intersection)." - ] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "#| label: fig-raster-intersection2\n", - "#| fig-cap: The resulting subset of the `elev.tif` raster\n", - "fig, ax = plt.subplots()\n", - "rasterio.plot.show(out_image, transform=out_transform, ax=ax)\n", - "gpd.GeoSeries([bbox]).plot(color='none', ax=ax);" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ "### Extent and origin {#sec-extent-and-origin}\n", "\n", - "When merging or performing map algebra on rasters, their resolution, projection, origin and/or extent have to match.\n", + "When merging or performing map algebra on rasters, their resolution, projection, origin, and/or extent have to match.\n", "Otherwise, how should we add the values of one raster with a resolution of `0.2` decimal degrees to a second raster with a resolution of `1` decimal degree?\n", "The same problem arises when we would like to merge satellite imagery from different sensors with different projections and resolutions.\n", "We can deal with such mismatches by aligning the rasters.\n", "Typically, raster alignment is done through resampling---that way, it is guaranteed that the rasters match exactly (@sec-raster-resampling).\n", - "However, sometimes it can be useful to modify raster placement and extent \"manually\", by adding or removing rows and columns, or by modifying the origin, that is, slightly shifting the raster.\n", + "However, sometimes it can be useful to modify raster placement and extent manually, by adding or removing rows and columns, or by modifying the origin, that is, slightly shifting the raster.\n", "Sometimes, there are reasons other than alignment with a second raster for manually modifying raster extent and placement.\n", "For example, it may be useful to add extra rows and columns to a raster prior to focal operations, so that it is easier to operate on the edges.\n", "\n", @@ -1737,7 +1627,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "To pad an `ndarray`, we can use the [`np.pad`](https://numpy.org/doc/stable/reference/generated/numpy.pad.html) function.\n", + "To pad an `ndarray`, we can use the `np.pad` function.\n", "The function accepts an array, and a tuple of the form `((rows_top,rows_bottom),(columns_left, columns_right))`.\n", "Also, we can specify the value that's being used for padding with `constant_values` (e.g., `18`).\n", "For example, here we pad `r` with one extra row and two extra columns, on both sides, resulting in the array `r_pad`:" @@ -1759,11 +1649,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", "However, for `r_pad` to be used in any spatial operation, we also have to update its transformation matrix.\n", "Whenever we add extra columns on the left, or extra rows on top, the raster *origin* changes.\n", - "To reflect this fact, we have to take to \"original\" origin and add the required multiple of pixel widths or heights (i.e., raster resolution steps).\n", + "To reflect this fact, we have to take to 'original' origin and add the required multiple of pixel widths or heights (i.e., raster resolution steps).\n", "The transformation matrix of a raster is accessible from the raster file metadata (@sec-raster-from-scratch) or, as a shortcut, through the `.transform` property of the raster file connection.\n", "For example, the next code chunk shows the transformation matrix of `elev.tif`." ] @@ -1879,7 +1767,7 @@ "source": [ "We can shift a raster origin not just when padding, but in any other use case, just by changing its transformation matrix.\n", "The effect is that the raster is going to be shifted (which is analogous to `.translate` for shifting a vector layer, see @sec-affine-transformations).\n", - "Manually shifting a raster to arbitrary distance is rarely needed in real-life scenarios, but it is useful to know how to do it at least for better understanding the concept of *raster origin*.\n", + "Manually shifting a raster to arbitrary distance is rarely needed in real-life scenarios, but it is useful to know how to do it at least for a better understanding of the concept of *raster origin*.\n", "As an example, let's shift the origin of `elev.tif` by `(-0.25,0.25)`.\n", "First, we need to calculate the new origin." ] @@ -1900,8 +1788,6 @@ "metadata": {}, "source": [ "To shift the origin in other directions we should change the two operators (`-`, `+`) accordingly.\n", - "\n", - "\n", "\n", "Then, same as when padding (see above), we create an updated transformation matrix." ] @@ -1933,7 +1819,7 @@ "metadata": {}, "source": [ "#| label: fig-raster-shift-origin2\n", - "#| fig-cap: The padded `elev.tif` raster (@fig-raster-shift-origin) further shifted by `(0.25,0.25)`, and the extent of the original `elev.tif` raster (in red)\n", + "#| fig-cap: The `elev.tif` raster shifted by `(0.25,0.25)`, and its original extent (in red)\n", "fig, ax = plt.subplots()\n", "rasterio.plot.show(r, transform=new_transform, cmap='Greys', ax=ax)\n", "elev_bbox.plot(color='none', edgecolor='red', ax=ax);" @@ -1947,19 +1833,17 @@ "source": [ "### Aggregation and disaggregation {#sec-raster-agg-disagg}\n", "\n", - "Raster datasets vary based on their resolution, from high resolution datasets that enable individual trees to be seen, to low resolution datasets covering large swaths of the Earth.\n", + "Raster datasets vary based on their resolution, from high-resolution datasets that enable individual trees to be seen, to low-resolution datasets covering large swaths of the Earth.\n", "Raster datasets can be transformed to either decrease (aggregate) or increase (disaggregate) their resolution, for a number of reasons.\n", "For example, aggregation can be used to reduce computational resource requirements of raster storage and subsequent steps, while disaggregation can be used to match other datasets, or to add detail.\n", - "As an example, we here change the spatial resolution of `dem.tif` by a factor of `5` (@fig-raster-aggregate).\n", "\n", "::: callout-note\n", "Raster aggregation is, in fact, a special case of raster resampling (see @sec-raster-resampling), where the target raster grid is aligned with the original raster, only with coarser pixels.\n", - "Conversely, raster resampling is the general case where the new grid is not necessarily an aggregation of the original one, but any other type of grid (such as a rotated and/or shifted one, etc.).\n", + "Conversely, raster resampling is the general case where the new grid is not necessarily an aggregation of the original one, but any other type of grid (i.e., shifted and or having increased/reduced resolution, by any factor).\n", ":::\n", - "\n", - "\n", "\n", - "To aggregate a raster using **rasterio**, we go through [two steps](https://rasterio.readthedocs.io/en/stable/topics/resampling.html):\n", + "As an example, we here change the spatial resolution of `dem.tif` by a factor of `5` (@fig-raster-aggregate).\n", + "To aggregate a raster using **rasterio**, we go through two steps:\n", "\n", "- Reading the raster values (using `.read`) into an `out_shape` that is different from the original `.shape`\n", "- Updating the `transform` according to `out_shape`\n", @@ -1999,10 +1883,10 @@ "source": [ "To aggregate, instead of reading the raster values the usual way, as in `src.read(1)`, we can specify `out_shape` to read the values into a different shape.\n", "Here, we calculate a new shape which is downscaled by a factor of `5`, i.e., the number of rows and columns is multiplied by `0.2`.\n", - "We must truncate any \"partial\" rows and columns, e.g., using `int`.\n", - "Each new pixel is now obtained, or \"resampled\", from $\\sim 5 \\times 5 = \\sim 25$ \"old\" raster values.\n", + "We must truncate any partial rows and columns, e.g., using `int`.\n", + "Each new pixel is now obtained, or resampled, from $\\sim 5 \\times 5 = \\sim 25$ 'old' raster values.\n", "It is crucial to choose an appropriate *resampling method* through the `resampling` parameter.\n", - "Here we use [`rasterio.enums.Resampling.average`](https://rasterio.readthedocs.io/en/stable/api/rasterio.enums.html#rasterio.enums.Resampling), i.e., the new \"large\" pixel value is the average of all coinciding small pixels, which makes sense for our elevation data in `dem.tif`. \n", + "Here we use `rasterio.enums.Resampling.average`, i.e., the new 'large' pixel value is the average of all coinciding small pixels, which makes sense for our elevation data in `dem.tif`. \n", "See @sec-raster-resampling for a list of other available methods." ] }, @@ -2042,9 +1926,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", - "\n", "What's left to be done is the second step, to update the transform, taking into account the change in raster shape.\n", "This can be done as follows, using `.transform.scale`." ] @@ -2112,9 +1993,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Then we can create a new file (`dem_agg5.tif`) in writing mode, and write the values from the aggregated array `r` into the 1^st^ band of the file (see @sec-data-output-raster for a detailed explanation of writing raster files with **rasterio**).\n", - "\n", - "" + "Then we can create a new file (`dem_agg5.tif`) in writing mode, and write the values from the aggregated array `r` into the 1^st^ band of the file (see @sec-data-output-raster for a detailed explanation of writing raster files with **rasterio**)." ] }, { @@ -2133,15 +2012,15 @@ "metadata": {}, "source": [ "::: callout-note\n", - "The `**` syntax in Python is known as variable-length [\"*keyword* arguments\"](https://docs.python.org/3/glossary.html#term-argument).\n", + "The `**` syntax in Python is known as variable-length '*keyword* arguments'.\n", "It is used to pass a dictionary of numerous `parameter:argument` pairs to named arguments of a function.\n", - "In `rasterio.open` writing mode, the \"keyword arguments\" syntax often comes in handy, because, instead of specifying each and every property of a new file, we pass a (modified) `.meta` dictionary based on another, template, raster. \n", + "In `rasterio.open` writing mode, the 'keyword arguments' syntax often comes in handy, because, instead of specifying each and every property of a new file, we pass a (modified) `.meta` dictionary based on another, template, raster. \n", "\n", "Technically, keep in mind that the expression:\n", "```\n", "rasterio.open('out.tif', 'w', **dst_kwargs)\n", "```\n", - "where `dst_kwargs` is a `dict` of the following form (typically coming from a template raster, possibly with few \"updated\" properties using `.update`, see above):\n", + "where `dst_kwargs` is a `dict` of the following form (typically coming from a template raster, possibly with few updated properties using `.update`, see above):\n", "```\n", "{'driver': 'GTiff',\n", " 'dtype': 'float32',\n", @@ -2159,13 +2038,11 @@ " ...\n", ")\n", "```\n", - "\"*Positional* arguments\" is a related technique; see note in @sec-reprojecting-raster-geometries.\n", + "*Positional* arguments is a related technique; see note in @sec-reprojecting-raster-geometries.\n", ":::\n", "\n", "The opposite operation, namely disaggregation, is when we increase the resolution of raster objects.\n", "Either of the supported resampling methods (see @sec-raster-resampling) can be used.\n", - "\n", - "\n", "However, since we are not actually summarizing information but transferring the value of a large pixel into multiple small pixels, it makes sense to use either:\n", "\n", "- Nearest neighbor resampling (`rasterio.enums.Resampling.nearest`), when want to keep the original values as-is, since modifying them would be incorrect (such as in categorical rasters)\n", @@ -2233,7 +2110,7 @@ "source": [ "The original raster `dem.tif` was already quite detailed, so it would be difficult to see any difference when plotting it along with the disaggregation result.\n", "A zoom-in of a small section of the rasters works better.\n", - "@fig-raster-disaggregate allows us to see the top-left corner of the original raster and the disaggregated one, demonstrating the increase in the number of pixels through disaggregation." + "@fig-raster-disaggregate shows the top-left corners of the original raster and the disaggregated one, demonstrating the increase in the number of pixels through disaggregation." ] }, { @@ -2241,7 +2118,7 @@ "metadata": {}, "source": [ "#| label: fig-raster-disaggregate\n", - "#| fig-cap: Disaggregating a raster by a factor of 5, using bilinear tresampling. Only the a small portion (top-left corner) of the rasters is shown, to zoom-in and demonstrate the effect of disaggregation.\n", + "#| fig-cap: Disaggregating a raster by a factor of 5, using bilinear tresampling. Only a small portion (top-left corner) of the rasters is shown, to zoom-in and demonstrate the effect of disaggregation.\n", "#| layout-ncol: 2\n", "#| fig-subcap: \n", "#| - Original\n", @@ -2268,22 +2145,20 @@ "There are several methods for estimating values for a raster with different resolutions/origins (@fig-raster-resample).\n", "The main resampling methods include:\n", "\n", - "- Nearest neighbor: assigns the value of the nearest cell of the original raster to the cell of the target one. This is a fast simple technique that is usually suitable for resampling categorical rasters\n", - "- Bilinear interpolation: assigns a weighted average of the four nearest cells from the original raster to the cell of the target one. This is the fastest method that is appropriate for continuous rasters\n", - "- Cubic interpolation: uses values of the 16 nearest cells of the original raster to determine the output cell value, applying third-order polynomial functions. Used for continuous rasters and results in a smoother surface compared to bilinear interpolation, but is computationally more demanding\n", - "- Cubic spline interpolation: also uses values of the 16 nearest cells of the original raster to determine the output cell value, but applies cubic splines (piecewise third-order polynomial functions). Used for continuous rasters\n", - "- Lanczos windowed sinc resampling: uses values of the 36 nearest cells of the original raster to determine the output cell value. Used for continuous rasters\n", + "- Nearest neighbor---assigns the value of the nearest cell of the original raster to the cell of the target one. This is a fast simple technique that is usually suitable for resampling categorical rasters\n", + "- Bilinear interpolation---assigns a weighted average of the four nearest cells from the original raster to the cell of the target one. This is the fastest method that is appropriate for continuous rasters\n", + "- Cubic interpolation---uses values of the 16 nearest cells of the original raster to determine the output cell value, applying third-order polynomial functions. Used for continuous rasters and results in a smoother surface compared to bilinear interpolation, but is computationally more demanding\n", + "- Cubic spline interpolation---also uses values of the 16 nearest cells of the original raster to determine the output cell value, but applies cubic splines (piecewise third-order polynomial functions). Used for continuous rasters\n", + "- Lanczos windowed sinc resampling---uses values of the 36 nearest cells of the original raster to determine the output cell value. Used for continuous rasters\n", "- Additionally, we can use straightforward summary methods, taking into account all pixels that coincide with the target pixel, such as average (@fig-raster-aggregate), minimum, maximum (@fig-raster-resample), median, mode, and sum\n", "\n", "The above explanation highlights that only nearest neighbor resampling is suitable for categorical rasters, while all remaining methods can be used (with different outcomes) for continuous rasters.\n", "\n", - "\n", - "\n", - "With **rasterio**, resampling can be done using the [`rasterio.warp.reproject`](https://rasterio.readthedocs.io/en/stable/api/rasterio.warp.html#rasterio.warp.reproject) function .\n", + "With **rasterio**, resampling can be done using the `rasterio.warp.reproject` function.\n", "To clarify this naming convention, note that raster *reprojection* is not fundamentally different from *resampling*---the difference is just whether the target grid is in the same CRS as the origin (resampling) or in a different CRS (reprojection).\n", "In other words, reprojection is *resampling* into a grid that is in a different CRS.\n", "Accordingly, both resampling and reprojection are done using the same function `rasterio.warp.reproject`.\n", - "We will demonstrate reprojection using `rasterio.warp.reproject` later in @sec-reprojecting-raster-geometries.\n", + "We will demonstrate *reprojection* using `rasterio.warp.reproject` later in @sec-reprojecting-raster-geometries.\n", "\n", "The information required for `rasterio.warp.reproject`, whether we are resampling or reprojecting, is:\n", "\n", @@ -2292,25 +2167,24 @@ "\n", "Importantly, `rasterio.warp.reproject` can work with file connections, such as a connection to an output file in write (`'w'`) mode.\n", "This makes the function efficient for large rasters.\n", - "\n", - "\n", "\n", "The target and destination CRS are straightforward to specify, depending on our choice.\n", - "The source transform is also available, e.g., through the `.transform` property of the source file connection.\n", + "The source transform is also readily available, through the `.transform` property of the source file connection.\n", "The only complicated part is to figure out the *destination transform*.\n", "When resampling, the transform is typically derived either from a *template* raster, such as an existing raster file that we would like our origin raster to match, or from a numeric specification of our target grid (see below).\n", "Otherwise, when the exact grid is not of importance, we can simply aggregate or disaggregate our raster as shown above (@sec-raster-agg-disagg).\n", - "(Note that when *reprojecting*, the target transform is not as straightforward to figure out, therefore we further use the `rasterio.warp.calculate_default_transform` function to compute it, as will be shown in @sec-reprojecting-raster-geometries.)\n", + "(Note that when *reprojecting*, the target transform is more difficult to figure out, therefore we further use the `rasterio.warp.calculate_default_transform` function to compute it, as will be shown in @sec-reprojecting-raster-geometries.)\n", "\n", "Finally, the resampling method is specified through the `resampling` parameter of `rasterio.warp.reproject`. \n", "The default is nearest neighbor resampling. \n", - "However, as mentioned above, you should be aware of the distiction between resampling methods, and choose the appropriate one according to the data type (continuous/categorical), the input and output resolution, and resampling purposes.\n", - "Possible arguments for [`resampling`](https://rasterio.readthedocs.io/en/stable/api/rasterio.enums.html#rasterio.enums.Resampling) include:\n", + "However, as mentioned above, you should be aware of the distinction between resampling methods, and choose the appropriate one according to the data type (continuous/categorical), the input and output resolution, and resampling purposes.\n", + "Possible arguments for `resampling` include:\n", "\n", "- `rasterio.enums.Resampling.nearest`---Nearest neighbor\n", "- `rasterio.enums.Resampling.bilinear`---Bilinear\n", "- `rasterio.enums.Resampling.cubic`---Cubic\n", "- `rasterio.enums.Resampling.lanczos`---Lanczos windowed\n", + "- `rasterio.enums.Resampling.average`---Average\n", "- `rasterio.enums.Resampling.mode`---Mode. i.e., most common value\n", "- `rasterio.enums.Resampling.min`---Minimum\n", "- `rasterio.enums.Resampling.max`---Maximum\n", @@ -2318,7 +2192,7 @@ "- `rasterio.enums.Resampling.sum`---Median\n", "\n", "Let's demonstrate resampling into a destination grid which is specified through numeric constraints, such as the extent and resolution.\n", - "These could have been specified manually (such as here), or obtained from a template raster metadata that we would like to match.\n", + "Again, these could have been specified manually (such as here), or obtained from a template raster metadata that we would like to match.\n", "Note that the resolution of the destination grid is \\~10 times more coarse (300 $m$) than the original resolution of `dem.tif` (\\~30 $m$) (@fig-raster-resample)." ] }, @@ -2361,10 +2235,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Again, note that in case we needed to resample into a grid specified by an existing \"template\" raster, we could skip this step and simply read the transform from the template file, as in `rasterio.open('template.tif').transform`.\n", + "In case we needed to resample into a grid specified by an existing template raster, we could have skipped this step and simply read the transform from the template file, as in `rasterio.open('template.tif').transform`.\n", "\n", - "Now, we can move on to creating the destination file connection.\n", - "For that, we also have to know the raster dimensions that can be derived from the extent and the resolution." + "We can move on to creating the destination file connection.\n", + "For that, we also have to know the raster dimensions, which can be derived from the extent and the resolution." ] }, { @@ -2383,7 +2257,7 @@ "metadata": {}, "source": [ "Now we can create the destination file connection.\n", - "We are using the same metadata as the source file, except for the dimensions and the transform, which are going to be different and reflecting the resampling process." + "We are using the same metadata as the source file, except for the dimensions and the transform, which are going to be different and reflect the resampling process." ] }, { @@ -2406,7 +2280,7 @@ "metadata": {}, "source": [ "Finally, we reproject using function `rasterio.warp.reproject`.\n", - "Note that the source and destination are specified using [`rasterio.band`](https://rasterio.readthedocs.io/en/latest/api/rasterio.html#rasterio.band) applied on either the file connection, reflecting the fact that we operate on a specific layer of the rasters.\n", + "Note that the source and destination are specified using `rasterio.band` applied on both file connections, reflecting the fact that we operate on a specific layer of the rasters.\n", "The resampling method being used here is nearest neighbor resampling (`rasterio.enums.Resampling.nearest`)." ] }, @@ -2431,9 +2305,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In the end, we close the file connection and create a new file `output/dem_resample_nearest.tif` with the resampling result (@fig-raster-resample).\n", - "\n", - "" + "In the end, we close the file connection, thus finalizing the new file `output/dem_resample_nearest.tif` with the resampling result (@fig-raster-resample)." ] }, { @@ -2450,16 +2322,13 @@ "metadata": {}, "source": [ "Here is another code section just to demonstrate a different resampling method, the maximum resampling, i.e., every new pixel gets the maximum value of all the original pixels it coincides with (@fig-raster-resample).\n", - "Note that all arguments in the `rasterio.warp.reproject` function call are identical to the previous example, except for the `resampling` method.\n", - "\n", - "" + "Note that all arguments in the `rasterio.warp.reproject` function call are identical to the previous example, except for the `resampling` method." ] }, { "cell_type": "code", "metadata": {}, "source": [ - "#| eval: false\n", "dst = rasterio.open('output/dem_resample_maximum.tif', 'w', **dst_kwargs)\n", "rasterio.warp.reproject(\n", " source=rasterio.band(src, 1),\n", @@ -2487,12 +2356,12 @@ "metadata": {}, "source": [ "#| label: fig-raster-resample\n", - "#| fig-cap: Visual comparison of the original raster and two different resampling methods'\n", + "#| fig-cap: The original raster `dem.tif` and two different resampling method results\n", "#| layout-ncol: 3\n", "#| fig-subcap: \n", - "#| - Input\n", - "#| - Nearest neighbor\n", - "#| - Maximum\n", + "#| - Input\n", + "#| - Nearest neighbor\n", + "#| - Maximum\n", "# Input\n", "fig, ax = plt.subplots(figsize=(4,4))\n", "rasterio.plot.show(src, ax=ax);\n", @@ -2510,17 +2379,15 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Exercises\n", - "\n", - "## References" + "" ] } ], "metadata": { "kernelspec": { - "name": "python3", + "display_name": "Python 3", "language": "python", - "display_name": "Python 3 (ipykernel)" + "name": "python3" } }, "nbformat": 4, diff --git a/ipynb/05-raster-vector.ipynb b/ipynb/05-raster-vector.ipynb index 9777cdbc..b114988e 100644 --- a/ipynb/05-raster-vector.ipynb +++ b/ipynb/05-raster-vector.ipynb @@ -4,6 +4,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ + "---\n", + "jupyter: python3\n", + "---\n", + "\n", "# Raster-vector interactions {#sec-raster-vector}\n", "\n", "## Prerequisites {.unnumbered}" @@ -14,12 +18,17 @@ "metadata": {}, "source": [ "#| echo: false\n", - "import matplotlib.pyplot as plt\n", - "import pandas as pd\n", - "pd.options.display.max_rows = 6\n", - "pd.options.display.max_columns = 6\n", - "pd.options.display.max_colwidth = 35\n", - "plt.rcParams['figure.figsize'] = (5, 5)" + "import book_options" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| echo: false\n", + "import book_options_pdf" ], "execution_count": null, "outputs": [] @@ -28,9 +37,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This chapter requires importing the following packages:\n", - "\n", - "" + "This chapter requires importing the following packages:" ] }, { @@ -41,6 +48,7 @@ "import math\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", "import shapely\n", "import geopandas as gpd\n", "import rasterio\n", @@ -91,13 +99,13 @@ "- Extracting raster values using different types of vector data (Section @sec-raster-extraction)\n", "- Raster-vector conversion (@sec-rasterization and @sec-spatial-vectorization)\n", "\n", - "These concepts are demonstrated using data from in previous chapters, to understand their potential real-world applications.\n", + "These concepts are demonstrated using data from previous chapters, to understand their potential real-world applications.\n", "\n", "## Raster masking and cropping {#sec-raster-cropping}\n", "\n", "Many geographic data projects involve integrating data from many different sources, such as remote sensing images (rasters) and administrative boundaries (vectors).\n", "Often the extent of input raster datasets is larger than the area of interest.\n", - "In this case raster *masking*, *cropping*, or both, are useful for unifying the spatial extent of input data (@fig-raster-crop (b) and (c), and the following two examples, illustrate the difference between masking and cropping).\n", + "In this case, raster *masking*, *cropping*, or both, are useful for unifying the spatial extent of input data (@fig-raster-crop (b) and (c), and the following two examples, illustrate the difference between masking and cropping).\n", "Both operations reduce object memory use and associated computational resources for subsequent analysis steps, and may be a necessary preprocessing step before creating attractive maps involving raster data.\n", "\n", "We will use two layers to illustrate raster cropping:\n", @@ -106,9 +114,7 @@ "- The `zion.gpkg` vector layer representing the Zion National Park boundaries (a `GeoDataFrame` named `zion`)\n", "\n", "Both target and cropping objects must have the same projection.\n", - "Since it is easier and more precise to reproject vector layers, compared to rasters, we use the following expression to reproject (@sec-reprojecting-vector-geometries) the vector layer `zion` into the CRS of the raster `src_srtm`.\n", - "\n", - "" + "Since it is easier and more precise to reproject vector layers, compared to rasters, we use the following expression to reproject (@sec-reprojecting-vector-geometries) the vector layer `zion` into the CRS of the raster `src_srtm`." ] }, { @@ -124,7 +130,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "To mask the image, i.e., convert all pixels which do not intersect with the `zion` polygon to \"No Data\", we use the [`rasterio.mask.mask`](https://rasterio.readthedocs.io/en/stable/api/rasterio.mask.html#rasterio.mask.mask) function.\n" + "To mask the image, i.e., convert all pixels which do not intersect with the `zion` polygon to 'No Data', we use the `rasterio.mask.mask` function.\n" ] }, { @@ -145,11 +151,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Note that we need to choose and specify a \"No Data\" value, within the valid range according to the data type.\n", + "Note that we need to choose and specify a 'No Data' value, within the valid range according to the data type.\n", "Since `srtm.tif` is of type `uint16` (how can we check?), we choose `9999` (a positive integer that is guaranteed not to occur in the raster).\n", - "Also note that **rasterio** does not directly support **geopandas** data structures, so we need to pass a \"collection\" of **shapely** geometries: a `GeoSeries` (see above) or a `list` of **shapely** geometries (see next example) both work.\n", - "\n", - "\n", + "Also note that **rasterio** does not directly support **geopandas** data structures, so we need to pass a 'collection' of **shapely** geometries: a `GeoSeries` (see above) or a `list` of **shapely** geometries (see next example) both work.\n", "The output consists of two objects.\n", "The first one is the `out_image` array with the masked values." ] @@ -186,9 +190,9 @@ "Note that masking (without cropping!) does not modify the raster extent.\n", "Therefore, the new transform is identical to the original (`src_srtm.transform`).\n", "\n", - "Unfortunately, the `out_image` and `out_transform` objects do not contain any information indicating that `9999` represents \"No Data\".\n", + "Unfortunately, the `out_image` and `out_transform` objects do not contain any information indicating that `9999` represents 'No Data'.\n", "To associate the information with the raster, we must write it to file along with the corresponding metadata.\n", - "For example, to write the masked raster to file, we first need to modify the \"No Data\" setting in the metadata." + "For example, to write the masked raster to file, we first need to modify the 'No Data' setting in the metadata." ] }, { @@ -224,7 +228,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now we can re-import the raster and check that the \"No Data\" value is correctly set." + "Now we can re-import the raster and check that the 'No Data' value is correctly set." ] }, { @@ -241,7 +245,7 @@ "metadata": {}, "source": [ "The `.meta` property contains the `nodata` entry.\n", - "Now, any relevant operation (such as plotting, see @fig-raster-crop (b)) will take \"No Data\" into account." + "Now, any relevant operation (such as plotting, see @fig-raster-crop (b)) will take 'No Data' into account." ] }, { @@ -259,10 +263,10 @@ "source": [ "The related operation, cropping, reduces the raster extent to the extent of the vector layer:\n", "\n", - "- To just crop, *without* masking, we can derive the bounding box polygon of the vector layer, and then crop using that polygon, also combined with `crop=True` (@fig-raster-crop (c))\n", - "- To crop *and* mask, we can use `rasterio.mask.mask`, same as above for masking, just setting `crop=True` instead of the default `crop=False` (@fig-raster-crop (d))\n", + "* To crop *and* mask, we can use `rasterio.mask.mask`, same as above for masking, while setting `crop=True` (@fig-raster-crop (d))\n", + "* To just crop, *without* masking, we can derive the bounding box polygon of the vector layer, and then crop using that polygon, also combined with `crop=True` (@fig-raster-crop (c))\n", "\n", - "For the example of cropping only, the extent polygon of `zion` can be obtained as a `shapely` geometry object using the `.unary_union.envelope` property(@fig-zion-bbox)." + "For the example of cropping only, the extent polygon of `zion` can be obtained as a `shapely` geometry object using `.union_all().envelope`(@fig-zion-bbox)." ] }, { @@ -271,7 +275,7 @@ "source": [ "#| label: fig-zion-bbox\n", "#| fig-cap: Bounding box `'Polygon'` geometry of the `zion` layer\n", - "bb = zion.unary_union.envelope\n", + "bb = zion.union_all().envelope\n", "bb" ], "execution_count": null, @@ -282,7 +286,7 @@ "metadata": {}, "source": [ "The extent can now be used for masking.\n", - "Here, we are also using the `all_touched=True` option so that pixels partially overlapping with the extent are also included in the output." + "Here, we are also using the `all_touched=True` option, so that pixels which are partially overlapping with the extent are also included in the output." ] }, { @@ -304,20 +308,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In the case of cropping, there is no particular reason to write the result to file for easier plotting, such as in the other two examples, since there are no \"No Data\" values (@fig-raster-crop (c)).\n", + "In the case of cropping, there is no particular reason to write the result to file for easier plotting, such as in the other two examples, since there are no 'No Data' values (@fig-raster-crop (c)).\n", "\n", "::: callout-note\n", "As mentioned above, **rasterio** functions typically accept vector geometries in the form of `lists` of `shapely` objects. `GeoSeries` are conceptually very similar, and also accepted. However, even an individual geometry has to be in a `list`, which is why we pass `[bb]`, and not `bb`, in the above `rasterio.mask.mask` function call (the latter would raise an error).\n", ":::\n", "\n", - "\n", - "\n", - "Finally, the third example is where we perform crop both and mask operations, using `rasterio.mask.mask` with `crop=True`.\n", - "\n", - "\n", - "\n", - "\n", - "" + "Finally, the third example is where we perform both crop and mask operations, using `rasterio.mask.mask` with `crop=True` passing `zion.geometry`." ] }, { @@ -338,7 +335,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "When writing the result to file, it is here crucial to update the transform and dimensions, since they were modified as a result of cropping.\n", + "When writing the result to a file, it is here crucial to update the transform and dimensions, since they were modified as a result of cropping.\n", "Also note that `out_image_mask_crop` is a three-dimensional array (even though it has one band in this case), so the number of rows and columns are in `.shape[1]` and `.shape[2]` (rather than `.shape[0]` and `.shape[1]`), respectively." ] }, @@ -375,8 +372,7 @@ "cell_type": "code", "metadata": {}, "source": [ - "src_srtm_mask_crop = rasterio.open('output/srtm_masked_cropped.tif')\n", - "out_image_mask_crop.shape" + "src_srtm_mask_crop = rasterio.open('output/srtm_masked_cropped.tif')" ], "execution_count": null, "outputs": [] @@ -385,7 +381,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "@fig-raster-crop shows the original raster, and the all of the masked and cropped results." + "@fig-raster-crop shows the original raster, and the three masking and/or cropping results." ] }, { @@ -431,8 +427,8 @@ "\n", "In the following examples, we use a package called **rasterstats**, which is specifically aimed at extracting raster values:\n", "\n", - "- To *points* (@sec-extraction-to-points) or to *lines* (@sec-extraction-to-lines), via the [`rasterstats.point_query`](https://pythonhosted.org/rasterstats/rasterstats.html#rasterstats.point_query) function\n", - "- To *polygons* (@sec-extraction-to-polygons), via the [`rasterstats.zonal_stats`](https://pythonhosted.org/rasterstats/rasterstats.html#rasterstats.zonal_stats) function\n", + "* To *points* (@sec-extraction-to-points) or to *lines* (@sec-extraction-to-lines), via the `rasterstats.point_query` function\n", + "* To *polygons* (@sec-extraction-to-polygons), via the `rasterstats.zonal_stats` function\n", "\n", "### Extraction to points {#sec-extraction-to-points}\n", "\n", @@ -445,10 +441,10 @@ "metadata": {}, "source": [ "#| label: fig-zion-points\n", - "#| fig-cap: 30 point locations within the Zion National Park, with elevation in the background\n", + "#| fig-cap: 30-point locations within the Zion National Park, with elevation in the background\n", "fig, ax = plt.subplots()\n", "rasterio.plot.show(src_srtm, ax=ax)\n", - "zion_points.plot(ax=ax, color='black');" + "zion_points.plot(ax=ax, color='black', edgecolor='white');" ], "execution_count": null, "outputs": [] @@ -479,11 +475,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The first two arguments are the vector layer and the array with rastetr values. \n", - "The `nodata` and `affine` arguments are used to align the array values into the CRS, and to correctly treat \"No Data\" flags. \n", - "Finally, the `interpolate` argument controls the way that the cell values are asigned to the point; `interpolate='nearest'` typically makes more sense, as opposed to the other option `interpolate='bilinear'` which is the default.\n", + "The first two arguments are the vector layer and the array with raster values. \n", + "The `nodata` and `affine` arguments are used to align the array values into the CRS, and to correctly treat 'No Data' flags. \n", + "Finally, the `interpolate` argument controls the way that the cell values are assigned to the point; `interpolate='nearest'` typically makes more sense, as opposed to the other option `interpolate='bilinear'` which is the default.\n", "\n", - "Alternatively, we can pass a raster file path to `rasterstats.point_query`, in which case `nodata` and `affine` are not necessary, as the function can understand those properties from the raster file." + "Alternatively, we can pass a raster file path to `rasterstats.point_query`, in which case `nodata` and `affine` are not necessary, as the function can understand those properties directly from the raster file." ] }, { @@ -503,10 +499,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", - "\n", - "The resulting object is a `list` of raster values, corresponding to `zion_points`.\n", + "Either way, the resulting object is a `list` of raster values, corresponding to `zion_points`.\n", "For example, here are the elevations of the first five points." ] }, @@ -519,15 +512,6 @@ "execution_count": null, "outputs": [] }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "result2[:5]" - ], - "execution_count": null, - "outputs": [] - }, { "cell_type": "markdown", "metadata": {}, @@ -551,9 +535,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", - "\n", "The function supports extracting from just one raster band at a time.\n", "When passing an array, we can read the required band (as in, `.read(1)`, `.read(2)`, etc.).\n", "When passing a raster file path, we can set the band using the `band_num` argument (the default being `band_num=1`).\n", @@ -564,7 +545,7 @@ "The typical line extraction algorithm is to extract one value for each raster cell touched by a line.\n", "However, this particular approach is not recommended to obtain values along the transects, as it is hard to get the correct distance between each pair of extracted raster values.\n", "\n", - "For line extraction, a better approach is to split the line into many points (at equal distances along the line) and then extract the values for these points using the \"extraction to points\" technique (@sec-extraction-to-points).\n", + "For line extraction, a better approach is to split the line into many points (at equal distances along the line) and then extract the values for these points using the 'extraction to points' technique (@sec-extraction-to-points).\n", "To demonstrate this, the code below creates (see @sec-vector-data for recap) `zion_transect`, a straight line going from northwest to southeast of the Zion National Park." ] }, @@ -621,8 +602,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Next, we need to calculate the distances, along the line, where points are going to be generated, using [`np.arange`](https://numpy.org/doc/stable/reference/generated/numpy.arange.html).\n", - "This is a numeric sequence starting at `0`, going up to line `.length`, in steps of `250` ($m$)." + "Next, we need to calculate the distances, along the line, where points are going to be generated.\n", + "We do this using `np.arange`.\n", + "The result is a numeric sequence starting at `0`, going up to line `.length`, in steps of `250` ($m$)." ] }, { @@ -639,16 +621,18 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The distances cutoffs are used to sample (\"interpolate\") points along the line.\n", - "The **shapely** [`.interpolate`](https://shapely.readthedocs.io/en/stable/manual.html#object.interpolate) method is used to generate the points, which then are reprojected back to the geographic CRS of the raster (EPSG:`4326`)." + "The distance cutoffs are used to sample ('interpolate') points along the line.\n", + "The **shapely** `.interpolate` method is used to generate the points, which then are reprojected back to the geographic CRS of the raster (EPSG:`4326`)." ] }, { "cell_type": "code", "metadata": {}, "source": [ - "zion_transect_pnt = [zion_transect_utm.interpolate(distance) for distance in distances]\n", - "zion_transect_pnt = gpd.GeoSeries(zion_transect_pnt, crs=32612).to_crs(src_srtm.crs)\n", + "#| code-overflow: wrap\n", + "zion_transect_pnt = [zion_transect_utm.interpolate(d) for d in distances]\n", + "zion_transect_pnt = gpd.GeoSeries(zion_transect_pnt, crs=32612) \\\n", + " .to_crs(src_srtm.crs)\n", "zion_transect_pnt" ], "execution_count": null, @@ -658,7 +642,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Finally, we extract the elevation values for each point in our transect and combine the information with `zion_transect_pnt` (after \"promoting\" it to a `GeoDataFrame`, to accommodate extra attributes), using the point extraction method shown earlier (@sec-extraction-to-points).\n", + "Finally, we extract the elevation values for each point in our transect and combine the information with `zion_transect_pnt` (after 'promoting' it to a `GeoDataFrame`, to accommodate extra attributes), using the point extraction method shown earlier (@sec-extraction-to-points).\n", "We also attach the respective distance cutoff points `distances`." ] }, @@ -719,7 +703,7 @@ "### Extraction to polygons {#sec-extraction-to-polygons}\n", "\n", "The final type of geographic vector object for raster extraction is polygons.\n", - "Like lines, polygons tend to return many raster values per polygon.\n", + "Like lines, polygons tend to return many raster values per vector geometry.\n", "For continuous rasters (@fig-raster-extract-to-polygon (a)), we typically want to generate summary statistics for raster values per polygon, for example to characterize a single region or to compare many regions.\n", "The generation of raster summary statistics, by polygons, is demonstrated in the code below using `rasterstats.zonal_stats`, which creates a list of summary statistics (in this case a list of length 1, since there is just one polygon)." ] @@ -766,17 +750,17 @@ "source": [ "Because there is only one polygon in the example, a `DataFrame` with a single row is returned.\n", "However, if `zion` was composed of more than one polygon, we would accordingly get more rows in the `DataFrame`.\n", - "The result provides useful summaries, for example that the maximum height in the park is around `2661` $m$ above see level.\n", + "The result provides useful summaries, for example that the maximum height in the park is `2661` $m$ above see level.\n", "\n", "Note the `stats` argument, where we determine what type of statistics are calculated per polygon.\n", - "Possible values other than `'mean'`, `'min'`, `'max'` are:\n", + "Possible values other than `'mean'`, `'min'`, and `'max'` are:\n", "\n", - "- `'count'`---The number of valid (i.e., excluding \"No Data\") pixels\n", - "- `'nodata'`---The number of pixels with 'No Data\"\n", + "- `'count'`---The number of valid (i.e., excluding 'No Data') pixels\n", + "- `'nodata'`---The number of pixels with 'No Data'\n", "- `'majority'`---The most frequently occurring value\n", "- `'median'`---The median value\n", "\n", - "See the [documentation](https://pythonhosted.org/rasterstats/manual.html#statistics) of `rasterstats.zonal_stats` for the complete list.\n", + "See the documentation of `rasterstats.zonal_stats` for the complete list.\n", "Additionally, the `rasterstats.zonal_stats` function accepts user-defined functions for calculating any custom statistics.\n", "\n", "To count occurrences of categorical raster values within polygons (@fig-raster-extract-to-polygon (b)), we can use masking (@sec-raster-cropping) combined with `np.unique`, as follows." @@ -790,7 +774,7 @@ " src_nlcd, \n", " zion.geometry.to_crs(src_nlcd.crs), \n", " crop=False, \n", - " nodata=9999\n", + " nodata=src_nlcd.nodata\n", ")\n", "counts = np.unique(out_image, return_counts=True)\n", "counts" @@ -802,7 +786,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "According to the result, for example, pixel value `2` (\"Developed\" class) appears in `4205` pixels within the Zion polygon.\n", + "According to the result, for example, the value `2` ('Developed' class) appears in `4205` pixels within the Zion polygon.\n", "\n", "@fig-raster-extract-to-polygon illustrates the two types of raster extraction to polygons described above." ] @@ -838,48 +822,47 @@ "\n", "## Rasterization {#sec-rasterization}\n", "\n", - "\n", - "\n", - "\n", "Rasterization is the conversion of vector objects into their representation in raster objects.\n", "Usually, the output raster is used for quantitative analysis (e.g., analysis of terrain) or modeling.\n", "As we saw in @sec-spatial-class, the raster data model has some characteristics that make it conducive to certain methods.\n", "Furthermore, the process of rasterization can help simplify datasets because the resulting values all have the same spatial resolution: rasterization can be seen as a special type of geographic data aggregation.\n", "\n", - "The **rasterio** package contains the [`rasterio.features.rasterize`](https://rasterio.readthedocs.io/en/stable/api/rasterio.features.html#rasterio.features.rasterize) function for doing this work.\n", - "To make it happen, we need to have the \"template\" grid definition, i.e., the \"template\" raster defining the extent, resolution and CRS of the output, in the `out_shape` (the output dimensions) and `transform` (the transformation matrix) arguments of `rasterio.features.rasterize`.\n", + "The **rasterio** package contains the `rasterio.features.rasterize` function for doing this work.\n", + "To make it happen, we need to have the 'template' grid definition, i.e., the 'template' raster defining the extent, resolution and CRS of the output, in the `out_shape` (the output dimensions) and `transform` (the transformation matrix) arguments of `rasterio.features.rasterize`.\n", "In case we have an existing template raster, we simply need to query its `.shape` and `.transform`.\n", "On the other hand, if we need to create a custom template, e.g., covering the vector layer extent with specified resolution, there is some extra work to calculate both of these objects (see next example).\n", "\n", "As for the vector geometries and their associated values, the `rasterio.features.rasterize` function requires the input vector shapes in the form of an iterable object of `geometry,value` pairs, where:\n", "\n", "- `geometry` is the given geometry (**shapely** geometry object)\n", - "- `value` is the value to be \"burned\" into pixels coinciding with the geometry (`int` or `float`)\n", + "- `value` is the value to be 'burned' into pixels coinciding with the geometry (`int` or `float`)\n", "\n", "Furthermore, we define how to deal with multiple values burned into the same pixel, using the `merge_alg` parameter.\n", - "The default `merge_alg=rasterio.enums.MergeAlg.replace` means that \"later\" values replace \"earlier\" ones, i.e., the pixel gets the \"last\" burned value.\n", + "The default `merge_alg=rasterio.enums.MergeAlg.replace` means that 'later' values replace 'earlier' ones, i.e., the pixel gets the 'last' burned value.\n", "The other option `merge_alg=rasterio.enums.MergeAlg.add` means that burned values are summed, i.e., the pixel gets the sum of all burned values.\n", "\n", "When rasterizing lines and polygons, we also have the choice between two pixel-matching algorithms. \n", - "The default, `all_touched=False`, implies pixels that are selected by [Bresenham's line algorithm](https://en.wikipedia.org/wiki/Bresenham%27s_line_algorithm) (for lines) or pixels whose center is within the polygon (for polygons).\n", + "The default, `all_touched=False`, implies pixels that are selected by Bresenham's line algorithm[^bresenham] (for lines) or pixels whose center is within the polygon (for polygons).\n", "The other option `all_touched=True`, as the name suggests, implies that all pixels intersecting with the geometry are matched.\n", "\n", - "Finally, we can set the `fill` value, which is the value that \"unaffected\" pixels get, with `fill=0` being the default.\n", + "[^bresenham]: [https://en.wikipedia.org/wiki/Bresenham%27s_line_algorithm](https://en.wikipedia.org/wiki/Bresenham%27s_line_algorithm)\n", + "\n", + "Finally, we can set the `fill` value, which is the value that 'unaffected' pixels get, with `fill=0` being the default.\n", "\n", "How the `rasterio.features.rasterize` function works with all of these various parameters will be made clear in the next examples.\n", "\n", - "The geographic resolution of the \"template\" raster has a major impact on the results: if it is too low (cell size is too large), the result may miss the full geographic variability of the vector data; if it is too high, computational times may be excessive.\n", + "The geographic resolution of the 'template' raster has a major impact on the results: if it is too low (cell size is too large), the result may miss the full geographic variability of the vector data; if it is too high, computational times may be excessive.\n", "There are no simple rules to follow when deciding an appropriate geographic resolution, which is heavily dependent on the intended use of the results.\n", - "Often the target resolution is imposed on the user, for example when the output of rasterization needs to be aligned to the existing raster.\n", + "Often the target resolution is imposed on the user, for example when the output of rasterization needs to be aligned to an existing raster.\n", "\n", "Depending on the input data, rasterization typically takes one of two forms which we demonstrate next:\n", "\n", "- in *point* rasterization (@sec-rasterizing-points), we typically choose how to treat multiple points: either to summarize presence/absence, point count, or summed attribute values (@fig-rasterize-points)\n", - "- in *line* and *polygon* rasterization (@sec-rasterizing-lines-and-polygons), there are typically no such \"overlaps\" and we simply \"burn\" attribute values, or fixed values, into pixels coinciding with the given geometries (@fig-rasterize-lines-polygons)\n", + "- in *line* and *polygon* rasterization (@sec-rasterizing-lines-and-polygons), there are typically no such 'overlaps' and we simply 'burn' attribute values, or fixed values, into pixels coinciding with the given geometries (@fig-rasterize-lines-polygons)\n", "\n", "### Rasterizing points {#sec-rasterizing-points}\n", "\n", - "To demonstrate point rasterization, we will prepare a \"template\" raster that has the same extent and CRS as the input vector data `cycle_hire_osm_projected` (a dataset on cycle hire points in London, illustrated in @fig-rasterize-points (a)) and a spatial resolution of 1000 $m$.\n", + "To demonstrate point rasterization, we will prepare a 'template' raster that has the same extent and CRS as the input vector data `cycle_hire_osm_projected` (a dataset on cycle hire points in London, illustrated in @fig-rasterize-points (a)) and a spatial resolution of 1000 $m$.\n", "To do that, we first take our point layer and transform it to a projected CRS." ] }, @@ -941,18 +924,14 @@ "metadata": {}, "source": [ "Finally, we are ready to rasterize.\n", - "As mentioned abover, point rasterization can be a very flexible operation: the results depend not only on the nature of the template raster, but also on the pixel \"activation\" method, namely the way we deal with multiple points matching the same pixel.\n", + "As mentioned above point rasterization can be a very flexible operation: the results depend not only on the nature of the template raster, but also on the pixel 'activation' method, namely the way we deal with multiple points matching the same pixel.\n", "\n", "To illustrate this flexibility, we will try three different approaches to point rasterization (@fig-rasterize-points (b)-(d)).\n", "First, we create a raster representing the presence or absence of cycle hire points (known as presence/absence rasters).\n", "In this case, we transfer the value of `1` to all pixels where at least one point falls in.\n", "In the **rasterio** framework, we use the `rasterio.features.rasterize` function, which requires an iterable object of `geometry,value` pairs. \n", "In this first example, we transform the point `GeoDataFrame` into a `list` of `shapely` geometries and the (fixed) value of `1`, using list comprehension as follows.\n", - "The first five elements of the `list` are hereby printed to illustrate its structure.\n", - "\n", - "\n", - "\n", - "" + "The first five elements of the `list` are hereby printed to illustrate its structure." ] }, { @@ -969,11 +948,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The list of `geometry,value` pairs is passed to `rasterio.features.rasterize`, along with the `shape` and `transform` which define the raster template.\n", - "The result `ch_raster1` is an `ndarray` with the burned values of `1` where the pixel coincides with at least one point, and `0` in \"unaffected\" pixels.\n", - "Note that `merge_alg=rasterio.enums.MergeAlg.replace` (the default) is used here, which means that a pixel get `1` when one or more point fall in it, or keeps the original `0` value otherwise.\n", - "\n", - "" + "The list of `geometry,value` pairs is passed to `rasterio.features.rasterize`, along with the `out_shape` and `transform` which define the raster template.\n", + "The result `ch_raster1` is an `ndarray` with the burned values of `1` where the pixel coincides with at least one point, and `0` in 'unaffected' pixels.\n", + "Note that `merge_alg=rasterio.enums.MergeAlg.replace` (the default) is used here, which means that a pixel get `1` when one or more points fall in it, or keeps the original `0` value otherwise." ] }, { @@ -997,10 +974,7 @@ "In our second variant of point rasterization, we count the number of bike hire stations. \n", "To do that, we use the fixed value of `1` (same as in the last example), but this time combined with the `merge_alg=rasterio.enums.MergeAlg.add` argument. \n", "That way, multiple values burned into the same pixel are *summed*, rather than replaced keeping last (which is the default).\n", - "The new output, `ch_raster2`, shows the number of cycle hire points in each grid cell.\n", - "\n", - "\n", - "" + "The new output, `ch_raster2`, shows the number of cycle hire points in each grid cell." ] }, { @@ -1025,10 +999,8 @@ "source": [ "The cycle hire locations have different numbers of bicycles described by the capacity variable, raising the question, what is the capacity in each grid cell?\n", "To calculate that, in our third point rasterization variant we sum the field (`'capacity'`) rather than the fixed values of `1`.\n", - "This requires using a more complex list comprehension expression, where we also (1) extract both geometries and the attribute of interest, and (2) filter out \"No Data\" values, which can be done as follows.\n", - "You are invited to run the separate parts to see how this works; the important point is that, in the end, we get the list `g` with the `geometry,value` pairs to be burned, only that the `value` is now variable, rather than fixed, among points.\n", - "\n", - "" + "This requires using a more complex list comprehension expression, where we also (1) extract both geometries and the attribute of interest, and (2) filter out 'No Data' values, which can be done as follows.\n", + "You are invited to run the separate parts to see how this works; the important point is that, in the end, we get the list `g` with the `geometry,value` pairs to be burned, only that the `value` is now variable, rather than fixed, among points." ] }, { @@ -1128,14 +1100,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Second, we \"cast\" the polygon into a `'MultiLineString'` geometry, using the [`.boundary`](https://geopandas.org/en/stable/docs/reference/api/geopandas.GeoSeries.boundary.html) property that `GeoSeries` have." + "Second, we 'cast' the polygon into a `'MultiLineString'` geometry, using the `.boundary` property that `GeoSeries` and `DataFrame`s have." ] }, { "cell_type": "code", "metadata": {}, "source": [ - "california_borders = california.geometry.boundary\n", + "california_borders = california.boundary\n", "california_borders" ], "execution_count": null, @@ -1145,7 +1117,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Third, we create the `transform` and `shape` describing our template raster, with a resolution of a `0.5` degree, using the same approach as in @sec-rasterizing-points." + "Third, we create the `transform` and `shape` describing our template raster, with a resolution of `0.5` degree, using the same approach as in @sec-rasterizing-points." ] }, { @@ -1176,7 +1148,7 @@ "When considering line or polygon rasterization, one useful additional argument is `all_touched`.\n", "By default it is `False`, but when changed to `True`---all cells that are touched by a line or polygon border get a value.\n", "Line rasterization with `all_touched=True` is demonstrated in the code below (@fig-rasterize-lines-polygons, left).\n", - "We are also using `fill=np.nan` to set \"background\" values as \"No Data\"." + "We are also using `fill=np.nan` to set 'background' values to 'No Data'." ] }, { @@ -1188,7 +1160,8 @@ " out_shape=shape,\n", " transform=transform,\n", " all_touched=True,\n", - " fill=np.nan\n", + " fill=np.nan,\n", + " dtype=np.float64\n", ")" ], "execution_count": null, @@ -1198,7 +1171,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Compare it to a polygon rasterization, with `all_touched=False` (the default), which selects only raster cells whose centroids are inside the selector polygon, as illustrated in @fig-rasterize-lines-polygons (right)." + "Compare it to polygon rasterization, with `all_touched=False` (the default), which selects only raster cells whose centroids are inside the selector polygon, as illustrated in @fig-rasterize-lines-polygons (right)." ] }, { @@ -1209,7 +1182,8 @@ " [(g, 1) for g in california.geometry],\n", " out_shape=shape,\n", " transform=transform,\n", - " fill=np.nan\n", + " fill=np.nan,\n", + " dtype=np.float64\n", ")" ], "execution_count": null, @@ -1220,9 +1194,7 @@ "metadata": {}, "source": [ "To illustrate which raster pixels are actually selected as part of rasterization, we also show them as points.\n", - "This also requires the following code section to calculate the points, which we explain in @sec-spatial-vectorization.\n", - "\n", - "" + "This also requires the following code section to calculate the points, which we explain in @sec-spatial-vectorization." ] }, { @@ -1281,7 +1253,7 @@ "## Spatial vectorization {#sec-spatial-vectorization}\n", "\n", "Spatial vectorization is the counterpart of rasterization (@sec-rasterization).\n", - "It involves converting spatially continuous raster data into spatially discrete vector data such as points, lines or polygons.\n", + "It involves converting spatially continuous raster data into spatially discrete vector data such as points, lines, or polygons.\n", "There are three standard methods to convert a raster to a vector layer, which we cover next:\n", "\n", "- Raster to polygons (@sec-raster-to-polygons)---converting raster cells to rectangular polygons, representing pixel areas\n", @@ -1292,10 +1264,8 @@ "\n", "### Raster to polygons {#sec-raster-to-polygons}\n", "\n", - "The [`rasterio.features.shapes`](https://rasterio.readthedocs.io/en/stable/api/rasterio.features.html#rasterio.features.shapes) gives access to raster pixels as polygon geometries, along with the associated raster values.\n", + "The `rasterio.features.shapes` gives access to raster pixels as polygon geometries, along with the associated raster values.\n", "The returned object is a generator (see note in @sec-spatial-subsetting-raster), yielding `geometry,value` pairs.\n", - "\n", - "\n", "\n", "For example, the following expression returns a generator named `shapes`, referring to the pixel polygons." ] @@ -1304,9 +1274,7 @@ "cell_type": "code", "metadata": {}, "source": [ - "shapes = rasterio.features.shapes(\n", - " rasterio.band(src_grain, 1) \n", - ")\n", + "shapes = rasterio.features.shapes(rasterio.band(src_grain, 1) )\n", "shapes" ], "execution_count": null, @@ -1332,7 +1300,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Each element in `pol` is a `tuple` of length 2, containing the GeoJSON-like `dict`---representing the polygon geometry and the value of the pixel(s)---which comprise the polygon.\n", + "Each element in `pol` is a `tuple` of length 2, containing the GeoJSON-like `dict`---representing the polygon geometry and the value of the pixel(s) which comprise the polygon.\n", "For example, here is the first element of `pol`." ] }, @@ -1349,18 +1317,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", - "\n", "::: callout-note\n", - "Note that, when transforming a raster cell into a polygon, five coordinate pairs need to be kept in memory to represent its geometry (explaining why rasters are often fast compared with vectors!).\n", + "Note that, when transforming a raster cell into a polygon, five-coordinate pairs need to be kept in memory to represent its geometry (explaining why rasters are often fast compared with vectors!).\n", ":::\n", "\n", "To transform the `list` coming out of `rasterio.features.shapes` into the familiar `GeoDataFrame`, we need few more steps of data reshaping.\n", - "First, we apply the [`shapely.geometry.shape`](https://shapely.readthedocs.io/en/stable/manual.html#shapely.geometry.shape) function to go from a `list` of GeoJSON-like `dict`s to a `list` of `shapely` geometry objects.\n", - "The `list` can then be converted to a `GeoSeries` (see @sec-vector-layer-from-scratch).\n", - "\n", - "" + "First, we apply the `shapely.geometry.shape` function to go from a `list` of GeoJSON-like `dict`s to a `list` of `shapely` geometry objects.\n", + "The `list` can then be converted to a `GeoSeries` (see @sec-vector-layer-from-scratch)." ] }, { @@ -1378,7 +1341,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The values can also be extracted from the `rasterio.features.shapes` and turned into a corresponding `Series`." + "The values can also be extracted from the `rasterio.features.shapes` result and turned into a corresponding `Series`." ] }, { @@ -1433,11 +1396,11 @@ "source": [ "As highlighted using `edgecolor='black'`, neighboring pixels sharing the same raster value are dissolved into larger polygons.\n", "The `rasterio.features.shapes` function unfortunately does not offer a way to avoid this type of dissolving.\n", - "One [suggestion](https://gis.stackexchange.com/questions/455980/vectorizing-all-pixels-as-separate-polygons-using-rasterio#answer-456251) is to add unique values between `0` and `0.9999` to all pixels, convert to polygons, and then get back to the original values using [`np.floor`](https://numpy.org/doc/stable/reference/generated/numpy.floor.html).\n", + "One [suggestion](https://gis.stackexchange.com/questions/455980/vectorizing-all-pixels-as-separate-polygons-using-rasterio#answer-456251) is to add unique values between `0` and `0.9999` to all pixels, convert to polygons, and then get back to the original values using `np.floor`.\n", "\n", "### Raster to points {#sec-raster-to-points}\n", "\n", - "To transform a raster to points, we can use the [`rasterio.transform.xy`](https://rasterio.readthedocs.io/en/latest/api/rasterio.transform.html#rasterio.transform.xy). \n", + "To transform a raster to points, we can use the `rasterio.transform.xy` function. \n", "As the name suggests, the function accepts row and column indices, and transforms them into x- and y-coordinates (using the raster's transformation matrix).\n", "For example, the coordinates of the top-left pixel can be calculated passing the `(row,col)` indices of `(0,0)`." ] @@ -1477,7 +1440,7 @@ ":::\n", "\n", "To generalize the above expression to calculate the coordinates of *all* pixels, we first need to generate a grid of all possible row/column index combinations.\n", - "This can be done using [`np.meshgrid`](https://numpy.org/doc/stable/reference/generated/numpy.meshgrid.html), as follows." + "This can be done using `np.meshgrid`, as follows." ] }, { @@ -1576,8 +1539,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This \"high-level\" workflow, like many other **rasterio**-based workflows covered in the book, is a commonly used one but lacking from the package itself. \n", - "From the user perspective, it may be a good idea to wrap the workflow into a function (e.g., `raster_to_points(src)`, returning a `GeoDataFrame`), to be re-used whenever we need it.\n", + "This 'high-level' workflow, like many other **rasterio**-based workflows covered in the book, is a commonly used one but lacking from the package itself. \n", + "From the user's perspective, it may be a good idea to wrap the workflow into a function (e.g., `raster_to_points(src)`, returning a `GeoDataFrame`), to be re-used whenever we need it.\n", "\n", "@fig-raster-to-points shows the input raster and the resulting point layer." ] @@ -1598,8 +1561,8 @@ "rasterio.plot.show(src_elev, ax=ax);\n", "# Points\n", "fig, ax = plt.subplots()\n", - "pnt.plot(column='value', legend=True, ax=ax)\n", - "rasterio.plot.show(src_elev, cmap='Greys', ax=ax);" + "pnt.plot(column='value', legend=True, edgecolor='black', ax=ax)\n", + "rasterio.plot.show(src_elev, alpha=0, ax=ax);" ], "execution_count": null, "outputs": [] @@ -1608,13 +1571,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Note that \"No Data\" pixels can be filtered out from the conversion, if necessary (see @sec-distance-to-nearest-geometry).\n", + "Note that 'No Data' pixels can be filtered out from the conversion, if necessary (see @sec-distance-to-nearest-geometry).\n", "\n", "### Raster to contours {#sec-raster-to-contours}\n", "\n", - "Another common type of spatial vectorization is the creation of contour lines representing lines of continuous height or temperatures (*isotherms*), for example.\n", + "Another common type of spatial vectorization is the creation of contour lines, representing lines of continuous height or temperatures (*isotherms*), for example.\n", "We will use a real-world digital elevation model (DEM) because the artificial raster `elev.tif` produces parallel lines (task for the reader: verify this and explain why this happens).\n", - "Plotting contour lines is straightforward, using the `contour=True` option of `rasterio.plot.show` (@fig-raster-contours1)." + "*Plotting* contour lines is straightforward, using the `contour=True` option of `rasterio.plot.show` (@fig-raster-contours1)." ] }, { @@ -1640,12 +1603,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Unfortunately, `rasterio` does not provide any way of extracting the contour lines in the form of a vector layer, for uses other than plotting.\n", + "Unfortunately, **rasterio** does not provide any way of extracting the contour lines in the form of a vector layer, for uses other than plotting.\n", "\n", "There are two possible workarounds:\n", "\n", - "1. Using `gdal_contour` on the [command line](https://gdal.org/programs/gdal_contour.html) (see below), or through its Python interface [**osgeo**](https://gis.stackexchange.com/questions/360431/how-can-i-create-contours-from-geotiff-and-python-gdal-rasterio-etc-into-sh)\n", - "2. Writing a custom function to export contour coordinates generated by, e.g., [**matplotlib**](https://www.tutorialspoint.com/how-to-get-coordinates-from-the-contour-in-matplotlib) or [**skimage**](https://gis.stackexchange.com/questions/268331/how-can-i-extract-contours-from-a-raster-with-python)\n", + "1. Using `gdal_contour` on the command line (see below), or through its Python interface **osgeo**\n", + "2. Writing a custom function to export contour coordinates generated by, e.g., **matplotlib** or **skimage**\n", "\n", "We demonstrate the first approach, using `gdal_contour`.\n", "Although we deviate from the Python-focused approach towards more direct interaction with GDAL, the benefit of `gdal_contour` is the proven algorithm, customized to spatial data, and with many relevant options.\n", @@ -1710,20 +1673,20 @@ "source": [ "## Distance to nearest geometry {#sec-distance-to-nearest-geometry}\n", "\n", - "Calculating a raster of distances to the nearest geometry is an example of a \"global\" raster operation (@sec-global-operations-and-distances).\n", + "Calculating a raster of distances to the nearest geometry is an example of a 'global' raster operation (@sec-global-operations-and-distances).\n", "To demonstrate it, suppose that we need to calculate a raster representing the distance to the nearest coast in New Zealand.\n", - "This example also wraps many of the concepts introduced in this chapter and in previous chapter, such as raster aggregation (@sec-raster-agg-disagg), raster conversion to points (@sec-raster-to-points), and rasterizing points (@sec-rasterizing-points).\n", + "This example also wraps many of the concepts introduced in this chapter and in previous chapters, such as raster aggregation (@sec-raster-agg-disagg), raster conversion to points (@sec-raster-to-points), and rasterizing points (@sec-rasterizing-points).\n", "\n", - "For the coastline, we will dissolve the New Zealand administrative division polygon layer and \"extract\" the boundary as a `'MultiLineString'` geometry." + "For the coastline, we will dissolve the New Zealand administrative division polygon layer and 'extract' the boundary as a `'MultiLineString'` geometry (@fig-nz-coastline). Note that `.dissolve(by=None)` (@sec-vector-attribute-aggregation) calls `.union_all` on all geometries (i.e., aggregates everything into one group), which is what we want to do here." ] }, { "cell_type": "code", "metadata": {}, "source": [ - "coastline = gpd.GeoSeries(nz.unary_union, crs=nz.crs) \\\n", - " .to_crs(src_nz_elev.crs) \\\n", - " .boundary\n", + "#| label: fig-nz-coastline\n", + "#| fig-cap: New Zealand coastline geometry\n", + "coastline = nz.dissolve().to_crs(src_nz_elev.crs).boundary.iloc[0]\n", "coastline" ], "execution_count": null, @@ -1733,10 +1696,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "For a \"template\" raster, we will aggregate the New Zealand DEM, in the `nz_elev.tif` file, to 5 times coarser resolution.\n", - "The code section below follows the aggeregation example in @sec-raster-agg-disagg.\n", - "\n", - "" + "For a 'template' raster, we will aggregate the New Zealand DEM, in the `nz_elev.tif` file, to 5 times coarser resolution.\n", + "The code section below follows the aggregation example in @sec-raster-agg-disagg." ] }, { @@ -1774,11 +1735,11 @@ "metadata": {}, "source": [ "#| label: fig-raster-distances1\n", - "#| fig-cap: Template with cell IDs to calculate distance to nearest geometry\n", + "#| fig-cap: Template to calculate distance to nearest geometry (coastlines, in red)\n", "\n", "fig, ax = plt.subplots()\n", "rasterio.plot.show(r, transform=new_transform, ax=ax)\n", - "gpd.GeoSeries(coastline).plot(ax=ax, edgecolor='black');" + "gpd.GeoSeries(coastline).plot(ax=ax, edgecolor='red');" ], "execution_count": null, "outputs": [] @@ -1844,7 +1805,7 @@ "image = rasterio.features.rasterize(\n", " distances,\n", " out_shape=r.shape,\n", - " dtype=np.float_,\n", + " dtype=np.float64,\n", " transform=new_transform,\n", " fill=np.nan\n", ")\n", @@ -1857,9 +1818,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", - "\n", "The final result, a raster of distances to the nearest coastline, is shown in @fig-raster-distances2." ] }, @@ -1871,7 +1829,7 @@ "#| fig-cap: Distance to nearest coastline in New Zealand\n", "fig, ax = plt.subplots()\n", "rasterio.plot.show(image, transform=new_transform, ax=ax)\n", - "gpd.GeoSeries(coastline).plot(ax=ax, edgecolor='black');" + "gpd.GeoSeries(coastline).plot(ax=ax, edgecolor='red');" ], "execution_count": null, "outputs": [] @@ -1880,15 +1838,15 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Exercises" + "" ] } ], "metadata": { "kernelspec": { - "name": "python3", + "display_name": "Python 3", "language": "python", - "display_name": "Python 3 (ipykernel)" + "name": "python3" } }, "nbformat": 4, diff --git a/ipynb/06-reproj.ipynb b/ipynb/06-reproj.ipynb index 87b33f65..14693b54 100644 --- a/ipynb/06-reproj.ipynb +++ b/ipynb/06-reproj.ipynb @@ -4,6 +4,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ + "---\n", + "jupyter: python3\n", + "---\n", + "\n", "# Reprojecting geographic data {#sec-reproj-geo-data}\n", "\n", "## Prerequisites {.unnumbered}" @@ -14,12 +18,17 @@ "metadata": {}, "source": [ "#| echo: false\n", - "import matplotlib.pyplot as plt\n", - "import pandas as pd\n", - "pd.options.display.max_rows = 6\n", - "pd.options.display.max_columns = 6\n", - "pd.options.display.max_colwidth = 35\n", - "plt.rcParams['figure.figsize'] = (5, 5)" + "import book_options" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| echo: false\n", + "import book_options_pdf" ], "execution_count": null, "outputs": [] @@ -53,9 +62,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", - "\n", "It also relies on the following data files:" ] }, @@ -63,6 +69,7 @@ "cell_type": "code", "metadata": {}, "source": [ + "#| warning: false\n", "src_srtm = rasterio.open('data/srtm.tif')\n", "src_nlcd = rasterio.open('data/nlcd.tif')\n", "zion = gpd.read_file('data/zion.gpkg')\n", @@ -82,49 +89,40 @@ "This chapter builds on that knowledge and goes further.\n", "It demonstrates how to set and transform geographic data from one CRS to another and, furthermore, highlights specific issues that can arise due to ignoring CRSs that you should be aware of, especially if your data is stored with lon/lat coordinates.\n", "\n", - "\n", - "\n", - "\n", "It is important to know if your data is in a projected or geographic coordinate system, and the consequences of this for geometry operations.\n", "However, if you know the CRS of your data and the consequences for geometry operations (covered in the next section), CRSs should just work behind the scenes: people often suddenly need to learn about CRSs when things go wrong.\n", "Having a clearly defined project CRS that all project data is in, plus understanding how and why to use different CRSs, can ensure that things do not go wrong.\n", "Furthermore, learning about coordinate systems will deepen your knowledge of geographic datasets and how to use them effectively.\n", "\n", - "This chapter teaches the fundamentals of CRSs, demonstrates the consequences of using different CRSs (including what can go wrong), and how to \"reproject\" datasets from one coordinate system to another.\n", + "This chapter teaches the fundamentals of CRSs, demonstrates the consequences of using different CRSs (including what can go wrong), and how to 'reproject' datasets from one coordinate system to another.\n", "In the next section we introduce CRSs in Python, followed by @sec-querying-and-setting-coordinate-systems which shows how to get and set CRSs associated with spatial objects.\n", "@sec-geometry-operations-on-projected-and-unprojected-data demonstrates the importance of knowing what CRS your data is in with reference to a worked example of creating buffers.\n", "We tackle questions of when to reproject and which CRS to use in @sec-when-to-reproject and @sec-which-crs-to-use, respectively.\n", - "Finally, we cover reprojecting vector and raster objects in @sec-reprojecting-vector-geometries and @sec-reprojecting-raster-geometries and modifying map projections in @sec-custom-map-projections.\n", + "Finally, we cover reprojecting vector and raster objects in @sec-reprojecting-vector-geometries and @sec-reprojecting-raster-geometries and using custom projections in @sec-custom-map-projections.\n", "\n", "## Coordinate Reference Systems {#sec-coordinate-reference-systems}\n", "\n", - "Most modern geographic tools that require CRS conversions, including Python packages and desktop GIS software such as QGIS, interface with [PROJ](https://proj.org/), an open source C++ library that \"transforms coordinates from one coordinate reference system (CRS) to another\".\n", + "Most modern geographic tools that require CRS conversions, including Python packages and desktop GIS software such as QGIS, interface with PROJ, an open source C++ library that 'transforms coordinates from one coordinate reference system (CRS) to another'.\n", "CRSs can be described in many ways, including the following:\n", "\n", - "- Simple, yet potentially ambiguous, statements, such as, \"it's in lon/lat coordinates\"\n", - "- Formalized, yet now outdated, 'proj-strings' such as `+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs`\n", - "- With an identifying 'authority:code' text string such as `EPSG:4326`\n", + "- Simple, yet potentially ambiguous, statements, such as 'it's in lon/lat coordinates'\n", + "- Formalized, yet now outdated, 'proj-strings', such as `+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs`\n", + "- With an identifying 'authority:code' text string, such as `EPSG:4326`\n", "\n", "Each refers to the same thing: the 'WGS84' coordinate system that forms the basis of Global Positioning System (GPS) coordinates and many other datasets.\n", "But which one is correct?\n", "\n", - "The short answer is that the third way to identify CRSs is correct: `EPSG:4326` is understood by **geopandas** and **rasterio** packages covered in this book, plus many other software projects for working with geographic data including [QGIS](https://docs.qgis.org/3.22/en/docs/user_manual/working_with_projections/working_with_projections.html) and [PROJ](https://proj.org/development/quickstart.html).\n", + "The short answer is that the third way to identify CRSs is correct: `EPSG:4326` is understood by **geopandas** and **rasterio** packages covered in this book, plus many other software projects for working with geographic data including QGIS and PROJ.\n", "`EPSG:4326` is future-proof.\n", - "Furthermore, although it is machine readable, unlike the proj-string representation `EPSG:4326` is short, easy to remember and highly 'findable' online (searching for `EPSG:4326` yields a dedicated page on the website [epsg.io](https://epsg.io/4326), for example).\n", - "The more concise identifier `4326` is also understood by **geopandas** and **rasterio**, but we recommend the more explicit `AUTHORITY:CODE` representation to prevent ambiguity and to provide context.\n", - "\n", - "\n", + "Furthermore, although it is machine readable, unlike the proj-string representation `EPSG:4326` is short, easy to remember and highly 'findable' online (searching for `EPSG:4326` yields a dedicated page on the website epsg.io[^epsgio], for example).\n", + "The more concise identifier `4326` is also understood by **geopandas** and **rasterio**.\n", + "\n", + "[^epsgio]: [https://epsg.io/4326](https://epsg.io/4326)\n", "\n", - "The longer answer is that none of the three descriptions are sufficient, and more detail is needed for unambiguous CRS handling and transformations: due to the complexity of CRSs, it is not possible to capture all relevant information about them in such short text strings.\n", + "The longer answer is that none of the three descriptions is sufficient, and more detail is needed for unambiguous CRS handling and transformations: due to the complexity of CRSs, it is not possible to capture all relevant information about them in such short text strings.\n", "For this reason, the Open Geospatial Consortium (OGC, which also developed the Simple Features specification that the **geopandas** package implements) developed an open standard format for describing CRSs that is called WKT (Well Known Text).\n", - "\n", - "\n", - "This is detailed in a [100+ page document](https://portal.opengeospatial.org/files/18-010r7) that \"defines the structure and content of a text string implementation of the abstract model for coordinate reference systems described in ISO 19111:2019\" [@opengeospatialconsortium_wellknown_2019].\n", - "\n", - "\n", - "The [WKT representation](https://en.wikipedia.org/wiki/Well-known_text_representation_of_coordinate_reference_systems) of the WGS84 CRS, which has the identifier `EPSG:4326` is as follows.\n", - "\n", - "" + "This is detailed in a 100+ page document that 'defines the structure and content of a text string implementation of the abstract model for coordinate reference systems described in ISO 19111:2019' [@opengeospatialconsortium_wellknown_2019].\n", + "The WKT representation of the WGS84 CRS, which has the identifier `EPSG:4326` is as follows." ] }, { @@ -141,11 +139,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", - "The output of the command shows how the CRS identifier (also known as a Spatial Reference Identifier or [SRID](https://postgis.net/workshops/postgis-intro/projection.html)) works: it is simply a look-up, providing a unique identifier associated with a more complete WKT representation of the CRS.\n", + "The output of the command shows how the CRS identifier (also known as a Spatial Reference Identifier, or SRID) works: it is simply a look-up, providing a unique identifier associated with a more complete WKT representation of the CRS.\n", "This raises the question: what happens if there is a mismatch between the identifier and the longer WKT representation of a CRS?\n", - "On this point Open Geospatial Consortium [@opengeospatialconsortium_wellknown_2019] is clear, the verbose WKT representation takes precedence over the [identifier](https://docs.opengeospatial.org/is/18-010r7/18-010r7.html#37):\n", + "On this point Open Geospatial Consortium [@opengeospatialconsortium_wellknown_2019] is clear, the verbose WKT representation takes precedence over the identifier:\n", "\n", "> Should any attributes or values given in the cited identifier be in conflict with attributes or values given explicitly in the WKT description, the WKT values shall prevail.\n", "\n", @@ -171,25 +167,21 @@ "source": [ "WKT strings are exhaustive, detailed, and precise, allowing for unambiguous CRSs storage and transformations.\n", "They contain all relevant information about any given CRS, including its datum and ellipsoid, prime meridian, projection, and units.\n", - "\n", - "\n", "\n", "Recent PROJ versions (6+) still allow use of proj-strings to define coordinate operations, but some proj-string keys (`+nadgrids`, `+towgs84`, `+k`, `+init=epsg:`) are either no longer supported or are discouraged.\n", - "\n", - "\n", "Additionally, only three datums (i.e., WGS84, NAD83, and NAD27) can be directly set in proj-string.\n", - "Longer explanations of the evolution of CRS definitions and the PROJ library can be found in [@bivand_progress_2021], Chapter 2 of [@pebesma_spatial_2022], and a [blog post by Floris Vanderhaeghe](https://inbo.github.io/tutorials/tutorials/spatial_crs_coding/).\n", + "Longer explanations of the evolution of CRS definitions and the PROJ library can be found in [@bivand_progress_2021], Chapter 2 of [@pebesma_spatial_2022], and a blog post by Floris Vanderhaeghe[^floris_blog].\n", + "\n", + "[^floris_blog]: [https://inbo.github.io/tutorials/tutorials/spatial_crs_coding/](https://inbo.github.io/tutorials/tutorials/spatial_crs_coding/)\n", "\n", "::: callout-note\n", - "As outlined in the [PROJ documentation](https://proj.org/development/reference/cpp/cpp_general.html), there are different versions of the WKT CRS format including WKT1 and two variants of WKT2, the latter of which (WKT2, 2018 specification) corresponds to the ISO 19111:2019 [@opengeospatialconsortium_wellknown_2019].\n", + "As outlined in the PROJ documentation, there are different versions of the WKT CRS format including WKT1 and two variants of WKT2, the latter of which (WKT2, 2018 specification) corresponds to the ISO 19111:2019 [@opengeospatialconsortium_wellknown_2019].\n", ":::\n", - "\n", - "\n", "\n", "## Querying and setting coordinate systems {#sec-querying-and-setting-coordinate-systems}\n", "\n", "Let's see how CRSs are stored in Python spatial objects and how they can be queried and set.\n", - "First we will look at getting and setting CRSs in vector geographic data objects.\n", + "First, we will look at getting and setting CRSs in vector geographic data objects.\n", "Consider the `GeoDataFrame` object named `world`, imported from a file `world.gpkg` that represents countries worldwide.\n", "Its CRS can be retrieved using the `.crs` property." ] @@ -243,8 +235,7 @@ "cell_type": "code", "metadata": {}, "source": [ - "world.crs.axis_info[0].unit_name\n", - "world.crs.axis_info[1].unit_name" + "world.crs.axis_info[0].unit_name, world.crs.axis_info[1].unit_name" ], "execution_count": null, "outputs": [] @@ -269,12 +260,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", "In cases when a coordinate reference system (CRS) is missing or the wrong CRS is set, the `.set_crs` method can be used on a `GeoSeries` or a `GeoDataFrame` to set it.\n", "The CRS can be specified using an EPSG code as the first argument.\n", "In case the object already has a different CRS definition, we must also specify `allow_override=True` to replace it (otherwise we get an error).\n", @@ -316,8 +301,6 @@ "Replacing the CRS definition for a **rasterio** file connection is typically not necessary, because it is not considered in any operation; only the transformation matrix and coordinates are.\n", "One exception is when writing the raster, in which case we need to construct the metadata of the raster file to be written, and therein specify the CRS anyway (@sec-raster-from-scratch).\n", "However, if we, for some reason, need to change the CRS definition in the file connection metadata, we can do that when opening the file in `r+` (reading and writing) mode.\n", - "\n", - "\n", "To demonstrate, we will create a copy of the `nlcd.tif` file, named `nlcd2.tif`, " ] }, @@ -352,7 +335,7 @@ "metadata": {}, "source": [ "::: callout-note\n", - "The `rasterio.open` function `mode`s generally follows Python's standard [file connection](https://docs.python.org/3/tutorial/inputoutput.html#reading-and-writing-files) modes, with possible arguments being `'r'` (read), `'w'` (write), `'r+'` (read/write), and `'w+'` (write/read) (the `'a'` \"append\" mode is irrelevant for raster files). In the book, and in general, the most commonly used modes are `'r'` (read) and `'w'` (write). `'r+'`, used in the last example, means 'read/write'. Unlike with `'w'`, `'r+'` does not delete the existing content on open, making `'r+'` suitable for making changes in an existing file (such as in the last example, where we replaced the CRS).\n", + "The `rasterio.open` function `mode`s generally follows Python's standard file connection modes, with possible arguments being `'r'` (read), `'w'` (write), `'r+'` (read/write), and `'w+'` (write/read) (the `'a'` 'append' mode is irrelevant for raster files). In the book, and in general, the most commonly used modes are `'r'` (read) and `'w'` (write). `'r+'`, used in the last example, means 'read/write'. Unlike with `'w'`, `'r+'` does not delete the existing content on open, making `'r+'` suitable for making changes in an existing file (such as here, replacing the CRS).\n", ":::\n", "\n", "To replace the definition with a new one, such as `EPSG:3857`, we can use the `.crs` method, as shown below." @@ -390,7 +373,7 @@ "source": [ "Importantly, the `.set_crs` (for vector layers) or the assignment to `.crs` (for rasters), as shown above, do not alter coordinates' values or geometries.\n", "Their role is only to set a metadata information about the object CRS.\n", - "Consequently, the objects we created, `world3`, `world4`, and `src_nlcd2` are \"incorrect\", in the sense that the geometries are in fact given in a different CRS than specified in the associated CRS definition.\n", + "Consequently, the objects we created, `world3`, `world4`, and `src_nlcd2` are 'incorrect', in the sense that the geometries are in fact given in a different CRS than specified in the associated CRS definition.\n", "\n", "In some cases, the CRS of a geographic object is unknown, as is the case in the London dataset created in the code chunk below, building on the example of London introduced in @sec-vector-layer-from-scratch." ] @@ -428,10 +411,8 @@ "metadata": {}, "source": [ "This implies that **geopandas** does not know what the CRS is and is unwilling to guess.\n", - "Unless a CRS is manually specified or is loaded from a source that has CRS metadata, **geopandas** does not make any explicit assumptions about which coordinate systems, other than to say \"I don't know\".\n", - "This behavior makes sense given the diversity of available CRSs but differs from some approaches, such as the GeoJSON file format specification, which makes the simplifying [assumption](https://datatracker.ietf.org/doc/html/rfc7946#section-4) that all coordinates have a lon/lat CRS: `EPSG:4326`.\n", - "\n", - "\n", + "Unless a CRS is manually specified or is loaded from a source that has CRS metadata, **geopandas** does not make any explicit assumptions about which coordinate systems, other than to say 'I don't know'.\n", + "This behavior makes sense given the diversity of available CRSs but differs from some approaches, such as the GeoJSON file format specification, which makes the simplifying assumption that all coordinates have a lon/lat CRS: `EPSG:4326`.\n", "\n", "A CRS can be added to `GeoSeries` or `GeoDataFrame` objects using the `.set_crs` method, as mentioned above." ] @@ -449,9 +430,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", - "\n", "When working with **geopandas** and **rasterio**, datasets without a specified CRS are not an issue in most workflows, since only the coordinates are considered.\n", "It is up to the user to make sure that, when working with more than one layer, all of the coordinates are given in the same CRS (whether specified or not).\n", "When exporting the results, though, it is important to keep the CRS definition in place, because other software typically *do* use, and require, the CRS definition in calculation.\n", @@ -460,7 +438,7 @@ "## Geometry operations on projected and unprojected data {#sec-geometry-operations-on-projected-and-unprojected-data}\n", "\n", "The **geopandas** package, through its dependency **shapely**, assumes planar geometry and works with distance/area values assumed to be in CRS units.\n", - "In fact, the CRS definition is typically ignored, and the respective functions (such as in plotting and distance calculations) are applied on the \"bare\" **shapely** geometries.\n", + "In fact, the CRS definition is typically ignored, and the respective functions (such as in plotting and distance calculations) are applied on the 'bare' **shapely** geometries.\n", "Accordingly, it is crucial to make sure that:\n", "\n", "- Geometric calculations are only applied in projected CRS\n", @@ -542,12 +520,12 @@ "uk = world[world['name_long'] == 'United Kingdom']\n", "uk_proj = uk.to_crs(27700)\n", "# Around projected point\n", - "base = uk_proj.plot(color='none', edgecolor='darkgrey')\n", - "lnd_layer_proj_buff.plot(color='lightgrey', edgecolor='black', ax=base)\n", + "base = uk_proj.plot(color='none', edgecolor='darkgrey', linewidth=0.5)\n", + "lnd_layer_proj_buff.plot(color='grey', edgecolor='black', alpha=0.5, ax=base)\n", "lnd_layer_proj.plot(color='red', ax=base);\n", "# Around point in lon/lat\n", - "base = uk.plot(color='none', edgecolor='darkgrey')\n", - "lnd_layer_buff.plot(color='lightgrey', edgecolor='black', ax=base)\n", + "base = uk.plot(color='none', edgecolor='darkgrey', linewidth=0.5)\n", + "lnd_layer_buff.plot(color='grey', edgecolor='black', alpha=0.5, ax=base)\n", "lnd_layer.plot(color='red', ax=base);" ], "execution_count": null, @@ -571,7 +549,7 @@ "## When to reproject? {#sec-when-to-reproject}\n", "\n", "The previous section showed how to set the CRS manually, with an expression such as `lnd_layer.set_crs(4326)`.\n", - "In real world applications, however, CRSs are usually set automatically when data is read-in.\n", + "In real-world applications, however, CRSs are usually set automatically when data is read-in.\n", "Thus, in many projects the main CRS-related task is to transform objects, from one CRS into another.\n", "But when should data be transformed?\n", "And into which CRS?\n", @@ -579,7 +557,7 @@ "However, there are some general principles provided in this section that can help you decide.\n", "\n", "First, it's worth considering when to transform.\n", - "In some cases transformation to a geographic CRS is essential, such as when publishing data online (for example, a Leaflet-based map using Python package [**folium**](https://python-visualization.github.io/folium/latest/)).\n", + "In some cases, transformation to a geographic CRS is essential, such as when publishing data online (for example, a Leaflet-based map using Python package **folium**).\n", "Another case is when two objects with different CRSs must be compared or combined, as shown when we try to find the distance between two objects with different CRSs." ] }, @@ -596,39 +574,39 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Here, we got a meaningless result, and a warning.\n", + "Here, we got a meaningless distance value of `559715`, and a warning.\n", "\n", "To make the `lnd_layer` and `lnd_layer_proj` objects geographically comparable, one of them must be transformed into the CRS of the other.\n", "But which CRS to use?\n", "The answer depends on context: many projects, especially those involving web mapping, require outputs in `EPSG:4326`, in which case it is worth transforming the projected object.\n", "If, however, the project requires geometric calculations, implying planar geometry, e.g., calculating buffers (@sec-geometry-operations-on-projected-and-unprojected-data), it is necessary to transform data with a geographic CRS into an equivalent object with a projected CRS, such as the British National Grid (`EPSG:27700`).\n", "That is the subject of @sec-which-crs-to-use.\n", - "\n", - "\n", "\n", "## Which CRS to use? {#sec-which-crs-to-use}\n", "\n", - "The question of which CRS is tricky, and there is rarely a \"right\" answer: \"There exist no all-purpose projections, all involve distortion when far from the center of the specified frame\" [@bivand_applied_2013].\n", + "The question of which CRS is tricky, and there is rarely a 'right' answer: 'There exist no all-purpose projections, all involve distortion when far from the center of the specified frame' [@bivand_applied_2013].\n", "Additionally, you should not be attached just to one projection for every task.\n", "It is possible to use one projection for some part of the analysis, another projection for a different part, and even some other for visualization.\n", "Always try to pick the CRS that serves your goal best!\n", "\n", - "When selecting *geographic* CRSs, the answer is often [WGS84](https://en.wikipedia.org/wiki/World_Geodetic_System#A_new_World_Geodetic_System:_WGS_84).\n", + "When selecting *geographic* CRSs, the answer is often WGS84.\n", "It is used not only for web mapping, but also because GPS datasets and thousands of raster and vector datasets are provided in this CRS by default.\n", "WGS84 is the most common CRS in the world, so it is worth knowing its EPSG code: `4326`.\n", - "This \"magic number\" can be used to convert objects with unusual projected CRSs into something that is widely understood.\n", + "This 'magic number' can be used to convert objects with unusual projected CRSs into something that is widely understood.\n", "\n", "What about when a *projected* CRS is required?\n", - "In some cases, it is not something that we are free to decide: \"often the choice of projection is made by a public mapping agency\" [@bivand_applied_2013].\n", + "In some cases, it is not something that we are free to decide: 'often the choice of projection is made by a public mapping agency' [@bivand_applied_2013].\n", "This means that when working with local data sources, it is likely preferable to work with the CRS in which the data was provided, to ensure compatibility, even if the official CRS is not the most accurate.\n", "The example of London was easy to answer because the British National Grid (with its associated EPSG code `27700`) is well known, and the original dataset (`lnd_layer`) already had that CRS.\n", "\n", - "A commonly used default is Universal Transverse Mercator ([UTM](https://en.wikipedia.org/wiki/Universal_Transverse_Mercator_coordinate_system)), a set of CRSs that divides the Earth into 60 longitudinal wedges and 20 latitudinal segments.\n", + "A commonly used default is Universal Transverse Mercator (UTM), a set of CRSs that divide the Earth into 60 longitudinal wedges and 20 latitudinal segments.\n", "The transverse Mercator projection used by UTM CRSs is conformal but distorts areas and distances with increasing severity with distance from the center of the UTM zone.\n", - "Documentation from the GIS software Manifold therefore suggests restricting the longitudinal extent of projects using UTM zones to 6 degrees from the central meridian (source: [manifold.net](http://www.manifold.net/doc/mfd9/universal_transverse_mercator_projection.htm)).\n", + "Documentation from the GIS software Manifold therefore suggests restricting the longitudinal extent of projects using UTM zones to 6 degrees from the central meridian[^manifold_recommendation].\n", "Therefore, we recommend using UTM only when your focus is on preserving angles for a relatively small area!\n", "\n", - "Almost every place on Earth has a UTM code, such as `'60H'` which refers, amoung others, to northern New Zealand.\n", + "[^manifold_recommendation]: [http://www.manifold.net/doc/mfd9/universal_transverse_mercator_projection.htm](http://www.manifold.net/doc/mfd9/universal_transverse_mercator_projection.htm)\n", + "\n", + "Almost every place on Earth has a UTM code, such as `'60H'` which refers, among others, to northern New Zealand.\n", "UTM EPSG codes run sequentially from `32601` to `32660` for northern hemisphere locations and from `32701` to `32760` for southern hemisphere locations.\n", "\n", "To show how the system works, let's create a function, `lonlat2UTM` to calculate the EPSG code associated with any point on the planet." @@ -669,7 +647,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Here is another example for London (where we \"unpack\" the coordinates of the 1^st^ geometry in `lnd_layer` into the `lonlat2UTM` function arguments)." + "Here is another example for London (where we 'unpack' the coordinates of the 1^st^ geometry in `lnd_layer` into the `lonlat2UTM` function arguments)." ] }, { @@ -692,15 +670,17 @@ "In cases where an appropriate CRS is not immediately clear, the choice of CRS should depend on the properties that are most important to preserve in the subsequent maps and analysis.\n", "All CRSs are either equal-area, equidistant, conformal (with shapes remaining unchanged), or some combination of compromises of those (@sec-projected-coordinate-reference-systems).\n", "Custom CRSs with local parameters can be created for a region of interest and multiple CRSs can be used in projects when no single CRS suits all tasks.\n", - "\"Geodesic calculations\" can provide a fall-back if no CRSs are appropriate (see ).\n", + "'Geodesic calculations' can provide a fall-back if no CRSs are appropriate[^proj_geodesic].\n", "Regardless of the projected CRS used, the results may not be accurate for geometries covering hundreds of kilometers.\n", "\n", + "[^proj_geodesic]: [https://proj.org/geodesic.html](https://proj.org/geodesic.html)\n", + "\n", "When deciding on a custom CRS, we recommend the following:\n", "\n", - "- A Lambert azimuthal equal-area ([LAEA](https://en.wikipedia.org/wiki/Lambert_azimuthal_equal-area_projection)) projection for a custom local projection (set latitude and longitude of origin to the center of the study area), which is an equal-area projection at all locations but distorts shapes beyond thousands of kilometers\n", - "- Azimuthal equidistant ([AEQD](https://en.wikipedia.org/wiki/Azimuthal_equidistant_projection)) projections for a specifically accurate straight-line distance between a point and the center point of the local projection\n", - "- Lambert conformal conic ([LCC](https://en.wikipedia.org/wiki/Lambert_conformal_conic_projection)) projections for regions covering thousands of kilometers, with the cone set to keep distance and area properties reasonable between the secant lines\n", - "- Stereographic ([STERE](https://en.wikipedia.org/wiki/Stereographic_projection)) projections for polar regions, but taking care not to rely on area and distance calculations thousands of kilometers from the center\n", + "- A Lambert azimuthal equal-area (LAEA) projection for a custom local projection (set latitude and longitude of origin to the center of the study area), which is an equal-area projection at all locations but distorts shapes beyond thousands of kilometers\n", + "- Azimuthal equidistant (AEQD) projections for a specifically accurate straight-line distance between a point and the center point of the local projection\n", + "- Lambert conformal conic (LCC) projections for regions covering thousands of kilometers, with the cone set to keep distance and area properties reasonable between the secant lines\n", + "- Stereographic (STERE) projections for polar regions, but taking care not to rely on area and distance calculations thousands of kilometers from the center\n", "\n", "One possible approach to automatically select a projected CRS specific to a local dataset is to create an azimuthal equidistant (AEQD) projection for the center-point of the study area.\n", "This involves creating a custom CRS (with no EPSG code) with units of meters based on the center point of a dataset.\n", @@ -717,7 +697,7 @@ "Reprojecting vectors thus consists of transforming the coordinates of these points, which form the vertices of lines and polygons.\n", "\n", "@sec-geometry-operations-on-projected-and-unprojected-data contains an example in which at a `GeoDataFrame` had to be transformed into an equivalent object, with a different CRS, to calculate the distance between two objects. \n", - "Reprojection of vector layers is done using the [.to_crs](https://geopandas.org/en/stable/docs/reference/api/geopandas.GeoDataFrame.to_crs.html) method." + "Reprojection of vector layers is done using the `.to_crs` method." ] }, { @@ -750,27 +730,10 @@ "metadata": {}, "source": [ "It may come as a surprise that `lnd_layer` and `lnd_layer2` are just over 2 $km$ apart!\n", - "The difference in location between the two points is not due to imperfections in the transforming operation (which is in fact very accurate) but the low precision of the manually-created coordinates that created `lnd_layer` and `lnd_layer_proj`.\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", + "The difference in location between the two points is not due to imperfections in the transforming operation (which is in fact very accurate) but the low precision of the manually specified coordinates when creating `lnd_layer` and `lnd_layer_proj`.\n", "\n", "Reprojecting to a different CRS is also demonstrated below using `cycle_hire_osm`, a point layer that represents 'docking stations' where you can hire bicycles in London.\n", - "The contents of the CRS object associated with a given geometry column is changed when the object's CRS is transformed.\n", + "The contents of the CRS object associated with a given geometry column are changed when the object's CRS is transformed.\n", "In the code chunk below, we create a new version of `cycle_hire_osm` with a projected CRS." ] }, @@ -788,8 +751,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The resulting object has a new CRS with an EPSG code `27700`.\n", - "But how to find out more details about this EPSG code, or any code?\n", + "The resulting object has a new CRS according to the EPSG code `27700`.\n", + "How to find out more details about this EPSG code, or any code?\n", "One option is to search for it online.\n", "Another option is to create a standalone CRS object within the Python environment (using `pyproj.CRS.from_string` or `pyproj.CRS.from_epsg`, see @sec-coordinate-reference-systems), and then query its properties, such as `.name` and `.to_wkt()`." ] @@ -808,10 +771,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", - " \n", - "The result shows that the EPSG code `27700` represents the British National Grid, a result that could have been found by searching online for \"[EPSG 27700](https://www.google.com/search?q=CRS+27700)\".\n", + "The result shows that the EPSG code `27700` represents the British National Grid, a result that could have been found by searching online for 'EPSG 27700'.\n", "\n", "## Reprojecting raster geometries {#sec-reprojecting-raster-geometries}\n", "\n", @@ -819,26 +779,24 @@ "However, there are important differences in reprojection of vectors and rasters: transforming a vector object involves changing the coordinates of every vertex, but this does not apply to raster data.\n", "Rasters are composed of rectangular cells of the same size (expressed by map units, such as degrees or meters), so it is usually impracticable to transform coordinates of pixels separately.\n", "Raster reprojection involves creating a new raster object in the destination CRS, often with a different number of columns and rows than the original.\n", - "The attributes must subsequently be re-estimated, allowing the new pixels to be \"filled\" with appropriate values.\n", + "The attributes must subsequently be re-estimated, allowing the new pixels to be 'filled' with appropriate values.\n", "In other words, raster reprojection can be thought of as two separate spatial operations: a vector reprojection of the raster extent to another CRS (@sec-reprojecting-vector-geometries), and computation of new pixel values through resampling (@sec-raster-resampling).\n", "Due to this additional complexity, in most cases when both raster and vector data are used, it is better to avoid reprojecting rasters and reproject vectors instead.\n", "\n", "::: callout-note\n", "Reprojection of the regular rasters is also known as warping.\n", - "Additionally, there is a second similar operation called \"transformation\".\n", + "Additionally, there is a second similar operation called 'transformation'.\n", "Instead of resampling all of the values, it leaves all values intact but recomputes new coordinates for every raster cell, changing the grid geometry.\n", "For example, it could convert the input raster (a regular grid) into a curvilinear grid.\n", "The **rasterio**, like common raster file formats (such as GeoTIFF), does not support curvilinear grids.\n", - "The **xarray** package, for instance, can be used to [work with](https://docs.xarray.dev/en/stable/examples/multidimensional-coords.html) curvilinear grids.\n", + "The **xarray** package, for instance, can be used to work with curvilinear grids.\n", ":::\n", - "\n", - "\n", "\n", "The raster reprojection process is done using two functions from the `rasterio.warp` sub-package:\n", "\n", - "1. `rasterio.warp.calculate_default_transform`: [`calculate_default_transform`](https://rasterio.readthedocs.io/en/latest/api/rasterio.warp.html#rasterio.warp.calculate_default_transform), is used to calculate the new transformation matrix in the destination CRS, according to the source raster dimensions and bounds.\n", + "1. `rasterio.warp.calculate_default_transform`, used to calculate the new transformation matrix in the destination CRS, according to the source raster dimensions and bounds.\n", "Alternatively, the destination transformation matrix can be obtained from an existing raster; this is common practice when we need to align one raster with another, for instance to be able to combine them in raster algebra operations (@sec-raster-local-operations) (see below)\n", - "2. `rasterio.warp.reproject`: introduced in @sec-raster-resampling, calculates cell values in the destination grid, using the user-selected resampling method (such as nearest neighbor, or bilinear)\n", + "2. `rasterio.warp.reproject`, introduced in @sec-raster-resampling, calculates cell values in the destination grid, using the user-selected resampling method (such as nearest neighbor, or bilinear)\n", "\n", "Let's take a look at two examples of raster transformation: using categorical and continuous data.\n", "Land cover data are usually represented by categorical maps.\n", @@ -894,7 +852,22 @@ " src_nlcd.width,\n", " src_nlcd.height,\n", " *src_nlcd.bounds\n", - ")\n", + ")" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here is the result." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ "dst_transform" ], "execution_count": null, @@ -922,11 +895,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", - "\n", "::: callout-note\n", - "The `*` syntax in Python is known as variable-length [\"*positional* arguments\"](https://docs.python.org/3/glossary.html#term-argument).\n", + "The `*` syntax in Python is known as variable-length '*positional* arguments'.\n", "It is used to pass a `list` or `tuple` (or other iterables object) to positional arguments of a function.\n", "\n", "For example, in the last code block, `*`, in `*src_nlcd.bounds`, is used to unpack `src_nlcd.bounds` (an iterable of length 4) to four separate arguments (`left`, `bottom`, `right`, and `top`), which `rasterio.warp.calculate_default_transform` requires in that order.\n", @@ -953,18 +923,16 @@ " src_nlcd.bounds[3]\n", ")\n", "```\n", - "\"*Keyword* arguments\" is a related technique; see note in @sec-raster-agg-disagg.\n", + "'*Keyword* arguments' is a related technique; see note in @sec-raster-agg-disagg.\n", ":::\n", "\n", "\n", - "Recall from @sec-raster-resampling that resampling using `rasterio.warp.reproject` can take place directly into a \"destination\" raster file connection.\n", + "Recall from @sec-raster-resampling that resampling using `rasterio.warp.reproject` can take place directly into a 'destination' raster file connection.\n", "Therefore, our next step is to create the metadata file used for writing the reprojected raster to file.\n", "For convenience, we are taking the metadata of the source raster (`src_nlcd.meta`), making a copy (`dst_kwargs`), and then updating those specific properties that need to be changed.\n", - "Note that the reprojection process typically creates \"No Data\" pixels, even when there were none in the input raster, since the raster orientation changes and the edges need to be \"filled\" to get back a rectangular extent.\n", - "For example, a reprojected raster may appear as a \"tilted\" rectangle, inside a larger straight rectangular extent, whereas the margins around the tilted rectangle are inevitably filled with \"No Data\" (e.g., the white stripes surrounding the edges in @fig-raster-reproject-nlcd (b) are \"No Data\" pixels created as a result of reprojection). \n", - "\n", - "\n", - "We need to specify a \"No Data\" value of our choice, if there is no existing definition, or keep the existing source raster \"No Data\" setting, such as `255` in this case." + "Note that the reprojection process typically creates 'No Data' pixels, even when there were none in the input raster, since the raster orientation changes and the edges need to be 'filled' to get back a rectangular extent.\n", + "For example, a reprojected raster may appear as a 'tilted' rectangle, inside a larger straight rectangular extent, whereas the margins around the tilted rectangle are inevitably filled with 'No Data' (e.g., the white stripes surrounding the edges in @fig-raster-reproject-nlcd (b) are 'No Data' pixels created as a result of reprojection).\n", + "We need to specify a 'No Data' value of our choice, if there is no existing definition, or keep the existing source raster 'No Data' setting, such as `255` in this case." ] }, { @@ -989,7 +957,7 @@ "source": [ "Now, we are ready to create the reprojected raster.\n", "Here, reprojection takes place between two file connections, meaning that the raster value arrays are not being read into memory at once.\n", - "It is also possible to reproject into an in-memory `ndarray` object, see the [documentation](https://rasterio.readthedocs.io/en/latest/api/rasterio.warp.html#rasterio.warp.reproject).\n", + "(It is also possible to reproject into an in-memory `ndarray` object.)\n", "\n", "To write the reprojected raster, we first create a destination file connection `dst_nlcd`, pointing at the output file path of our choice (`'output/nlcd_4326.tif'`), using the updated metadata object created earlier (`dst_kwargs`):" ] @@ -1007,13 +975,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Then, we use the `rasterio.warp.reproject` function to calculate and write the reprojection result into the `dst_nlcd` file connection.\n" + "Then, we use the `rasterio.warp.reproject` function to calculate and write the reprojection result into the `dst_nlcd` file connection." ] }, { "cell_type": "code", "metadata": {}, "source": [ + "#| output: false\n", "rasterio.warp.reproject(\n", " source=rasterio.band(src_nlcd, 1),\n", " destination=rasterio.band(dst_nlcd, 1),\n", @@ -1031,9 +1000,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Note--like in the example in @sec-raster-resampling---that the `source` and `destination` accept a \"band\" object, created using `rasterio.band`.\n", + "Note--like in the example in @sec-raster-resampling---that the `source` and `destination` accept a 'band' object, created using `rasterio.band`.\n", "In this case, there is just one band.\n", - "If there were more bands, we would have to repeat the procedure for each band, using `i` instead of `1` inside a [loop](https://rasterio.readthedocs.io/en/latest/topics/reproject.html#reprojecting-a-geotiff-dataset).\n", + "If there were more bands, we would have to repeat the procedure for each band, using `i` instead of `1` inside a loop.\n", "Finally, we close the file connection so that the data are actually written." ] }, @@ -1076,7 +1045,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Examining the unique raster values tells us that the new raster has the same categories, plus the value `255` representing \"No Data\":" + "Examining the unique raster values tells us that the new raster has the same categories, plus the value `255` representing 'No Data':" ] }, { @@ -1127,11 +1096,11 @@ "In the above example, we automatically calculated an optimal (i.e., most information preserving) destination grid using `rasterio.warp.calculate_default_transform`.\n", "This is appropriate when there are no specific requirements for the destination raster spatial properties.\n", "Namely, we are not required to obtain a specific origin and resolution, but just wish to preserve the raster values as much as possible.\n", - "To do that, `rasterio.warp.calculate_default_transform` \"tries\" to keep the extent and resolution of the destination raster as similar as possible to the source.\n", - "In other situations, however, we need to reproject a raster into a specific \"template\", so that it corresponds, for instance, with other rasters we use in the analysis.\n", - "In the following code examples, we reproject the `nlcd.tif` raster, again, but this time using the `nlcd_4326.tif` reprojection result as the \"template\" to demonstrate this alternative workflow.\n", + "To do that, `rasterio.warp.calculate_default_transform` 'tries' to keep the extent and resolution of the destination raster as similar as possible to the source.\n", + "In other situations, however, we need to reproject a raster into a specific 'template', so that it corresponds, for instance, with other rasters we use in the analysis.\n", + "In the following code examples, we reproject the `nlcd.tif` raster, again, but this time using the `nlcd_4326.tif` reprojection result as the 'template' to demonstrate this alternative workflow.\n", "\n", - "First, we create a connection to our \"template\" raster to read its metadata." + "First, we create a connection to our 'template' raster to read its metadata." ] }, { @@ -1148,9 +1117,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Then, we create a write-mode connection to our destination raster, using this exact metadata, meaning that as the resampling result is going to have identical properties as the \"template\".\n", - "\n", - "" + "Then, we create a write-mode connection to our destination raster, using this exact metadata, meaning that the resampling result is going to have identical properties as the 'template'." ] }, { @@ -1210,10 +1177,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The difference is that in the first example we calculated the template automatically, using `rasterio.warp.calculate_default_transform`, while in the second example we used an existing raster as the \"template\".\n", + "The difference is that in the first example we calculated the template automatically, using `rasterio.warp.calculate_default_transform`, while in the second example we used an existing raster as the 'template'.\n", "\n", - "Importantly, when the template raster has much more \"coarse\" resolution than the source raster, the `rasterio.enums.Resampling.average` (for continuous rasters) or `rasterio.enums.Resampling.mode` (for categorical rasters) resampling methods should be used, instead of `rasterio.enums.Resampling.nearest`.\n", - "Otherwise, much of the data will be lost, as the \"nearest\" method can capture one pixel value only for each destination raster pixel.\n", + "Importantly, when the template raster has much more 'coarse' resolution than the source raster, the `rasterio.enums.Resampling.average` (for continuous rasters) or `rasterio.enums.Resampling.mode` (for categorical rasters) resampling methods should be used, instead of `rasterio.enums.Resampling.nearest`.\n", + "Otherwise, much of the data will be lost, as the 'nearest' method can capture one-pixel value only for each destination raster pixel.\n", "\n", "Reprojecting continuous rasters (with numeric or, in this case, integer values) follows an almost identical procedure.\n", "This is demonstrated below with `srtm.tif` from the Shuttle Radar Topography Mission (SRTM), which represents height in meters above sea level (elevation) with the WGS84 CRS.\n", @@ -1221,8 +1188,8 @@ "We will reproject this dataset into a projected CRS, but not with the nearest neighbor method.\n", "Instead, we will use the bilinear method which computes the output cell value based on the four nearest cells in the original raster.\n", "The values in the projected dataset are the distance-weighted average of the values from these four cells: the closer the input cell is to the center of the output cell, the greater its weight.\n", - "The following code section create a text string representing WGS 84 / UTM zone 12N, and reproject the raster into this CRS, using the bilinear method.\n", - "The code is practically the same, except for changing the source and destination file names, and replacing `rasterio.enums.Resampling.nearest` with `rasterio.enums.Resampling.bilinear`." + "The following code section creates a text string representing WGS 84 / UTM zone 12N, and reprojects the raster into this CRS, using the bilinear method.\n", + "The code is practically the same as in the first example in this section, except for changing the source and destination file names, and replacing `rasterio.enums.Resampling.nearest` with `rasterio.enums.Resampling.bilinear`." ] }, { @@ -1293,8 +1260,8 @@ "@sec-which-crs-to-use mentioned reasons for using custom CRSs, and provided several possible approaches.\n", "Here, we show how to apply these ideas in Python.\n", "\n", - "One is to take an existing WKT definition of a CRS, modify some of its elements, and then use the new definition for reprojecting, using the reprojection methods shown above for vector layers (@sec-reprojecting-vector-geometries) and rasters (@sec-reprojecting-raster-geometries).\n", - "For example, let's transforms the `zion.gpkg` vector layer to a custom azimuthal equidistant (AEQD) CRS.\n", + "One approach is to take an existing WKT definition of a CRS, modify some of its elements, and then use the new definition for reprojecting, using the reprojection methods shown above for vector layers (@sec-reprojecting-vector-geometries) and rasters (@sec-reprojecting-raster-geometries).\n", + "For example, let's transform the `zion.gpkg` vector layer to a custom azimuthal equidistant (AEQD) CRS.\n", "Using a custom AEQD CRS requires knowing the coordinates of the center point of a dataset in degrees (geographic CRS).\n", "In our case, this information can be extracted by calculating the centroid of the `zion` layer transformed into WGS84:" ] @@ -1303,7 +1270,7 @@ "cell_type": "code", "metadata": {}, "source": [ - "lon, lat = zion.to_crs(4326).unary_union.centroid.coords[0]\n", + "lon, lat = zion.to_crs(4326).union_all().centroid.coords[0]\n", "lon, lat" ], "execution_count": null, @@ -1314,9 +1281,7 @@ "metadata": {}, "source": [ "Next, we can use the obtained lon/lat coordinates in `coords` to update the WKT definition of the azimuthal equidistant (AEQD) CRS seen below.\n", - "Notice that we modified just two values below---`\"Central_Meridian\"` to the longitude and `\"Latitude_Of_Origin\"` to the latitude of our centroid.\n", - "\n", - "" + "Notice that we modified just two values below---`\"Central_Meridian\"` to the longitude and `\"Latitude_Of_Origin\"` to the latitude of our centroid." ] }, { @@ -1343,7 +1308,7 @@ "metadata": {}, "source": [ "::: callout-note\n", - "The above expression uses the so-called [\"f-strings\"](https://docs.python.org/3/tutorial/inputoutput.html#tut-f-strings) syntax, which is one of several Python techniques to embed values inside a string (as alternatives to concatenating with `+`).\n", + "The above expression uses the so-called 'f-strings' syntax, which is one of several Python techniques to embed values inside a string (as alternatives to concatenating with `+`).\n", "For example, given:\n", "```\n", "x = 5\n", @@ -1375,15 +1340,17 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Custom projections can also be made interactively, for example, using the [Projection Wizard](https://projectionwizard.org/#) web application [@savric_projection_2016].\n", + "Custom projections can also be made interactively, for example, using the Projection Wizard[^projection_wizard] web application [@savric_projection_2016].\n", "This website allows you to select a spatial extent of your data and a distortion property, and returns a list of possible projections.\n", "The list also contains WKT definitions of the projections that you can copy and use for reprojections.\n", - "See Open Geospatial Consortium ([2019](https://r.geocompx.org/references.html#ref-opengeospatialconsortium_wellknown_2019)) for details on creating custom CRS definitions with WKT strings.\n", + "See Open Geospatial Consortium [@opengeospatialconsortium_wellknown_2019] for details on creating custom CRS definitions with WKT strings.\n", + "\n", + "[^projection_wizard]: [https://projectionwizard.org/#](https://projectionwizard.org/#)\n", "\n", "PROJ strings can also be used to create custom projections, accepting the limitations inherent to projections, especially of geometries covering large geographic areas, as mentioned in @sec-coordinate-reference-systems.\n", - "Many projections have been developed and can be set with the `+proj=` element of PROJ strings, with dozens of projects described in detail on the [PROJ website](https://proj.org/operations/projections/index.html) alone.\n", + "Many projections have been developed and can be set with the `+proj=` element of PROJ strings, with dozens of projections described in detail on the PROJ website alone.\n", "\n", - "When mapping the world while preserving area relationships the Mollweide projection, illustrated in @fig-mollweide, is a popular and often sensible choice [@jenny_guide_2017].\n", + "When mapping the world while preserving area relationships, the Mollweide projection, illustrated in @fig-mollweide, is a popular and often sensible choice [@jenny_guide_2017].\n", "To use this projection, we need to specify it using the proj-string element, `'+proj=moll'`, in the `.to_crs` method:" ] }, @@ -1403,7 +1370,7 @@ "metadata": {}, "source": [ "It is often desirable to minimize distortion for all spatial properties (area, direction, distance) when mapping the world.\n", - "One of the most popular projections to achieve this is [Winkel tripel](http://www.winkel.org/other/Winkel%20Tripel%20Projections.htm) (`'+proj=wintri'`), illustrated in @fig-wintri." + "One of the most popular projections to achieve this is Winkel tripel (`'+proj=wintri'`), illustrated in @fig-wintri." ] }, { @@ -1421,7 +1388,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Moreover, proj-string parameters can be modified in most CRS definitions, for example the center of the projection can be adjusted using the `+lon_0` and `+lat_0` parameters.\n", + "Moreover, proj-string parameters can be modified in most CRS definitions, for example, the center of the projection can be adjusted using the `+lon_0` and `+lat_0` parameters.\n", "The below code transforms the coordinates to the Lambert azimuthal equal-area projection centered on the longitude and latitude of New York City (@fig-azimuthal-equal-area)." ] }, @@ -1441,19 +1408,19 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "More information on CRS modifications can be found in the [Using PROJ](https://proj.org/usage/index.html) documentation.\n", + "More information on CRS modifications can be found in the Using PROJ documentation[^using_proj].\n", "\n", - "## Exercises\n", + "[^using_proj]: [https://proj.org/usage/index.html](https://proj.org/usage/index.html)\n", "\n", - "## References" + "\n" ] } ], "metadata": { "kernelspec": { - "name": "python3", + "display_name": "Python 3", "language": "python", - "display_name": "Python 3 (ipykernel)" + "name": "python3" } }, "nbformat": 4, diff --git a/ipynb/07-read-write.ipynb b/ipynb/07-read-write.ipynb index dd54bf16..810f9d43 100644 --- a/ipynb/07-read-write.ipynb +++ b/ipynb/07-read-write.ipynb @@ -14,12 +14,29 @@ "metadata": {}, "source": [ "#| echo: false\n", - "import matplotlib.pyplot as plt\n", - "import pandas as pd\n", - "pd.options.display.max_rows = 6\n", - "pd.options.display.max_columns = 6\n", - "pd.options.display.max_colwidth = 35\n", - "plt.rcParams['figure.figsize'] = (5, 5)" + "#| include: false\n", + "#| error: true\n", + "import map_to_png" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| echo: false\n", + "import book_options" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| echo: false\n", + "import book_options_pdf" ], "execution_count": null, "outputs": [] @@ -38,8 +55,10 @@ "import urllib.request\n", "import zipfile\n", "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", "import shapely\n", - "import fiona\n", + "import pyogrio\n", "import geopandas as gpd\n", "import rasterio\n", "import rasterio.plot\n", @@ -78,37 +97,49 @@ "Taken together, these processes of input/output can be referred to as data I/O.\n", "\n", "Geographic data I/O is often done with few lines of code at the beginning and end of projects.\n", - "It is often overlooked as a simple one step process.\n", + "It is often overlooked as a simple one-step process.\n", "However, mistakes made at the outset of projects (e.g., using an out-of-date or in some way faulty dataset) can lead to large problems later down the line, so it is worth putting considerable time into identifying which datasets are available, where they can be found and how to retrieve them.\n", - "These topics are covered in @sec-retrieving-open-data, which describes various geoportals, which collectively contain many terabytes of data, and how to use them.\n", - "To further ease data access, a number of packages for downloading geographic data have been developed, as described in @sec-geographic-data-packages.\n", + "These topics are covered in @sec-retrieving-open-data, which describes several geoportals, which collectively contain many terabytes of data, and how to use them.\n", + "To further ease data access, a number of packages for downloading geographic data have been developed, as demonstrated in @sec-geographic-data-packages.\n", "\n", "There are many geographic file formats, each of which has pros and cons, described in @sec-file-formats.\n", "The process of reading and writing files efficiently is covered in Sections @sec-data-input and @sec-data-output, respectively.\n", "\n", "## Retrieving open data {#sec-retrieving-open-data}\n", "\n", - "A vast and ever-increasing amount of geographic data is available on the internet, much of which is free to access and use (with appropriate credit given to its providers).[^07-read-write-plot-1]\n", + "A vast and ever-increasing amount of geographic data is available on the internet, much of which is free to access and use (with appropriate credit given to its providers)[^07-read-write-plot-1].\n", "In some ways there is now too much data, in the sense that there are often multiple places to access the same dataset.\n", "Some datasets are of poor quality.\n", "In this context, it is vital to know where to look, so the first section covers some of the most important sources.\n", - "Various 'geoportals' (web services providing geospatial datasets such as [Data.gov](https://catalog.data.gov/dataset?metadata_type=geospatial)) are a good place to start, providing a wide range of data but often only for specific locations (as illustrated in the updated [Wikipedia page](https://en.wikipedia.org/wiki/Geoportal) on the topic).\n", + "Various 'geoportals' (web services providing geospatial datasets, such as Data.gov[^data_gov]) are a good place to start, providing a wide range of data but often only for specific locations (as illustrated in the updated Wikipedia page[^wiki_geoportal] on the topic).\n", "\n", "[^07-read-write-plot-1]: For example, visit for a vast list of websites with freely available geographic datasets.\n", + "[^data_gov]: \n", + "[^wiki_geoportal]: \n", "\n", "Some global geoportals overcome this issue.\n", - "The [GEOSS portal](http://www.geoportal.org/) and the [Copernicus Open Access Hub](https://scihub.copernicus.eu/), for example, contain many raster datasets with global coverage.\n", - "A wealth of vector datasets can be accessed from the [SEDAC](http://sedac.ciesin.columbia.edu/) portal run by the National Aeronautics and Space Administration (NASA) and the European Union's [INSPIRE geoportal](http://inspire-geoportal.ec.europa.eu/), with global and regional coverage.\n", + "The GEOSS portal[^geoss_portal] and the Copernicus Data Space Ecosystem[^copernicus], for example, contain many raster datasets with global coverage.\n", + "A wealth of vector datasets can be accessed from the SEDAC[^sedac] portal run by the National Aeronautics and Space Administration (NASA) and the European Union's INSPIRE geoportal[^inspire_geoportal], with global and regional coverage.\n", "\n", - "Most geoportals provide a graphical interface allowing datasets to be queried based on characteristics such as spatial and temporal extent, the United States Geological Survey's [EarthExplorer](https://earthexplorer.usgs.gov/) being a prime example.\n", + "[^geoss_portal]: \n", + "[^copernicus]: \n", + "[^sedac]: \n", + "[^inspire_geoportal]: \n", + "\n", + "Most geoportals provide a graphical interface allowing datasets to be queried based on characteristics such as spatial and temporal extent, the United States Geological Survey's EarthExplorer[^earthexplorer] and NASA's EarthData Search[^earthdata_search] being prime examples.\n", "Exploring datasets interactively on a browser is an effective way of understanding available layers.\n", "From reproducibility and efficiency perspectives, downloading data is, however, best done with code.\n", - "Downloads can be initiated from the command line using a variety of techniques, primarily via URLs and APIs (see the [Sentinel API](https://scihub.copernicus.eu/twiki/do/view/SciHubWebPortal/APIHubDescription), for example).\n", - "Files hosted on static URLs can be downloaded with the following method, as illustrated in the code chunk below which accesses the [Natural Earth Data](https://www.naturalearthdata.com/) website to download the world airports layer zip file and to extract the contained ESRI Shapefile.\n", + "Downloads can be initiated from the command line using a variety of techniques, primarily via URLs and APIs (see the Sentinel API[^sentinel_api], for example).\n", + "\n", + "[^earthexplorer]: \n", + "[^earthdata_search]: \n", + "[^sentinel_api]: \n", + "\n", + "Files hosted on static URLs can be downloaded with the following method, as illustrated in the code chunk below which accesses the Natural Earth Data[^natural_earth_data] website to download the world airports layer zip file and to extract the contained ESRI Shapefile.\n", "Note that the download code is complicated by the fact that the server checks the `User-agent` header of the request, basically to make sure that the download takes place through a browser.\n", "To overcome this, we add a header corresponding to a request coming from a browser (such as Firefox) in our code.\n", - "\n", - "" + "\n", + "[^natural_earth_data]: " ] }, { @@ -117,11 +148,16 @@ "source": [ "#| eval: false\n", "# Set URL+filename\n", - "url = 'https://www.naturalearthdata.com/http//www.naturalearthdata.com/download/10m/cultural/ne_10m_airports.zip'\n", + "url = 'https://www.naturalearthdata.com/http//www.naturalearthdata.com/'\n", + "url += 'download/10m/cultural/ne_10m_airports.zip'\n", "filename = 'output/ne_10m_airports.zip'\n", "# Download\n", "opener = urllib.request.build_opener()\n", - "opener.addheaders = [('User-agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/116.0')]\n", + "opener.addheaders = [(\n", + " 'User-agent', \n", + " 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) ' +\n", + " 'Gecko/20100101 Firefox/116.0'\n", + ")]\n", "urllib.request.install_opener(opener)\n", "urllib.request.urlretrieve(url, filename)\n", "# Extract\n", @@ -167,23 +203,11 @@ "source": [ "## Geographic data packages {#sec-geographic-data-packages}\n", "\n", - "Many Python packages have been developed for accessing geographic data, two of which are presented in @tbl-data-packages and demonstrated below.\n", + "Several Python packages have been developed for accessing geographic data, two of which are demonstrated below.\n", "These provide interfaces to one or more spatial libraries or geoportals and aim to make data access even quicker from the command line.\n", - "\n", - "\n", - "\n", - "| Package | Description |\n", - "|-------------|---------------------------------------------------------------------------------------------------|\n", - "| **cartopy** | Download layers from [Natural Earth Data](https://www.naturalearthdata.com/downloads/) |\n", - "| **osmnx** | Access to [OpenStreetMap](https://www.openstreetmap.org/) data and conversion to spatial networks |\n", - "\n", - ": Selected Python packages for geographic data retrieval {#tbl-data-packages}\n", - "\n", - "Each data package has its own syntax for accessing data.\n", - "This diversity is demonstrated in the subsequent code chunks, which show how to get data using the packages from @tbl-data-packages.\n", "\n", "Administrative borders are often useful in spatial analysis.\n", - "These can be accessed with the [`cartopy.io.shapereader.natural_earth`](https://scitools.org.uk/cartopy/docs/latest/reference/generated/cartopy.io.shapereader.natural_earth.html) function from the **cartopy** package [@cartopy].\n", + "These can be accessed with the `cartopy.io.shapereader.natural_earth` function from the **cartopy** package [@cartopy].\n", "For example, the following code loads the `'admin_2_counties'` dataset of US counties into a `GeoDataFrame`." ] }, @@ -224,27 +248,25 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Note that @fig-ne-counties x-axis spans the entire range of longitues, between `-180` and `180`, since the Aleutian Islands county (which is small and difficult to see on the map) crosses the [International Date Line](https://en.wikipedia.org/wiki/International_Date_Line).\n", - "\n", - "\n", - "\n", + "Note that @fig-ne-counties x-axis spans the entire range of longitudes, between `-180` and `180`, since the Aleutian Islands county (which is small and difficult to see on the map) crosses the International Date Line.\n", "\n", - "Other layers can be accessed the same way.\n", - "You need to specify the `resolution`, `category`, and `name` of the requested dataset in [Natural Earth Data](https://www.naturalearthdata.com/downloads/), then run the `cartopy.io.shapereader.natural_earth`, which downloads the file(s) and returns the path, and read the file into the Python environment, e.g., using `gpd.read_file`.\n", - "This is an alternative approach to \"directly\" downloading files as shown earlier (@sec-retrieving-open-data).\n", + "Other layers can from NaturalEarth be accessed the same way.\n", + "You need to specify the `resolution`, `category`, and `name` of the requested dataset in Natural Earth Data, then run the `cartopy.io.shapereader.natural_earth`, which downloads the file(s) and returns the path, and read the file into the Python environment, e.g., using `gpd.read_file`.\n", + "This is an alternative approach to 'directly' downloading files as shown earlier (@sec-retrieving-open-data).\n", "\n", "The second example uses the **osmnx** package [@osmnx] to find parks from the OpenStreetMap (OSM) database.\n", - "As illustrated in the code-chunk below, OpenStreetMap data can be obtained using the `ox.features.features_from_place` function.\n", + "As illustrated in the code chunk below, OpenStreetMap data can be obtained using the `ox.features.features_from_place` function.\n", "The first argument is a string which is geocoded to a polygon (the `ox.features.features_from_bbox` and `ox.features.features_from_polygon` can also be used to query a custom area of interest).\n", - "The second argument specifies the OSM [tag(s)](https://wiki.openstreetmap.org/wiki/Map_features), selecting which OSM elements we're interested in (parks, in this case), represented by key-value pairs.\n", - "\n", - "" + "The second argument specifies the OSM tag(s)[^osm_tags], selecting which OSM elements we're interested in (parks, in this case), represented by key-value pairs.\n", + "\n", + "[^osm_tags]: " ] }, { "cell_type": "code", "metadata": {}, "source": [ + "#| warning: false\n", "parks = ox.features.features_from_place(\n", " query='leeds uk', \n", " tags={'leisure': 'park'}\n", @@ -258,7 +280,9 @@ "metadata": {}, "source": [ "The result is a `GeoDataFrame` with the parks in Leeds.\n", - "Now, we can plots the geometries with the `name` property in the tooltips using `explore` (@fig-ox-features)." + "Now, we can plot the geometries with the `name` property in the tooltips using `explore` (@fig-ox-features).\n", + "\n", + "::: {.content-visible when-format=\"html\"}" ] }, { @@ -276,48 +300,71 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "It should be noted that the **osmnx** package downloads OSM data from the [Overpass API](https://wiki.openstreetmap.org/wiki/Overpass_API), which is rate limited and therefore unsuitable for queries covering very large areas.\n", - "To overcome this limitation, you can download OSM data extracts, such as in Shapefile format from [Geofabrik](https://download.geofabrik.de/), and then load them from the file into the Python environment.\n", - "\n", - "\n", + ":::\n", + "::: {.content-visible when-format=\"pdf\"}" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| eval: false\n", + "parks[['name', 'geometry']].explore()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| echo: false\n", + "#| output: false\n", + "#| error: true\n", + "map_to_png.map_to_png(parks[['name', 'geometry']].explore(), 'fig-ox-features')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Parks in Leeds, based on OpenStreetMap data, downloaded using package **osmnx**](images/fig-ox-features.png){#fig-ox-features}\n", + ":::\n", + "\n", + "It should be noted that the **osmnx** package downloads OSM data from the Overpass API[^overpass_api], which is rate limited and therefore unsuitable for queries covering very large areas.\n", + "To overcome this limitation, you can download OSM data extracts, such as in Shapefile format from Geofabrik[^geofabrik], and then load them from the file into the Python environment.\n", "\n", - "OpenStreetMap is a vast global database of crowd-sourced data, is growing daily, and has a wider ecosystem of tools enabling easy access to the data, from the [Overpass turbo](https://overpass-turbo.eu/) web service for rapid development and testing of OSM queries to [osm2pgsql](https://osm2pgsql.org/) for importing the data into a PostGIS database.\n", + "[^overpass_api]: \n", + "[^geofabrik]: \n", + "\n", + "OpenStreetMap is a vast global database of crowd-sourced data, is growing daily, and has a wider ecosystem of tools enabling easy access to the data, from the Overpass turbo[^overpass_turbo] web service for rapid development and testing of OSM queries to `osm2pgsql` for importing the data into a PostGIS database.\n", "Although the quality of datasets derived from OSM varies, the data source and wider OSM ecosystems have many advantages: they provide datasets that are available globally, free of charge, and constantly improving thanks to an army of volunteers.\n", - "Using OSM encourages 'citizen science' and contributions back to the digital commons (you can start editing data representing a part of the world you know well at [www.openstreetmap.org](https://www.openstreetmap.org/)).\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", + "Using OSM encourages 'citizen science' and contributions back to the digital commons (you can start editing data representing a part of the world you know well at ).\n", + "\n", + "[^overpass_turbo]: \n", "\n", "One way to obtain spatial information is to perform geocoding---transform a description of a location, usually an address, into a set of coordinates.\n", "This is typically done by sending a query to an online service and getting the location as a result.\n", "Many such services exist that differ in the used method of geocoding, usage limitations, costs, or API key requirements.\n", - "[Nominatim](https://nominatim.openstreetmap.org/ui/about.html) is a well-known free service, based on OpenStreetMap data, and there are many other free and commercial geocoding services.\n", + "Nominatim[^nominatim] is a well-known free service, based on OpenStreetMap data, and there are many other free and commercial geocoding services.\n", "\n", - "**geopandas** provides the [`gpd.tools.geocode`](https://geopandas.org/en/stable/docs/reference/api/geopandas.tools.geocode.html), which can geocode addresses to a `GeoDataFrame`.\n", + "[^nominatim]: \n", + "\n", + "**geopandas** provides the `gpd.tools.geocode`, which can geocode addresses to a `GeoDataFrame`.\n", "Internally it uses the **geopy** package, supporting several providers through the `provider` parameter (use `geopy.geocoders.SERVICE_TO_GEOCODER` to see possible options).\n", - "\n", - "\n", - "\n", - "The example below searches for [John Snow blue plaque](https://en.m.wikipedia.org/wiki/John_Snow_(public_house)) coordinates located on a building in the Soho district of London.\n", + "The example below searches for John Snow blue plaque[^john_snow_blue_plaque] coordinates located on a building in the Soho district of London.\n", "The result is a `GeoDataFrame` with the address we passed to `gpd.tools.geocode`, and the detected point location.\n", - "\n", - "" + "\n", + "[^john_snow_blue_plaque]: " ] }, { "cell_type": "code", "metadata": {}, "source": [ - "result = gpd.tools.geocode('54 Frith St, London W1D 4SJ, UK')\n", + "result = gpd.tools.geocode('54 Frith St, London W1D 4SJ, UK', timeout=10)\n", "result" ], "execution_count": null, @@ -327,17 +374,17 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Importantly, (1) we can pass a `list` of multiple addresses instead of just one, resulting in a `GeoDataFrame` with corresponding multiple rows, and (2) \"No Results\" responses are represented by `POINT EMPTY` geometries, as shown in the following example." + "Importantly, (1) we can pass a `list` of multiple addresses instead of just one, resulting in a `GeoDataFrame` with corresponding multiple rows, and (2) 'No Results' responses are represented by `POINT EMPTY` geometries, as shown in the following example." ] }, { "cell_type": "code", "metadata": {}, "source": [ - "result = gpd.tools.geocode([\n", - " '54 Frith St, London W1D 4SJ, UK', \n", - " 'abcdefghijklmnopqrstuvwxyz'\n", - "])\n", + "result = gpd.tools.geocode(\n", + " ['54 Frith St, London W1D 4SJ, UK', 'abcdefghijklmnopqrstuvwxyz'], \n", + " timeout=10\n", + ")\n", "result" ], "execution_count": null, @@ -347,21 +394,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", "The result is visualized in @fig-ox-geocode using the `.explore` function. \n", - "We are using the `marker_kwds` parameter of `.explore` to make the marker larger (see @sec-interactive-styling)." + "We are using the `marker_kwds` parameter of `.explore` to make the marker larger (see @sec-interactive-styling).\n", + "\n", + "::: {.content-visible when-format=\"html\"}" ] }, { @@ -379,28 +415,56 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", - "\n", - "\n", + ":::\n", + "::: {.content-visible when-format=\"pdf\"}" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| eval: false\n", + "result.iloc[[0]].explore(color='red', marker_kwds={'radius':20})" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| echo: false\n", + "#| output: false\n", + "#| error: true\n", + "map_to_png.map_to_png(result.iloc[[0]].explore(color='red', marker_kwds={'radius':20}), 'fig-ox-geocode')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Specific address in London, geocoded into a `GeoDataFrame`](images/fig-ox-geocode.png){#fig-ox-geocode}\n", + ":::\n", "\n", "## File formats {#sec-file-formats}\n", "\n", "Geographic datasets are usually stored as files or in spatial databases.\n", - "File formats usually can either store vector or raster data, while spatial databases such as [PostGIS](https://postgis.net/) can store both.\n", - "The large variety of file formats may seem bewildering, but there has been much consolidation and standardization since the beginnings of GIS software in the 1960s when the first widely distributed program ([SYMAP](https://news.harvard.edu/gazette/story/2011/10/the-invention-of-gis/)) for spatial analysis was created at Harvard University [@coppock_history_1991].\n", + "File formats usually can either store vector or raster data, while spatial databases such as PostGIS can store both.\n", + "The large variety of file formats may seem bewildering, but there has been much consolidation and standardization since the beginnings of GIS software in the 1960s when the first widely distributed program SYMAP for spatial analysis was created at Harvard University [@coppock_history_1991].\n", "\n", - "GDAL (which originally was pronounced as \"goo-dal\", with the double \"o\" making a reference to object-orientation), the Geospatial Data Abstraction Library, has resolved many issues associated with incompatibility between geographic file formats since its release in 2000.\n", + "GDAL (which originally was pronounced as 'goo-dal', with the double 'o' making a reference to object-orientation), the Geospatial Data Abstraction Library, has resolved many issues associated with incompatibility between geographic file formats since its release in 2000.\n", "GDAL provides a unified and high-performance interface for reading and writing of many raster and vector data formats.\n", "Many open and proprietary GIS programs, including GRASS, ArcGIS and QGIS, use GDAL behind their GUIs for doing the legwork of ingesting and spitting out geographic data in appropriate formats.\n", - "Most Pyhton packages for working with spatial data, including **geopandas** and **rasterio** used in this book, also rely on GDAL for importing and exporting spatial data files.\n", + "Most Python packages for working with spatial data, including **geopandas** and **rasterio** used in this book, also rely on GDAL for importing and exporting spatial data files.\n", "\n", "GDAL provides access to more than 200 vector and raster data formats.\n", - "@tbl-file-formats presents some basic information about selected and often used spatial file formats.\n", + "@tbl-file-formats presents some basic information about selected and often-used spatial file formats.\n", "\n", "| Name | Extension | Info | Type | Model |\n", "|-------------------|------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------|----------------|\n", - "| ESRI Shapefile | `.shp` (the main file) | Popular format consisting of at least three files. No support for: files \\> 2GB;mixed types; names \\> 10 chars; cols \\> 255. | Vector | Partially open |\n", + "| ESRI Shapefile | `.shp` (the main file) | Popular format consisting of at least three files. No support for: files \\> 2GB; mixed types; names \\> 10 chars; cols \\> 255. | Vector | Partially open |\n", "| GeoJSON | `.geojson` | Extends the JSON exchange format by including a subset of the simple feature representation; mostly used for storing coordinates in longitude and latitude; it is extended by the TopoJSON format. | Vector | Open |\n", "| KML | `.kml` | XML-based format for spatial visualization, developed for use with Google Earth. Zipped KML file forms the KMZ format. | Vector | Open |\n", "| GPX | `.gpx` | XML schema created for exchange of GPS data. | Vector | Open |\n", @@ -409,23 +473,21 @@ "| Arc ASCII | `.asc` | Text format where the first six lines represent the raster header, followed by the raster cell values arranged in rows and columns. | Raster | Open |\n", "| SQLite/SpatiaLite | `.sqlite` | Standalone relational database, SpatiaLite is the spatial extension of SQLite. | Vector and raster | Open |\n", "| ESRI FileGDB | `.gdb` | Spatial and nonspatial objects created by ArcGIS. Allows: multiple feature classes; topology. Limited support from GDAL. | Vector and raster | Proprietary |\n", - "| GeoPackage | `.gpkg` | Lightweight database container based on SQLite allowing an easy and platform-independent exchange of geodata | Vector and (very limited) raster | Open |\n", + "| GeoPackage | `.gpkg` | Lightweight database container based on SQLite allowing an easy and platform-independent exchange of geodata. | Vector and (very limited) raster | Open |\n", "\n", - ": Commonly used spatial data file formats {#tbl-file-formats}\n", + ": Commonly used spatial data file formats {#tbl-file-formats tbl-colwidths=\"[23, 13, 54, 15, 15]\"}\n", "\n", - "An important development ensuring the standardization and open-sourcing of file formats was the founding of the Open Geospatial Consortium ([OGC](http://www.opengeospatial.org/)) in 1994.\n", + "An important development ensuring the standardization and open-sourcing of file formats was the founding of the Open Geospatial Consortium (OGC) in 1994.\n", "Beyond defining the Simple Features data model (see @sec-simple-features), the OGC also coordinates the development of open standards, for example as used in file formats such as KML and GeoPackage.\n", - "\n", - "\n", "Open file formats of the kind endorsed by the OGC have several advantages over proprietary formats: the standards are published, ensure transparency and open up the possibility for users to further develop and adjust the file formats to their specific needs.\n", "\n", - "ESRI Shapefile is the most popular vector data exchange format; however, it is not an fully open format (though its specification is open).\n", + "ESRI Shapefile is the most popular vector data exchange format; however, it is not a fully open format (though its specification is open).\n", "It was developed in the early 1990s and, from a modern standpoint, has a number of limitations.\n", "First of all, it is a multi-file format, which consists of at least three files.\n", "It also only supports 255 columns, its column names are restricted to ten characters and the file size limit is 2 GB.\n", "Furthermore, ESRI Shapefile does not support all possible geometry types, for example, it is unable to distinguish between a polygon and a multipolygon.\n", "Despite these limitations, a viable alternative had been missing for a long time.\n", - "In 2014, [GeoPackage](https://www.geopackage.org/) emerged, and seems to be a more than suitable replacement candidate for ESRI Shapefile.\n", + "In 2014, GeoPackage emerged, and seems to be a more than suitable replacement candidate for ESRI Shapefile.\n", "GeoPackage is a format for exchanging geospatial information and an OGC standard.\n", "This standard describes the rules on how to store geospatial information in a tiny SQLite container.\n", "Hence, GeoPackage is a lightweight spatial database container, which allows the storage of vector and raster data but also of non-spatial data and extensions.\n", @@ -435,42 +497,36 @@ "It allows spatial information, such as the CRS definition and the transformation matrix (see @sec-using-rasterio), to be embedded within a TIFF file.\n", "Similar to ESRI Shapefile, this format was firstly developed in the 1990s, but as an open format.\n", "Additionally, GeoTIFF is still being expanded and improved.\n", - "One of the most significant recent addition to the GeoTIFF format is its variant called COG (Cloud Optimized GeoTIFF).\n", + "One of the most significant recent additions to the GeoTIFF format is its variant called COG (Cloud Optimized GeoTIFF).\n", "Raster objects saved as COGs can be hosted on HTTP servers, so other people can read only parts of the file without downloading the whole file (@sec-input-raster).\n", "\n", "There is also a plethora of other spatial data formats that we do not explain in detail or mention in @tbl-file-formats due to the book limits.\n", - "If you need to use other formats, we encourage you to read the GDAL documentation about [vector](https://gdal.org/drivers/vector/index.html) and [raster](https://gdal.org/drivers/raster/index.html) drivers.\n", + "If you need to use other formats, we encourage you to read the GDAL documentation about vector and raster drivers.\n", "Additionally, some spatial data formats can store other data models (types) than vector or raster.\n", - "It includes LAS and LAZ formats for storing lidar point clouds, and NetCDF and HDF for storing multidimensional arrays.\n", + "Two examples are LAS and LAZ formats for storing lidar point clouds, and NetCDF and HDF for storing multidimensional arrays.\n", "\n", - "Finally, spatial data is also often stored using tabular (non-spatial) text formats, including CSV files or Excel spreadsheets.\n", + "Finally, spatial data are also often stored using tabular (non-spatial) text formats, including CSV files or Excel spreadsheets.\n", "This can be convenient to share spatial (point) datasets with people who, or software that, struggle with spatial data formats.\n", "If necessary, the table can be converted to a point layer (see examples in @sec-vector-layer-from-scratch and @sec-spatial-joining).\n", "\n", "## Data input (I) {#sec-data-input}\n", "\n", - "Executing commands such as `geopandas.read_file` (the main function we use for loading vector data) or `rasterio.open`+`.read` (the main group of functions used for loading raster data) silently sets off a chain of events that reads data from files.\n", + "Executing commands such as `gpd.read_file` (the main function we use for loading vector data) or `rasterio.open`+`.read` (the main group of functions used for loading raster data) silently sets off a chain of events that reads data from files.\n", "Moreover, there are many Python packages containing a wide range of geographic data or providing simple access to different data sources.\n", "All of them load the data into the Python environment or, more precisely, assign objects to your workspace, stored in RAM and accessible within the Python session.\n", "The latter is the most straightforward approach, suitable when RAM is not a limiting factor. \n", "For large vector layers and rasters, partial reading may be required. \n", "For vector layers, we will demonstrate how to read subsets of vector layers, filtered by attributes or by location (@sec-input-vector). \n", "For rasters, we already showed earlier in the book how the user can choose which specific bands to read (@sec-using-rasterio), or read resampled data to a lower resolution (@sec-raster-agg-disagg).\n", - "In this section, we also show how to read specific rectangular extents (\"windows\") from a raster file (@sec-input-raster).\n", - "\n", - "\n", + "In this section, we also show how to read specific rectangular extents ('windows') from a raster file (@sec-input-raster).\n", "\n", "### Vector data {#sec-input-vector}\n", "\n", "Spatial vector data comes in a wide variety of file formats.\n", "Most popular representations such as `.shp`, `.geojson`, and `.gpkg` files can be imported and exported with **geopandas** functions `read_file` and `to_file` (covered in @sec-data-output), respectively.\n", "\n", - "**geopandas** uses GDAL to read and write data, via **fiona** (the [default](https://github.com/geopandas/geopandas/issues/2217)) or **pyogrio** packages (a recently developed alternative to **fiona**, which will become the default in the future, see [note](https://geopandas.org/en/stable/docs/user_guide/io.html) in \"Reading and writing files\" tutorial).\n", - "\n", - "\n", - "\n", - "\n", - "After **fiona** is imported, the command `fiona.supported_drivers` can be used to list drivers available to GDAL, including whether they can (`'r'`), append (`'a'`), or write (`'w'`) data, or all three." + "**geopandas** uses GDAL to read and write data, via **pyogrio** since `geopandas` version `1.0.0` (previously via **fiona**).\n", + "After **pyogrio** is imported, `pyogrio.list_drivers` can be used to list drivers available to GDAL, including whether they can read (`'r'`), append (`'a'`), or write (`'w'`) data, or all three." ] }, { @@ -478,7 +534,7 @@ "metadata": {}, "source": [ "#| eval: false\n", - "fiona.supported_drivers" + "pyogrio.list_drivers()" ], "execution_count": null, "outputs": [] @@ -488,15 +544,13 @@ "metadata": {}, "source": [ "```\n", - "{'DXF': 'rw',\n", - " 'CSV': 'raw',\n", - " ... \n", - " 'TopoJSON': 'r',\n", - " 'LIBKML': 'r'}\n", + "{'PCIDSK': 'rw',\n", + " 'PDS4': 'rw',\n", + " ...\n", + " 'AVCE00': 'r',\n", + " 'HTTP': 'r'}\n", "```\n", "\n", - "Other, less common, drivers can be [\"activated\"](https://geopandas.org/en/stable/docs/user_guide/io.html) by manually supplementing `fiona.supported_drivers`.\n", - "\n", "The first argument of the **geopandas** versatile data import function `gpd.read_file` is `filename`, which is typically a string, but can also be a file connection.\n", "The content of a string could vary between different drivers.\n", "In most cases, as with the ESRI Shapefile (`.shp`) or the GeoPackage format (`.gpkg`), the `filename` argument would be a path or a URL to an actual file, such as `geodata.gpkg`.\n", @@ -536,7 +590,7 @@ "Some vector formats, such as GeoPackage, can store multiple data layers.\n", "By default, `gpd.read_file` reads the first layer of the file specified in `filename`.\n", "However, using the `layer` argument you can specify any other layer.\n", - "To list the available layers, we can use function `fiona.listlayers` or `pyogrio.list_layers`.\n", + "To list the available layers, we can use function `gpd.list_layers` (or `pyogrio.list_layers`).\n", "\n", "The `gpd.read_file` function also allows for reading just parts of the file into RAM with two possible mechanisms.\n", "The first one is related to the `where` argument, which allows specifying what part of the data to read using an SQL `WHERE` expression.\n", @@ -558,14 +612,24 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "If you do not know the names of the available columns, a good approach is to just read one row of the data using the `rows` argument, which can be used to read the first N rows, then use the `.columns` property to examine the column names:" + "If you do not know the names of the available columns, a good approach is to read the layer metadata using `pyogrio.read_info`. The resulting object contains, among other properties, the column names (`fields`) and data types (`dtypes`): " ] }, { "cell_type": "code", "metadata": {}, "source": [ - "gpd.read_file('data/world.gpkg', rows=1).columns" + "info = pyogrio.read_info('data/world.gpkg')\n", + "info['fields']" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "info['dtypes']" ], "execution_count": null, "outputs": [] @@ -577,7 +641,7 @@ "The second mechanism uses the `mask` argument to filter data based on intersection with an existing geometry.\n", "This argument expects a geometry (`GeoDataFrame`, `GeoSeries`, or `shapely` geometry) representing the area where we want to extract the data.\n", "Let's try it using a small example---we want to read polygons from our file that intersect with the buffer of 50,000 $m$ of Tanzania's borders.\n", - "To do it, we need to transform the geometry to a projected CRS (such as `EPSG:32736`), prepare our \"filter\" by creating the buffer (@sec-buffers), and transform back to the original CRS to be used as a mask (@fig-read-shp-query (a))." + "To do it, we need to transform the geometry to a projected CRS (such as `EPSG:32736`), prepare our 'filter' by creating the buffer (@sec-buffers), and transform back to the original CRS to be used as a mask (@fig-read-shp-query (a))." ] }, { @@ -593,7 +657,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now, we can pass the \"filter\" geometry `tanzania_buf` to the `mask` argument of `gpd.read_file`." + "Now, we can pass the 'filter' geometry `tanzania_buf` to the `mask` argument of `gpd.read_file`." ] }, { @@ -626,14 +690,18 @@ "# Using 'where'\n", "fig, ax = plt.subplots()\n", "tanzania.plot(ax=ax, color='lightgrey', edgecolor='grey')\n", - "tanzania.apply(lambda x: ax.annotate(text=x['name_long'], \n", - " xy=x.geometry.centroid.coords[0], ha='center'), axis=1);\n", + "tanzania.apply(\n", + " lambda x: ax.annotate(text=x['name_long'], \n", + " xy=x.geometry.centroid.coords[0], ha='center'), axis=1\n", + ");\n", "# Using 'mask'\n", "fig, ax = plt.subplots()\n", "tanzania_neigh.plot(ax=ax, color='lightgrey', edgecolor='grey')\n", "tanzania_buf.plot(ax=ax, color='none', edgecolor='red')\n", - "tanzania_neigh.apply(lambda x: ax.annotate(text=x['name_long'],\n", - " xy=x.geometry.centroid.coords[0], ha='center'), axis=1);" + "tanzania_neigh.apply(\n", + " lambda x: ax.annotate(text=x['name_long'],\n", + " xy=x.geometry.centroid.coords[0], ha='center'), axis=1\n", + ");" ], "execution_count": null, "outputs": [] @@ -655,6 +723,7 @@ "source": [ "#| label: fig-cycle_hire_xy-layer\n", "#| fig-cap: The `cycle_hire_xy.csv` table transformed to a point layer\n", + "#| warning: false\n", "cycle_hire = pd.read_csv('data/cycle_hire_xy.csv')\n", "geom = gpd.points_from_xy(cycle_hire['X'], cycle_hire['Y'], crs=4326)\n", "geom = gpd.GeoSeries(geom)\n", @@ -668,7 +737,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Instead of columns describing 'XY' coordinates, a single column can also contain the geometry information, not necessarily points but possible any other geometry type.\n", + "Instead of columns describing 'XY' coordinates, a single column can also contain the geometry information, not necessarily points but possibly any other geometry type.\n", "Well-known text (WKT), well-known binary (WKB), and GeoJSON are examples of formats used to encode geometry in such a column.\n", "For instance, the `world_wkt.csv` file has a column named `'WKT'`, representing polygons of the world's countries (in WKT format).\n", "When importing the CSV file into a `DataFrame`, the `'WKT'` column is interpreted just like any other string column." @@ -688,14 +757,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "To convert it to a `GeoDataFrame`, we can apply the `shapely.from_wkt` function (@sec-geometries) on the WKT strings, to convert them into `shapely` geometries (also see note about the `.apply` method in @sec-topological-relations)." + "To convert it to a `GeoDataFrame`, we can apply the `gpd.GeoSeries.from_wkt` function (which is analogous to `shapely`'s `shapely.from_wkt`, see @sec-geometries) on the WKT strings, to convert the series of WKT strings into a `GeoSeries` with the geometries. " ] }, { "cell_type": "code", "metadata": {}, "source": [ - "world_wkt['geometry'] = world_wkt['WKT'].apply(shapely.from_wkt)\n", + "world_wkt['geometry'] = gpd.GeoSeries.from_wkt(world_wkt['WKT'])\n", "world_wkt = gpd.GeoDataFrame(world_wkt)\n", "world_wkt" ], @@ -706,9 +775,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", - "\n", "The resulting layer is shown in @fig-world_wkt-layer." ] }, @@ -718,6 +784,7 @@ "source": [ "#| label: fig-world_wkt-layer\n", "#| fig-cap: The `world_wkt.csv` table transformed to a polygon layer\n", + "#| warning: false\n", "world_wkt.plot();" ], "execution_count": null, @@ -727,39 +794,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", "As a final example, we will show how **geopandas** also reads KML files.\n", "A KML file stores geographic information in XML format---a data format for the creation of web pages and the transfer of data in an application-independent way [@nolan_xml_2014].\n", "Here, we access a KML file from the web.\n", - "First, if necessary, we may need to \"activate\" the `KML` driver, which is not always available by default (just one of these expressions should be sufficient, depending on your system)." - ] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "fiona.supported_drivers['KML'] = 'r'\n", - "fiona.supported_drivers['LIBKML'] = 'r'" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The sample KML file `KML_Samples.kml` contains more than one layer.\n", - "\n", - "" + "\n", + "The sample KML file `KML_Samples.kml` contains more than one layer." ] }, { @@ -767,7 +806,7 @@ "metadata": {}, "source": [ "u = 'https://developers.google.com/kml/documentation/KML_Samples.kml'\n", - "fiona.listlayers(u)" + "gpd.list_layers(u)" ], "execution_count": null, "outputs": [] @@ -823,7 +862,9 @@ "cell_type": "code", "metadata": {}, "source": [ - "src = rasterio.open('https://zenodo.org/record/5774954/files/clm_snow.prob_esacci.dec_p.90_500m_s0..0cm_2000..2012_v2.0.tif')\n", + "url = 'https://zenodo.org/record/5774954/files/'\n", + "url += 'clm_snow.prob_esacci.dec_p.90_500m_s0..0cm_2000..2012_v2.0.tif'\n", + "src = rasterio.open(url)\n", "src" ], "execution_count": null, @@ -839,7 +880,7 @@ "This is very useful when working with large datasets hosted online from resource-constrained computing environments such as laptops.\n", "\n", "For example, we can read a specified rectangular extent of the raster.\n", - "With **rasterio**, this is done using the so-called [windowed reading](https://rasterio.readthedocs.io/en/latest/topics/windowed-rw.html) capabilities.\n", + "With **rasterio**, this is done using the so-called *windowed reading* capabilities.\n", "Note that, with windowed reading, we import just a subset of the raster extent into an `ndarray` covering any partial extent.\n", "Windowed reading is therefore memory- (and, in this case, bandwidth-) efficient, since it avoids reading the entire raster into memory.\n", "It can also be considered an alternative pathway to *cropping* (@sec-raster-cropping).\n", @@ -864,8 +905,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Using the extent coordinates along with the raster transformation matrix, we create a window object, using the [`rasterio.windows.from_bounds`](https://rasterio.readthedocs.io/en/stable/api/rasterio.windows.html#rasterio.windows.from_bounds) function.\n", - "This function basically \"translates\" the extent from coordinates, to row/column ranges." + "Using the extent coordinates along with the raster transformation matrix, we create a window object, using the `rasterio.windows.from_bounds` function.\n", + "This function basically 'translates' the extent from coordinates, to row/column ranges." ] }, { @@ -888,9 +929,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", - "\n", "Now we can read the partial array, according to the specified window `w`, by passing it to the `.read` method." ] }, @@ -1034,9 +1072,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Instead of overwriting the file, we could add new rows to the file with `mode='a'` (\"append\" mode, as opposed to the default `mode='w'` for the \"write\" mode).\n", - "\n", - "\n", + "Instead of overwriting the file, we could add new rows to the file with `mode='a'` ('append' mode, as opposed to the default `mode='w'` for the 'write' mode).\n", "Appending is supported by several spatial formats, including GeoPackage." ] }, @@ -1054,13 +1090,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now, `w_many_features.gpkg` contains a polygonal layer named `world` with two \"copies\" of each country (that is 177×2=354 features, whereas the `world` layer has 177 features)." + "Now, `w_many_features.gpkg` contains a polygonal layer named `world` with two 'copies' of each country (that is 177×2=354 features, whereas the `world` layer has 177 features)." ] }, { "cell_type": "code", "metadata": {}, "source": [ + "#| warning: false\n", "gpd.read_file('output/w_many_features.gpkg').shape" ], "execution_count": null, @@ -1087,8 +1124,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In this case, `w_many_layers.gpkg` has two \"layers\": `w_many_layers` (same as the file name, when `layer` is unspecified) and `world2`.\n", - "Incidentally, the contents of the two layers is identical, but this does not have to be.\n", + "In this case, `w_many_layers.gpkg` has two 'layers': `w_many_layers` (same as the file name, when `layer` is unspecified) and `world2`.\n", + "Incidentally, the contents of the two layers are identical, but this does not have to be.\n", "Each layer from such a file can be imported separately using the `layer` argument of `gpd.read_file`." ] }, @@ -1119,15 +1156,15 @@ "- `height`---Number of rows\n", "- `width`---Number of columns\n", "- `count`---Number of bands\n", - "- `nodata`---The value which represents \"No Data\", if any\n", + "- `nodata`---The value which represents 'No Data', if any\n", "- `dtype`---The raster data type, one of **numpy** types supported by the `driver` (e.g., `np.int64`) (see @tbl-numpy-data-types)\n", "- `crs`---The CRS, e.g., using an EPSG code (such as `4326`)\n", "- `transform`---The transform matrix\n", "- `compress`---A compression method to apply, such as `'lzw'`. This is optional and most useful for large rasters. Note that, at the time of writing, this [does not work well](https://gis.stackexchange.com/questions/404738/why-does-rasterio-compression-reduces-image-size-with-single-band-but-not-with-m) for writing multiband rasters\n", "\n", - "```{note}\n", - "Note that `'GTiff` (GeoTIFF, `.tif`), which is the recommended driver, [supports](https://gdal.org/drivers/raster/gtiff.html) just some of the possible **numpy** data types (see @tbl-numpy-data-types). Importantly, it does not support `np.int64`, the default `int` type. The recommendation in such case it to use `np.int32` (if the range is sufficient), or `np.float64`. \n", - "```\n", + "::: callout-note\n", + "Note that `'GTiff` (GeoTIFF, `.tif`), which is the recommended driver, supports just some of the possible **numpy** data types (see @tbl-numpy-data-types). Importantly, it does not support `np.int64`, the default `int` type. The recommendation in such case it to use `np.int32` (if the range is sufficient), or `np.float64`. \n", + ":::\n", "\n", "Once the file connection with the right metadata is ready, we do the actual writing using the `.write` method of the file connection.\n", "If there are several bands we may execute the `.write` method several times, as in `.write(a,n)`, where `a` is a two-dimensional array representing a single band, and `n` is the band index (starting from `1`, see below).\n", @@ -1137,11 +1174,9 @@ "\n", "Most of the properties are either straightforward to choose, based on our aims, (e.g., `driver`, `crs`, `compress`, `nodata`), or directly derived from the array with the raster values itself (e.g., `height`, `width`, `count`, `dtype`).\n", "The most complicated property is the `transform`, which specifies the raster origin and resolution.\n", - "The `transform` is typically either obtained from an existing raster (serving as a \"template\"), created from scratch based on manually specified origin and resolution values (e.g., using `rasterio.transform.from_origin`), or calculated automatically (e.g., using `rasterio.warp.calculate_default_transform`), as shown in previous chapters.\n", + "The `transform` is typically either obtained from an existing raster (serving as a 'template'), created from scratch based on manually specified origin and resolution values (e.g., using `rasterio.transform.from_origin`), or calculated automatically (e.g., using `rasterio.warp.calculate_default_transform`), as shown in previous chapters.\n", "\n", "Earlier in the book, we have already demonstrated five common scenarios of writing rasters, covering the above-mentioned considerations:\n", - "\n", - "\n", "\n", "- Creating from scratch (@sec-raster-from-scratch)---we created and wrote two rasters from scratch by associating the `elev` and `grain` arrays with an arbitrary spatial extent. The custom arbitrary transformation matrix was created using `rasterio.transform.from_origin`\n", "- Aggregating (@sec-raster-agg-disagg)---we wrote an aggregated a raster, by resampling from an exising raster file, then updating the transformation matrix using `.transform.scale`\n", @@ -1199,7 +1234,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Then, we establish the writing-mode file connection to `r.tif`, which will be eithe created or overwritten." + "Then, we establish the writing-mode file connection to `r.tif`, which will be either created or overwritten." ] }, { @@ -1260,18 +1295,18 @@ "source": [ "These expressions, taken together, create a new file `output/r.tif`, which is a $2 \\times 2$ raster, having a 2 decimal degree resolution, with the top-left corner placed over London.\n", "\n", - "To make the picture of raster export complete, there are three important concepts we have not covered yet: array and raster data types, writing multiband rasters, and handling \"No Data\" values.\n", + "To make the picture of raster export complete, there are three important concepts we have not covered yet: array and raster data types, writing multiband rasters, and handling 'No Data' values.\n", "\n", "Arrays (i.e., `ndarray` objects defined in package **numpy**) are used to store raster values when reading them from file, using `.read` (@sec-using-rasterio).\n", "All values in an array are of the same type, whereas the **numpy** package supports numerous numeric data types of various precision (and, accordingly, memory footprint).\n", "Raster formats, such as GeoTIFF, support (a subset of) exactly the same data types as **numpy**, which means that reading a raster file uses as little RAM as possible.\n", - "The most useful types for raster data, and thir support in GeoTIFF are summarized in @tbl-numpy-data-types.\n", + "The most useful types for raster data, and their support in GeoTIFF are summarized in @tbl-numpy-data-types.\n", "\n", "| Data type | Description | GeoTIFF |\n", "|-----------|----------------------------------------------------------------------|:--------:|\n", - "| `int8` | Integer in a single byte (`-128` to `127`) | + |\n", + "| `int8` | Integer in a single byte (`-128` to `127`) | |\n", "| `int16` | Integer in 16 bits (`-32768` to `32767`) | + |\n", - "| `int32` | Integer in 32 bits (`-2147483648` to `2147483647`) | |\n", + "| `int32` | Integer in 32 bits (`-2147483648` to `2147483647`) | + |\n", "| `int64` | Integer in 64 bits (`-9223372036854775808` to `9223372036854775807`) | |\n", "| `uint8` | Unsigned integer in 8 bits (`0` to `255`) | + |\n", "| `uint16` | Unsigned integer in 16 bits (`0` to `65535`) | + |\n", @@ -1334,9 +1369,6 @@ "source": [ "These code sections demonstrate the agreement between GeoTIFF (and other file formats) data types, which are universal and understood by many programs and programming languages, and the corresponding `ndarray` data types which are defined by **numpy** (@tbl-numpy-data-types).\n", "\n", - "\n", - "\n", - "\n", "Writing multiband rasters is similar to writing single-band rasters, only that we need to:\n", "\n", "- Define a number of bands other than `count=1`, according to the number of bands we are going to write\n", @@ -1378,7 +1410,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Finally, we can create a file connection using the updated metadata, write the values of the three bands, and close the connection (note that we are switching to the \"keyword argument\" syntax of Python function calls here; see note in @sec-raster-agg-disagg)." + "Finally, we can create a file connection using the updated metadata, write the values of the three bands, and close the connection (note that we are switching to the 'keyword argument' syntax of Python function calls here; see note in @sec-raster-agg-disagg)." ] }, { @@ -1400,16 +1432,16 @@ "source": [ "As a result, a three-band raster named `r3.tif` is created.\n", "\n", - "Rasters often contain \"No Data\" values, representing missing data, for example, unreliable measurements due to clouds or pixels outside of the photographed extent.\n", - "In a **numpy** `ndarray` object, \"No Data\" values may be represented by the special `np.nan` value.\n", + "Rasters often contain 'No Data' values, representing missing data, for example, unreliable measurements due to clouds or pixels outside of the photographed extent.\n", + "In a **numpy** `ndarray` object, 'No Data' values may be represented by the special `np.nan` value.\n", "However, due to computer memory limitations, only arrays of type `float` can contain `np.nan`, while arrays of type `int` cannot.\n", - "For `int` rasters containing \"No Data\", we typically mark missing data with a specific value beyond the valid range (e.g., `-9999`).\n", - "The missing data \"flag\" definition is stored in the file (set through the `nodata` property of the file connection, see above).\n", - "When reading an `int` raster with \"No Data\" back into Python, we need to be aware of the flag, if any.\n", + "For `int` rasters containing 'No Data', we typically mark missing data with a specific value beyond the valid range (e.g., `-9999`).\n", + "The missing data 'flag' definition is stored in the file (set through the `nodata` property of the file connection, see above).\n", + "When reading an `int` raster with 'No Data' back into Python, we need to be aware of the flag, if any.\n", "Let's demonstrate it through examples.\n", "\n", "We will start with the simpler case, rasters of type `float`.\n", - "Since `float` arrays may contain the \"native\" value `np.nan`, representing \"No Data\" is straightforward.\n", + "Since `float` arrays may contain the 'native' value `np.nan`, representing 'No Data' is straightforward.\n", "For example, suppose that we have a `float` array of size $2 \\times 2$ containing one `np.nan` value." ] }, @@ -1436,7 +1468,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "When writing this type of array to a raster file, we do not need to specify any particular `nodata` \"flag\" value." + "When writing this type of array to a raster file, we do not need to specify any particular `nodata` 'flag' value." ] }, { @@ -1495,7 +1527,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now, conversely, suppose that we have an `int` array with missing data, where the \"missing\" value must inevitably be marked using a specific `int` \"flag\" value, such as `-9999` (remember that we can't store `np.nan` in an `int` array!)." + "Now, conversely, suppose that we have an `int` array with missing data, where the 'missing' value must inevitably be marked using a specific `int` 'flag' value, such as `-9999` (remember that we can't store `np.nan` in an `int` array!)." ] }, { @@ -1521,7 +1553,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "When writing the array to file, we must specify `nodata=-9999` to keep track of our \"No Data\" flag." + "When writing the array to file, we must specify `nodata=-9999` to keep track of our 'No Data' flag." ] }, { @@ -1566,7 +1598,7 @@ "metadata": {}, "source": [ "If you try to open the file in GIS software, such as QGIS, you will see the missing data interpreted (e.g., the pixel shown as blank), meaning that the software is aware of the flag.\n", - "However, reading the data back into Python reproduces an `int` array with `-9999`, due to the limitation of `int` arrays stated before/" + "However, reading the data back into Python reproduces an `int` array with `-9999`, due to the limitation of `int` arrays stated before." ] }, { @@ -1584,8 +1616,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The Python user must therefore be mindful of \"No Data\" `int` rasters, for example to avoid interpreting the value `-9999` literally.\n", - "For instance, if we \"forget\" about the `nodata` flag, the literal calculation of the `.mean` would incorrectly include the value `-9999`." + "The Python user must therefore be mindful of 'No Data' `int` rasters, for example to avoid interpreting the value `-9999` literally.\n", + "For instance, if we 'forget' about the `nodata` flag, the literal calculation of the `.mean` would incorrectly include the value `-9999`." ] }, { @@ -1601,7 +1633,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "There are two basic ways to deal with the situation: either converting the raster to `float`, or using a \"No Data\" mask.\n", + "There are two basic ways to deal with the situation: either converting the raster to `float`, or using a 'No Data' mask.\n", "The first approach, simple and particularly relevant for small rasters where memory constraints are irrelevant, is to go from `int` to `float`, to gain the ability of the natural `np.nan` representation.\n", "Here is how we can do this with `r_nodata_int.tif`.\n", "We detect the missing data flag, convert the raster to `float`, then assign `np.nan` into the cells that are supposed to be missing." @@ -1623,7 +1655,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "From there on, we deal with `np.nan` the usual way, such as using `np.nanmean` to calculate the mean excluding \"No Data\"." + "From there on, we deal with `np.nan` the usual way, such as using `np.nanmean` to calculate the mean excluding 'No Data'." ] }, { @@ -1639,8 +1671,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The second approach is to read the values into a so-called [\"masked\" array](https://numpy.org/doc/stable/reference/maskedarray.generic.html#what-is-a-masked-array), using the argument `masked=True` of the `.read` method.\n", - "A masked array can be thought of as an extended `ndarray`, with two components: `.data` (the values) and `.mask` (a corresponding boolean array marking \"No Data\" values)." + "The second approach is to read the values into a so-called *'masked' array*, using the argument `masked=True` of the `.read` method.\n", + "A masked array can be thought of as an extended `ndarray`, with two components: `.data` (the values) and `.mask` (a corresponding boolean array marking 'No Data' values)." ] }, { @@ -1658,7 +1690,7 @@ "metadata": {}, "source": [ "Complete treatment of masked arrays is beyond the scope of this book.\n", - "However, the basic idea is that many **numpy** operations \"honor\" the mask, so that the user does not have to keep track of the way that \"No Data\" values are marked, similarly to the natural `np.nan` representation and regardless of the data type.\n", + "However, the basic idea is that many **numpy** operations 'honor' the mask, so that the user does not have to keep track of the way that 'No Data' values are marked, similarly to the natural `np.nan` representation and regardless of the data type.\n", "For example, the `.mean` of a masked array ignores the value `-9999`, because it is masked, taking into account just the valid values `1`, `2`, and `4`." ] }, @@ -1678,23 +1710,18 @@ "Switching to `float` and assigning `np.nan` is the simpler approach, since that way we can keep working with the familiar `ndarray` data structure for all raster types, whether `int` or `float`.\n", "Nevertheless, learning how to work with masked arrays can be beneficial when we have good reasons to keep our raster data in `int` arrays (for example, due to RAM limits) and still perform operations that take missing values into account.\n", "\n", - "\n", - "\n", - "\n", - "Finally, keep in mind that, confusingly, `float` rasters may represent \"No Data\" using a specific \"flag\" (such as `-9999.0`), instead, or in addition to (!), the native `np.nan` representation.\n", + "Finally, keep in mind that, confusingly, `float` rasters may represent 'No Data' using a specific 'flag' (such as `-9999.0`), instead, or in addition to (!), the native `np.nan` representation.\n", "In such cases, the same considerations shown for `int` apply to `float` rasters as well.\n", "\n", - "## Exercises\n", - "\n", - "## References" + "\n" ] } ], "metadata": { "kernelspec": { - "name": "venv", + "display_name": "Python 3", "language": "python", - "display_name": "test" + "name": "python3" } }, "nbformat": 4, diff --git a/ipynb/08-mapping.ipynb b/ipynb/08-mapping.ipynb index 2a99f686..bd7666ef 100644 --- a/ipynb/08-mapping.ipynb +++ b/ipynb/08-mapping.ipynb @@ -14,12 +14,29 @@ "metadata": {}, "source": [ "#| echo: false\n", - "import matplotlib.pyplot as plt\n", - "import pandas as pd\n", - "pd.options.display.max_rows = 6\n", - "pd.options.display.max_columns = 6\n", - "pd.options.display.max_colwidth = 35\n", - "plt.rcParams['figure.figsize'] = (5, 5)" + "#| include: false\n", + "#| error: true\n", + "import map_to_png" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| echo: false\n", + "import book_options" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| echo: false\n", + "import book_options_pdf" ], "execution_count": null, "outputs": [] @@ -94,9 +111,7 @@ "## Introduction\n", "\n", "\n", - "\n", "\n", - "\n", "\n", "\n", "A satisfying and important aspect of geographic research is communicating the results.\n", @@ -112,7 +127,7 @@ "Historic examples include maps of buildings and land ownership in the Old Babylonian dynasty more than 3000 years ago and Ptolemy's world map in his masterpiece Geography nearly 2000 years ago [@talbert_ancient_2014].\n", "\n", "Map making has historically been an activity undertaken only by, or on behalf of, the elite.\n", - "This has changed with the emergence of open source mapping software such as mapping packages in Python, R, and other languages, and the \"print composer\" in QGIS, which enable anyone to make high-quality maps, enabling \"citizen science\".\n", + "This has changed with the emergence of open-source mapping software such as mapping packages in Python, R, and other languages, and the 'print composer' in QGIS, which enable anyone to make high-quality maps, enabling 'citizen science'.\n", "Maps are also often the best way to present the findings of geocomputational research in a way that is accessible.\n", "Map making is therefore a critical part of geocomputation and its emphasis not only on describing, but also changing the world.\n", "\n", @@ -120,27 +135,21 @@ "Other, more advanced uses of these methods, were also encountered in subsequent chapters, when demonstrating the various outputs we got.\n", "In this chapter, we provide a comprehensive summary of the most useful workflows of these two methods for creating static maps (@sec-static-maps).\n", "Static maps can be easily shared and viewed (whether digitally or in print), however they can only convey as much information as a static image can.\n", - "Interactive maps provide much more flexibilty in terms of user experience and amount of information, however they often require more work to design and effectively share.\n", + "Interactive maps provide much more flexibility in terms of user experience and amount of information, however they often require more work to design and effectively share.\n", "Thus, in @sec-interactive-maps, we move on to elaborate on the `.explore` method for creating interactive maps, which was also briefly introduced earlier in @sec-vector-layers.\n", "\n", "## Static maps {#sec-static-maps}\n", "\n", - "\n", - "\n", - "\n", "Static maps are the most common type of visual output from geocomputation. \n", "For example, we have been using `.plot` and `rasterio.plot.show` throughout the book, to display **geopandas** and **rasterio** geocomputation results, respectively. \n", - "In this section we systematically review and elaborate on the various properties that can be customized when using those functions.\n", + "In this section, we systematically review and elaborate on the various properties that can be customized when using those functions.\n", "\n", "A static map is basically a digital image. \n", "When stored in a file, standard formats include `.png` and `.pdf` for graphical raster and vector outputs, respectively. \n", "Thanks to their simplicity, static maps can be shared in a wide variety of ways: in print, through files sent by e-mail, embedded in documents and web pages, etc.\n", "\n", "Nevertheless, there are many aesthetic considerations when making a static map, and there is also a wide variety of ways to create static maps using novel presentation methods. \n", - "This is the focus of the field of [cartography](https://en.wikipedia.org/wiki/Cartography), and beyond the scope of this book.\n", - "\n", - "\n", - "\n", + "This is the focus of the field of cartography, and beyond the scope of this book.\n", "\n", "\n", "\n", @@ -215,9 +224,7 @@ "metadata": {}, "source": [ "The next example uses `markersize` to get larger points (@fig-basic-plot-markersize).\n", - "It also demonstrates how to control the overall [figure size](https://matplotlib.org/stable/gallery/subplots_axes_and_figures/figure_size_units.html), such as $4 \\times 4$ $in$ in this case, using [`plt.subplots`](https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.subplots.html) to initialize the plot and its `figsize` parameter to specify dimensions.\n", - "\n", - "" + "It also demonstrates how to control the overall figure size, such as $4 \\times 4$ $in$ in this case, using `plt.subplots` to initialize the plot and its `figsize` parameter to specify dimensions." ] }, { @@ -238,7 +245,7 @@ "source": [ "::: callout-note\n", "As you have probably noticed throughout the book, the `plt.subplots` function is used to initialize a **maptplotlib** plot layout, possibly also specifying image size (e.g., @fig-basic-plot-markersize) and multi-panel layout (e.g., @fig-faceted-map).\n", - "The returned value is a `tuple` of [`Figure`](https://matplotlib.org/stable/api/figure_api.html#matplotlib.figure.Figure) and [`Axes`](https://matplotlib.org/stable/api/_as_gen/matplotlib.axes.Axes.html#matplotlib.axes.Axes) objects, which we conventionally unpack to variables named `fig` and `ax`. \n", + "The returned value is a `tuple` of `Figure` and `Axes` objects, which we conventionally unpack to variables named `fig` and `ax`. \n", "These two variables represent the entire figure, and the elements of individual sub-figures, respectively.\n", "\n", "For our purposes in this book, we have been using just the `ax` object, passing it to the `ax` parameter in further function calls, in order to add subsequent layers (e.g., @fig-plot-raster-and-vector) or other elements (e.g., @fig-plot-symbology-colors-r-scale) into the same panel.\n", @@ -254,8 +261,6 @@ "- `column`---a column name\n", "- `legend`---whether to show a legend\n", "- `cmap`---color map, a.k.a. color scale, a palette from which the colors are sampled\n", - "\n", - "\n", "\n", "For example, @fig-plot-symbology shows the `nz` polygons colored according to the `'Median_income'` attribute (column), with a legend." ] @@ -276,11 +281,14 @@ "metadata": {}, "source": [ "The default color scale which you see in @fig-plot-symbology is `cmap='viridis'`.\n", - "The `cmap` (\"color map\") argument can be used to specify one of countless color scales.\n", - "A first safe choice is often the [ColorBrewer](https://colorbrewer2.org/#type=sequential&scheme=BuGn&n=3) collection of color scales, specifically designed for mapping.\n", + "The `cmap` ('color map') argument can be used to specify one of countless color scales.\n", + "A first safe choice is often the ColorBrewer[^colorbrewer] collection of color scales, specifically designed for mapping.\n", "Any color scale can be reversed, using the `_r` suffix.\n", - "Finally, other color scales are available: see the **matplotlib** [colormaps article](https://matplotlib.org/stable/tutorials/colors/colormaps.html) for details.\n", - "The following code sections demonstrates three color scale specifications other than the default (@fig-plot-symbology-colors)." + "Finally, other color scales are available: see the **matplotlib** colormaps article[^matplotlib_colormaps] for details.\n", + "The following code section demonstrates three-color scale specifications other than the default (@fig-plot-symbology-colors).\n", + "\n", + "[^colorbrewer]: \n", + "[^matplotlib_colormaps]: " ] }, { @@ -305,9 +313,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", - "\n", "Categorical symbology is also supported, such as when `column` points to an `str` attribute.\n", "For categorical variables, it makes sense to use a qualitative color scale, such as `'Set1'` from ColorBrewer.\n", "For example, the following expression sets symbology according to the `'Island'` column (@fig-plot-symbology-categorical)." @@ -372,11 +377,8 @@ "metadata": {}, "source": [ "Unfortunately, there is no built-in option to display a legend in `rasterio.plot.show`.\n", - "The following [workaround](https://stackoverflow.com/questions/61327088/rio-plot-show-with-colorbar), reverting to **matplotlib** methods, can be used to acheive it instead (@fig-plot-symbology-colors-r-scale).\n", - "Basically, the code reverts to the **matplotlib** [`.colorbar`](https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.colorbar.html) method to add a legend, using the [`plt.imshow`](https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.imshow.html) function that draws an image of a **numpy** array (which `rasterio.plot.show` is a wrapper of).\n", - "\n", - "\n", - "" + "The following workaround, reverting to **matplotlib** methods, can be used to acheive it instead (@fig-plot-symbology-colors-r-scale).\n", + "Basically, the code reverts to the **matplotlib** `.colorbar` method to add a legend, using the `plt.imshow` function that draws an image of a **numpy** array (which `rasterio.plot.show` is a wrapper of)." ] }, { @@ -401,7 +403,7 @@ "\n", "Labels are often useful to annotate maps and identify the location of specific features. \n", "GIS software, as opposed to **matplotlib**, has specialized algorithms for label placement, e.g., to avoid overlaps between adjacent labels.\n", - "Furthermore, editing in graphical editing software is sometimes used for fine tuning of label placement.\n", + "Furthermore, editing in graphical editing software is sometimes used for fine-tuning of label placement.\n", "Nevertheless, simple labels added within the Python environment can be a good starting point, both for interactive exploration and sharing analysis results.\n", "\n", "To demonstrate it, suppose that we have a layer `nz1` of regions comprising the New Zealand southern Island." @@ -420,13 +422,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "To add a label in **matplotlib**, we use the [`.annotate`](https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.annotate.html) method where the important arguments are the label string and the placement (a `tuple` of the form `(x,y)`). \n", + "To add a label in **matplotlib**, we use the `.annotate` method where the important arguments are the label string and the placement (a `tuple` of the form `(x,y)`). \n", "When labeling vector layers, we typically want to add numerous labels, based on (one or more) attribute of each feature. \n", "To do that, we can run a `for` loop, or use the `.apply` method, to pass the label text and the coordinates of each feature to `.annotate`.\n", "In the following example, we use the `.apply` method the pass the region name (`'Name'` attribute) and the geometry centroid coordinates, for each region, to `.annotate`.\n", - "We are also using `ha`, short for `horizontalalignment`, with `'center'` (other options are `'right'` and `'left'`, see [Text properties and layout](https://matplotlib.org/stable/users/explain/text/text_props.html) reference for **matplotlib**) (@fig-labels-polygon).\n", - "\n", - "" + "We are also using `ha`, short for `horizontalalignment`, with `'center'` (other options are `'right'` and `'left'`) (@fig-labels-polygon)." ] }, { @@ -474,7 +474,7 @@ "source": [ "Then, we again use `.apply`, combined with `.annotate`, to add the text labels. \n", "The main difference compared to the previous example (@fig-labels-polygon) is that we are directly passing the geometry coordinates (`.geometry.coords[0]`), since the geometries are points rather than polygons.\n", - "We are also using the `weight='bold'` argument to use bold font (see [Text properties and layout](https://matplotlib.org/stable/users/explain/text/text_props.html) reference for **matplotlib**) for list of other options) (@fig-labels-points1).\n", + "We are also using the `weight='bold'` argument to use bold font (@fig-labels-points1).\n", "\n", "" ] @@ -504,11 +504,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "It should be noted that sometimes we wish to add text labels \"manually\", one by one, rather than use a loop or `.apply`. \n", + "It should be noted that sometimes we wish to add text labels 'manually', one by one, rather than use a loop or `.apply`. \n", "For example, we may want to add labels of specific locations not stored in a layer, or to have control over the specific properties of each label. \n", - "To add text labels manually, we can run the `.annotate` expressions one at a time, as shown in the code section below recreating the last result with the \"manual\" approach (@fig-labels-points2).\n", - "\n", - "" + "To add text labels manually, we can run the `.annotate` expressions one at a time, as shown in the code section below recreating the last result with the 'manual' approach (@fig-labels-points2)." ] }, { @@ -531,7 +529,7 @@ "source": [ "### Layers {#sec-plot-static-layers}\n", "\n", - "To display more than one layer in the same static map, we need to:\n", + "To display more than one layer in the same static map, we can:\n", "\n", "1. Store the first plot in a variable (e.g., `base`)\n", "2. Pass it as the `ax` argument of any subsequent plot(s) (e.g., `ax=base`)\n", @@ -551,19 +549,40 @@ "execution_count": null, "outputs": [] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Alternatively (see note in @sec-static-styling), we can:\n", + "\n", + "1. Initialize the plot using `fig,ax=plt.subplots()`\n", + "2. Pass `ax` to any subsequent plot" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| label: fig-two-layers2\n", + "#| fig-cap: Plotting two layers, `nz` (polygons) and `nz_height` (points), using `plt.subplots`\n", + "fig, ax = plt.subplots()\n", + "nz.plot(ax=ax, color='none')\n", + "nz_height.plot(ax=ax, color='red');" + ], + "execution_count": null, + "outputs": [] + }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can combine rasters and vector layers in the same plot as well, which we already used earlier in the book, for example when explaining masking and cropping (@fig-raster-crop).\n", "The technique is to initialize a plot with `fig,ax=plt.subplots()`, then pass `ax` to any of the separate plots, making them appear together.\n", - "\n", - "\n", "\n", "For example, @fig-plot-raster-and-vector demonstrated plotting a raster with increasingly complicated additions:\n", "\n", "- Panel (a) shows a raster (New Zealand elevation) and a vector layer (New Zealand administrative division)\n", - "- Panel (b) shows the raster with a buffer of 22.2 $km$ around the dissolved administrative borders, representing New Zealand's [territorial waters](https://en.wikipedia.org/wiki/Territorial_waters) (see @sec-global-operations-and-distances)\n", + "- Panel (b) shows the raster with a buffer of 22.2 $km$ around the dissolved administrative borders, representing New Zealand's territorial waters (see @sec-global-operations-and-distances)\n", "- Panel (c) shows the raster with two vector layers: the territorial waters (in red) and elevation measurement points (in yellow)" ] }, @@ -585,7 +604,7 @@ "# Raster + computed vector layer\n", "fig, ax = plt.subplots(figsize=(5, 5))\n", "rasterio.plot.show(nz_elev, ax=ax)\n", - "gpd.GeoSeries(nz.unary_union, crs=nz.crs) \\\n", + "gpd.GeoSeries(nz.union_all(), crs=nz.crs) \\\n", " .to_crs(nz_elev.crs) \\\n", " .buffer(22200) \\\n", " .exterior \\\n", @@ -593,7 +612,7 @@ "# Raster + two vector layers\n", "fig, ax = plt.subplots(figsize=(5, 5))\n", "rasterio.plot.show(nz_elev, ax=ax)\n", - "gpd.GeoSeries(nz.unary_union, crs=nz.crs) \\\n", + "gpd.GeoSeries(nz.union_all(), crs=nz.crs) \\\n", " .to_crs(nz_elev.crs) \\\n", " .buffer(22200) \\\n", " .exterior \\\n", @@ -607,13 +626,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", - "\n", - "\n", - "\n", "::: callout-note\n", - "Note that the drawing order of layers is not necessarily according to the order of expressions, in the code, but according to layer *type*. For example, by [default](https://matplotlib.org/stable/gallery/misc/zorder_demo.html) line layers are drawn on top of point layers. To override the default plotting order, we can use the `zorder` argument of `.plot`. Layers with higher `zorder` values will be drawn on top. For example, the following would draw `layer2` on top of `layer1` (regaredless of their types).\n", + "Note that the drawing order of layers is not necessarily according to the order of expressions, in the code, but according to layer *type*. For example, by default line layers are drawn on top of point layers. To override the default plotting order, we can use the `zorder` argument of `.plot`. Layers with higher `zorder` values will be drawn on top. For example, the following would draw `layer2` on top of `layer1` (regaredless of their types).\n", "\n", "```python\n", "base = layer1.plot(zorder=1)\n", @@ -623,13 +637,15 @@ "\n", "### Basemaps\n", "\n", - "Basemaps, or background layers, are often useful to provide context to the displayed layers (which are in the \"foreground\").\n", + "Basemaps, or background layers, are often useful to provide context to the displayed layers (which are in the 'foreground').\n", "Basemaps are ubiquitous in interactive maps (see @sec-interactive-maps).\n", "However, they are often useful in static maps too.\n", "\n", - "Basemaps can be added to **geopandas** static plots using the [**contextily**](https://contextily.readthedocs.io/en/latest/index.html) package.\n", - "A preliminary step is to convert our layers to `EPSG:3857` ([\"Web Mercator\"](https://en.wikipedia.org/wiki/Web_Mercator_projection)), to be in agreement with the basemaps, which are typically provided in this CRS.\n", - "For example, let's take the small `\"Nelson\"` polygon from `nz`, and reproject it to `3857`." + "Basemaps can be added to **geopandas** static plots using the **contextily** package.\n", + "A preliminary step is to convert our layers to `EPSG:3857` ('Web Mercator'), to be in agreement with the basemaps, which are typically provided in this CRS[^reproject_tiles].\n", + "For example, let's take the small `\"Nelson\"` polygon from `nz`, and reproject it to `3857`.\n", + "\n", + "[^reproject_tiles]: Another option is to reproject the tiles to match the CRS of the foreground layers; this is less commonly used workflow, as it may lead to distorted appearance of the background layer. " ] }, { @@ -646,7 +662,7 @@ "metadata": {}, "source": [ "To add a basemap, we use the `contextily.add_basemap` function, similarly to the way we added multiple layers (@sec-plot-static-layers).\n", - "The default basemap is \"OpenStreetMap\".\n", + "The default basemap is 'OpenStreetMap'.\n", "You can specify a different basemap using the `source` parameter, with one of the values in `cx.providers` (@fig-basemap)." ] }, @@ -655,8 +671,8 @@ "metadata": {}, "source": [ "#| label: fig-basemap\n", - "#| fig-cap: Adding a basemap to a static map, using `contextily`\n", - "#| layout-ncol: 3\n", + "#| fig-cap: Adding a basemap to a static map, using **contextily**\n", + "#| layout-ncol: 2\n", "#| fig-subcap:\n", "#| - \"'OpenStreetMap' basemap\"\n", "#| - \"'CartoDB Positron' basemap\"\n", @@ -676,9 +692,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Check out the [gallery](https://xyzservices.readthedocs.io/en/stable/gallery.html) for more possible basemaps.\n", - "Custom basemaps (such as from your own raster tile server) can be also specified using a [URL](https://contextily.readthedocs.io/en/latest/providers_deepdive.html#Manually-specifying-a-provider).\n", - "Finally, you may read the [Adding a background map to plots](https://geopandas.org/en/stable/gallery/plotting_basemap_background.html) tutorial for more examples.\n", + "Check out the gallery[^xyzservices_gallery] for more possible basemaps.\n", + "Custom basemaps (such as from your own raster tile server) can be also specified using a URL.\n", + "Finally, you may read the *Adding a background map to plots*[^basemaps_tutorial] tutorial for more examples.\n", + "\n", + "[^xyzservices_gallery]: \n", + "[^basemaps_tutorial]: \n", "\n", "### Faceted maps {#sec-faceted-maps}\n", "\n", @@ -702,9 +721,7 @@ "metadata": {}, "source": [ "We may want to plot them all in a faceted map, that is, four small maps of `nz` with the different variables.\n", - "To do that, we initialize the plot with the expected number of panels, such as `ncols=len(vars)` if we wish to have one row and four columns, and then go over the variables in a `for` loop, each time plotting `vars[i]` into the `ax[i]` panel (@fig-faceted-map).\n", - "\n", - "" + "To do that, we initialize the plot with the expected number of panels, such as `ncols=len(vars)` if we wish to have one row and four columns, and then go over the variables in a `for` loop, each time plotting `vars[i]` into the `ax[i]` panel (@fig-faceted-map)." ] }, { @@ -725,9 +742,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In case we prefer a specific layout, rather than one row or one column, we can initialize the required number or rows and columns, as in `plt.subplots(nrows,ncols)`, \"flatten\" `ax`, so that the facets are still accessible using a single index `ax[i]` (rather than the default `ax[i][j]`), and plot into `ax[i]`.\n", + "In case we prefer a specific layout, rather than one row or one column, we can initialize the required number or rows and columns, as in `plt.subplots(nrows,ncols)`, 'flatten' `ax`, so that the facets are still accessible using a single index `ax[i]` (rather than the default `ax[i][j]`), and plot into `ax[i]`.\n", "For example, here is how we can reproduce the last plot, this time in a $2 \\times 2$ layout, instead of a $1 \\times 4$ layout (@fig-faceted-map2).\n", - "One more modification we are doing here is hiding the axis ticks and labels, to make the map less \"crowded\", using `ax[i].xaxis.set_visible(False)` (and same for `.yaxis`)." + "One more modification we are doing here is hiding the axis ticks and labels, to make the map less 'crowded', using `ax[i].xaxis.set_visible(False)` (and same for `.yaxis`)." ] }, { @@ -735,8 +752,8 @@ "metadata": {}, "source": [ "#| label: fig-faceted-map2\n", - "#| fig-cap: 2D layout in a faceted map, using a `for` loop\n", - "fig, ax = plt.subplots(ncols=2, nrows=int(len(vars)/2), figsize=(6, 6))\n", + "#| fig-cap: Two-dimensional layout in a faceted map, using a `for` loop\n", + "fig, ax = plt.subplots(nrows=int(len(vars)/2), ncols=2, figsize=(6, 6))\n", "ax = ax.flatten()\n", "for i in range(len(vars)):\n", " nz.plot(ax=ax[i], column=vars[i], legend=True)\n", @@ -751,7 +768,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "It is also possible to \"manually\" specify the properties of each panel, and which row/column it goes in (e.g., @fig-spatial-aggregation-different-functions).\n", + "It is also possible to 'manually' specify the properties of each panel, and which row/column it goes in (e.g., @fig-spatial-aggregation-different-functions).\n", "This can be useful when the various panels have different components, or even completely different types of plots (e.g., @fig-zion-transect), making automation with a `for` loop less applicable.\n", "For example, here is a plot similar to @fig-faceted-map2, but specifying each panel using a separate expression instead of using a `for` loop (@fig-faceted-map3)." ] @@ -761,7 +778,7 @@ "metadata": {}, "source": [ "#| label: fig-faceted-map3\n", - "#| fig-cap: 2D layout in a faceted map, using \"manual\" specification of the panels \n", + "#| fig-cap: Two-dimensional layout in a faceted map, using 'manual' specification of the panels \n", "fig, ax = plt.subplots(ncols=2, nrows=int(len(vars)/2), figsize=(6, 6))\n", "nz.plot(ax=ax[0][0], column=vars[0], legend=True)\n", "ax[0][0].set_title(vars[0])\n", @@ -783,10 +800,8 @@ "\n", "### Exporting {#sec-exporting-static-maps}\n", "\n", - "Static maps can be exported to a file using the [`matplotlib.pyplot.savefig`](https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.savefig.html) function.\n", - "For example, the following code section recreates fig-two-layers, but this time the last expression saves the image to a JPG image named `plot_geopandas.jpg`.\n", - "\n", - "" + "Static maps can be exported to a file using the `matplotlib.pyplot.savefig` function.\n", + "For example, the following code section recreates @fig-two-layers, but this time the last expression saves the image to a JPG image named `plot_geopandas.jpg`." ] }, { @@ -847,28 +862,25 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", "## Interactive maps {#sec-interactive-maps}\n", "\n", - "\n", - "\n", - "\n", "While static maps can enliven geographic datasets, interactive maps can take them to a new level. \n", "Interactivity can take many forms, the most common and useful of which is the ability to pan around and zoom into any part of a geographic dataset overlaid on a 'web map' to show context.\n", "Less advanced interactivity levels include popups which appear when you click on different features, a kind of interactive label.\n", - "More advanced levels of interactivity include the ability to tilt and rotate maps, and the provision of \"dynamically linked\" sub-plots which automatically update when the user pans and zooms [@pezanowski_senseplace3_2018].\n", + "More advanced levels of interactivity include the ability to tilt and rotate maps, and the provision of 'dynamically linked' sub-plots which automatically update when the user pans and zooms [@pezanowski_senseplace3_2018].\n", "\n", "The most important type of interactivity, however, is the display of geographic data on interactive or 'slippy' web maps.\n", - "Significant features of web maps are that (1) they eventually comprise static HTML files, easily shared and accessed by a wide audience, and (2) they can \"grab\" content (e.g., basemaps) or use services from other locations on the internet, that way providing detailed context without much requiring much effort from the person who created the map.\n", - "The most popular approaches for web mapping, in Python and elsewhere, are based on the [Leaflet](https://leafletjs.com/) JavaScript library [@dorman2020introduction]. \n", - "The [**folium**](https://python-visualization.github.io/folium/latest/) Python package provides an extensive interface to create customized web maps based on Leaflet; it is recommended for highly-custimized maps.\n", + "Significant features of web maps are that (1) they eventually comprise static HTML files, easily shared and accessed by a wide audience, and (2) they can 'grab' content (e.g., basemaps) or use services from other locations on the internet, that way providing detailed context without much requiring much effort from the person who created the map.\n", + "The most popular approaches for web mapping, in Python and elsewhere, are based on the Leaflet JavaScript library [@dorman2020introduction]. \n", + "The **folium** Python package provides an extensive interface to create customized web maps based on Leaflet; it is recommended for highly customized maps.\n", "However, the **geopandas** wrapper `.explore`, introduced in @sec-vector-layers, can be used for a wide range of scenarios which are often sufficient.\n", "This is what we cover in this section.\n", "\n", "### Minimal example\n", "\n", - "An interactive map of a `GeoSeries` or `GeoDataFrame` can be created with `.explore` (@sec-vector-layers)." + "An interactive map of a `GeoSeries` or `GeoDataFrame` can be created with `.explore` (@sec-vector-layers).\n", + "\n", + "::: {.content-visible when-format=\"html\"}" ] }, { @@ -886,6 +898,39 @@ "cell_type": "markdown", "metadata": {}, "source": [ + ":::\n", + "::: {.content-visible when-format=\"pdf\"}" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| eval: false\n", + "nz.explore()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| echo: false\n", + "#| output: false\n", + "#| error: true\n", + "map_to_png.map_to_png(nz.explore(), 'fig-explore')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Minimal example of an interactive vector layer plot with `.explore`](images/fig-explore.png){#fig-explore}\n", + ":::\n", + "\n", "### Styling {#sec-interactive-styling}\n", "\n", "The `.explore` method has a `color` parameter which affects both the fill and outline color.\n", @@ -901,7 +946,9 @@ "- `fillColor`---Fill color\n", "- `fillOpacity`---Fill opacity (from `0` to `1`)\n", "\n", - "For example, here is how we can set green fill color and 30% opaque black outline of `nz` polygons in `.explore` (@fig-explore-styling-polygons):" + "For example, here is how we can set green fill color and 30% opaque black outline of `nz` polygons in `.explore` (@fig-explore-styling-polygons).\n", + "\n", + "::: {.content-visible when-format=\"html\"}" ] }, { @@ -919,18 +966,53 @@ "cell_type": "markdown", "metadata": {}, "source": [ + ":::\n", + "::: {.content-visible when-format=\"pdf\"}" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| eval: false\n", + "nz.explore(color='green', style_kwds={'color':'black', 'opacity':0.3})" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| echo: false\n", + "#| output: false\n", + "#| error: true\n", + "map_to_png.map_to_png(nz.explore(color='green', style_kwds={'color':'black', 'opacity':0.3}), 'fig-explore-styling-polygons')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Styling of polygons in `.explore`](images/fig-explore-styling-polygons.png){#fig-explore-styling-polygons}\n", + ":::\n", + "\n", "The `dict` passed to `marker_kwds` controls the way that points are displayed:\n", "\n", - "- `radius`---Curcle radius (in $m$ for `circle`, see below) or in pixels (for `circle_marker`)\n", + "- `radius`---Curcle radius, in $m$ for `circle` (see below), or in pixels for `circle_marker`\n", "- `fill`---Whether to draw fill (for `circle` or `circle_marker`)\n", "\n", - "Additionally, for points, we can set the `marker_type`, to one of:\n", + "Accordingly, for points, we can set the `marker_type`, to one of:\n", "\n", "- `'marker'`---A PNG image of a marker\n", "- `'circle'`---A vector circle with radius specified in $m$\n", "- `'circle_marker'`---A vector circle with radius specified in pixels (the default)\n", "\n", - "For example, the following expression draws `'circe_marker`' points with 20 pixel radius, green fill, and black outline (@fig-explore-styling-points)." + "For example, the following expression draws `'circe_marker`' points with 20-pixel radius, green fill, and black outline (@fig-explore-styling-points).\n", + "\n", + "::: {.content-visible when-format=\"html\"}" ] }, { @@ -952,8 +1034,51 @@ "cell_type": "markdown", "metadata": {}, "source": [ + ":::\n", + "::: {.content-visible when-format=\"pdf\"}" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| eval: false\n", + "nz_height.explore(\n", + " color='green', \n", + " style_kwds={'color':'black', 'opacity':0.5, 'fillOpacity':0.1}, \n", + " marker_kwds={'radius':20}\n", + ")" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| echo: false\n", + "#| output: false\n", + "#| error: true\n", + "map_to_png.map_to_png(nz_height.explore(\n", + " color='green', \n", + " style_kwds={'color':'black', 'opacity':0.5, 'fillOpacity':0.1}, \n", + " marker_kwds={'radius':20}\n", + "), 'fig-explore-styling-points')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Styling of points in `.explore` (using `circle_marker`)](images/fig-explore-styling-points.png){#fig-explore-styling-points}\n", + ":::\n", + "\n", "@fig-explore-styling-points2 demonstrates the `'marker_type'` option.\n", - "Note that the above-mentioned styling properties (other then `opacity`) are not applicable when using `marker_type='marker'`, because the markers are fixed PNG images." + "Note that the above-mentioned styling properties (other than `opacity`) are not applicable when using `marker_type='marker'`, because the markers are fixed PNG images.\n", + "\n", + "::: {.content-visible when-format=\"html\"}" ] }, { @@ -971,12 +1096,44 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", + ":::\n", + "::: {.content-visible when-format=\"pdf\"}" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| eval: false\n", + "nz_height.explore(marker_type='marker')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| echo: false\n", + "#| output: false\n", + "#| error: true\n", + "map_to_png.map_to_png(nz_height.explore(marker_type='marker'), 'fig-explore-styling-points2')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Styling of points in `.explore` (using `marker`)](images/fig-explore-styling-points2.png){#fig-explore-styling-points2}\n", + ":::\n", "\n", "### Layers\n", "\n", - "To display multiple layers, one on top of another, with `.explore`, we use the `m` argument, which stands for the previous map (@fig-explore-layers)." + "To display multiple layers, one on top of another, with `.explore`, we use the `m` argument, which stands for the previous map (@fig-explore-layers).\n", + "\n", + "::: {.content-visible when-format=\"html\"}" ] }, { @@ -995,10 +1152,47 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "One of the advantages of interactive maps is the ability to turn layers \"on\" and \"off\".\n", - "This capability is implemented in [`folium.LayerControl`](https://python-visualization.github.io/folium/latest/user_guide/ui_elements/layer_control.html#LayerControl) from package **folium**, which the **geopandas** `.explore` method is a wrapper of.\n", + ":::\n", + "::: {.content-visible when-format=\"pdf\"}" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| eval: false\n", + "m = nz.explore()\n", + "nz_height.explore(m=m, color='red')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| echo: false\n", + "#| output: false\n", + "#| error: true\n", + "m = nz.explore()\n", + "map_to_png.map_to_png(nz_height.explore(m=m, color='red'), 'fig-explore-layers')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Displaying multiple layers in an interactive map with `.explore`](images/fig-explore-layers.png){#fig-explore-layers}\n", + ":::\n", + "\n", + "One of the advantages of interactive maps is the ability to turn layers 'on' and 'off'.\n", + "This capability is implemented in `folium.LayerControl` from package **folium**, which the **geopandas** `.explore` method is a wrapper of.\n", "For example, this is how we can add a layer control for the `nz` and `nz_height` layers (@fig-explore-layers-controls).\n", - "Note the `name` properties, used to specify layer names in the control, and the `collapsed` property, used to specify whether the control is fully visible at all times (`False`), or on mouse hover (`True`, the default)." + "Note the `name` properties, used to specify layer names in the control, and the `collapsed` property, used to specify whether the control is fully visible at all times (`False`), or only on mouse hover (`True`, the default).\n", + "\n", + "::: {.content-visible when-format=\"html\"}" ] }, { @@ -1019,10 +1213,51 @@ "cell_type": "markdown", "metadata": {}, "source": [ + ":::\n", + "::: {.content-visible when-format=\"pdf\"}" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| eval: false\n", + "m = nz.explore(name='Polygons (adm. areas)')\n", + "nz_height.explore(m=m, color='red', name='Points (elevation)')\n", + "folium.LayerControl(collapsed=False).add_to(m)\n", + "m" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| echo: false\n", + "#| output: false\n", + "#| error: true\n", + "m = nz.explore(name='Polygons (adm. areas)')\n", + "nz_height.explore(m=m, color='red', name='Points (elevation)')\n", + "folium.LayerControl(collapsed=False).add_to(m)\n", + "map_to_png.map_to_png(m, 'fig-explore-layers-controls')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Displaying multiple layers in an interactive map with `.explore`](images/fig-explore-layers-controls.png){#fig-explore-layers-controls}\n", + ":::\n", + "\n", "### Symbology {#sec-explore-symbology}\n", "\n", "Symbology can be specified in `.explore` using similar arguments as in `.plot` (@sec-plot-symbology).\n", - "For example, @fig-explore-symbology is an interactive version of @fig-plot-symbology-colors (a)." + "For example, @fig-explore-symbology is an interactive version of @fig-plot-symbology-colors (a).\n", + "\n", + "::: {.content-visible when-format=\"html\"}" ] }, { @@ -1040,9 +1275,44 @@ "cell_type": "markdown", "metadata": {}, "source": [ + ":::\n", + "::: {.content-visible when-format=\"pdf\"}" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| eval: false\n", + "nz.explore(column='Median_income', legend=True, cmap='Reds')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| echo: false\n", + "#| output: false\n", + "#| error: true\n", + "map_to_png.map_to_png(nz.explore(column='Median_income', legend=True, cmap='Reds'), 'fig-explore-symbology')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Symbology in an interactive map of a vector layer, created with `.explore`](images/fig-explore-symbology.png){#fig-explore-symbology}\n", + ":::\n", + "\n", "Fixed styling (@sec-explore-symbology) can be combined with symbology settings.\n", "For example, polygon outline colors in @fig-explore-symbology are styled according to `'Median_income'`, however, this layer has overlapping outlines and their color is arbitrarily set according to the order of features (top-most features), which may be misleading and confusing.\n", - "To specify fixed outline colors (e.g., black), we can use the `color` and `weight` properties of `style_kwds` (@fig-explore-symbology2):" + "To specify fixed outline colors (e.g., black), we can use the `color` and `weight` properties of `style_kwds` (@fig-explore-symbology2):\n", + "\n", + "::: {.content-visible when-format=\"html\"}" ] }, { @@ -1051,7 +1321,12 @@ "source": [ "#| label: fig-explore-symbology2\n", "#| fig-cap: 'Symbology combined with fixed styling in `.explore`'\n", - "nz.explore(column='Median_income', legend=True, cmap='Reds', style_kwds={'color':'black', 'weight': 0.5})" + "nz.explore(\n", + " column='Median_income', \n", + " legend=True, \n", + " cmap='Reds', \n", + " style_kwds={'color':'black', 'weight': 0.5}\n", + ")" ], "execution_count": null, "outputs": [] @@ -1060,6 +1335,44 @@ "cell_type": "markdown", "metadata": {}, "source": [ + ":::\n", + "::: {.content-visible when-format=\"pdf\"}" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| eval: false\n", + "nz.explore(\n", + " column='Median_income', \n", + " legend=True, \n", + " cmap='Reds',\n", + " style_kwds={'color':'black', 'weight': 0.5}\n", + ")" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| echo: false\n", + "#| output: false\n", + "#| error: true\n", + "map_to_png.map_to_png(nz.explore(column='Median_income', legend=True, cmap='Reds', style_kwds={'color':'black', 'weight': 0.5}), 'fig-explore-symbology2')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Symbology combined with fixed styling in `.explore`](images/fig-explore-symbology2.png){#fig-explore-symbology2}\n", + ":::\n", + "\n", "### Basemaps\n", "\n", "The basemap in `.explore` can be specified using the `tiles` argument.\n", @@ -1069,8 +1382,10 @@ "- `'CartoDB positron'`\n", "- `'CartoDB dark_matter'`\n", "\n", - "Other basemaps are available through the **xyzservices** package, which needs to be installed (see `xyzservices.providers` for a list), or using a custom tile server URL.\n", - "For example, the following expression displays the `'CartoDB positron'` tiles in an `.explore` map (@fig-explore-basemaps)." + "Other basemaps are available through the **xyzservices** package (see `xyzservices.providers` for a list), or using a custom tile server URL.\n", + "For example, the following expression displays the `'CartoDB positron'` tiles in an `.explore` map (@fig-explore-basemaps).\n", + "\n", + "::: {.content-visible when-format=\"html\"}" ] }, { @@ -1088,11 +1403,46 @@ "cell_type": "markdown", "metadata": {}, "source": [ + ":::\n", + "::: {.content-visible when-format=\"pdf\"}" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| eval: false\n", + "nz.explore(tiles='CartoDB positron')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| echo: false\n", + "#| output: false\n", + "#| error: true\n", + "map_to_png.map_to_png(nz.explore(tiles='CartoDB positron'), 'fig-explore-basemaps')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Specifying the basemap in `.explore`](images/fig-explore-basemaps.png){#fig-explore-basemaps}\n", + ":::\n", + "\n", "### Exporting\n", "\n", "An interactive map can be exported to an HTML file using the `.save` method of the `map` object.\n", - "The HTML file can then be shared with other people, or published on a server and shared through a URL.\n", - "A good free option for publishing a web map is through [GitHub Pages](https://pages.github.com/).\n", + "The HTML file can then be shared with other people, or published on a server and shared through a URL[^leaflet_size].\n", + "A good free option for publishing a web map is through GitHub Pages.\n", + "\n", + "[^leaflet_size]: The GeoJSON representation of the data is embedded in the HTML file, which means that the file size can get large, and the web map may become unusable due to browser performance limitations.\n", "\n", "For example, here is how we can export the map shown in @fig-explore-layers-controls, to a file named `map.html`." ] @@ -1114,21 +1464,19 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", + "\n", "\n", "\n", "\n", - "## Exercises\n", - "\n", - "## References" + "" ] } ], "metadata": { "kernelspec": { - "name": "venv", + "display_name": "Python 3", "language": "python", - "display_name": "test" + "name": "python3" } }, "nbformat": 4, diff --git a/ipynb/README.ipynb b/ipynb/README.ipynb deleted file mode 100644 index 1545b030..00000000 --- a/ipynb/README.ipynb +++ /dev/null @@ -1,235 +0,0 @@ -{ - "cells": [ - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "---\n", - "format: gfm\n", - "---" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# geocompy\n", - "\n", - "[![Render](https://github.com/geocompx/geocompy/actions/workflows/main.yaml/badge.svg)](https://github.com/geocompx/geocompy/actions/workflows/main.yaml)\n", - "\n", - "[![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://github.com/codespaces/new?hide_repo_select=true&ref=main&repo=447558863)\n", - "\n", - "\n", - "\n", - "Running the code in this book requires the following:\n", - "\n", - "1. Python dependencies, which can be installed with [`pip`](https://pypi.org/project/pip/), a package manager or a [Docker](https://docs.docker.com/get-docker/) container (see below)\n", - "2. An integrated development environment (IDE) such as [VS Code](https://code.visualstudio.com/) (running locally or on [Codespaces](https://github.com/codespaces/new?hide_repo_select=true&ref=main&repo=447558863)/other host) or [Jupyter Notebook](https://github.com/geocompx/geocompy/tree/main/ipynb) for running and exploring the Python code interactively\n", - "3. [Quarto](https://quarto.org/docs/get-started/), which is used to generate the book\n", - "\n", - " \n", - "\n", - "## Reproduce the book with GitHub Codespaces\n", - "\n", - "GitHub [Codespaces](https://github.com/features/codespaces) minimise set-up costs by providing access to a modern IDE (VS Code) plus dependencies in your browser.\n", - "This can save time on package installation.\n", - "Codespaces allow you to make and commit changes, providing a way to test changes and contribute fixes in an instant.\n", - "\n", - "To run the book in Codespaces, click on the link below.\n", - "\n", - "[![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://github.com/codespaces/new?hide_repo_select=true&ref=main&repo=447558863)\n", - "\n", - "You should [see](https://github.com/geocompx/geocompy/issues/114) something like this, the result of running all the code in the book by opening the terminal (e.g. with the command Ctrl+J) and entering the following command:\n", - "\n", - "```\n", - "quarto preview\n", - "```\n", - "\n", - "![](https://user-images.githubusercontent.com/1825120/202933280-e313c076-f188-4efd-9de1-5625eb169045.png)\n", - "\n", - "## Reproduce the book with Docker (devcontainer)\n", - "\n", - "If you can install [Docker](https://docs.docker.com/desktop/install/) this is likely to be the quickest way to reproduce the contents of this book.\n", - "To do this from within VS Code:\n", - "\n", - "1. Install Microsoft's official [Dev Container](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers) extension\n", - "2. Open the folder containing the repo in VS Code and click on the 'Reopen in container' button that should appear, as shown below (you need to have Docker installed on your computer for this to work)\n", - "\n", - "![](https://user-images.githubusercontent.com/1825120/202933928-eb6de086-f9a5-43cd-9932-e6ec84746d45.png)\n", - "\n", - "Edit the code in the containerised instance of VS Code that will appear 🎉\n", - "\n", - "See details below for other ways to get the dependencies and reproduce the book.\n", - "\n", - "## Install dependencies with pip\n", - "\n", - "
\n", - "\n", - "Use `pip` to install the dependencies as follows, after cloning the repo and opening a terminal in the root folder of the repo.\n", - "\n", - "First we'll set-up a virtual environment to install the dependencies in:\n", - "\n", - "```sh\n", - "# Create a virtual environment called geocompy\n", - "python -m venv geocompy\n", - "# Activate the virtual environment\n", - "source geocompy/bin/activate\n", - "```\n", - "\n", - "Then install the dependencies (with the optional [`python -m`](https://fosstodon.org/deck/@hugovk@mastodon.social/111311327842154267) prefix specifying the Python version):\n", - "\n", - "```sh\n", - "# Install dependencies from the requirements.txt file\n", - "python -m pip install -r requirements.txt\n", - "```\n", - "You can also install packages individually, e.g.:\n", - "\n", - "```sh\n", - "pip install jupyter-book\n", - "```\n", - "\n", - "Deactivate the virtual environment when you're done:\n", - "\n", - "```sh\n", - "deactivate\n", - "```\n", - "\n", - "
\n", - "\n", - "\n", - "## Install dependencies with a package manager\n", - "\n", - "
\n", - "\n", - "The [`environment.yml`](environment.yml) file contains a list of dependencies that can be installed with a package manager such as `conda`, `mamba` or `micromamba`.\n", - "The instructions below are for [micromamba](https://mamba.readthedocs.io/en/latest/installation.html#micromamba) but should work for any package manager.\n", - "\n", - "```bash\n", - "# For Linux, the default shell is bash:\n", - "curl micro.mamba.pm/install.sh | bash\n", - "# For macOS, the default shell is zsh:\n", - "curl micro.mamba.pm/install.sh | zsh\n", - "```\n", - "After answering the questions, install dependencies with the following command:\n", - "\n", - "```bash\n", - "micromamba env create -f environment.yml\n", - "```\n", - "\n", - "Activate the environment as follows:\n", - "\n", - "```bash\n", - "micromamba activate geocompy\n", - "```\n", - "\n", - "Install kernel, this will allow you to select the environment in vscode or IPython as follows:\n", - "\n", - "```bash\n", - "python -m ipykernel install --user\n", - "```\n", - "\n", - "You can now reproduce the book (requires quarto to be installed):\n", - "\n", - "```bash\n", - "micromamba run -n geocompy quarto preview\n", - "```\n", - "\n", - "
\n", - "\n", - "### Reproduce chapters with jupyter\n", - "\n", - "
\n", - "\n", - "VS Code's `quarto.quarto` plugin can Python code in the chunks in the .qmd files in this book interactively.\n", - "\n", - "However, you can also run any of the chapters in a Jupyter Notebook, e.g. as follows:\n", - "\n", - "```sh\n", - "cd ipynb\n", - "# jupyter notebook . # open a notebook showing all chapters\n", - "jupyter notebook 02-spatial-data.ipynb\n", - "```\n", - "\n", - "You should see something like this: \n", - "\n", - "![](https://user-images.githubusercontent.com/1825120/176920562-d2e7f9af-84b4-4352-8a50-9d9946084c66.png)\n", - "\n", - "See documentation on running and developing Python code in a Jupyter notebook at [docs.jupyter.org](https://docs.jupyter.org/en/latest/).\n", - "\n", - "
\n", - "\n", - "# Additional information\n", - "\n", - "If you're interested in how to auto-generate and run the .py and .ipynb files from the .qmd files, see below.\n", - "\n", - "
\n", - "\n", - "## Updating the .py and .ipynb files\n", - "\n", - "The Python scripts and IPython notebook files stored in the [code](code) and [ipynb](ipynb) folders are generated from the .qmd files.\n", - "To regenerate them, you can use the following commands, to generate .ipynb and .py files for local versions of Chapter 2, for example:\n", - "\n", - "```bash\n", - "quarto convert 02-spatial-data.qmd # generate .ipynb file\n", - "jupytext --to py *.ipynb # generate .py files .ipynb files\n", - "```\n", - "\n", - "Do this for all chapters with the following bash script in the repo:\n", - "\n", - "```bash\n", - "./convert.sh\n", - "```\n", - "\n", - "## Updating .py and .ipynb files with GitHub Actions\n", - "\n", - "We have set-up a GitHub Action to do this automatically: every commit message that contains the text string 'convert' will create and push updated .ipynb and .py files.\n", - "\n", - "\n", - "## Executing the .py and .ipynb files\n", - "\n", - "Running the code chunks in the .qmd files in an IDE such as VSCode or directly with quarto is the main way code in this book is designed to be run interactively, but you can also execute the .py and .ipynb files directly.\n", - "To run the code for chapter 2, for example, you can run one of the following commands from your system shell:\n", - "\n", - "```bash\n", - "python code/chapters/02-spatial-data.py # currently requires manual intervention to complete, see #71\n", - "ipython ipynb/02-spatial-data.ipynb # currently requires manual intervention to complete, see #71\n", - "bash ./run-code.sh # run all .python files\n", - "```\n", - "\n", - "## Updating packages\n", - "\n", - "We pin package versions in the [environment.yml](environment.yml) and [requirements.txt](requirements.txt) files to ensure reproducibility.\n", - "\n", - "To update the `requirements.txt` run the following:\n", - "\n", - "```bash\n", - "python -m pip install pur\n", - "pur -r requirements.txt\n", - "python -m pip install -r requirements.txt\n", - "```\n", - "\n", - "To update the `environment.yml` file in the same way based on your newly installed packages, run the following:\n", - "\n", - "```bash\n", - "micromamba list export > environment.yml\n", - "```\n", - "\n", - "
" - ] - } - ], - "metadata": { - "kernelspec": { - "name": "venv", - "language": "python", - "display_name": "test" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} \ No newline at end of file diff --git a/ipynb/index.ipynb b/ipynb/index.ipynb index 143b3df1..75827358 100644 --- a/ipynb/index.ipynb +++ b/ipynb/index.ipynb @@ -1,7 +1,7 @@ { "cells": [ { - "cell_type": "raw", + "cell_type": "markdown", "metadata": {}, "source": [ "---\n", @@ -10,15 +10,11 @@ " It's a short and practical open source book in which you'll\n", " develop and use geocomputation to solve practical\n", " problems and lay the foundations for advanced geospatial applications.\n", - "---" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ + "---\n", + "\n", "::: {.content-visible when-format=\"html\"}\n", - "## Welcome {.unnumbered}\n", + "\n", + "# Welcome {.unnumbered}\n", "\n", "This is the online home of *Geocomputation with Python*, a book on reproducible geographic data analysis with open source software.\n", "\n", @@ -37,58 +33,15 @@ "\n", "For details on reproducing the book, see the README in the project's GitHub repo: .\n", "\n", - "````{=html}\n", - "\n", - "````\n", - ":::" + ":::\n" ] } ], "metadata": { "kernelspec": { - "name": "venv", + "display_name": "Python 3", "language": "python", - "display_name": "test" + "name": "python3" } }, "nbformat": 4, diff --git a/ipynb/preface.ipynb b/ipynb/preface.ipynb index ae52c155..66f7bf2a 100644 --- a/ipynb/preface.ipynb +++ b/ipynb/preface.ipynb @@ -4,49 +4,220 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Preface {.unnumbered}\n", + "# Preface {.unnumbered}\n", "\n", - "**Geocomputation with Python** (*geocompy*) is motivated by the need for an introductory, yet rigorous and up-to-date, resource for working with geographic data with the most popular programming language in the world.\n", + "**Geocomputation with Python** (*geocompy*) is motivated by the need for an introductory resource for working with geographic data with the most popular programming language in the world.\n", "A unique selling point of the book is its cohesive and joined-up coverage of *both vector and raster* geographic data models and consistent learning curve.\n", "We aim to *minimize surprises*, with each section and chapter building on the previous.\n", "If you're just starting out with Python for working with geographic data, this book is an excellent place to start.\n", "\n", "There are many resources on Python on 'GeoPython' but none that fill this need for an introductory resource that provides strong foundations for future work.\n", - "We want to avoid reinventing the wheel and provide something that fills an 'ecological niche' in the wider free and open source software for geospatial (FOSS4G) ecosystem.\n", + "We want to avoid reinventing the wheel and provide something that fills an 'ecological niche' in the wider free and open-source software for geospatial (FOSS4G) ecosystem.\n", "Key features include:\n", "\n", "1. Doing basic operations well\n", - "2. Integration of vector and raster datasets operations\n", - "3. Clear explanation of each line of code in the book to minimize surprises\n", - "4. Exercises at the end of each chapter with reproducible and open solutions\n", - "5. Provision of lucid example datasets and meaningful operations to illustrate the applied nature of geographic research\n", + "1. Integration of vector and raster datasets operations\n", + "1. Clear explanation of each line of code in the book to minimize surprises\n", + "\n", + "1. Provision of lucid example datasets and meaningful operations to illustrate the applied nature of geographic research\n", "\n", - "This book is complementary with, and adds value to, other projects in the ecosystem, as highlighted in the following comparison between *Geocomputation with Python* and related GeoPython books:\n", + "This book complements and adds value to other projects in the ecosystem, as highlighted in the following comparison between *Geocomputation with Python* and related GeoPython books:\n", "\n", - "- [Learning Geospatial Analysis with Python](https://www.packtpub.com/product/learning-geospatial-analysis-with-python/9781783281138) and [Geoprocessing with Python](https://www.manning.com/books/geoprocessing-with-python) are books in this space that focus on processing spatial data using low-level Python interfaces for GDAL, such as the **gdal**, **gdalnumeric**, and **ogr** [packages](https://gdal.org/api/python_bindings.html) from **osgeo**. This approach requires writing more lines of code. We believe our approach is more [\"Pythonic\"](https://rasterio.readthedocs.io/en/latest/intro.html#philosophy) and future-proof, in light of development of packages such as **geopandas** and **rasterio**.\n", - "- [Introduction to Python for Geographic Data Analysis](https://pythongis.org/) (in progress) seeks to provide a general introduction to 'GIS in Python', with parts focusing on Python essentials, using Python with GIS, and case studies. Compared with this book, which is also open source, and is hosted at pythongis.org, *Geocomputation with Python* has a narrower scope (not covering [spatial network analysis](https://pythongis.org/part3/chapter-11/index.html), for example) and more coverage of raster data processing and raster-vector interoperability.\n", - "- [Geographic Data Science with Python](https://geographicdata.science/book/intro.html) is an ambitious project with chapters dedicated to advanced topics, with Chapter 4 on [Spatial Weights](https://geographicdata.science/book/notebooks/04_spatial_weights.html) getting into complex topics relatively early, for example.\n", - "- [Python for Geospatial Data Analysis](https://www.oreilly.com/library/view/python-for-geospatial/9781098104788/) introduces a wide range of approaches to working with geospatial data using Python, including automation of proprietary and open-source GIS software, as well as standalone open source Python packages (which is what we focus on and explain comprehensively in our book). Geocompy is shorter, simpler and more introductory, and cover raster and vector data with equal importance.\n", + "- *Learning Geospatial Analysis with Python*[^book_1] and *Geoprocessing with Python*[^book_2] are books in this space that focus on processing spatial data using low-level Python interfaces for GDAL, such as the **gdal**, **gdalnumeric**, and **ogr** packages from **osgeo**. This approach requires writing more lines of code. We believe our approach is more 'Pythonic' and future-proof, in light of development of packages such as **geopandas** and **rasterio**.\n", + "- *Introduction to Python for Geographic Data Analysis*[^book_3] (in progress) seeks to provide a general introduction to 'GIS in Python', with parts focusing on Python essentials, using Python with GIS, and case studies. Compared with this book, which is also open source, and is hosted at pythongis.org, *Geocomputation with Python* has a narrower scope (not covering spatial network analysis, for example) and more coverage of raster data processing and raster-vector interoperability.\n", + "- *Geographic Data Science with Python*[^book_4] is an ambitious project with chapters dedicated to advanced topics, with Chapter 4 on Spatial Weights getting into complex topics relatively early, for example.\n", + "- *Python for Geospatial Data Analysis*[^book_5] introduces a wide range of approaches to working with geospatial data using Python, including automation of proprietary and open-source GIS software, as well as standalone open-source Python packages (which is what we focus on and explain comprehensively in our book). Geocompy is shorter, simpler and more introductory, and cover raster and vector data with equal importance.\n", + "\n", + "[^book_1]: [https://www.packtpub.com/product/learning-geospatial-analysis-with-python/9781783281138](https://www.packtpub.com/product/learning-geospatial-analysis-with-python/9781783281138)\n", + "\n", + "[^book_2]: [https://www.manning.com/books/geoprocessing-with-python](https://www.manning.com/books/geoprocessing-with-python)\n", + "\n", + "[^book_3]: [https://pythongis.org](https://pythongis.org)\n", + "\n", + "[^book_4]: [https://geographicdata.science/book/intro.html](https://geographicdata.science/book/intro.html)\n", + "\n", + "[^book_5]: [https://www.oreilly.com/library/view/python-for-geospatial/9781098104788/](https://www.oreilly.com/library/view/python-for-geospatial/9781098104788/)\n", "\n", "Another unique feature of the book is that it is part of a wider community.\n", - "*Geocomputation with Python* is a sister project of [Geocomputation with R](https://r.geocompx.org/), a book on geographic data analysis, visualization, and modeling using the R programming language that has 60+ contributors and an active community, not least in the associated [Discord group](https://discord.gg/PMztXYgNxp).\n", - "Links with the vibrant 'R-spatial' community, and other communities such as [GeoRust](https://georust.org/) and [JuliaGeo](https://juliageo.org/), lead to many opportunities for mutual benefit across open source ecosystems.\n", + "*Geocomputation with Python* is a sister project of *Geocomputation with R*[^geocompr][@lovelace_geocomputation_2019], a book on geographic data analysis, visualization, and modeling using the R programming language that has 60+ contributors and an active community, not least in the associated Discord group[^geocompr_discord].\n", + "Links with the vibrant 'R-spatial' community, and other communities such as GeoRust and JuliaGeo, lead to many opportunities for mutual benefit across open-source ecosystems.\n", + "\n", + "[^geocompr]: [https://r.geocompx.org/](https://r.geocompx.org/)\n", + "\n", + "[^geocompr_discord]: [https://discord.gg/PMztXYgNxp](https://discord.gg/PMztXYgNxp)\n", + "\n", + "## Prerequisites\n", + "\n", + "We assume that the reader is: \n", + "\n", + "* familiar with the Python language, \n", + "* is capable of running Python code and install Python packages, and \n", + "* is familiar with the `numpy` and `pandas` packages for working with data in Python. \n", + "\n", + "From that starting point on, the book introduces the topic of working with *spatial data* in Python, through dedicated third-party packages---most importantly `geopandas` and `rasterio`. \n", + "\n", + "We also assume familiarity with theoretical concepts of geographic data and GIS, such as coordinate systems, projections, spatial layer file formats, etc., which is necessary for understanding the reasoning of the examples.\n", + "\n", + "## Code and sample data\n", + "\n", + "To run the code examples, you can download[^book_data_zip] the ZIP file of the GitHub repository. In the ZIP file, the `ipynb` directory contains the source files of the chapters in Jupyter Notebook format, the `data` directory contains the sample data files, and the `output` directory contains the files created in code examples (some of which are also used as inputs in other code sections). Place them together as follows to run the code:\n", + "\n", + "[^book_data_zip]: [https://github.com/geocompx/geocompy/zipball/master](https://github.com/geocompx/geocompy/zipball/master)\n", + "\n", + "```text\n", + "├── data\n", + "│ ├── aut.tif\n", + "│ ├── ch.tif\n", + "│ ├── coffee_data.csv\n", + "│ ├── cycle_hire.gpkg\n", + "│ ├── cycle_hire_osm.gpkg\n", + "│ ├── cycle_hire_xy.csv\n", + "│ ├── dem.tif\n", + "│ ├── landsat.tif\n", + "│ ├── nlcd.tif\n", + "│ ├── nz_elev.tif\n", + "│ ├── nz.gpkg\n", + "│ ├── nz_height.gpkg\n", + "│ ├── seine.gpkg\n", + "│ ├── srtm.tif\n", + "│ ├── us_states.gpkg\n", + "│ ├── world.gpkg\n", + "│ ├── world_wkt.csv\n", + "│ ├── zion.gpkg\n", + "│ └── zion_points.gpkg\n", + "├── output\n", + "│ ├── cycle_hire_xy.csv\n", + "│ ├── dem_agg5.tif\n", + "│ ├── dem_contour.gpkg\n", + "│ ├── dem_resample_maximum.tif\n", + "│ ├── dem_resample_nearest.tif\n", + "│ ├── elev.tif\n", + "│ ├── grain.tif\n", + "│ ├── map.html\n", + "│ ├── ne_10m_airports.cpg\n", + "│ ├── ne_10m_airports.dbf\n", + "│ ├── ne_10m_airports.prj\n", + "│ ├── ne_10m_airports.README.html\n", + "│ ├── ne_10m_airports.shp\n", + "│ ├── ne_10m_airports.shx\n", + "│ ├── ne_10m_airports.VERSION.txt\n", + "│ ├── ne_10m_airports.zip\n", + "│ ├── nlcd_4326_2.tif\n", + "│ ├── nlcd_4326.tif\n", + "│ ├── nlcd_modified_crs.tif\n", + "│ ├── plot_geopandas.jpg\n", + "│ ├── plot_rasterio2.svg\n", + "│ ├── plot_rasterio.jpg\n", + "│ ├── r3.tif\n", + "│ ├── r_nodata_float.tif\n", + "│ ├── r_nodata_int.tif\n", + "│ ├── r.tif\n", + "│ ├── srtm_32612_aspect.tif\n", + "│ ├── srtm_32612_slope.tif\n", + "│ ├── srtm_32612.tif\n", + "│ ├── srtm_masked_cropped.tif\n", + "│ ├── srtm_masked.tif\n", + "│ ├── w_many_features.gpkg\n", + "│ ├── w_many_layers.gpkg\n", + "│ └── world.gpkg\n", + "├── 01-spatial-data.ipynb\n", + "├── 02-attribute-operations.ipynb\n", + "├── 03-spatial-operations.ipynb\n", + "├── 04-geometry-operations.ipynb\n", + "├── 05-raster-vector.ipynb\n", + "├── 06-reproj.ipynb\n", + "├── 07-read-write.ipynb\n", + "└── 08-mapping.ipynb\n", + "```\n", + "\n", + "## Software\n", + "\n", + "Python version used when rendering the book:" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| echo: false\n", + "import sys\n", + "print(sys.version)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Versions of the main packages used in the book:" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| echo: false\n", + "import subprocess\n", + "packages = [\n", + " 'numpy',\n", + " 'pandas',\n", + " 'shapely',\n", + " 'geopandas',\n", + " 'rasterio',\n", + " 'matplotlib',\n", + " 'rasterstats'\n", + "]\n", + "result = ''\n", + "for i in packages:\n", + " x = 'pip freeze | grep ^%s==' % i\n", + " result += subprocess.run(x, shell=True, executable='/bin/bash', capture_output=True, text=True).stdout\n", + "print(result)\n", + "\n", + "# Run micromamba list to get all installed packages and their versions\n", + "# cmd = \"micromamba list -n geocompy\"\n", + "# result = subprocess.run(cmd, shell=True, executable='/bin/bash', capture_output=True, text=True).stdout\n", + "\n", + "# # Filter the result for the packages of interest\n", + "# filtered_result = \"\"\n", + "# lines = result.split('\\n')\n", + "# # Skip all lines up to and including the header separator line\n", + "# start_index = next((i for i, line in enumerate(lines) if '─' in line), -1) + 1\n", + "\n", + "# for line in lines[start_index:]:\n", + "# if line.strip(): # Ensure the line is not empty\n", + "# parts = line.split()\n", + "# package_name = parts[0] # Assuming package name is the first element\n", + "# if package_name in packages:\n", + "# version = parts[1] if len(parts) > 1 else \"Unknown Version\"\n", + "# filtered_result += f\"{package_name} {version}\\n\"\n", + "\n", + "# print(filtered_result)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Acknowledgements\n", "\n", - "### Acknowlegements\n", + "We acknowledge Robin Lovelace, Jakub Nowosad, and Jannes Muenchow---authors of *Geocomputation with R* (Robin and Jakub also author the present book), a book on the same topic for a different programming language (R). The structure, topics, and most of the theoretical discussions were adapted from that earlier publication. \n", "\n", - "We acknowledge Robin Lovelace, Jakub Nowosad, and Jannes Muenchow---authors of [Geocomputation with R](https://r.geocompx.org/) (Robin and Jakub also author the present book), a book on the same topic for a different programming language (R). The structure, topics, and most of the theoretical discussions were adapted from that earlier publication. \n", + "We thank the authors of the Python language, and the authors of the **numpy**, **pandas**, **shapely**, **geopandas**, and **rasterio** packages which are used extensively in the book, for building these wonderful tools. \n", "\n", - "We thank the authors of the Python language, and the authors of the **shapely**, **pandas**, **geopandas**, and **rasterio** packages which are used extensively in the book, for building these wonderful tools. \n", + "We acknowledge GitHub users Will Deakin, Sean Gillies, Josh Cole, and Jt Miclat (at the time of writing; full list on GitHub[^github_contributors]) for their contributions during the open-source development of the book.\n", "\n", - "We acknowledge of GitHub users [Will Deakin](https://github.com/anisotropi4), [Sean Gillies](https://github.com/sgillies), [Josh Cole](https://github.com/JoshCole-DTA), and [Jt Miclat](https://github.com/jtmiclat) (at the time of writing; full list [here](https://github.com/geocompx/geocompy/graphs/contributors)) for their contributions during the open-source development of the book. " + "[^github_contributors]: [https://github.com/geocompx/geocompy/graphs/contributors](https://github.com/geocompx/geocompy/graphs/contributors)" ] } ], "metadata": { "kernelspec": { - "name": "venv", + "display_name": "Python 3", "language": "python", - "display_name": "test" + "name": "python3" } }, "nbformat": 4, diff --git a/ipynb/references.ipynb b/ipynb/references.ipynb new file mode 100644 index 00000000..da34851f --- /dev/null +++ b/ipynb/references.ipynb @@ -0,0 +1,23 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# References {.unnumbered}\n", + "\n", + "::: {#refs}\n", + ":::" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file