Update osmnx extraction

- the network is huge so try to optimize memory usage - rename output directory to be more consistent with step1 - output geopackage so for easy GIS viewing - output geojson directly from geodataframe to explore the difference between this version and what was output before
BayAreaMetro · Apr 7, 2022 · 4a89ae8 · 4a89ae8 · yuqiww · Apr 7, 2022
1 parent 0423895
commit 4a89ae8
Show file tree

Hide file tree

Showing 2 changed files with 55 additions and 22 deletions.
diff --git a/notebooks/pipeline/Readme.md b/notebooks/pipeline/Readme.md
@@ -39,9 +39,9 @@ Use OMNx to extract OSM data for the Bay Area and save as geojson files.
 * County shapefile, `[INPUT_DATA_DIR]/external/step0_boundaries/cb_2018_us_county_5m_BayArea.shp`
 * OpenStreetMap via [`osmnx.graph.graph_from_polygon()`](https://osmnx.readthedocs.io/en/stable/osmnx.html#osmnx.graph.graph_from_polygon)
 #### Output:
-* OSM link extract, `[OUTPUT_DATA_DIR]/external/step2_osmnx_extraction/link.geojson` with columns: 'osmid', 'oneway', 'lanes', 'ref', 'name', 'highway', 'maxspeed',
+* OSM link extract, `[OUTPUT_DATA_DIR]/external/step2_osmnx_extracts/link.geojson` with columns: 'osmid', 'oneway', 'lanes', 'ref', 'name', 'highway', 'maxspeed',
      'length', 'bridge', 'service', 'width', 'access', 'junction', 'tunnel', 'est_width', 'area', 'landuse', 'u', 'v', 'key', 'geometry'
-* OSM node extract, `[OUTPUT_DATA_DIR]/external/step2_osmnx_extraction/node.geojson` with columns: 'y', 'x', 'osmid', 'ref', 'highway', 'geometry'
+* OSM node extract, `[OUTPUT_DATA_DIR]/external/step2_osmnx_extracts/node.geojson` with columns: 'y', 'x', 'osmid', 'ref', 'highway', 'geometry'
 
 ### [Step 3: Process SharedStreets Extraction to Network Standard and Conflate with OSM](step3_join_shst_extraction_with_osm.py)
 

diff --git a/notebooks/pipeline/step2_osmnx_extraction.py b/notebooks/pipeline/step2_osmnx_extraction.py
@@ -5,18 +5,21 @@
 set INPUT_DATA_DIR, OUTPUT_DATA_DIR environment variable
 Input: polygon boundary file for the region, [INPUT_DATA_DIR]/external/step0_boundaries/cb_2018_us_county_5m_BayArea.shp
 Output: nodes and links data from OSMNX in geojson format,
-    [OUTPUT_DATA_DIR]/external/external/step2_osmnx_extraction/link.geojson,
-    [OUTPUT_DATA_DIR]/external/external/step2_osmnx_extraction/node.geojson
+    [OUTPUT_DATA_DIR]/external/step2_osmnx_extracts/link.geojson,
+    [OUTPUT_DATA_DIR]/external/step2_osmnx_extracts/node.geojson
+    Plus geopackage with these layers, [OUTPUT_DATA_DIR]/external/step2_osmnx_extraction/osmnx_extraction.gpkg
 """
-from methods import *
+import datetime, json, os
+import methods
+import geopandas as gpd
+import osmnx as ox
 from pyproj import CRS
 from network_wrangler import WranglerLogger, setupLogging
-from datetime import datetime
 
 #####################################
 # EPSG requirement
 # TARGET_EPSG = 4326
-lat_lon_epsg_str = 'epsg:{}'.format(str(LAT_LONG_EPSG))
+lat_lon_epsg_str = 'epsg:{}'.format(str(methods.LAT_LONG_EPSG))
 WranglerLogger.info('standard ESPG: ', lat_lon_epsg_str)
 
 #####################################
@@ -25,58 +28,88 @@
 INPUT_DATA_DIR  = os.environ['INPUT_DATA_DIR']
 OUTPUT_DATA_DIR = os.environ['OUTPUT_DATA_DIR']
 INPUT_POLYGON   = os.path.join(INPUT_DATA_DIR, 'external', 'step0_boundaries', 'cb_2018_us_county_5m_BayArea.shp')
-OUTPUT_DIR      = os.path.join(OUTPUT_DATA_DIR, 'external', 'step2_osmnx_extraction')
+OUTPUT_DIR      = os.path.join(OUTPUT_DATA_DIR, 'external', 'step2_osmnx_extracts')
+OUTPUT_GPKG     = os.path.join(OUTPUT_DIR, "osmnx_extracts.gpkg")
 
 
 if __name__ == '__main__':
     # create output folder if not exist
     if not os.path.exists(OUTPUT_DIR):
-        WranglerLogger.info('create output folder')
+        # need to print since logger isn't setup yet
+        print('creating output folder {}'.format(OUTPUT_DIR))
         os.makedirs(OUTPUT_DIR)
 
     # setup logging
     LOG_FILENAME = os.path.join(
         OUTPUT_DIR,
-        "step2_osmnx_extraction_{}.info.log".format(datetime.now().strftime("%Y_%m_%d__%H_%M_%S")),
+        "step2_osmnx_extraction_{}.info.log".format(datetime.datetime.now().strftime("%Y_%m_%d__%H_%M_%S")),
     )
     setupLogging(LOG_FILENAME)
 
     # read polygon boundary
     county_polys_gdf = gpd.read_file(INPUT_POLYGON)
-    WranglerLogger.info('Input county boundary file uses projection: ' + str(county_polys_gdf.crs))
+    WranglerLogger.info('Input county boundary file {} uses projection: {}'.format(INPUT_POLYGON, county_polys_gdf.crs))
 
     # project to lat-long
     county_polys_gdf = county_polys_gdf.to_crs(CRS(lat_lon_epsg_str))
     WranglerLogger.info('converted to projection: ' + str(county_polys_gdf.crs))
 
     # dissolve into one polygon
-    WranglerLogger.info('dissolve into one polygon')
     boundary = county_polys_gdf.geometry.unary_union
+    WranglerLogger.info('dissolved into one polygon')
 
-    # OSM extraction
+    # OSM extraction - Note: this is memory intensive (~15GB) and time-consuming (~50 min)
     WranglerLogger.info('starting osmnx extraction')
-    G_drive = ox.graph_from_polygon(boundary, network_type='all', simplify=False)
+    osmnx_graph = ox.graph_from_polygon(boundary, network_type='all', simplify=False)
     WranglerLogger.info('finished osmnx extraction')
 
-    WranglerLogger.info('getting links and nodes from osmnx data')
-    link_gdf = ox.graph_to_gdfs(G_drive, nodes=False, edges=True)
+    # these are very large datasets to do links first and then delete, then nodes
+    WranglerLogger.info('getting links from osmnx data')
+    link_gdf = ox.graph_to_gdfs(osmnx_graph, nodes=False, edges=True)
     WranglerLogger.info('link_gdf has {} records, with columns: \n{}'.format(
         link_gdf.shape[0], list(link_gdf)))
-    node_gdf = ox.graph_to_gdfs(G_drive, nodes=True, edges=False)
-    WranglerLogger.info('node_gdf has {} records, with columns: \n{}'.format(
-        node_gdf.shape[0], list(node_gdf)))
 
     # writing out OSM link data to geojson
-    WranglerLogger.info('writing out OSM links and nodes to geojson at {}'.format(OUTPUT_DIR))
+    WranglerLogger.info('writing out OSM links to geojson at {}'.format(OUTPUT_DIR))
+    # this is already a geodataframe, why not just use the geodatframe to_file() method?
+    # for now, have both methods to understand the difference in the output
+    # I will remove one of them shortly
+    link_gdf.to_file(os.path.join(OUTPUT_DIR, 'link2.geojson'), driver='GeoJSON')
+    WranglerLogger.info('writing out OSM links to gpkg at {}'.format(OUTPUT_DIR))
+    link_gdf.to_file(OUTPUT_GPKG, layer="link", driver="GPKG")
+
+    # this is the old way this was done
     link_prop = link_gdf.drop("geometry", axis=1).columns.tolist()
-    link_geojson = link_df_to_geojson(link_gdf, link_prop)
+    link_geojson = methods.link_df_to_geojson(link_gdf, link_prop)
     with open(os.path.join(OUTPUT_DIR, 'link.geojson'), "w") as f:
         json.dump(link_geojson, f)
+    del link_prop
+    del link_geojson
+    del link_gdf
+    WranglerLogger.info('link objects deleted')
+
+    # writing out OSM node data to geojson
+    WranglerLogger.info('getting nodes from osmnx data')
+    node_gdf = ox.graph_to_gdfs(osmnx_graph, nodes=True, edges=False)
+    WranglerLogger.info('node_gdf has {} records, with columns: \n{}'.format(
+        node_gdf.shape[0], list(node_gdf)))
 
     # writing out OSM node data to geojson
+    WranglerLogger.info('writing out OSM nodes to geojson at {}'.format(OUTPUT_DIR))
+    # this is already a geodataframe, why not just use the geodatframe to_file() method?
+    node_gdf.to_file(os.path.join(OUTPUT_DIR, 'node2.geojson'), driver="GeoJSON")
+    WranglerLogger.info('writing out OSM links to gpkg at {}'.format(OUTPUT_DIR))
+    node_gdf.to_file(OUTPUT_GPKG, layer="node", driver="GPKG")
+
+    # this is the old way this was done
     node_prop = node_gdf.drop("geometry", axis=1).columns.tolist()
-    node_geojson = point_df_to_geojson(node_gdf, node_prop)
+    node_geojson = methods.point_df_to_geojson(node_gdf, node_prop)
     with open(os.path.join(OUTPUT_DIR, 'node.geojson'), "w") as f:
         json.dump(node_geojson, f)
+    del node_prop
+    del node_geojson
+    del node_gdf
+    WranglerLogger.info('node objects deleted')
+
 
     WranglerLogger.info('finished writing out OSM links and nodes')