From 4a89ae8d2f7d76a8f7269c46b79c99527a0d0cbb Mon Sep 17 00:00:00 2001 From: Lisa Zorn Date: Thu, 7 Apr 2022 10:13:37 -0700 Subject: [PATCH] Update osmnx extraction - the network is huge so try to optimize memory usage - rename output directory to be more consistent with step1 - output geopackage so for easy GIS viewing - output geojson directly from geodataframe to explore the difference between this version and what was output before --- notebooks/pipeline/Readme.md | 4 +- notebooks/pipeline/step2_osmnx_extraction.py | 73 ++++++++++++++------ 2 files changed, 55 insertions(+), 22 deletions(-) diff --git a/notebooks/pipeline/Readme.md b/notebooks/pipeline/Readme.md index afd572f5..4798ca54 100644 --- a/notebooks/pipeline/Readme.md +++ b/notebooks/pipeline/Readme.md @@ -39,9 +39,9 @@ Use OMNx to extract OSM data for the Bay Area and save as geojson files. * County shapefile, `[INPUT_DATA_DIR]/external/step0_boundaries/cb_2018_us_county_5m_BayArea.shp` * OpenStreetMap via [`osmnx.graph.graph_from_polygon()`](https://osmnx.readthedocs.io/en/stable/osmnx.html#osmnx.graph.graph_from_polygon) #### Output: -* OSM link extract, `[OUTPUT_DATA_DIR]/external/step2_osmnx_extraction/link.geojson` with columns: 'osmid', 'oneway', 'lanes', 'ref', 'name', 'highway', 'maxspeed', +* OSM link extract, `[OUTPUT_DATA_DIR]/external/step2_osmnx_extracts/link.geojson` with columns: 'osmid', 'oneway', 'lanes', 'ref', 'name', 'highway', 'maxspeed', 'length', 'bridge', 'service', 'width', 'access', 'junction', 'tunnel', 'est_width', 'area', 'landuse', 'u', 'v', 'key', 'geometry' -* OSM node extract, `[OUTPUT_DATA_DIR]/external/step2_osmnx_extraction/node.geojson` with columns: 'y', 'x', 'osmid', 'ref', 'highway', 'geometry' +* OSM node extract, `[OUTPUT_DATA_DIR]/external/step2_osmnx_extracts/node.geojson` with columns: 'y', 'x', 'osmid', 'ref', 'highway', 'geometry' ### [Step 3: Process SharedStreets Extraction to Network Standard and Conflate with OSM](step3_join_shst_extraction_with_osm.py) diff --git a/notebooks/pipeline/step2_osmnx_extraction.py b/notebooks/pipeline/step2_osmnx_extraction.py index c5492916..80e4db39 100644 --- a/notebooks/pipeline/step2_osmnx_extraction.py +++ b/notebooks/pipeline/step2_osmnx_extraction.py @@ -5,18 +5,21 @@ set INPUT_DATA_DIR, OUTPUT_DATA_DIR environment variable Input: polygon boundary file for the region, [INPUT_DATA_DIR]/external/step0_boundaries/cb_2018_us_county_5m_BayArea.shp Output: nodes and links data from OSMNX in geojson format, - [OUTPUT_DATA_DIR]/external/external/step2_osmnx_extraction/link.geojson, - [OUTPUT_DATA_DIR]/external/external/step2_osmnx_extraction/node.geojson + [OUTPUT_DATA_DIR]/external/step2_osmnx_extracts/link.geojson, + [OUTPUT_DATA_DIR]/external/step2_osmnx_extracts/node.geojson + Plus geopackage with these layers, [OUTPUT_DATA_DIR]/external/step2_osmnx_extraction/osmnx_extraction.gpkg """ -from methods import * +import datetime, json, os +import methods +import geopandas as gpd +import osmnx as ox from pyproj import CRS from network_wrangler import WranglerLogger, setupLogging -from datetime import datetime ##################################### # EPSG requirement # TARGET_EPSG = 4326 -lat_lon_epsg_str = 'epsg:{}'.format(str(LAT_LONG_EPSG)) +lat_lon_epsg_str = 'epsg:{}'.format(str(methods.LAT_LONG_EPSG)) WranglerLogger.info('standard ESPG: ', lat_lon_epsg_str) ##################################### @@ -25,58 +28,88 @@ INPUT_DATA_DIR = os.environ['INPUT_DATA_DIR'] OUTPUT_DATA_DIR = os.environ['OUTPUT_DATA_DIR'] INPUT_POLYGON = os.path.join(INPUT_DATA_DIR, 'external', 'step0_boundaries', 'cb_2018_us_county_5m_BayArea.shp') -OUTPUT_DIR = os.path.join(OUTPUT_DATA_DIR, 'external', 'step2_osmnx_extraction') +OUTPUT_DIR = os.path.join(OUTPUT_DATA_DIR, 'external', 'step2_osmnx_extracts') +OUTPUT_GPKG = os.path.join(OUTPUT_DIR, "osmnx_extracts.gpkg") if __name__ == '__main__': # create output folder if not exist if not os.path.exists(OUTPUT_DIR): - WranglerLogger.info('create output folder') + # need to print since logger isn't setup yet + print('creating output folder {}'.format(OUTPUT_DIR)) os.makedirs(OUTPUT_DIR) # setup logging LOG_FILENAME = os.path.join( OUTPUT_DIR, - "step2_osmnx_extraction_{}.info.log".format(datetime.now().strftime("%Y_%m_%d__%H_%M_%S")), + "step2_osmnx_extraction_{}.info.log".format(datetime.datetime.now().strftime("%Y_%m_%d__%H_%M_%S")), ) setupLogging(LOG_FILENAME) # read polygon boundary county_polys_gdf = gpd.read_file(INPUT_POLYGON) - WranglerLogger.info('Input county boundary file uses projection: ' + str(county_polys_gdf.crs)) + WranglerLogger.info('Input county boundary file {} uses projection: {}'.format(INPUT_POLYGON, county_polys_gdf.crs)) # project to lat-long county_polys_gdf = county_polys_gdf.to_crs(CRS(lat_lon_epsg_str)) WranglerLogger.info('converted to projection: ' + str(county_polys_gdf.crs)) # dissolve into one polygon - WranglerLogger.info('dissolve into one polygon') boundary = county_polys_gdf.geometry.unary_union + WranglerLogger.info('dissolved into one polygon') - # OSM extraction + # OSM extraction - Note: this is memory intensive (~15GB) and time-consuming (~50 min) WranglerLogger.info('starting osmnx extraction') - G_drive = ox.graph_from_polygon(boundary, network_type='all', simplify=False) + osmnx_graph = ox.graph_from_polygon(boundary, network_type='all', simplify=False) WranglerLogger.info('finished osmnx extraction') - WranglerLogger.info('getting links and nodes from osmnx data') - link_gdf = ox.graph_to_gdfs(G_drive, nodes=False, edges=True) + # these are very large datasets to do links first and then delete, then nodes + WranglerLogger.info('getting links from osmnx data') + link_gdf = ox.graph_to_gdfs(osmnx_graph, nodes=False, edges=True) WranglerLogger.info('link_gdf has {} records, with columns: \n{}'.format( link_gdf.shape[0], list(link_gdf))) - node_gdf = ox.graph_to_gdfs(G_drive, nodes=True, edges=False) - WranglerLogger.info('node_gdf has {} records, with columns: \n{}'.format( - node_gdf.shape[0], list(node_gdf))) # writing out OSM link data to geojson - WranglerLogger.info('writing out OSM links and nodes to geojson at {}'.format(OUTPUT_DIR)) + WranglerLogger.info('writing out OSM links to geojson at {}'.format(OUTPUT_DIR)) + # this is already a geodataframe, why not just use the geodatframe to_file() method? + # for now, have both methods to understand the difference in the output + # I will remove one of them shortly + link_gdf.to_file(os.path.join(OUTPUT_DIR, 'link2.geojson'), driver='GeoJSON') + WranglerLogger.info('writing out OSM links to gpkg at {}'.format(OUTPUT_DIR)) + link_gdf.to_file(OUTPUT_GPKG, layer="link", driver="GPKG") + + # this is the old way this was done link_prop = link_gdf.drop("geometry", axis=1).columns.tolist() - link_geojson = link_df_to_geojson(link_gdf, link_prop) + link_geojson = methods.link_df_to_geojson(link_gdf, link_prop) with open(os.path.join(OUTPUT_DIR, 'link.geojson'), "w") as f: json.dump(link_geojson, f) + del link_prop + del link_geojson + del link_gdf + WranglerLogger.info('link objects deleted') + + # writing out OSM node data to geojson + WranglerLogger.info('getting nodes from osmnx data') + node_gdf = ox.graph_to_gdfs(osmnx_graph, nodes=True, edges=False) + WranglerLogger.info('node_gdf has {} records, with columns: \n{}'.format( + node_gdf.shape[0], list(node_gdf))) # writing out OSM node data to geojson + WranglerLogger.info('writing out OSM nodes to geojson at {}'.format(OUTPUT_DIR)) + # this is already a geodataframe, why not just use the geodatframe to_file() method? + node_gdf.to_file(os.path.join(OUTPUT_DIR, 'node2.geojson'), driver="GeoJSON") + WranglerLogger.info('writing out OSM links to gpkg at {}'.format(OUTPUT_DIR)) + node_gdf.to_file(OUTPUT_GPKG, layer="node", driver="GPKG") + + # this is the old way this was done node_prop = node_gdf.drop("geometry", axis=1).columns.tolist() - node_geojson = point_df_to_geojson(node_gdf, node_prop) + node_geojson = methods.point_df_to_geojson(node_gdf, node_prop) with open(os.path.join(OUTPUT_DIR, 'node.geojson'), "w") as f: json.dump(node_geojson, f) + del node_prop + del node_geojson + del node_gdf + WranglerLogger.info('node objects deleted') + WranglerLogger.info('finished writing out OSM links and nodes')