Skip to content

Commit

Permalink
Update osmnx extraction
Browse files Browse the repository at this point in the history
- the network is huge so try to optimize memory usage
- rename output directory to be more consistent with step1
- output geopackage so for easy GIS viewing
- output geojson directly from geodataframe to explore the difference between this version and what was output before
  • Loading branch information
lmz committed Apr 7, 2022
1 parent 0423895 commit 4a89ae8
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 22 deletions.
4 changes: 2 additions & 2 deletions notebooks/pipeline/Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,9 @@ Use OMNx to extract OSM data for the Bay Area and save as geojson files.
* County shapefile, `[INPUT_DATA_DIR]/external/step0_boundaries/cb_2018_us_county_5m_BayArea.shp`
* OpenStreetMap via [`osmnx.graph.graph_from_polygon()`](https://osmnx.readthedocs.io/en/stable/osmnx.html#osmnx.graph.graph_from_polygon)
#### Output:
* OSM link extract, `[OUTPUT_DATA_DIR]/external/step2_osmnx_extraction/link.geojson` with columns: 'osmid', 'oneway', 'lanes', 'ref', 'name', 'highway', 'maxspeed',
* OSM link extract, `[OUTPUT_DATA_DIR]/external/step2_osmnx_extracts/link.geojson` with columns: 'osmid', 'oneway', 'lanes', 'ref', 'name', 'highway', 'maxspeed',
'length', 'bridge', 'service', 'width', 'access', 'junction', 'tunnel', 'est_width', 'area', 'landuse', 'u', 'v', 'key', 'geometry'
* OSM node extract, `[OUTPUT_DATA_DIR]/external/step2_osmnx_extraction/node.geojson` with columns: 'y', 'x', 'osmid', 'ref', 'highway', 'geometry'
* OSM node extract, `[OUTPUT_DATA_DIR]/external/step2_osmnx_extracts/node.geojson` with columns: 'y', 'x', 'osmid', 'ref', 'highway', 'geometry'

### [Step 3: Process SharedStreets Extraction to Network Standard and Conflate with OSM](step3_join_shst_extraction_with_osm.py)

Expand Down
73 changes: 53 additions & 20 deletions notebooks/pipeline/step2_osmnx_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,21 @@
set INPUT_DATA_DIR, OUTPUT_DATA_DIR environment variable
Input: polygon boundary file for the region, [INPUT_DATA_DIR]/external/step0_boundaries/cb_2018_us_county_5m_BayArea.shp
Output: nodes and links data from OSMNX in geojson format,
[OUTPUT_DATA_DIR]/external/external/step2_osmnx_extraction/link.geojson,
[OUTPUT_DATA_DIR]/external/external/step2_osmnx_extraction/node.geojson
[OUTPUT_DATA_DIR]/external/step2_osmnx_extracts/link.geojson,
[OUTPUT_DATA_DIR]/external/step2_osmnx_extracts/node.geojson
Plus geopackage with these layers, [OUTPUT_DATA_DIR]/external/step2_osmnx_extraction/osmnx_extraction.gpkg
"""
from methods import *
import datetime, json, os
import methods
import geopandas as gpd
import osmnx as ox
from pyproj import CRS
from network_wrangler import WranglerLogger, setupLogging
from datetime import datetime

#####################################
# EPSG requirement
# TARGET_EPSG = 4326
lat_lon_epsg_str = 'epsg:{}'.format(str(LAT_LONG_EPSG))
lat_lon_epsg_str = 'epsg:{}'.format(str(methods.LAT_LONG_EPSG))
WranglerLogger.info('standard ESPG: ', lat_lon_epsg_str)

#####################################
Expand All @@ -25,58 +28,88 @@
INPUT_DATA_DIR = os.environ['INPUT_DATA_DIR']
OUTPUT_DATA_DIR = os.environ['OUTPUT_DATA_DIR']
INPUT_POLYGON = os.path.join(INPUT_DATA_DIR, 'external', 'step0_boundaries', 'cb_2018_us_county_5m_BayArea.shp')
OUTPUT_DIR = os.path.join(OUTPUT_DATA_DIR, 'external', 'step2_osmnx_extraction')
OUTPUT_DIR = os.path.join(OUTPUT_DATA_DIR, 'external', 'step2_osmnx_extracts')
OUTPUT_GPKG = os.path.join(OUTPUT_DIR, "osmnx_extracts.gpkg")


if __name__ == '__main__':
# create output folder if not exist
if not os.path.exists(OUTPUT_DIR):
WranglerLogger.info('create output folder')
# need to print since logger isn't setup yet
print('creating output folder {}'.format(OUTPUT_DIR))
os.makedirs(OUTPUT_DIR)

# setup logging
LOG_FILENAME = os.path.join(
OUTPUT_DIR,
"step2_osmnx_extraction_{}.info.log".format(datetime.now().strftime("%Y_%m_%d__%H_%M_%S")),
"step2_osmnx_extraction_{}.info.log".format(datetime.datetime.now().strftime("%Y_%m_%d__%H_%M_%S")),
)
setupLogging(LOG_FILENAME)

# read polygon boundary
county_polys_gdf = gpd.read_file(INPUT_POLYGON)
WranglerLogger.info('Input county boundary file uses projection: ' + str(county_polys_gdf.crs))
WranglerLogger.info('Input county boundary file {} uses projection: {}'.format(INPUT_POLYGON, county_polys_gdf.crs))

# project to lat-long
county_polys_gdf = county_polys_gdf.to_crs(CRS(lat_lon_epsg_str))
WranglerLogger.info('converted to projection: ' + str(county_polys_gdf.crs))

# dissolve into one polygon
WranglerLogger.info('dissolve into one polygon')
boundary = county_polys_gdf.geometry.unary_union
WranglerLogger.info('dissolved into one polygon')

# OSM extraction
# OSM extraction - Note: this is memory intensive (~15GB) and time-consuming (~50 min)
WranglerLogger.info('starting osmnx extraction')
G_drive = ox.graph_from_polygon(boundary, network_type='all', simplify=False)
osmnx_graph = ox.graph_from_polygon(boundary, network_type='all', simplify=False)
WranglerLogger.info('finished osmnx extraction')

WranglerLogger.info('getting links and nodes from osmnx data')
link_gdf = ox.graph_to_gdfs(G_drive, nodes=False, edges=True)
# these are very large datasets to do links first and then delete, then nodes
WranglerLogger.info('getting links from osmnx data')
link_gdf = ox.graph_to_gdfs(osmnx_graph, nodes=False, edges=True)
WranglerLogger.info('link_gdf has {} records, with columns: \n{}'.format(
link_gdf.shape[0], list(link_gdf)))
node_gdf = ox.graph_to_gdfs(G_drive, nodes=True, edges=False)
WranglerLogger.info('node_gdf has {} records, with columns: \n{}'.format(
node_gdf.shape[0], list(node_gdf)))

# writing out OSM link data to geojson
WranglerLogger.info('writing out OSM links and nodes to geojson at {}'.format(OUTPUT_DIR))
WranglerLogger.info('writing out OSM links to geojson at {}'.format(OUTPUT_DIR))
# this is already a geodataframe, why not just use the geodatframe to_file() method?
# for now, have both methods to understand the difference in the output
# I will remove one of them shortly
link_gdf.to_file(os.path.join(OUTPUT_DIR, 'link2.geojson'), driver='GeoJSON')
WranglerLogger.info('writing out OSM links to gpkg at {}'.format(OUTPUT_DIR))
link_gdf.to_file(OUTPUT_GPKG, layer="link", driver="GPKG")

# this is the old way this was done
link_prop = link_gdf.drop("geometry", axis=1).columns.tolist()
link_geojson = link_df_to_geojson(link_gdf, link_prop)
link_geojson = methods.link_df_to_geojson(link_gdf, link_prop)
with open(os.path.join(OUTPUT_DIR, 'link.geojson'), "w") as f:
json.dump(link_geojson, f)
del link_prop
del link_geojson
del link_gdf
WranglerLogger.info('link objects deleted')

# writing out OSM node data to geojson
WranglerLogger.info('getting nodes from osmnx data')
node_gdf = ox.graph_to_gdfs(osmnx_graph, nodes=True, edges=False)
WranglerLogger.info('node_gdf has {} records, with columns: \n{}'.format(
node_gdf.shape[0], list(node_gdf)))

# writing out OSM node data to geojson
WranglerLogger.info('writing out OSM nodes to geojson at {}'.format(OUTPUT_DIR))
# this is already a geodataframe, why not just use the geodatframe to_file() method?
node_gdf.to_file(os.path.join(OUTPUT_DIR, 'node2.geojson'), driver="GeoJSON")
WranglerLogger.info('writing out OSM links to gpkg at {}'.format(OUTPUT_DIR))
node_gdf.to_file(OUTPUT_GPKG, layer="node", driver="GPKG")

# this is the old way this was done
node_prop = node_gdf.drop("geometry", axis=1).columns.tolist()
node_geojson = point_df_to_geojson(node_gdf, node_prop)
node_geojson = methods.point_df_to_geojson(node_gdf, node_prop)
with open(os.path.join(OUTPUT_DIR, 'node.geojson'), "w") as f:
json.dump(node_geojson, f)
del node_prop
del node_geojson
del node_gdf
WranglerLogger.info('node objects deleted')


WranglerLogger.info('finished writing out OSM links and nodes')

3 comments on commit 4a89ae8

@yuqiww
Copy link
Member

@yuqiww yuqiww commented on 4a89ae8 Apr 7, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@lmz : regarding "geopandas.to_file()" versus "link_df_to_geojson", my understanding is that there are fields with mixed data type - string and list, then using gpd.to_file() would throw an error, both for geojson and geopackage. The method "link_df_to_geojson" bypasses the problem. It it based on this: https://geoffboeing.com/2015/10/exporting-python-data-geojson/.

@lmz
Copy link
Member Author

@lmz lmz commented on 4a89ae8 Apr 7, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My runs of geopackage.to_file() in this script has thus far been successful. Have you had problems with this call?

@yuqiww
Copy link
Member

@yuqiww yuqiww commented on 4a89ae8 Apr 7, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My runs of geopackage.to_file() in this script has thus far been successful. Have you had problems with this call?

The error happens starting from step3 when OSM data is conflated with SharedStreets data.
Also, the file generated from "link_df_to_geojson" is smaller than from "gpd.to_file()" - but I guess this is less important if we don't want to use geojson.

Please sign in to comment.