Skip to content

Commit

Permalink
TitlePattern geocoder (#162)
Browse files Browse the repository at this point in the history
* title-pattern geocoding

* be more cautious about matching semicolons

* log item ID and address when hitting network

* remove trailing dots

* strip space and dot

* test for matching alt_title

* try switching to &; seems like a loss

* use extended-grid from title-pattern

* update snapshots

* data, geocache updates

* fix test

* update random500

* seven decimals ought to be enough

* update lat-lons file
  • Loading branch information
danvk authored Nov 8, 2024
1 parent eba3df4 commit 7d70122
Show file tree
Hide file tree
Showing 11 changed files with 350 additions and 161 deletions.
22 changes: 11 additions & 11 deletions data/geocode/random500.csv

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion data/lat-lon-to-ids.json

Large diffs are not rendered by default.

Binary file modified geocache.tgz
Binary file not shown.
6 changes: 3 additions & 3 deletions oldnyc/geocode/coders/gpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from oldnyc.geocode.coders.coder_utils import get_lat_lng_from_geocode
from oldnyc.geocode.geocode_types import Coder, Locatable
from oldnyc.geocode.geogpt.generate_batch import GptResponse
from oldnyc.geocode.geogpt.generate_batch import GptResponse, guess_borough
from oldnyc.item import Item


Expand Down Expand Up @@ -34,11 +34,11 @@ def codeRecord(self, r: Item):
return None
sys.stderr.write(f"GPT location: {r.id} {q}\n")

if q["type"] == "no_location":
if q["type"] == "not in NYC" or q["type"] == "no location information":
return None

loc: Locatable | None = None
boro = q["borough"]
boro = guess_borough(r)
if q["type"] == "place_name":
self.num_poi += 1
return None
Expand Down
112 changes: 112 additions & 0 deletions oldnyc/geocode/coders/title_pattern.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
"""Geocode by pattern matching against the title.
This matches extremely simple, common patterns that aren't worth a trip to GPT.
"""

import re
import sys

from oldnyc.geocode import grid
from oldnyc.geocode.boroughs import point_to_borough
from oldnyc.geocode.coders.coder_utils import get_lat_lng_from_geocode
from oldnyc.geocode.coders.extended_grid import parse_street_ave
from oldnyc.geocode.geocode_types import Coder, Locatable

boroughs_pat = r"(?:Manhattan|Brooklyn|Queens|Bronx|Staten Island|Richmond)"

# Borough: str1 - str2
# Manhattan: 10th Street (East) - Broadway
# 711023f
boro_int = re.compile(rf"^({boroughs_pat}): ([^-:\[\];]+?) - ([^-:\[\];]+)\.?$")


class TitlePatternCoder(Coder):
def __init__(self):
self.n_title = 0
self.n_alt_title = 0
self.n_match = 0

self.n_grid = 0
self.n_google_location = 0
self.n_geocode_fail = 0
self.n_boro_mismatch = 0

def codeRecord(self, r):
src = None
m = boro_int.match(r.title)
if m:
src = r.title
self.n_title += 1
else:
for alt_title in r.alt_title:
m = boro_int.match(alt_title)
if m:
src = alt_title
self.n_alt_title += 1
break

if not m:
return None

self.n_match += 1

boro, str1, str2 = m.groups()
if "and" in str1 or "and" in str2:
return None
str1 = str1.rstrip(". ")
str2 = str2.rstrip(". ")
(str1, str2) = sorted((str1, str2)) # try to increase cache coherence
boro = boro.replace("Richmond", "Staten Island")

assert src
out: Locatable = {
"type": "intersection",
"source": src,
"address": f"{str1} and {str2}, {boro}, NY",
"data": (str1, str2, boro),
}
return out

def getLatLonFromGeocode(self, geocode, data, record):
assert "data" in data
ssb: tuple[str, str, str] = data["data"]
(str1, str2, boro) = ssb
if boro == "Manhattan":
try:
avenue, street = parse_street_ave(str1, str2)
latlon = grid.code(avenue, street)
if latlon:
self.n_grid += 1
lat, lng = latlon
return round(float(lat), 7), round(float(lng), 7) # they're numpy floats
except ValueError:
pass
# TODO: use extended-grid coder if possible; would require more street/avenue parsing.

tlatlng = get_lat_lng_from_geocode(geocode, data)
if not tlatlng:
self.n_geocode_fail += 1
return None
_, lat, lng = tlatlng
geocode_boro = point_to_borough(lat, lng)
if geocode_boro != boro:
self.n_boro_mismatch += 1
sys.stderr.write(
f'Borough mismatch: {record.id}: {data["source"]} geocoded to {geocode_boro} not {boro}\n'
)
return None
self.n_google_location += 1
return (lat, lng)

def finalize(self):
sys.stderr.write(f" titles matched: {self.n_title}\n")
sys.stderr.write(f"alt titles matched: {self.n_alt_title}\n")
sys.stderr.write(f" total matches: {self.n_match}\n")
sys.stderr.write(" geocoding results:\n")
sys.stderr.write(f" grid: {self.n_grid}\n")
sys.stderr.write(f" google: {self.n_google_location}\n")
sys.stderr.write(f" boro mismatch: {self.n_boro_mismatch}\n")
sys.stderr.write(f" failures: {self.n_geocode_fail}\n")

def name(self):
return "title-pattern"
57 changes: 57 additions & 0 deletions oldnyc/geocode/coders/title_pattern_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from oldnyc.geocode.coders.title_pattern import TitlePatternCoder
from oldnyc.item import blank_item


def test_title_pattern():
tp = TitlePatternCoder()
title = "Manhattan: 10th Street (East) - Broadway"
item = blank_item()
item.title = title
assert tp.codeRecord(item) == {
"type": "intersection",
"source": title,
"address": "10th Street (East) and Broadway, Manhattan, NY",
"data": ("10th Street (East)", "Broadway", "Manhattan"),
}

title = "Richmond: 3rd Street - New Dorp Lane"
item.title = title
assert tp.codeRecord(item) == {
"type": "intersection",
"source": title,
"address": "3rd Street and New Dorp Lane, Staten Island, NY",
"data": ("3rd Street", "New Dorp Lane", "Staten Island"),
}

title = "Manhattan: 6th Avenue - 37th and 38th Streets (West)"
item.title = title
assert tp.codeRecord(item) is None


def test_alt_title():
tp = TitlePatternCoder()
item = blank_item()

item.title = "Feast of Our Lady of Mount Carmel."
item.alt_title = ["Manhattan: 1st Avenue - 112th Street ."]
assert tp.codeRecord(item) == {
"type": "intersection",
"source": item.alt_title[0],
"address": "112th Street and 1st Avenue, Manhattan, NY",
"data": ("112th Street", "1st Avenue", "Manhattan"),
}

# 730343f
item.title = "General view - Cedar Street - South."
item.alt_title = [
"Manhattan: Cedar Street - Pearl Street ; 1 Cedar Street ; Municipal Building."
]
assert tp.codeRecord(item) is None

# TODO: split on ';' and try each one
# == {
# "type": "intersection",
# "source": item.alt_title[0],
# "address": "Cedar Street and Pearl Street, Manhattan, NY",
# "data": ("Cedar Street", "Pearl Street", "Manhattan"),
# }
28 changes: 17 additions & 11 deletions oldnyc/geocode/diff_geojson.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,10 +111,11 @@ def diff_geojson(
-geom: {len(dropped_geometry_ids):,}
""")

if added_ids and num_samples:
print("\nSample of additions:")
add_samples = min(num_samples, len(added_ids))
for k in random.sample([*added_ids], add_samples):
if (added_ids or added_geometry_ids) and num_samples:
print("\nSample of additions / +geom:")
both = dropped_ids.union(added_geometry_ids)
add_samples = min(num_samples, len(both))
for k in random.sample([*both], add_samples):
props = new[k]["properties"]
b = props["geocode"]
title = (
Expand All @@ -126,18 +127,23 @@ def diff_geojson(
print(f" {k:6}: {title}")
print(f' + {b.get("lat"):.6f},{b.get("lng"):.6f} {b.get("technique")}')

if dropped_ids and num_samples:
print("\nSample of dropped:")
drop_samples = min(num_samples, len(dropped_ids))
for k in random.sample([*dropped_ids], drop_samples):
a = old[k]["properties"]["geocode"]
print(f' {k:6}: {a.get("original_title", "original title not found")}')
if (dropped_ids or dropped_geometry_ids) and num_samples:
both = dropped_ids.union(dropped_geometry_ids)
print("\nSample of dropped / -geom:")
drop_samples = min(num_samples, len(both))
for k in random.sample([*both], drop_samples):
props = old[k]["properties"]
a = props["geocode"]
title = props.get("original_title") or props.get("title") or "original title not found"
print(f" {k:6}: {title}")
print(f' - {a.get("lat"):.6f},{a.get("lng"):.6f} {a.get("technique")}')

if changed_ids and num_samples:
print("\nSample of changes:")
changed_samples = min(num_samples, len(changed_ids))
for k in random.sample([*changed_ids], changed_samples):
props = old[k]["properties"]
title = props.get("original_title") or props.get("title") or "original title not found"
a = old[k]["properties"]["geocode"]
b = new[k]["properties"]["geocode"]
a_lat = a.get("lat")
Expand All @@ -146,7 +152,7 @@ def diff_geojson(
b_lng = b.get("lng")
d_meters = haversine((a_lat, a_lng), (b_lat, b_lng)) * 1000

print(f' {k:6}: {a.get("original_title", "original title not found")}')
print(f" {k:6}: {title}")
print(f' - {a_lat:.6f},{a_lng:.6f} {a.get("technique")}')
print(f' + {b_lat:.6f},{b_lng:.6f} {b.get("technique")}')
print(f" Moved {d_meters:0,.0f} meters")
Expand Down
9 changes: 5 additions & 4 deletions oldnyc/geocode/geocode.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,12 @@
from dotenv import load_dotenv

from oldnyc.geocode import generate_js, geocoder
from oldnyc.geocode.coders import extended_grid, gpt, milstein, subjects
from oldnyc.geocode.coders import extended_grid, gpt, milstein, subjects, title_pattern
from oldnyc.geocode.geocode_types import Coder, Locatable, Location
from oldnyc.item import Item, load_items

CODERS: dict[str, Callable[[], Coder]] = {
"title-pattern": title_pattern.TitlePatternCoder,
"extended-grid": extended_grid.ExtendedGridCoder,
"milstein": milstein.MilsteinCoder,
"subjects": subjects.SubjectsCoder,
Expand All @@ -42,7 +43,7 @@
parser.add_argument(
"-c",
"--coders",
default="extended-grid,milstein,subjects",
default="title-pattern,extended-grid,milstein,subjects",
help="Set to a comma-separated list of coders. Coders run in the specified order.",
)

Expand Down Expand Up @@ -90,7 +91,7 @@

if args.geocode:
api_key = os.environ.get("GOOGLE_MAPS_API_KEY")
g = geocoder.Geocoder(args.use_network, 2) # 2s between geocodes
g = geocoder.Geocoder(args.use_network, 2, api_key) # 2s between geocodes
if args.use_network and not api_key:
raise ValueError("Must set GOOGLE_MAPS_API_KEY with --use_network")
else:
Expand Down Expand Up @@ -151,7 +152,7 @@
geocode_result = None
address = location_data["address"]
try:
geocode_result = g.Locate(address)
geocode_result = g.Locate(address, True, r.id)
except urllib.error.HTTPError as e:
if e.status == 400:
sys.stderr.write(f"Bad request: {address}\n")
Expand Down
10 changes: 6 additions & 4 deletions oldnyc/geocode/geocoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def _cache_result(self, loc: str, result: bytes):
cache_file = _cache_file(loc)
open(cache_file, "wb").write(result)

def _fetch(self, url: str):
def _fetch(self, url: str, debug_txt: Optional[str]):
"""Attempts to fetch the URL. Does rate throttling. Returns XML."""
now = time.time()
diff = now - self._last_fetch
Expand All @@ -68,7 +68,7 @@ def _fetch(self, url: str):
time.sleep(self._wait_time - diff)
self._last_fetch = time.time()

sys.stderr.write("Fetching %s\n" % url)
sys.stderr.write(f"Fetching {url} ({debug_txt})\n")
assert self._api_key
# Note: API key is _not_ part of the cache key
f = urllib.request.urlopen(url + "&key=" + self._api_key)
Expand All @@ -81,7 +81,9 @@ def _check_for_lat_lon(self, address: str):
return FakeResponse % (m.group(1), m.group(2))

# TODO: get a more precise return type from the GMaps API
def Locate(self, address: str, check_cache=True) -> dict[str, Any] | None:
def Locate(
self, address: str, check_cache=True, debug_txt: Optional[str] = None
) -> dict[str, Any] | None:
"""Returns a maps API JSON response for the address or None.
Address should be a fully-qualified address, e.g.
Expand All @@ -103,7 +105,7 @@ def Locate(self, address: str, check_cache=True) -> dict[str, Any] | None:
sys.stderr.write(f"Would have geocoded with network: {address}\n")
# XXX this should probably raise instead
return None
data = self._fetch(url)
data = self._fetch(url, f"{debug_txt}: {address}")

if not data:
return None
Expand Down
Loading

0 comments on commit 7d70122

Please sign in to comment.