Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TitlePattern geocoder #162

Merged
merged 14 commits into from
Nov 8, 2024
22 changes: 11 additions & 11 deletions data/geocode/random500.csv

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion data/lat-lon-to-ids.json

Large diffs are not rendered by default.

Binary file modified geocache.tgz
Binary file not shown.
6 changes: 3 additions & 3 deletions oldnyc/geocode/coders/gpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from oldnyc.geocode.coders.coder_utils import get_lat_lng_from_geocode
from oldnyc.geocode.geocode_types import Coder, Locatable
from oldnyc.geocode.geogpt.generate_batch import GptResponse
from oldnyc.geocode.geogpt.generate_batch import GptResponse, guess_borough
from oldnyc.item import Item


Expand Down Expand Up @@ -34,11 +34,11 @@ def codeRecord(self, r: Item):
return None
sys.stderr.write(f"GPT location: {r.id} {q}\n")

if q["type"] == "no_location":
if q["type"] == "not in NYC" or q["type"] == "no location information":
return None

loc: Locatable | None = None
boro = q["borough"]
boro = guess_borough(r)
if q["type"] == "place_name":
self.num_poi += 1
return None
Expand Down
112 changes: 112 additions & 0 deletions oldnyc/geocode/coders/title_pattern.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
"""Geocode by pattern matching against the title.

This matches extremely simple, common patterns that aren't worth a trip to GPT.
"""

import re
import sys

from oldnyc.geocode import grid
from oldnyc.geocode.boroughs import point_to_borough
from oldnyc.geocode.coders.coder_utils import get_lat_lng_from_geocode
from oldnyc.geocode.coders.extended_grid import parse_street_ave
from oldnyc.geocode.geocode_types import Coder, Locatable

boroughs_pat = r"(?:Manhattan|Brooklyn|Queens|Bronx|Staten Island|Richmond)"

# Borough: str1 - str2
# Manhattan: 10th Street (East) - Broadway
# 711023f
boro_int = re.compile(rf"^({boroughs_pat}): ([^-:\[\];]+?) - ([^-:\[\];]+)\.?$")


class TitlePatternCoder(Coder):
def __init__(self):
self.n_title = 0
self.n_alt_title = 0
self.n_match = 0

self.n_grid = 0
self.n_google_location = 0
self.n_geocode_fail = 0
self.n_boro_mismatch = 0

def codeRecord(self, r):
src = None
m = boro_int.match(r.title)
if m:
src = r.title
self.n_title += 1
else:
for alt_title in r.alt_title:
m = boro_int.match(alt_title)
if m:
src = alt_title
self.n_alt_title += 1
break

if not m:
return None

self.n_match += 1

boro, str1, str2 = m.groups()
if "and" in str1 or "and" in str2:
return None
str1 = str1.rstrip(". ")
str2 = str2.rstrip(". ")
(str1, str2) = sorted((str1, str2)) # try to increase cache coherence
boro = boro.replace("Richmond", "Staten Island")

assert src
out: Locatable = {
"type": "intersection",
"source": src,
"address": f"{str1} and {str2}, {boro}, NY",
"data": (str1, str2, boro),
}
return out

def getLatLonFromGeocode(self, geocode, data, record):
assert "data" in data
ssb: tuple[str, str, str] = data["data"]
(str1, str2, boro) = ssb
if boro == "Manhattan":
try:
avenue, street = parse_street_ave(str1, str2)
latlon = grid.code(avenue, street)
if latlon:
self.n_grid += 1
lat, lng = latlon
return round(float(lat), 7), round(float(lng), 7) # they're numpy floats
except ValueError:
pass
# TODO: use extended-grid coder if possible; would require more street/avenue parsing.

tlatlng = get_lat_lng_from_geocode(geocode, data)
if not tlatlng:
self.n_geocode_fail += 1
return None
_, lat, lng = tlatlng
geocode_boro = point_to_borough(lat, lng)
if geocode_boro != boro:
self.n_boro_mismatch += 1
sys.stderr.write(
f'Borough mismatch: {record.id}: {data["source"]} geocoded to {geocode_boro} not {boro}\n'
)
return None
self.n_google_location += 1
return (lat, lng)

def finalize(self):
sys.stderr.write(f" titles matched: {self.n_title}\n")
sys.stderr.write(f"alt titles matched: {self.n_alt_title}\n")
sys.stderr.write(f" total matches: {self.n_match}\n")
sys.stderr.write(" geocoding results:\n")
sys.stderr.write(f" grid: {self.n_grid}\n")
sys.stderr.write(f" google: {self.n_google_location}\n")
sys.stderr.write(f" boro mismatch: {self.n_boro_mismatch}\n")
sys.stderr.write(f" failures: {self.n_geocode_fail}\n")

def name(self):
return "title-pattern"
57 changes: 57 additions & 0 deletions oldnyc/geocode/coders/title_pattern_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from oldnyc.geocode.coders.title_pattern import TitlePatternCoder
from oldnyc.item import blank_item


def test_title_pattern():
tp = TitlePatternCoder()
title = "Manhattan: 10th Street (East) - Broadway"
item = blank_item()
item.title = title
assert tp.codeRecord(item) == {
"type": "intersection",
"source": title,
"address": "10th Street (East) and Broadway, Manhattan, NY",
"data": ("10th Street (East)", "Broadway", "Manhattan"),
}

title = "Richmond: 3rd Street - New Dorp Lane"
item.title = title
assert tp.codeRecord(item) == {
"type": "intersection",
"source": title,
"address": "3rd Street and New Dorp Lane, Staten Island, NY",
"data": ("3rd Street", "New Dorp Lane", "Staten Island"),
}

title = "Manhattan: 6th Avenue - 37th and 38th Streets (West)"
item.title = title
assert tp.codeRecord(item) is None


def test_alt_title():
tp = TitlePatternCoder()
item = blank_item()

item.title = "Feast of Our Lady of Mount Carmel."
item.alt_title = ["Manhattan: 1st Avenue - 112th Street ."]
assert tp.codeRecord(item) == {
"type": "intersection",
"source": item.alt_title[0],
"address": "112th Street and 1st Avenue, Manhattan, NY",
"data": ("112th Street", "1st Avenue", "Manhattan"),
}

# 730343f
item.title = "General view - Cedar Street - South."
item.alt_title = [
"Manhattan: Cedar Street - Pearl Street ; 1 Cedar Street ; Municipal Building."
]
assert tp.codeRecord(item) is None

# TODO: split on ';' and try each one
# == {
# "type": "intersection",
# "source": item.alt_title[0],
# "address": "Cedar Street and Pearl Street, Manhattan, NY",
# "data": ("Cedar Street", "Pearl Street", "Manhattan"),
# }
28 changes: 17 additions & 11 deletions oldnyc/geocode/diff_geojson.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,10 +111,11 @@ def diff_geojson(
-geom: {len(dropped_geometry_ids):,}
""")

if added_ids and num_samples:
print("\nSample of additions:")
add_samples = min(num_samples, len(added_ids))
for k in random.sample([*added_ids], add_samples):
if (added_ids or added_geometry_ids) and num_samples:
print("\nSample of additions / +geom:")
both = dropped_ids.union(added_geometry_ids)
add_samples = min(num_samples, len(both))
for k in random.sample([*both], add_samples):
props = new[k]["properties"]
b = props["geocode"]
title = (
Expand All @@ -126,18 +127,23 @@ def diff_geojson(
print(f" {k:6}: {title}")
print(f' + {b.get("lat"):.6f},{b.get("lng"):.6f} {b.get("technique")}')

if dropped_ids and num_samples:
print("\nSample of dropped:")
drop_samples = min(num_samples, len(dropped_ids))
for k in random.sample([*dropped_ids], drop_samples):
a = old[k]["properties"]["geocode"]
print(f' {k:6}: {a.get("original_title", "original title not found")}')
if (dropped_ids or dropped_geometry_ids) and num_samples:
both = dropped_ids.union(dropped_geometry_ids)
print("\nSample of dropped / -geom:")
drop_samples = min(num_samples, len(both))
for k in random.sample([*both], drop_samples):
props = old[k]["properties"]
a = props["geocode"]
title = props.get("original_title") or props.get("title") or "original title not found"
print(f" {k:6}: {title}")
print(f' - {a.get("lat"):.6f},{a.get("lng"):.6f} {a.get("technique")}')

if changed_ids and num_samples:
print("\nSample of changes:")
changed_samples = min(num_samples, len(changed_ids))
for k in random.sample([*changed_ids], changed_samples):
props = old[k]["properties"]
title = props.get("original_title") or props.get("title") or "original title not found"
a = old[k]["properties"]["geocode"]
b = new[k]["properties"]["geocode"]
a_lat = a.get("lat")
Expand All @@ -146,7 +152,7 @@ def diff_geojson(
b_lng = b.get("lng")
d_meters = haversine((a_lat, a_lng), (b_lat, b_lng)) * 1000

print(f' {k:6}: {a.get("original_title", "original title not found")}')
print(f" {k:6}: {title}")
print(f' - {a_lat:.6f},{a_lng:.6f} {a.get("technique")}')
print(f' + {b_lat:.6f},{b_lng:.6f} {b.get("technique")}')
print(f" Moved {d_meters:0,.0f} meters")
Expand Down
9 changes: 5 additions & 4 deletions oldnyc/geocode/geocode.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,12 @@
from dotenv import load_dotenv

from oldnyc.geocode import generate_js, geocoder
from oldnyc.geocode.coders import extended_grid, gpt, milstein, subjects
from oldnyc.geocode.coders import extended_grid, gpt, milstein, subjects, title_pattern
from oldnyc.geocode.geocode_types import Coder, Locatable, Location
from oldnyc.item import Item, load_items

CODERS: dict[str, Callable[[], Coder]] = {
"title-pattern": title_pattern.TitlePatternCoder,
"extended-grid": extended_grid.ExtendedGridCoder,
"milstein": milstein.MilsteinCoder,
"subjects": subjects.SubjectsCoder,
Expand All @@ -42,7 +43,7 @@
parser.add_argument(
"-c",
"--coders",
default="extended-grid,milstein,subjects",
default="title-pattern,extended-grid,milstein,subjects",
help="Set to a comma-separated list of coders. Coders run in the specified order.",
)

Expand Down Expand Up @@ -90,7 +91,7 @@

if args.geocode:
api_key = os.environ.get("GOOGLE_MAPS_API_KEY")
g = geocoder.Geocoder(args.use_network, 2) # 2s between geocodes
g = geocoder.Geocoder(args.use_network, 2, api_key) # 2s between geocodes
if args.use_network and not api_key:
raise ValueError("Must set GOOGLE_MAPS_API_KEY with --use_network")
else:
Expand Down Expand Up @@ -151,7 +152,7 @@
geocode_result = None
address = location_data["address"]
try:
geocode_result = g.Locate(address)
geocode_result = g.Locate(address, True, r.id)
except urllib.error.HTTPError as e:
if e.status == 400:
sys.stderr.write(f"Bad request: {address}\n")
Expand Down
10 changes: 6 additions & 4 deletions oldnyc/geocode/geocoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def _cache_result(self, loc: str, result: bytes):
cache_file = _cache_file(loc)
open(cache_file, "wb").write(result)

def _fetch(self, url: str):
def _fetch(self, url: str, debug_txt: Optional[str]):
"""Attempts to fetch the URL. Does rate throttling. Returns XML."""
now = time.time()
diff = now - self._last_fetch
Expand All @@ -68,7 +68,7 @@ def _fetch(self, url: str):
time.sleep(self._wait_time - diff)
self._last_fetch = time.time()

sys.stderr.write("Fetching %s\n" % url)
sys.stderr.write(f"Fetching {url} ({debug_txt})\n")
assert self._api_key
# Note: API key is _not_ part of the cache key
f = urllib.request.urlopen(url + "&key=" + self._api_key)
Expand All @@ -81,7 +81,9 @@ def _check_for_lat_lon(self, address: str):
return FakeResponse % (m.group(1), m.group(2))

# TODO: get a more precise return type from the GMaps API
def Locate(self, address: str, check_cache=True) -> dict[str, Any] | None:
def Locate(
self, address: str, check_cache=True, debug_txt: Optional[str] = None
) -> dict[str, Any] | None:
"""Returns a maps API JSON response for the address or None.

Address should be a fully-qualified address, e.g.
Expand All @@ -103,7 +105,7 @@ def Locate(self, address: str, check_cache=True) -> dict[str, Any] | None:
sys.stderr.write(f"Would have geocoded with network: {address}\n")
# XXX this should probably raise instead
return None
data = self._fetch(url)
data = self._fetch(url, f"{debug_txt}: {address}")

if not data:
return None
Expand Down
Loading