Skip to content

Commit

Permalink
Use GPT for geocoding (#164)
Browse files Browse the repository at this point in the history
* look at patterns

* clean/normalize alt_title

* clean/normalize alt_title data update

* skip boro:A-B pattern for GPT

* types

* unit test for title_pattern

* plug in title-pattern geocoder

* fix src, stub for extended-grid

* update logs

* drop boro_int filter

* rv irrelevant bits

* drop one more

* update GPT geocodes

* avoid feeding address into gpt

* update gpt geocodes

* try a new prompt

* old/bad geocodes analysis

* do not print geocodes

* Output added/dropped geometries

* match an address

* more tests

* update test data; no more milstein or extended-grid!

* adjust log format, add --print_geocodes

* rewrite directional streets for better geocoding

* cleanup

* try to match milstein punctuation; add special cases coder

* with careful punctuation, down to 48 / 87

* more special cases; 29 / 69

* slot in special cases coder

* update test data

* update geocache

* restore the prompt I actually used

* use subjects location for Columbus Circle

* update site data

* add sizes of new images
  • Loading branch information
danvk authored Nov 13, 2024
1 parent 1a15189 commit 35a419c
Show file tree
Hide file tree
Showing 19 changed files with 80,916 additions and 11,934 deletions.
91,123 changes: 79,567 additions & 11,556 deletions data/gpt-geocodes.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion data/lat-lon-to-ids.json

Large diffs are not rendered by default.

593 changes: 593 additions & 0 deletions data/self-hosted-sizes.txt

Large diffs are not rendered by default.

Binary file modified geocache.tgz
Binary file not shown.
66 changes: 66 additions & 0 deletions oldnyc/analysis/invalid_geocodes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
"""How many old geocodes are invalid?"""

import base64
import binascii
import json
import re
from pathlib import Path


def softdecode(s: str) -> str:
try:
for i in range(0, 4):
try:
q = base64.b64decode(s).decode("latin-1")
return q
except binascii.Error:
s = s[:-1]
except UnicodeDecodeError:
print(f"Cannot decode: {s}")
raise
raise ValueError(f"Cannot decode: {s}")


def main():
geocache = Path("geocache")
n_cache = 0
n_int_q = 0
n_hit, n_zero, n_miss = 0, 0, 0
n_old_miss, n_new_miss = 0, 0
out = open("/tmp/failed-geocodes.ndjson", "w")
for path in geocache.iterdir():
n_cache += 1
query = softdecode(path.name)
if not re.search(r"&|\band\b", query):
continue

n_int_q += 1
with open(path) as f:
data = json.load(f)
assert "results" in data, path
if data["status"] == "ZERO_RESULTS":
n_zero += 1
continue
assert len(data["results"]) > 0, path
if "intersection" in data["results"][0]["types"]:
n_hit += 1
else:
n_miss += 1
is_old = path.stat().st_mtime < 1447339825 # 2015-11-12
if is_old:
n_old_miss += 1
else:
n_new_miss += 1
out.write(json.dumps({"query": query, "data": data}))
out.write("\n")

print(f"Cache: {n_cache}")
print(f"Intersection queries: {n_int_q}")
print(f"Hit: {n_hit}")
print(f"Zero results: {n_zero}")
print(f"Miss: {n_miss}")
print(f"{n_new_miss=}, {n_old_miss=}")


if __name__ == "__main__":
main()
26 changes: 26 additions & 0 deletions oldnyc/geocode/boroughs.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
"""Utility for mapping a lat/lon to a borough."""

import json
import re

from oldnyc.geocode import shape_utils
from oldnyc.ingest.util import BOROUGHS
from oldnyc.item import Item

BOROUGHS_JSON_FILE = "data/originals/borough-polygons.json"

Expand All @@ -24,3 +27,26 @@ def point_to_borough(lat: float, lon: float) -> str | None:
if shape_utils.PointInPolygon(pt, v):
return k
return None


boroughs_pat = r"(?:Manhattan|Brooklyn|Queens|Bronx|Staten Island|Richmond)"
borough_re = re.compile(rf"\b({boroughs_pat})\b")


def guess_borough(item: Item):
titles = [item.title] + item.alt_title
for b in BOROUGHS:
full = f"{b} (New York, N.Y.)"
if full in item.subject.geographic:
return b
full = f"/ {b}"
if item.source.endswith(full):
return b
for t in titles:
if t.startswith(b):
return b
for t in titles:
m = borough_re.search(t)
if m:
return m.group(1)
return None
104 changes: 70 additions & 34 deletions oldnyc/geocode/coders/gpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,51 +3,48 @@
import json
import sys

from oldnyc.geocode import grid
from oldnyc.geocode.boroughs import guess_borough, point_to_borough
from oldnyc.geocode.coders.coder_utils import get_lat_lng_from_geocode
from oldnyc.geocode.coders.extended_grid import parse_street_ave
from oldnyc.geocode.geocode_types import Coder, Locatable
from oldnyc.geocode.geogpt.generate_batch import GptResponse, guess_borough
from oldnyc.geocode.geogpt.generate_batch import GptResponse
from oldnyc.item import Item


class GptCoder(Coder):
queries: dict[str, GptResponse]

def __init__(self):
# with open("geogpt/geocodes.json") as f:
# with open("/tmp/extracted-structure.json") as f:
with open("/tmp/extracted-structure+text.json") as f:
with open("data/gpt-geocodes.json") as f:
self.queries = json.load(f)
self.num_intersection = 0
self.num_address = 0
self.num_poi = 0
# TODO: can this be moved up top now?
from oldnyc.geocode.coders.milstein import MilsteinCoder

# Could also use extended-grid coder
self.milstein = MilsteinCoder()
self.n_grid = 0
self.n_google_location = 0
self.n_geocode_fail = 0
self.n_boro_mismatch = 0

def codeRecord(self, r: Item):
# GPT location extractions are always based on record ID, not photo ID.
id = r.id.split("-")[0]
q = self.queries.get(id)
if not q:
return None
sys.stderr.write(f"GPT location: {r.id} {q}\n")
# sys.stderr.write(f"GPT location: {r.id} {q}\n")

if q["type"] == "not in NYC" or q["type"] == "no location information":
if q["type"] in ("no location information", "not in NYC"):
return None

loc: Locatable | None = None
boro = guess_borough(r)
if boro is None:
sys.stderr.write(f"Failed to guess borough for {r.id}\n")
boro = "New York"
if q["type"] == "place_name":
self.num_poi += 1
return None
# place = q["place_name"]
# loc = {
# "address": f"{place}, {boro}, NY",
# "source": place,
# "type": ["point_of_interest", "premise"],
# }
elif q["type"] == "address":
self.num_address += 1
num = q["number"]
Expand All @@ -57,36 +54,75 @@ def codeRecord(self, r: Item):
address=f"{address}, {boro}, NY",
source=address,
type=["street_address", "premise"],
data=q,
data={**q, "boro": boro},
)
elif q["type"] == "intersection":
self.num_intersection += 1
street1 = q["street1"]
street2 = q["street2"]
str1 = q["street1"]
str2 = q["street2"]
(str1, str2) = sorted((str1, str2)) # try to increase cache coherence
loc = Locatable(
address=f"{street1} & {street2}, {boro}, NY",
source=f"{street1} & {street2}",
address=f"{str1} and {str2}, {boro}, NY",
source=f"{str1} and {str2}",
type="intersection",
data=q,
data=(str1, str2, boro),
)
sys.stderr.write(f"GPT location: {r.id} {loc}\n")
# sys.stderr.write(f"GPT location: {r.id} {loc}\n")
return loc

def getLatLonFromGeocode(self, geocode, data, record: Item):
result = self.milstein.getLatLonFromGeocode(geocode, data, record)
if not result:
sys.stderr.write(f"gpt geocode failed: {record.id}\n")
sys.stderr.write(json.dumps(data) + "\n")
sys.stderr.write(json.dumps(geocode) + "\n")
else:
tll = get_lat_lng_from_geocode(geocode, data)
sys.stderr.write(f"gpt geocode success: {record.id} {tll}: {data}\n")
return result
# TODO: next two methods are nearly identical to those in title_pattern.py
def getLatLonFromLocatable(self, r, data):
if data["type"] != "intersection":
return None
assert "data" in data
ssb: tuple[str, str, str] = data["data"]
(str1, str2, boro) = ssb
if boro != "Manhattan":
return None
try:
avenue, street = parse_street_ave(str1, str2)
latlon = grid.code(avenue, street)
if latlon:
self.n_grid += 1
lat, lng = latlon
return round(float(lat), 7), round(float(lng), 7) # they're numpy floats
except ValueError:
pass

def getLatLonFromGeocode(self, geocode, data, record):
assert "data" in data
boro = None
if data["type"] == "intersection":
ssb: tuple[str, str, str] = data["data"]
(str1, str2, boro) = ssb
elif "street_address" in data["type"]:
boro = data["data"]["boro"]

tlatlng = get_lat_lng_from_geocode(geocode, data)
if not tlatlng:
self.n_geocode_fail += 1
return None
_, lat, lng = tlatlng
geocode_boro = point_to_borough(lat, lng)
if geocode_boro != boro and not (boro == "New York" and geocode_boro == "Manhattan"):
self.n_boro_mismatch += 1
sys.stderr.write(
f'gpt Borough mismatch: {record.id}: {data["source"]} geocoded to {geocode_boro} not {boro}\n'
)
return None
self.n_google_location += 1
return (lat, lng)

def finalize(self):
sys.stderr.write(f"GPT POI: {self.num_poi}\n")
sys.stderr.write(f"GPT address: {self.num_address}\n")
sys.stderr.write(f"GPT intersection: {self.num_intersection}\n")

sys.stderr.write("GPT geocoding results:\n")
sys.stderr.write(f" grid: {self.n_grid}\n")
sys.stderr.write(f" google: {self.n_google_location}\n")
sys.stderr.write(f" boro mismatch: {self.n_boro_mismatch}\n")
sys.stderr.write(f" failures: {self.n_geocode_fail}\n")

def name(self):
return "gpt"
73 changes: 73 additions & 0 deletions oldnyc/geocode/coders/special_cases.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import sys
from collections import Counter

from oldnyc.geocode.geocode_types import Coder, Locatable
from oldnyc.item import Item


class SpecialCasesCoder(Coder):
def __init__(self):
self.counts = Counter[str]()

def codeRecord(self, r: Item):
if r.title.startswith("Newspapers - China Daily News"):
# The 2013 Milstein CSV file has a bunch of addresses along Mott Street for these.
self.counts["China Daily News"] += 1
return Locatable(
address="105 Mott Street, Manhattan, NY",
source="China Daily News",
type="address",
data=(40.7173856, -73.9975334),
)

if r.title.startswith("Squatters Colony - Camp Thomas Paine."):
# 733208f mentions 75th Street, but the 2013 CSV file has this address.
self.counts["Squatters: Camp Thomas Paine"] += 1
return Locatable(
address="West 70th Street and Riverside, Manhattan, N.Y.",
source="Camp Thomas Paine",
type="address",
data=(40.779554, -73.988017),
)

if "Cathedral of St. John the Divine (New York, N.Y.)" in r.subject.name:
self.counts["St. John the Divine"] += 1
return Locatable(
address="Cathedral of St. John the Divine, New York, N.Y.",
source="Cathedral of St. John the Divine",
type="address",
data=(40.8038356, -73.9618754),
)

if "Mount Sinai Hospital (New York, N.Y.)" in r.subject.name:
self.counts["Mt. Sinai"] += 1
return Locatable(
address="Mount Sinai Hospital, New York, N.Y.)",
source="Mount Sinai Hospital",
type="address",
data=(40.789196, -73.954817),
)

titles = [r.title] + r.alt_title
for title in titles:
if title.startswith("Manhattan: Columbus Circle"):
self.counts["Columbus Circle"] += 1
return Locatable(
address="Columbus Circle, Manhattan, N.Y.",
source="Columbus Circle",
type="address",
data=(40.76808, -73.981896),
)

def getLatLonFromLocatable(self, r, data):
assert "data" in data
return data["data"]

def getLatLonFromGeocode(self, geocode, data, record):
return None

def finalize(self):
sys.stderr.write(f"Special cases: {self.counts.most_common()}\n")

def name(self):
return "special"
Loading

0 comments on commit 35a419c

Please sign in to comment.