-
Notifications
You must be signed in to change notification settings - Fork 130
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* look at patterns * clean/normalize alt_title * clean/normalize alt_title data update * skip boro:A-B pattern for GPT * types * unit test for title_pattern * plug in title-pattern geocoder * fix src, stub for extended-grid * update logs * drop boro_int filter * rv irrelevant bits * drop one more * update GPT geocodes * avoid feeding address into gpt * update gpt geocodes * try a new prompt * old/bad geocodes analysis * do not print geocodes * Output added/dropped geometries * match an address * more tests * update test data; no more milstein or extended-grid! * adjust log format, add --print_geocodes * rewrite directional streets for better geocoding * cleanup * try to match milstein punctuation; add special cases coder * with careful punctuation, down to 48 / 87 * more special cases; 29 / 69 * slot in special cases coder * update test data * update geocache * restore the prompt I actually used * use subjects location for Columbus Circle * update site data * add sizes of new images
- Loading branch information
Showing
19 changed files
with
80,916 additions
and
11,934 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
"""How many old geocodes are invalid?""" | ||
|
||
import base64 | ||
import binascii | ||
import json | ||
import re | ||
from pathlib import Path | ||
|
||
|
||
def softdecode(s: str) -> str: | ||
try: | ||
for i in range(0, 4): | ||
try: | ||
q = base64.b64decode(s).decode("latin-1") | ||
return q | ||
except binascii.Error: | ||
s = s[:-1] | ||
except UnicodeDecodeError: | ||
print(f"Cannot decode: {s}") | ||
raise | ||
raise ValueError(f"Cannot decode: {s}") | ||
|
||
|
||
def main(): | ||
geocache = Path("geocache") | ||
n_cache = 0 | ||
n_int_q = 0 | ||
n_hit, n_zero, n_miss = 0, 0, 0 | ||
n_old_miss, n_new_miss = 0, 0 | ||
out = open("/tmp/failed-geocodes.ndjson", "w") | ||
for path in geocache.iterdir(): | ||
n_cache += 1 | ||
query = softdecode(path.name) | ||
if not re.search(r"&|\band\b", query): | ||
continue | ||
|
||
n_int_q += 1 | ||
with open(path) as f: | ||
data = json.load(f) | ||
assert "results" in data, path | ||
if data["status"] == "ZERO_RESULTS": | ||
n_zero += 1 | ||
continue | ||
assert len(data["results"]) > 0, path | ||
if "intersection" in data["results"][0]["types"]: | ||
n_hit += 1 | ||
else: | ||
n_miss += 1 | ||
is_old = path.stat().st_mtime < 1447339825 # 2015-11-12 | ||
if is_old: | ||
n_old_miss += 1 | ||
else: | ||
n_new_miss += 1 | ||
out.write(json.dumps({"query": query, "data": data})) | ||
out.write("\n") | ||
|
||
print(f"Cache: {n_cache}") | ||
print(f"Intersection queries: {n_int_q}") | ||
print(f"Hit: {n_hit}") | ||
print(f"Zero results: {n_zero}") | ||
print(f"Miss: {n_miss}") | ||
print(f"{n_new_miss=}, {n_old_miss=}") | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
import sys | ||
from collections import Counter | ||
|
||
from oldnyc.geocode.geocode_types import Coder, Locatable | ||
from oldnyc.item import Item | ||
|
||
|
||
class SpecialCasesCoder(Coder): | ||
def __init__(self): | ||
self.counts = Counter[str]() | ||
|
||
def codeRecord(self, r: Item): | ||
if r.title.startswith("Newspapers - China Daily News"): | ||
# The 2013 Milstein CSV file has a bunch of addresses along Mott Street for these. | ||
self.counts["China Daily News"] += 1 | ||
return Locatable( | ||
address="105 Mott Street, Manhattan, NY", | ||
source="China Daily News", | ||
type="address", | ||
data=(40.7173856, -73.9975334), | ||
) | ||
|
||
if r.title.startswith("Squatters Colony - Camp Thomas Paine."): | ||
# 733208f mentions 75th Street, but the 2013 CSV file has this address. | ||
self.counts["Squatters: Camp Thomas Paine"] += 1 | ||
return Locatable( | ||
address="West 70th Street and Riverside, Manhattan, N.Y.", | ||
source="Camp Thomas Paine", | ||
type="address", | ||
data=(40.779554, -73.988017), | ||
) | ||
|
||
if "Cathedral of St. John the Divine (New York, N.Y.)" in r.subject.name: | ||
self.counts["St. John the Divine"] += 1 | ||
return Locatable( | ||
address="Cathedral of St. John the Divine, New York, N.Y.", | ||
source="Cathedral of St. John the Divine", | ||
type="address", | ||
data=(40.8038356, -73.9618754), | ||
) | ||
|
||
if "Mount Sinai Hospital (New York, N.Y.)" in r.subject.name: | ||
self.counts["Mt. Sinai"] += 1 | ||
return Locatable( | ||
address="Mount Sinai Hospital, New York, N.Y.)", | ||
source="Mount Sinai Hospital", | ||
type="address", | ||
data=(40.789196, -73.954817), | ||
) | ||
|
||
titles = [r.title] + r.alt_title | ||
for title in titles: | ||
if title.startswith("Manhattan: Columbus Circle"): | ||
self.counts["Columbus Circle"] += 1 | ||
return Locatable( | ||
address="Columbus Circle, Manhattan, N.Y.", | ||
source="Columbus Circle", | ||
type="address", | ||
data=(40.76808, -73.981896), | ||
) | ||
|
||
def getLatLonFromLocatable(self, r, data): | ||
assert "data" in data | ||
return data["data"] | ||
|
||
def getLatLonFromGeocode(self, geocode, data, record): | ||
return None | ||
|
||
def finalize(self): | ||
sys.stderr.write(f"Special cases: {self.counts.most_common()}\n") | ||
|
||
def name(self): | ||
return "special" |
Oops, something went wrong.