diff --git a/oldnyc/geocode/coders/nyc_parks.py b/oldnyc/geocode/coders/subjects.py similarity index 94% rename from oldnyc/geocode/coders/nyc_parks.py rename to oldnyc/geocode/coders/subjects.py index 51d56b4b..e7c0ab9c 100755 --- a/oldnyc/geocode/coders/nyc_parks.py +++ b/oldnyc/geocode/coders/subjects.py @@ -1,9 +1,16 @@ -#!/usr/bin/python -# -# Look for well-known NYC parks. +"""Geocode using the "subjects" field and places of interest in the title. +The idea is that if an item has a subjects/geographic field like this: + +[ + "Prospect Park", + "Grand Army Plaza", +] + +then we can look these two up in subjects.geojson, see that "Grand Army Plaza" +is more specific, and use that for geocoding. +""" -import fileinput import re import sys from collections import Counter, defaultdict @@ -12,9 +19,8 @@ from oldnyc.geocode.geocode_types import Coder, Locatable from oldnyc.geojson_utils import assert_point -from oldnyc.item import blank_item -# TODO: move these into a data file, maybe GeoJSON +# TODO: use subjects.geojson instead of these lists parks = { "Bronx Park": (40.856389, -73.876667), "Claremont Park": (40.840546, -73.907469), @@ -271,7 +277,7 @@ def is_address_close(a: str, b: str) -> bool: return abs(a_lat - b_lat) < 0.0001 and abs(a_lon - b_lon) < 0.0001 # ~11m -class NycParkCoder(Coder): +class SubjectsCoder(Coder): geo_to_location: dict[str, tuple[int, pygeojson.Point]] counters: Counter[str] @@ -283,11 +289,6 @@ def __init__(self): for f in features if f.geometry } - # for f in features: - # if f.properties.get("result") == "pier": - # self.geo_to_location[f.properties["geo"]] = pygeojson.Point( - # (-73.9737, 40.74421) - # ) self.counters = Counter() @@ -463,30 +464,4 @@ def finalize(self): # sys.stderr.write("%4d\t%s\n" % (v, k)) def name(self): - return "nyc-parks" - - -# For fast iteration -if __name__ == "__main__": - coder = NycParkCoder() - r = blank_item() - num_ok, num_bad = 0, 0 - for line in fileinput.input(): - addr = line.strip() - if not addr: - continue - r.address = addr - result = coder.codeRecord(r) - - if result: - num_ok += 1 - print('"%s" -> %s' % (addr, result)) - else: - num_bad += 1 - - coder.finalize() - - sys.stderr.write( - "Parsed %d / %d = %.4f records\n" - % (num_ok, num_ok + num_bad, 1.0 * num_ok / (num_ok + num_bad)) - ) + return "subjects" diff --git a/oldnyc/geocode/collect_subjects.py b/oldnyc/geocode/collect_subjects.py index 932eb3fa..f37d6039 100755 --- a/oldnyc/geocode/collect_subjects.py +++ b/oldnyc/geocode/collect_subjects.py @@ -3,7 +3,7 @@ import sys from collections import Counter -from oldnyc.geocode.coders import nyc_parks +from oldnyc.geocode.coders import subjects from oldnyc.item import Item, load_items @@ -26,8 +26,8 @@ def print_geographics(items: list[Item]): counts[g] += 1 if g.endswith(" (New York, N.Y.)"): g = g.replace(" (New York, N.Y.)", "") - if g in nyc_parks.parks: - geo_to_loc[raw_g] = nyc_parks.parks[g] + if g in subjects.parks: + geo_to_loc[raw_g] = subjects.parks[g] for name, count in counts.most_common(): loc = geo_to_loc.get(name) or "" diff --git a/oldnyc/geocode/geocode.py b/oldnyc/geocode/geocode.py index fe4304dc..94e57b7b 100755 --- a/oldnyc/geocode/geocode.py +++ b/oldnyc/geocode/geocode.py @@ -15,14 +15,14 @@ from dotenv import load_dotenv from oldnyc.geocode import generate_js, geocoder -from oldnyc.geocode.coders import extended_grid, gpt, milstein, nyc_parks +from oldnyc.geocode.coders import extended_grid, gpt, milstein, subjects from oldnyc.geocode.geocode_types import Coder, Locatable, Location from oldnyc.item import Item, load_items CODERS: dict[str, Callable[[], Coder]] = { "extended-grid": extended_grid.ExtendedGridCoder, "milstein": milstein.MilsteinCoder, - "nyc-parks": nyc_parks.NycParkCoder, + "subjects": subjects.SubjectsCoder, "gpt": gpt.GptCoder, } @@ -42,7 +42,7 @@ parser.add_argument( "-c", "--coders", - default="extended-grid,milstein,nyc-parks", + default="extended-grid,milstein,subjects", help="Set to a comma-separated list of coders. Coders run in the specified order.", ) diff --git a/oldnyc/geocode/subjects/csv_to_geojson.py b/oldnyc/geocode/subjects/csv_to_geojson.py index 31a39ca1..4eb46023 100644 --- a/oldnyc/geocode/subjects/csv_to_geojson.py +++ b/oldnyc/geocode/subjects/csv_to_geojson.py @@ -7,7 +7,7 @@ import pygeojson -from oldnyc.geocode.coders.nyc_parks import IGNORE_SUBJECTS +from oldnyc.geocode.coders.subjects import IGNORE_SUBJECTS def main(): diff --git a/oldnyc/geocode/subjects/make_localturk_csv.py b/oldnyc/geocode/subjects/make_localturk_csv.py index f0f30eff..7920dfaf 100644 --- a/oldnyc/geocode/subjects/make_localturk_csv.py +++ b/oldnyc/geocode/subjects/make_localturk_csv.py @@ -11,7 +11,7 @@ import pygeojson from haversine import haversine -from oldnyc.geocode.coders.nyc_parks import IGNORE_SUBJECTS, NycParkCoder +from oldnyc.geocode.coders.subjects import IGNORE_SUBJECTS, SubjectsCoder from oldnyc.geojson_utils import assert_point from oldnyc.item import Item, load_items from oldnyc.util import encode_json_base64, pick @@ -35,7 +35,7 @@ def maybe_coords(p: pygeojson.Point | None): def main(): items = load_items("data/images.ndjson") - coder = NycParkCoder() + coder = SubjectsCoder() other_geocodes = pygeojson.load_feature_collection(open("/tmp/images.geojson")).features id_to_location = {str(f.id): assert_point(f.geometry) for f in other_geocodes if f.geometry} diff --git a/test/random200-geocoded.txt b/test/random200-geocoded.txt index 67ed64bb..05bf4577 100644 --- a/test/random200-geocoded.txt +++ b/test/random200-geocoded.txt @@ -78,7 +78,7 @@ 703041f milstein (40.6823228, -73.9706163) Atlantic Avenue and Carlton Avenue, Brooklyn, NY 707148f milstein (40.6205804, -73.9019884) Ave. V and Bergen Ave., Brooklyn, NY 716421f extended-grid (40.725081, -73.981232) @40.725081,-73.981232 -731946f nyc-parks (40.574926, -73.985941) @40.574926,-73.985941 +731946f subjects (40.574926, -73.985941) @40.574926,-73.985941 716604f failed n/a n/a 726128f milstein (40.7704092, -73.8246714) Bayside Avenue and Parsons Boulevard, Queens, NY 728569f milstein (40.6289529, -74.0797226) Beach Street and Van Duzer Street, Staten Island, NY @@ -96,9 +96,9 @@ 703479f failed n/a n/a 718754f milstein (40.7121586, -73.9806996) Cherry Street and Jackson Street, Manhattan, NY 703755f milstein (40.6739616, -74.0080168) Columbia Street and Creamer Street, Brooklyn, NY -719118f nyc-parks (40.76808, -73.981896) @40.768080,-73.981896 +719118f subjects (40.76808, -73.981896) @40.768080,-73.981896 733706f failed n/a n/a -732005f nyc-parks (40.574926, -73.985941) @40.574926,-73.985941 +732005f subjects (40.574926, -73.985941) @40.574926,-73.985941 1509541 failed n/a n/a 726456f milstein (40.6967542, -73.9005186) Cypress Avenue and Summerfield Avenue, Queens, NY 719259f milstein (40.7200098, -73.992885) Chrystie Street and Delancey Street, Manhattan, NY @@ -118,8 +118,8 @@ 726840f failed n/a n/a 720408f milstein (40.7155152, -73.9897764) Essex Street and Hester Street, Manhattan, NY 704920f milstein (40.697888, -73.99460499999999) Clark Street and Hicks Street, Brooklyn, NY -720438f nyc-parks (40.842551, -73.932621) @40.842551,-73.932621 -730836f nyc-parks (40.842308, -73.930277) @40.842308,-73.930277 +720438f subjects (40.842551, -73.932621) @40.842551,-73.932621 +730836f subjects (40.842308, -73.930277) @40.842308,-73.930277 704967f milstein (40.705439, -73.95633699999999) Hooper Street and Marcy Avenue, Brooklyn, NY 701306f failed n/a n/a 726942f milstein (40.7416676, -73.9542183) Jackson Avenue and Vernon Boulevard, Queens, NY @@ -142,19 +142,19 @@ 722402f milstein (40.8058042, -73.9386896) 126th Street and Park Avenue, Manhattan, NY 722101f extended-grid (40.749644, -73.979609) @40.749644,-73.979609 722430f milstein (40.71200320000001, -74.0081046) Broadway and Park Row, Manhattan, NY -734005f nyc-parks (40.705573, -74.001457) @40.705573,-74.001457 +734005f subjects (40.705573, -74.001457) @40.705573,-74.001457 701674f milstein (40.8376796, -73.85350729999999) Purdy Street and St. Raymond Avenue, Bronx, NY 104780 failed n/a n/a 729420f milstein (40.5732734, -74.1469118) Mill Road and Richmond Hill Road, Staten Island, NY 722881f milstein (40.7849851, -73.9826672) 79th Street and Riverside Drive, Manhattan, NY 722717f milstein (40.78691060000001, -73.9812484) 82nd Street and Riverside Drive, Manhattan, NY -719805f nyc-parks (40.861619, -73.933622) @40.861619,-73.933622 +719805f subjects (40.861619, -73.933622) @40.861619,-73.933622 723027f milstein (40.7584384, -73.9789121) 49th Street (West) and Rockefeller Plaza, Manhattan, NY 723036f failed n/a n/a 727922f failed n/a n/a 104536 failed n/a n/a 723323f milstein (40.7094957, -73.994007) Market Slip (West). and South Street, Manhattan, NY -734332f nyc-parks (40.873694, -73.911064) @40.873694,-73.911064 +734332f subjects (40.873694, -73.911064) @40.873694,-73.911064 706565f failed n/a n/a 723111f milstein (40.7275748, -73.9853065) 1st Avenue and St. Marks Place, Manhattan, NY 723134f milstein (40.8253111, -73.9437817) 147th Street and St. Nicholas Avenue, Manhattan, NY @@ -164,7 +164,7 @@ 702016f milstein (40.8608772, -73.8418068) Waring Avenue and Woodhull Avenue, Bronx, NY 707267f milstein (40.686453, -73.99392999999999) Court St. and Warren St., Brooklyn, NY 707268f milstein (40.686453, -73.99392999999999) Court St. and Warren St., Brooklyn, NY -731060f nyc-parks (40.846944, -73.928056) @40.846944,-73.928056 +731060f subjects (40.846944, -73.928056) @40.846944,-73.928056 724252f milstein (40.7087712, -74.00091499999999) Dover Street and Water Street, Manhattan, NY 702047f milstein (40.8659309, -73.8858516) 198th Street (East) and Webster Avenue, Bronx, NY 1558186 extended-grid (40.752191, -73.993472) @40.752191,-73.993472 @@ -182,9 +182,9 @@ 1113271 failed n/a n/a 730594f failed n/a n/a 731805f failed n/a n/a -734193f nyc-parks (40.540383, -74.135698) @40.540383,-74.135698 +734193f subjects (40.540383, -74.135698) @40.540383,-74.135698 1635861 failed n/a n/a -1635949 nyc-parks (40.790882, -73.775732) @40.790882,-73.775732 +1635949 subjects (40.790882, -73.775732) @40.790882,-73.775732 1635983 failed n/a n/a 1636238 failed n/a n/a 1663931 failed n/a n/a diff --git a/test/random200.logs.txt b/test/random200.logs.txt index 203c4c3f..bc7a9ef5 100644 --- a/test/random200.logs.txt +++ b/test/random200.logs.txt @@ -27,5 +27,5 @@ POI/subject geocoding: 1 n_title_park 53 extended-grid 81 milstein - 11 nyc-parks + 11 subjects 145 (total)