Skip to content

Commit

Permalink
More title patterns (#163)
Browse files Browse the repository at this point in the history
* allow "1st St", tighten types

* clean titles before matching

* Match "A between B and C"

* another test

* natsort

* bypass Google Maps geocoder for Manhattan grid

* only use natsort for between

* Add Manhattan prefix; down to 533 extended-grid

* strip leading ??

* data update

* look in source for Manhattan; 365 / 2038

* bugfix, split on ;

* strip trivia (340 / 2011)

* match "A at B, boro" (297 / 2010)

* add some notes; with --use_network, at 296 / 1914

* get more precise about "and" detection; 296 / 1168

* more patterns

* allow boro - str1 - str2; 296 / 1140, some bad ones here

* allow dash in num pat; 296 / 1135

* match braced clauses; 245 / 1114

* fix space before colon: 237 / 1098

* more tests

* update geocoding test data; stop pinning random500.csv

* update lat-lon-to-ids.json

* add test for 464848

* check for boundary on both sides of St

* strip potentially-distracting addresses (238 / 1085)

* update geocache

* data update

* update self-hosted-sizes

* update tests
  • Loading branch information
danvk authored Nov 10, 2024
1 parent 7d70122 commit 1a15189
Show file tree
Hide file tree
Showing 22 changed files with 804 additions and 273 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/e2etest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ jobs:
export PYTHONPATH=.
poetry run oldnyc/geocode/geocode.py --images_ndjson data/images.ndjson --output_format geojson --ids_filter data/geocode/random500-ids.txt --geocode > /tmp/images.geojson
poetry run oldnyc/geocode/truth/make_localturk_csv.py data/geocode/random500-ids.txt /tmp/images.geojson data/geocode/random500.csv
# We don't actually care about the diff on this file, just that make_localturk_csv.py doesn't error out.
git checkout data/geocode/random500.csv
poetry run oldnyc/geocode/truth/generate_truth_gtjson.py > data/geocode/truth.geojson
jq -r '.features[].id' data/geocode/truth.geojson > data/geocode/truth-ids.txt
poetry run python oldnyc/geocode/subjects/csv_to_geojson.py data/subjects/out.csv > data/subjects.geojson
Expand Down
34 changes: 17 additions & 17 deletions data/images.ndjson

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion data/lat-lon-to-ids.json

Large diffs are not rendered by default.

143 changes: 143 additions & 0 deletions data/self-hosted-sizes.txt
Original file line number Diff line number Diff line change
Expand Up @@ -184,22 +184,152 @@
485728,477,760
485729,479,760
485730,476,760
485735,760,509
485742,481,760
485764,483,760
485767,500,760
485768,508,760
485770,480,760
485771,760,482
485774,760,516
485775,507,760
485776,484,760
485780,760,514
485781,760,513
485783,760,506
485784,760,510
485785,760,512
485786,760,513
485787,504,760
485793,760,510
485794,760,509
485796,760,510
485797,760,504
485798,760,504
485800,760,510
485802,760,511
485803,760,510
485804,760,509
485805,760,513
485806,507,760
485807,760,510
485808,760,507
485809,760,506
485810,760,505
485811,760,508
485813,760,513
485814,760,503
485815,760,506
485816,507,760
485817,514,760
485818,760,506
485820,508,760
485821,760,514
485822,506,760
485824,760,508
485825,760,507
485826,760,504
485827,760,507
485828,508,760
485830,760,507
485831,760,511
485832,760,509
485833,760,511
485836,507,760
485839,760,509
485840,760,508
485841,508,760
485843,760,508
485845,502,760
485846,760,511
485848,760,499
485850,508,760
485851,760,509
485852,760,511
485853,760,512
485854,760,508
485856,760,506
485859,509,760
485860,760,513
485862,760,510
485864,760,512
485865,760,515
485866,760,514
485867,760,509
485871,509,760
485873,504,760
485874,760,512
485876,760,512
485883,760,507
485884,760,511
485885,760,509
485886,760,512
485887,760,511
485888,760,508
485890,760,505
485891,506,760
485892,760,504
485893,505,760
485894,760,504
485895,760,509
485896,760,513
485898,506,760
485899,760,514
485900,760,511
485901,760,506
485902,760,504
485903,760,508
485904,760,513
485905,760,506
485906,760,513
485907,760,512
485908,503,760
485909,760,511
485913,760,505
485914,760,515
485917,760,516
485919,760,511
485920,760,508
485921,760,507
485922,760,510
485924,760,508
485928,760,512
485929,760,512
485930,760,510
485938,760,507
485940,760,513
485941,760,510
485942,760,521
485943,760,505
485944,760,512
485945,760,510
485946,760,508
485947,760,506
485950,760,510
485951,760,508
485953,760,514
485959,760,513
485960,760,510
485961,760,508
485965,760,514
485970,760,511
485971,760,513
485973,760,509
485979,760,512
485980,760,510
485981,760,512
485983,512,760
485985,760,509
485987,760,513
485988,760,510
485989,760,510
485990,513,760
485991,760,510
485992,760,509
485995,760,487
485997,509,760
485998,760,507
485999,508,760
486000,510,760
486001,508,760
486002,760,514
Expand All @@ -217,12 +347,25 @@
486017,760,510
486018,760,510
486019,760,513
486021,760,513
486023,760,512
486024,760,512
486025,760,510
486026,508,760
486028,760,511
486029,507,760
486030,760,505
486031,760,510
486032,760,511
486033,514,760
486035,760,511
486038,760,506
486041,760,507
486044,760,511
486045,760,506
486046,760,514
486048,508,760
486049,760,514
486281,760,501
701317f,760,608
701318f,760,608
Expand Down
Binary file modified geocache.tgz
Binary file not shown.
8 changes: 6 additions & 2 deletions oldnyc/geocode/coders/extended_grid.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,13 @@

def parse_street_ave(street1: str, street2: str) -> tuple[str, str]:
# try to get the avenue in street1
if re.search(r"str|st\.", street1, flags=re.I):
if re.search(r"str|st\.|\bst\b", street1, flags=re.I):
street2, street1 = street1, street2

if not re.search(r"ave", street1, flags=re.I):
raise ValueError("%s is not an avenue" % street1)

if not re.search(r"str|st\.", street2, flags=re.I):
if not re.search(r"str|st\.|\bst\b", street2, flags=re.I):
raise ValueError("%s is not a street" % street2)

street1 = remove_parens(street1)
Expand Down Expand Up @@ -159,6 +159,10 @@ def codeRecord(self, r: Item):
}
return out

def getLatLonFromLocatable(self, r, data):
# TODO: do the location here, not in codeRecord
pass

def getLatLonFromGeocode(self, geocode, data, record):
for result in geocode["results"]:
# data['type'] is something like 'address' or 'intersection'.
Expand Down
18 changes: 18 additions & 0 deletions oldnyc/geocode/coders/extended_grid_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from oldnyc.geocode.coders.extended_grid import parse_street_ave


def test_parse_street_ave():
assert parse_street_ave("122nd St", "1st Ave") == ("1", "122")
assert parse_street_ave("1st Ave", "122nd St") == ("1", "122")
assert parse_street_ave("18th Street", "Avenue A") == ("A", "18")
assert parse_street_ave("18th Street (West)", "4th Avenue") == ("4", "18")

# 711722f
assert parse_street_ave("9th Avenue", "23rd Street (West)") == ("9", "23")

# 464848
assert parse_street_ave("West End Avenue", "106th Street") == ("11", "106")
assert parse_street_ave("Stanley Court, corner West End Avenue", "106th Street") == (
"11",
"106",
)
3 changes: 3 additions & 0 deletions oldnyc/geocode/coders/milstein.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,9 @@ def _getBoroughFromAddress(self, address):
record_boro = "Manhattan"
return record_boro

def getLatLonFromLocatable(self, r, data):
pass

def getLatLonFromGeocode(self, geocode, data, record):
"""Extract (lat, lon) from a Google Maps API response. None = failure.
Expand Down
33 changes: 18 additions & 15 deletions oldnyc/geocode/coders/subjects.py
Original file line number Diff line number Diff line change
Expand Up @@ -399,9 +399,9 @@ def codeRecord(self, r):
title_spec = title_locatable[0]
title_src = title_locatable[1]["source"]
if is_address_close(subject_locatable[1]["address"], title_locatable[1]["address"]):
sys.stderr.write(
"\t".join(["clash!", "subject/title close", r.id, subj_src, title_src]) + "\n"
)
# sys.stderr.write(
# "\t".join(["clash!", "subject/title close", r.id, subj_src, title_src]) + "\n"
# )

self.counters["n_out_title"] += 1
self.counters["n_out_both_close"] += 1
Expand All @@ -423,18 +423,18 @@ def codeRecord(self, r):
self.counters["n_out_both_title"] += 1
return title_locatable[1]
else:
sys.stderr.write(
"\t".join(
[
"clash!",
"subject/title same",
r.id,
subj_src,
title_src,
]
)
+ "\n"
)
# sys.stderr.write(
# "\t".join(
# [
# "clash!",
# "subject/title same",
# r.id,
# subj_src,
# title_src,
# ]
# )
# + "\n"
# )
self.counters["n_out_title"] += 1
self.counters["n_out_both_fallback_title"] += 1
return title_locatable[1]
Expand All @@ -446,6 +446,9 @@ def codeRecord(self, r):
self.counters["n_out_title"] += 1
return title_locatable[1]

def getLatLonFromLocatable(self, r, data):
pass

def getLatLonFromGeocode(self, geocode, data, record):
for result in geocode["results"]:
# data['type'] is something like 'address' or 'intersection'.
Expand Down
Loading

0 comments on commit 1a15189

Please sign in to comment.