Skip to content

Commit

Permalink
start clearing out nyc dir
Browse files Browse the repository at this point in the history
  • Loading branch information
danvk committed Oct 20, 2024
1 parent 74bd910 commit ce363ad
Show file tree
Hide file tree
Showing 7 changed files with 17 additions and 21 deletions.
4 changes: 2 additions & 2 deletions coders/milstein.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
import re
import sys

import nyc.boroughs
from coders.types import Coder, Locatable
from oldnyc.geocode.boroughs import point_to_borough
from oldnyc.item import Item

boros = r"(?:New York|Manhattan|Brooklyn|Bronx|Queens|Staten Island), (?:NY|N\.Y\.)"
Expand Down Expand Up @@ -161,7 +161,7 @@ def getLatLonFromGeocode(self, geocode, data, record):
return None
_, lat, lon = tlatlon

geocode_boro = nyc.boroughs.PointToBorough(lat, lon)
geocode_boro = point_to_borough(lat, lon)
record_boro = self._getBoroughFromAddress(data["address"])

if geocode_boro != record_boro:
Expand Down
4 changes: 3 additions & 1 deletion data/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,6 @@ Inputs:
- `data.json`: contains OCR text from 2015 (Ocropy) plus manual fixes
- `gpt-text.json`: contains OCR text from 2024 via OpenAI

...
TODO:

- Document provenance for all files.
File renamed without changes.
4 changes: 2 additions & 2 deletions grid/gold.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@

from dotenv import load_dotenv

from nyc.boroughs import PointToBorough
from oldnyc.geocode import geocoder
from oldnyc.geocode.boroughs import point_to_borough


# See http://stackoverflow.com/a/20007730/388951
Expand Down Expand Up @@ -101,7 +101,7 @@ def locate(avenue, street, verbose=False):

loc = r["geometry"]["location"]
lat_lon = loc["lat"], loc["lng"]
if PointToBorough(*lat_lon) != "Manhattan":
if point_to_borough(*lat_lon) != "Manhattan":
if verbose:
sys.stderr.write("Discarding non-Manhattan location\n")
return None
Expand Down
4 changes: 2 additions & 2 deletions nyc/coverage-by-borough.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import sys
from collections import defaultdict

from nyc.boroughs import PointToBorough
from oldnyc.geocode.boroughs import point_to_borough

records = json.load(open(sys.argv[1]))

Expand Down Expand Up @@ -39,7 +39,7 @@
if "located_str" not in e:
continue
lat, lon = e["latlon"]
geocode_boro = PointToBorough(lat, lon)
geocode_boro = point_to_borough(lat, lon)

if boro == geocode_boro:
correct_by_borough[boro] += 1
Expand Down
8 changes: 4 additions & 4 deletions ocr/cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import editdistance


def swap_chars(txt):
def swap_chars(txt: str) -> str:
r"""Remove a few common Ocropusisms, like \& and ''"""
return re.sub(r"''", '"', re.sub(r"\\&", "&", txt))

Expand All @@ -39,7 +39,7 @@ def is_warning(line):
return False


def remove_warnings(txt):
def remove_warnings(txt: str) -> str:
"""Remove lines like "NO REPRODUCTIONS"."""
# remove full warning lines
txt = "\n".join(line for line in txt.split("\n") if not is_warning(line))
Expand All @@ -60,7 +60,7 @@ def remove_warnings(txt):
return txt


def merge_lines(txt):
def merge_lines(txt: str) -> str:
"""Merge sequential lines in a paragraph into a single line.
This can't be done reliably from just the OCR'd text -- it would be better
Expand Down Expand Up @@ -99,7 +99,7 @@ def merge_lines(txt):
return txt


def clean(txt):
def clean(txt: str):
return merge_lines(remove_warnings(swap_chars(txt)))


Expand Down
14 changes: 4 additions & 10 deletions nyc/boroughs.py → oldnyc/geocode/boroughs.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,23 @@
"""Utility for mapping a lat/lon to a borough."""

import json
import os

import nyc.shape_utils as shape_utils

boroughs = None

BOROUGHS_JSON_FILE = "data/originals/borough-polygons.json"

def _getBoroughJsonPath():
for path in ["borough-polygons.json", "nyc/borough-polygons.json"]:
if os.path.exists(path):
return path
raise Exception("Couldn't find borough-polygons.json file.")
boroughs = None


def PointToBorough(lat, lon):
def point_to_borough(lat: float, lon: float) -> str | None:
"""Returns the name of a borough, or None if the point is not in NYC.
Possible return values are:
'Bronx', 'Brooklyn', 'Staten Island', 'Manhattan', 'Queens', None
"""
global boroughs
if not boroughs:
boroughs = json.load(open(_getBoroughJsonPath()))
boroughs = json.load(open(BOROUGHS_JSON_FILE))

pt = (lon, lat)
for k, v in boroughs.items():
Expand Down

0 comments on commit ce363ad

Please sign in to comment.