Skip to content

Commit

Permalink
Pull in more fields from mods, drop milstein.csv (#171)
Browse files Browse the repository at this point in the history
* delete some more dead code

* delete record.py

* remove more dead code

* working towards eliminating milstein.csv

* collect creator from mods

* pull in sources from mods

* pull in new fields from MODS

* pull in artists as well as photographers

* pull in lithographers, too

* pull in dates from mods

* add mods date to images.ndjson

* delete milstein.csv!

* drop more generic dates

* geocode update

* cleanup
  • Loading branch information
danvk authored Nov 14, 2024
1 parent 35a29e1 commit 39bd841
Show file tree
Hide file tree
Showing 11 changed files with 251,608 additions and 66,904 deletions.
7,324 changes: 3,662 additions & 3,662 deletions data/images.ndjson

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion data/lat-lon-to-ids.json

Large diffs are not rendered by default.

247,826 changes: 247,826 additions & 0 deletions data/mods-details.json

Large diffs are not rendered by default.

3 changes: 1 addition & 2 deletions data/originals/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@

Files in this folder are "originals"—not derived from some other data source.

- `data/originals/milstein.csv`: the vintage 2013 CSV file from the NYPL that started it all
- `data/originals/Milstein_data_for_DV.csv`: the 2024 update to the CSV
- `data/originals/Milstein_data_for_DV.csv`: CSV from the NYPL (2024)
- Street listings
- `manhattan-streets.txt`: https://geographic.org/streetview/usa/ny/new_york/new_york.html
- `brooklyn-streets.txt`: https://geographic.org/streetview/usa/ny/kings/brooklyn.html
Expand Down
63,092 changes: 0 additions & 63,092 deletions data/originals/milstein.csv

This file was deleted.

3 changes: 1 addition & 2 deletions oldnyc/geocode/coders/title_pattern.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
from oldnyc.geocode.boroughs import boroughs_pat, guess_borough, point_to_borough
from oldnyc.geocode.coders.coder_utils import get_lat_lng_from_geocode
from oldnyc.geocode.geocode_types import Coder, Locatable
from oldnyc.geocode.record import clean_title
from oldnyc.item import Item

# Borough: str1 - str2
Expand Down Expand Up @@ -53,7 +52,7 @@ def strip_trivia(txt: str) -> str:


def clean_and_strip_title(title: str) -> str:
title = clean_title(title)
title = title.replace("[", "").replace("]", "")
title = re.sub(r" +:", ":", title)
# east side
# west corner
Expand Down
17 changes: 13 additions & 4 deletions oldnyc/geocode/generate_js.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@
import json
import sys
from collections import defaultdict
from datetime import date
from json import encoder
from typing import Sequence

from oldnyc.geocode import record
from oldnyc.geocode.geocode_types import Locatable
from oldnyc.ingest.dates import extract_years
from oldnyc.item import Item

encoder.FLOAT_REPR = lambda o: format(o, ".6f") # type: ignore
Expand All @@ -18,6 +19,15 @@
LocatedRecord = tuple[Item, str | None, Locatable | None]


def get_date_range(date_str: str) -> tuple[date, date]:
# TODO: this is a bit wonky; could use clean_date more directly.
years = extract_years(date_str)
if not years or years == [""]:
return date(1850, 1, 1), date(1999, 12, 31)
dates = [date(int(y), 1, 1) for y in years]
return min(dates), max(dates)


def _generateJson(located_recs: Sequence[LocatedRecord], lat_lon_map: dict[str, str]):
out: dict[str, list[str]] = {}
# "lat,lon" -> list of items
Expand Down Expand Up @@ -45,10 +55,9 @@ def _generateJson(located_recs: Sequence[LocatedRecord], lat_lon_map: dict[str,
points = 0
photos = 0
for lat_lon, recs in ll_to_id.items():
rec_dates = [(r, record.get_date_range(r.date or "")) for r in recs]
# XXX the "if" filter here probably doesn't do anything
rec_dates = [(r, get_date_range(r.date or "")) for r in recs]
sorted_recs = sorted(
[rdr for rdr in rec_dates if rdr[1] and rdr[1][1]],
rec_dates,
key=lambda rdr: rdr[1][1],
)
no_date += len(recs) - len(sorted_recs)
Expand Down
6 changes: 0 additions & 6 deletions oldnyc/geocode/geocode_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,6 @@
from oldnyc.item import Item


class Location(TypedDict):
address: str
lat: float
lon: float


class Locatable(TypedDict):
address: str
"""Can be either a geolocatable address or @lat,lng"""
Expand Down
43 changes: 0 additions & 43 deletions oldnyc/geocode/record.py

This file was deleted.

33 changes: 31 additions & 2 deletions oldnyc/ingest/collect_mods.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,20 @@ def as_list(dict_or_list: dict | list) -> list:
return [dict_or_list] if isinstance(dict_or_list, dict) else dict_or_list


def is_photographer(name: dict):
# Photographer, Artist, Lithographer, Creator
return any(rt["$"] in ("pht", "art", "ltg", "cre") for rt in name["role"]["roleTerm"])


def extract_date(origin_info: dict | list) -> str | None:
origins = as_list(origin_info)
for origin in origins:
for field in ("dateIssued", "dateCreated"):
date = origin.get(field)
if date:
return ", ".join([d["$"] for d in as_list(date)])


if __name__ == "__main__":
mods_dir, item_details_dir = sys.argv[1:]

Expand All @@ -30,9 +44,24 @@ def as_list(dict_or_list: dict | list) -> list:
mods = resp["mods"]
titles = as_list(mods["titleInfo"])
assert titles[0]["usage"] == "primary"

title_strs = [t["title"]["$"] for t in titles]
mapping[uuid] = {"titles": title_strs}

# TODO: output as a list
names = as_list(mods.get("name", []))
creators = [name["namePart"]["$"] for name in names if is_photographer(name)]
creator = ";".join(creators) if creators else None

origin = mods.get("originInfo")
date = extract_date(origin) if origin else None

sources = []
relatedItem = mods["relatedItem"]
while relatedItem:
assert relatedItem["type"] == "host"
sources = [relatedItem["titleInfo"]["title"]["$"]] + sources
relatedItem = relatedItem.get("relatedItem")

mapping[uuid] = {"titles": title_strs, "creator": creator, "sources": sources, "date": date}

# 104425.json: no back image
# 1552839.json: has back image
Expand Down
163 changes: 73 additions & 90 deletions oldnyc/ingest/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,47 @@ def sort_uniq(xs: list[str]) -> list[str]:
}


CREATOR_PATCHES = {
"Welles, Burton F. (Burton Frederick), 1872-": "Welles & Co.--Publisher",
"Sperr, Percy Loomis, 1890-1964": "Sperr, Percy Loomis",
"Wurts Bros. (New York, N.Y.)": "Wurts Brothers",
"Ewing Galloway (Agency)": "Galloway, Ewing",
"Underhill, Irving, -1960": "Underhill, Irving,d. 1960",
"Tiemann, Hermann Newell (1863-1957)": "Tiemann, Hermann Newell",
"Fass, John S. (John Stroble), 1890-1973": "Fass, John S. (John Stroble),b. 1890",
"Van der Weyde, William M. (William Manley), 1870-1928": "Van der Weyde, William M. (William Manley)",
"Abbott, Berenice, 1898-1991": "Abbott, Berenice",
"Fairchild Aerial Surveys, inc.": "Fairchild Aerial Surveys, Inc.",
"Armbruster, Eugene L., 1865-1943": "Armbruster, Eugene L.",
}

SOURCE_PATCHES = {
"Fifth Avenue, New York, from start to finish": "Fifth Avenue, New York, from start to finish.",
"Itineraire pittoresque du fleuve Hudson et des parties laterales de l'Amerique du Nord, d'apres les dessins originaux pris sur les lieux. Atlas": "Itineraire pittoresque du fleuve Hudson et des parties laterales de l'Amerique du Nord, d'apres les dessins originaux pris sur les lieux. Atlas.",
"Apartment houses of the metropolis": "Apartment houses of the metropolis.",
"Amerique septentrionale : vues des chutes du Niagara": "Amerique septentrionale : vues des chutes du Niagara.",
"Photographic views of New York City, 1870's-1970's. Supplement. / Manhattan": "Photographic views of New York City, 1870's-1970's, from the collections of the New York Public Library. Supplement. / Manhattan",
"Photographic views of New York City, 1870's-1970's. Supplement. / Brooklyn": "Photographic views of New York City, 1870's-1970's, from the collections of the New York Public Library. Supplement. / Brooklyn",
"Photographic views of New York City, 1870's-1970's. Supplement. / Queens": "Photographic views of New York City, 1870's-1970's, from the collections of the New York Public Library. Supplement. / Queens",
"Photographic views of New York City, 1870's-1970's. Supplement. / Bronx": "Photographic views of New York City, 1870's-1970's, from the collections of the New York Public Library. Supplement. / Bronx",
"Photographic views of New York City, 1870's-1970's. Supplement. / Topics": "Photographic views of New York City, 1870's-1970's, from the collections of the New York Public Library. Supplement. / Topics",
"Collection of photographs of New York City / Manhattan": "[Collection of photographs of New York City.] / [Wurts Brothers, photographer] / Manhattan",
"Collection of photographs of New York City / Brooklyn": "[Collection of photographs of New York City.] / [Wurts Brothers, photographer] / Brooklyn",
"Collection of photographs of New York City / Bronx": "[Collection of photographs of New York City.] / [Wurts Brothers, photographer] / Bronx",
"Collection of photographs of New York City / Queens": "[Collection of photographs of New York City.] / [Wurts Brothers, photographer] / Queens",
"Collection of photographs of New York City / Subjects": "[Collection of photographs of New York City.] / [Wurts Brothers, photographer] / Subjects",
"Collection of photographs of New York City, 1931-1942": "[Collection of photographs of New York City, 1931-1942.]",
"Photographic negatives of the New York City Tenement House Department": "Photographic negatives of the New York City Tenement House Department, 1902-1914",
"A Pictorial description of Broadway": "A Pictorial description of Broadway / by the Mail & Express.",
"The World's loose leaf album of apartment houses: containing views and ground plans of the principal high class apartment houses in New York City, together with a map showing the situation of these houses, transportation facilities, etc.": "The World's loose leaf album of apartment houses, containing views and ground plans of the principal high class apartment houses in New York City, together with a map showing the situation of these houses, transportation facilities, etc.",
"[Collection of photographs of New York City, 1931-1942]": "[Collection of photographs of New York City, 1931-1942.]",
"Photographs of Madison Square Garden": "[Photographs of Madison Square Garden. New York, 1925]",
"Forty etchings, from sketches made with the camera lucida, in North America, in 1827 and 1828": "Forty etchings, from sketches made with the camera lucida, in North America, in 1827 and 1828.",
"Photographic views of the construction of the New York City subway system, 1901-1905": "Photographic views of the construction of the New York City subway system, 1901-1905.",
"Supplement to Apartment houses of the metropolis": "Supplement to Apartment houses of the metropolis.",
}


def outside_nyc(geographics: list[str]) -> bool:
for g in geographics:
if (g in STATES and g not in TRISTATE) or g in OTHER_OUTSIDE:
Expand All @@ -55,11 +96,21 @@ def strip_punctuation(s: str) -> str:
return re.sub(r"[^\w]", "", s)


def patch_source(source: str) -> str:
if source == "":
return ""
source = source.replace(", from the collections of the New York Public Library", "")
if source.startswith("Collection of photographs taken by Daniel B. Austin"):
source = "[" + source
source = source.replace("1914", "1914]")
return SOURCE_PATCHES.get(source, source)


# These are sometimes used as placeholders for unknown dates.
GENERIC_DATES = {"1887, 1986", "1870, 1970", "1887, 1964", "1900, 1999", "1960, 1990"}


def run():
csv2013 = {
row["DIGITAL_ID"]: row
for row in csv.DictReader(open("data/originals/milstein.csv", encoding="latin-1"))
}
csv2024 = {
row["image_id"].lower(): row
for row in csv.DictReader(open("data/originals/Milstein_data_for_DV_2.csv"))
Expand All @@ -76,108 +127,34 @@ def run():

counters = Counter[str]()
out = open("data/images.ndjson", "w")
ids = [*sorted(csv2013.keys())]
ids = [*sorted(csv2024.keys())]
for id in tqdm(ids):
counters["num records"] += 1
row = csv2013[id]

date_str = row["CREATED_DATE"]

title = row["IMAGE_TITLE"].strip()
assert title

alt_title = row["ALTERNATE_TITLE"].strip()
if not alt_title:
alt_title = None
source = row["SOURCE"].strip()

creator = row["CREATOR"].strip()

row2 = csv2024[id]

uuid = row2["item_uuid"]
url = row2["digital_collections_url"]
title2 = row2["title"].strip()
date2 = row2["date"]
if date2 == "1887, 1986" or date2 == "1870, 1970":
date2 = "" # 1887-1986 is used as "unknown"
counters["date2: generic"] += 1

topics = sort_uniq(json.loads(row2["subject/topic"]))
geographics = sort_uniq(json.loads(row2["subject/geographic"]))
names = sort_uniq(json.loads(row2["subject/name"]))
temporals = sort_uniq(json.loads(row2["subject/temporal"]))
mods_detail = mods_details.get(uuid)

dates = [date_str, date2]
dates = [clean_date(normalize_whitespace(d.strip())) for d in dates]
date_str, date2 = dates
if date_str != date2:
counters["mismatch: date"] += 1
if date2 and not date_str:
counters["date: added"] += 1
elif date_str and not date2:
counters["date: dropped"] += 1
else:
counters["date: changed"] += 1

# print("---")
# print(id)
# print(date_str)
# print(date2)

titles = [title, title2]
titles = [clean_title(normalize_whitespace(t)) for t in titles]
title, title2 = titles

if title != title2:
counters["mismatch: title mismatch"] += 1
if title == title2 + ".":
counters["title mismatch: drop dot"] += 1
elif title2 == title + "]":
counters["title mismatch: add bracket"] += 1
elif strip_punctuation(title).lower() == strip_punctuation(title2).lower():
counters["title mismatch: other punctuation"] += 1
elif not title2.isascii():
counters["title mismatch: non-ascii"] += 1
elif title == "No Title":
counters["title mismatch: add title"] += 1
# print("---", id, "---")
# print(title)
# print(title2)
elif "Directories" in topics or "directory" in title2.lower():
counters["title mismatch: directory"] += 1
elif outside_nyc(geographics):
counters["title mismatch: outside nyc"] += 1
elif title2.replace(" and ", "") == title:
counters["title mismatch: add and"] += 1
else:
counters["title mismatch: other"] += 1
# print("---", id, "---")
# print(title)
# print(title2)
date2 = row2["date"] or (mods_detail["date"] if mods_detail else None) or ""
if date2 in GENERIC_DATES:
date2 = ""
counters["date2: generic"] += 1
date2 = clean_date(normalize_whitespace(date2.strip()))

title2 = clean_title(normalize_whitespace(title2))

mods_detail = mods_details.get(uuid)
# TODO: store as array
alt_title2 = (
"\n".join(mods_detail.get("titles"))
if mods_detail
else (row2["alternative_title"].strip() if row2["alternative_title"] else None)
)
alt_titles = [alt_title, alt_title2]
alt_titles = [clean_title(normalize_whitespace(t)) if t else None for t in alt_titles]
alt_title, alt_title2 = alt_titles

if alt_title != alt_title2:
counters["mismatch: alt_title mismatch"] += 1
# print("---")
# print(alt_title)
# print(alt_title2)
alt_title2 = mods_detail.get("titles")[1:] if mods_detail else None
if not alt_title2:
alt_title2 = [row2["alternative_title"].strip()] if row2["alternative_title"] else []
alt_title2 = [clean_title(normalize_whitespace(t)) for t in alt_title2]

if alt_title:
counters["alt_title"] += 1
if alt_title2:
counters["alt_title2"] += 1

Expand Down Expand Up @@ -228,16 +205,22 @@ def run():
counters["filtered: directory"] += 1
continue

creator = (clean_creator(mods_detail["creator"] or "") or None) if mods_detail else None
creator = CREATOR_PATCHES.get(creator, creator) if creator else None

source = " / ".join(mods_detail["sources"]) if mods_detail else ""
source = patch_source(source)

r = Item(
id=id,
uuid=uuid,
url=url,
photo_url=f"https://images.nypl.org/?id={id}&t=w",
date=date2 or date_str or None,
date=date2 or None,
title=title2,
alt_title=alt_title2 or [],
back_id=back_id,
creator=clean_creator(creator) or None,
creator=creator,
source=source,
back_text=back_text,
back_text_source=back_text_source,
Expand Down

0 comments on commit 39bd841

Please sign in to comment.