Pull in more fields from mods, drop milstein.csv (#171)

* delete some more dead code * delete record.py * remove more dead code * working towards eliminating milstein.csv * collect creator from mods * pull in sources from mods * pull in new fields from MODS * pull in artists as well as photographers * pull in lithographers, too * pull in dates from mods * add mods date to images.ndjson * delete milstein.csv! * drop more generic dates * geocode update * cleanup
danvk · Nov 14, 2024 · 39bd841 · 39bd841
1 parent 35a29e1
commit 39bd841
Show file tree

Hide file tree

Showing 11 changed files with 251,608 additions and 66,904 deletions.
diff --git a/data/images.ndjson b/data/images.ndjson
diff --git a/data/lat-lon-to-ids.json b/data/lat-lon-to-ids.json
diff --git a/data/mods-details.json b/data/mods-details.json
diff --git a/data/originals/README.md b/data/originals/README.md
@@ -2,8 +2,7 @@
 
 Files in this folder are "originals"—not derived from some other data source.
 
-- `data/originals/milstein.csv`: the vintage 2013 CSV file from the NYPL that started it all
-- `data/originals/Milstein_data_for_DV.csv`: the 2024 update to the CSV
+- `data/originals/Milstein_data_for_DV.csv`: CSV from the NYPL (2024)
 - Street listings
   - `manhattan-streets.txt`: https://geographic.org/streetview/usa/ny/new_york/new_york.html
   - `brooklyn-streets.txt`: https://geographic.org/streetview/usa/ny/kings/brooklyn.html

diff --git a/data/originals/milstein.csv b/data/originals/milstein.csv
diff --git a/oldnyc/geocode/coders/title_pattern.py b/oldnyc/geocode/coders/title_pattern.py
@@ -13,7 +13,6 @@
 from oldnyc.geocode.boroughs import boroughs_pat, guess_borough, point_to_borough
 from oldnyc.geocode.coders.coder_utils import get_lat_lng_from_geocode
 from oldnyc.geocode.geocode_types import Coder, Locatable
-from oldnyc.geocode.record import clean_title
 from oldnyc.item import Item
 
 # Borough: str1 - str2
@@ -53,7 +52,7 @@ def strip_trivia(txt: str) -> str:
 
 
 def clean_and_strip_title(title: str) -> str:
-    title = clean_title(title)
+    title = title.replace("[", "").replace("]", "")
     title = re.sub(r" +:", ":", title)
     # east side
     # west corner

diff --git a/oldnyc/geocode/generate_js.py b/oldnyc/geocode/generate_js.py
@@ -4,11 +4,12 @@
 import json
 import sys
 from collections import defaultdict
+from datetime import date
 from json import encoder
 from typing import Sequence
 
-from oldnyc.geocode import record
 from oldnyc.geocode.geocode_types import Locatable
+from oldnyc.ingest.dates import extract_years
 from oldnyc.item import Item
 
 encoder.FLOAT_REPR = lambda o: format(o, ".6f")  # type: ignore
@@ -18,6 +19,15 @@
 LocatedRecord = tuple[Item, str | None, Locatable | None]
 
 
+def get_date_range(date_str: str) -> tuple[date, date]:
+    # TODO: this is a bit wonky; could use clean_date more directly.
+    years = extract_years(date_str)
+    if not years or years == [""]:
+        return date(1850, 1, 1), date(1999, 12, 31)
+    dates = [date(int(y), 1, 1) for y in years]
+    return min(dates), max(dates)
+
+
 def _generateJson(located_recs: Sequence[LocatedRecord], lat_lon_map: dict[str, str]):
     out: dict[str, list[str]] = {}
     # "lat,lon" -> list of items
@@ -45,10 +55,9 @@ def _generateJson(located_recs: Sequence[LocatedRecord], lat_lon_map: dict[str,
     points = 0
     photos = 0
     for lat_lon, recs in ll_to_id.items():
-        rec_dates = [(r, record.get_date_range(r.date or "")) for r in recs]
-        # XXX the "if" filter here probably doesn't do anything
+        rec_dates = [(r, get_date_range(r.date or "")) for r in recs]
         sorted_recs = sorted(
-            [rdr for rdr in rec_dates if rdr[1] and rdr[1][1]],
+            rec_dates,
             key=lambda rdr: rdr[1][1],
         )
         no_date += len(recs) - len(sorted_recs)

diff --git a/oldnyc/geocode/geocode_types.py b/oldnyc/geocode/geocode_types.py
@@ -5,12 +5,6 @@
 from oldnyc.item import Item
 
 
-class Location(TypedDict):
-    address: str
-    lat: float
-    lon: float
-
-
 class Locatable(TypedDict):
     address: str
     """Can be either a geolocatable address or @lat,lng"""

diff --git a/oldnyc/geocode/record.py b/oldnyc/geocode/record.py
diff --git a/oldnyc/ingest/collect_mods.py b/oldnyc/ingest/collect_mods.py
@@ -10,6 +10,20 @@ def as_list(dict_or_list: dict | list) -> list:
     return [dict_or_list] if isinstance(dict_or_list, dict) else dict_or_list
 
 
+def is_photographer(name: dict):
+    # Photographer, Artist, Lithographer, Creator
+    return any(rt["$"] in ("pht", "art", "ltg", "cre") for rt in name["role"]["roleTerm"])
+
+
+def extract_date(origin_info: dict | list) -> str | None:
+    origins = as_list(origin_info)
+    for origin in origins:
+        for field in ("dateIssued", "dateCreated"):
+            date = origin.get(field)
+            if date:
+                return ", ".join([d["$"] for d in as_list(date)])
+
+
 if __name__ == "__main__":
     mods_dir, item_details_dir = sys.argv[1:]
 
@@ -30,9 +44,24 @@ def as_list(dict_or_list: dict | list) -> list:
         mods = resp["mods"]
         titles = as_list(mods["titleInfo"])
         assert titles[0]["usage"] == "primary"
-
         title_strs = [t["title"]["$"] for t in titles]
-        mapping[uuid] = {"titles": title_strs}
+
+        # TODO: output as a list
+        names = as_list(mods.get("name", []))
+        creators = [name["namePart"]["$"] for name in names if is_photographer(name)]
+        creator = ";".join(creators) if creators else None
+
+        origin = mods.get("originInfo")
+        date = extract_date(origin) if origin else None
+
+        sources = []
+        relatedItem = mods["relatedItem"]
+        while relatedItem:
+            assert relatedItem["type"] == "host"
+            sources = [relatedItem["titleInfo"]["title"]["$"]] + sources
+            relatedItem = relatedItem.get("relatedItem")
+
+        mapping[uuid] = {"titles": title_strs, "creator": creator, "sources": sources, "date": date}
 
     # 104425.json: no back image
     # 1552839.json: has back image

diff --git a/oldnyc/ingest/ingest.py b/oldnyc/ingest/ingest.py
@@ -44,6 +44,47 @@ def sort_uniq(xs: list[str]) -> list[str]:
 }
 
 
+CREATOR_PATCHES = {
+    "Welles, Burton F. (Burton Frederick), 1872-": "Welles & Co.--Publisher",
+    "Sperr, Percy Loomis, 1890-1964": "Sperr, Percy Loomis",
+    "Wurts Bros. (New York, N.Y.)": "Wurts Brothers",
+    "Ewing Galloway (Agency)": "Galloway, Ewing",
+    "Underhill, Irving, -1960": "Underhill, Irving,d. 1960",
+    "Tiemann, Hermann Newell (1863-1957)": "Tiemann, Hermann Newell",
+    "Fass, John S. (John Stroble), 1890-1973": "Fass, John S. (John Stroble),b. 1890",
+    "Van der Weyde, William M. (William Manley), 1870-1928": "Van der Weyde, William M. (William Manley)",
+    "Abbott, Berenice, 1898-1991": "Abbott, Berenice",
+    "Fairchild Aerial Surveys, inc.": "Fairchild Aerial Surveys, Inc.",
+    "Armbruster, Eugene L., 1865-1943": "Armbruster, Eugene L.",
+}
+
+SOURCE_PATCHES = {
+    "Fifth Avenue, New York, from start to finish": "Fifth Avenue, New York, from start to finish.",
+    "Itineraire pittoresque du fleuve Hudson et des parties laterales de l'Amerique du Nord, d'apres les dessins originaux pris sur les lieux. Atlas": "Itineraire pittoresque du fleuve Hudson et des parties laterales de l'Amerique du Nord, d'apres les dessins originaux pris sur les lieux. Atlas.",
+    "Apartment houses of the metropolis": "Apartment houses of the metropolis.",
+    "Amerique septentrionale : vues des chutes du Niagara": "Amerique septentrionale : vues des chutes du Niagara.",
+    "Photographic views of New York City, 1870's-1970's. Supplement. / Manhattan": "Photographic views of New York City, 1870's-1970's, from the collections of the New York Public Library. Supplement.  / Manhattan",
+    "Photographic views of New York City, 1870's-1970's. Supplement. / Brooklyn": "Photographic views of New York City, 1870's-1970's, from the collections of the New York Public Library. Supplement.  / Brooklyn",
+    "Photographic views of New York City, 1870's-1970's. Supplement. / Queens": "Photographic views of New York City, 1870's-1970's, from the collections of the New York Public Library. Supplement.  / Queens",
+    "Photographic views of New York City, 1870's-1970's. Supplement. / Bronx": "Photographic views of New York City, 1870's-1970's, from the collections of the New York Public Library. Supplement.  / Bronx",
+    "Photographic views of New York City, 1870's-1970's. Supplement. / Topics": "Photographic views of New York City, 1870's-1970's, from the collections of the New York Public Library. Supplement.  / Topics",
+    "Collection of photographs of New York City / Manhattan": "[Collection of photographs of New York City.] / [Wurts Brothers, photographer] / Manhattan",
+    "Collection of photographs of New York City / Brooklyn": "[Collection of photographs of New York City.] / [Wurts Brothers, photographer] / Brooklyn",
+    "Collection of photographs of New York City / Bronx": "[Collection of photographs of New York City.] / [Wurts Brothers, photographer] / Bronx",
+    "Collection of photographs of New York City / Queens": "[Collection of photographs of New York City.] / [Wurts Brothers, photographer] / Queens",
+    "Collection of photographs of New York City / Subjects": "[Collection of photographs of New York City.] / [Wurts Brothers, photographer] / Subjects",
+    "Collection of photographs of New York City, 1931-1942": "[Collection of photographs of New York City, 1931-1942.]",
+    "Photographic negatives of the New York City Tenement House Department": "Photographic negatives of the New York City Tenement House Department, 1902-1914",
+    "A Pictorial description of Broadway": "A Pictorial description of Broadway / by the Mail & Express.",
+    "The World's loose leaf album of apartment houses: containing views and ground plans of the principal high class apartment houses in New York City, together with a map showing the situation of these houses, transportation facilities, etc.": "The World's loose leaf album of apartment houses, containing views and ground plans of the principal high class apartment houses in New York City, together with a map showing the situation of these houses, transportation facilities, etc.",
+    "[Collection of photographs of New York City, 1931-1942]": "[Collection of photographs of New York City, 1931-1942.]",
+    "Photographs of Madison Square Garden": "[Photographs of Madison Square Garden. New York, 1925]",
+    "Forty etchings, from sketches made with the camera lucida, in North America, in 1827 and 1828": "Forty etchings, from sketches made with the camera lucida, in North America, in 1827 and 1828.",
+    "Photographic views of the construction of the New York City subway system, 1901-1905": "Photographic views of the construction of the New York City subway system, 1901-1905.",
+    "Supplement to Apartment houses of the metropolis": "Supplement to Apartment houses of the metropolis.",
+}
+
+
 def outside_nyc(geographics: list[str]) -> bool:
     for g in geographics:
         if (g in STATES and g not in TRISTATE) or g in OTHER_OUTSIDE:
@@ -55,11 +96,21 @@ def strip_punctuation(s: str) -> str:
     return re.sub(r"[^\w]", "", s)
 
 
+def patch_source(source: str) -> str:
+    if source == "":
+        return ""
+    source = source.replace(", from the collections of the New York Public Library", "")
+    if source.startswith("Collection of photographs taken by Daniel B. Austin"):
+        source = "[" + source
+        source = source.replace("1914", "1914]")
+    return SOURCE_PATCHES.get(source, source)
+
+
+# These are sometimes used as placeholders for unknown dates.
+GENERIC_DATES = {"1887, 1986", "1870, 1970", "1887, 1964", "1900, 1999", "1960, 1990"}
+
+
 def run():
-    csv2013 = {
-        row["DIGITAL_ID"]: row
-        for row in csv.DictReader(open("data/originals/milstein.csv", encoding="latin-1"))
-    }
     csv2024 = {
         row["image_id"].lower(): row
         for row in csv.DictReader(open("data/originals/Milstein_data_for_DV_2.csv"))
@@ -76,108 +127,34 @@ def run():
 
     counters = Counter[str]()
     out = open("data/images.ndjson", "w")
-    ids = [*sorted(csv2013.keys())]
+    ids = [*sorted(csv2024.keys())]
     for id in tqdm(ids):
         counters["num records"] += 1
-        row = csv2013[id]
-
-        date_str = row["CREATED_DATE"]
-
-        title = row["IMAGE_TITLE"].strip()
-        assert title
-
-        alt_title = row["ALTERNATE_TITLE"].strip()
-        if not alt_title:
-            alt_title = None
-        source = row["SOURCE"].strip()
-
-        creator = row["CREATOR"].strip()
-
         row2 = csv2024[id]
+
         uuid = row2["item_uuid"]
         url = row2["digital_collections_url"]
         title2 = row2["title"].strip()
-        date2 = row2["date"]
-        if date2 == "1887, 1986" or date2 == "1870, 1970":
-            date2 = ""  # 1887-1986 is used as "unknown"
-            counters["date2: generic"] += 1
 
         topics = sort_uniq(json.loads(row2["subject/topic"]))
         geographics = sort_uniq(json.loads(row2["subject/geographic"]))
         names = sort_uniq(json.loads(row2["subject/name"]))
         temporals = sort_uniq(json.loads(row2["subject/temporal"]))
+        mods_detail = mods_details.get(uuid)
 
-        dates = [date_str, date2]
-        dates = [clean_date(normalize_whitespace(d.strip())) for d in dates]
-        date_str, date2 = dates
-        if date_str != date2:
-            counters["mismatch: date"] += 1
-            if date2 and not date_str:
-                counters["date: added"] += 1
-            elif date_str and not date2:
-                counters["date: dropped"] += 1
-            else:
-                counters["date: changed"] += 1
-
-            # print("---")
-            # print(id)
-            # print(date_str)
-            # print(date2)
-
-        titles = [title, title2]
-        titles = [clean_title(normalize_whitespace(t)) for t in titles]
-        title, title2 = titles
-
-        if title != title2:
-            counters["mismatch: title mismatch"] += 1
-            if title == title2 + ".":
-                counters["title mismatch: drop dot"] += 1
-            elif title2 == title + "]":
-                counters["title mismatch: add bracket"] += 1
-            elif strip_punctuation(title).lower() == strip_punctuation(title2).lower():
-                counters["title mismatch: other punctuation"] += 1
-            elif not title2.isascii():
-                counters["title mismatch: non-ascii"] += 1
-            elif title == "No Title":
-                counters["title mismatch: add title"] += 1
-                # print("---", id, "---")
-                # print(title)
-                # print(title2)
-            elif "Directories" in topics or "directory" in title2.lower():
-                counters["title mismatch: directory"] += 1
-            elif outside_nyc(geographics):
-                counters["title mismatch: outside nyc"] += 1
-            elif title2.replace(" and ", "") == title:
-                counters["title mismatch: add and"] += 1
-            else:
-                counters["title mismatch: other"] += 1
-                # print("---", id, "---")
-                # print(title)
-                # print(title2)
+        date2 = row2["date"] or (mods_detail["date"] if mods_detail else None) or ""
+        if date2 in GENERIC_DATES:
+            date2 = ""
+            counters["date2: generic"] += 1
+        date2 = clean_date(normalize_whitespace(date2.strip()))
+
+        title2 = clean_title(normalize_whitespace(title2))
 
-        mods_detail = mods_details.get(uuid)
-        # TODO: store as array
-        alt_title2 = (
-            "\n".join(mods_detail.get("titles"))
-            if mods_detail
-            else (row2["alternative_title"].strip() if row2["alternative_title"] else None)
-        )
-        alt_titles = [alt_title, alt_title2]
-        alt_titles = [clean_title(normalize_whitespace(t)) if t else None for t in alt_titles]
-        alt_title, alt_title2 = alt_titles
-
-        if alt_title != alt_title2:
-            counters["mismatch: alt_title mismatch"] += 1
-            # print("---")
-            # print(alt_title)
-            # print(alt_title2)
         alt_title2 = mods_detail.get("titles")[1:] if mods_detail else None
         if not alt_title2:
             alt_title2 = [row2["alternative_title"].strip()] if row2["alternative_title"] else []
         alt_title2 = [clean_title(normalize_whitespace(t)) for t in alt_title2]
 
-        if alt_title:
-            counters["alt_title"] += 1
         if alt_title2:
             counters["alt_title2"] += 1
 
@@ -228,16 +205,22 @@ def run():
             counters["filtered: directory"] += 1
             continue
 
+        creator = (clean_creator(mods_detail["creator"] or "") or None) if mods_detail else None
+        creator = CREATOR_PATCHES.get(creator, creator) if creator else None
+
+        source = " / ".join(mods_detail["sources"]) if mods_detail else ""
+        source = patch_source(source)
+
         r = Item(
             id=id,
             uuid=uuid,
             url=url,
             photo_url=f"https://images.nypl.org/?id={id}&t=w",
-            date=date2 or date_str or None,
+            date=date2 or None,
             title=title2,
             alt_title=alt_title2 or [],
             back_id=back_id,
-            creator=clean_creator(creator) or None,
+            creator=creator,
             source=source,
             back_text=back_text,
             back_text_source=back_text_source,