r-three · wildphoton · May 11, 2024 · Mar 18, 2024 · Apr 15, 2024 · May 6, 2024
diff --git a/README.md b/README.md
@@ -21,6 +21,22 @@ Look at the text for item with the id of 12 (note that position in file is not c
 
 Note: You can also use `gunzip -c ${file}.jsonl.gz | jq -s ${command}` which is slightly faster (reduces the amount of  data flowwing through pipes) but if you forget the `-c` flag you end up uncompressing the file and deleting the compressed version, i.e. you need to run `gzip ${file}.jsonl` to fix it.
 
+### Capped-parallelism in bash script
+Sometimes we want to download/process multiple files in parallel up to a limited number of jobs in bash script.
+Below is a example code snippet (used in [courtlistener/get_data.sh](courtlistener/get_data.sh)).
+Note that `jobs -r` counts all jobs running in the current shell.
+
+````
+max_jobs = 8
+for file in "${files[@]}"; do
+    download_and_process "file" &
+
+    # Limit the number of parallel jobs
+    if (( $(jobs -r | wc -l) >= max_jobs )); then
+        wait -n
+    fi
+done
+````
 ## Development
 
 We use git pre-commit hooks to format code and keep style consistent.

diff --git a/courtlistener/README.md b/courtlistener/README.md
@@ -0,0 +1,10 @@
+# Court Listener Data
+Opinion data from CourtListener [bulk data list](https://com-courtlistener-storage.s3-us-west-2.amazonaws.com/list.html?prefix=bulk-data/)
+
+## Data download and processing
+Run full processing including downloading the raw zipped data, unzipp to csv file and parsing to dolma format with
+``bash get_data.sh``.
+
+To test with only one zip file with ``bash get_data.sh --test_run 1``.
+
+To change the maximum number of parallel jobs (8 by default) to run with ``--max_jobs``.
diff --git a/courtlistener/csv_to_dolma.py b/courtlistener/csv_to_dolma.py
@@ -0,0 +1,73 @@
+#!/usr/bin/env python
+"""
+Created by zhenlinx on 01/19/2024
+"""
+import argparse
+import csv
+import logging
+import os
+import sys
+from datetime import datetime
+
+from licensed_pile.licenses import PermissiveLicenses
+from licensed_pile.logs import configure_logging
+from licensed_pile.write import to_dolma
+
+SOURCE_NAME = "CourtListenerOpinion"
+
+csv.field_size_limit(sys.maxsize)
+
+logger = configure_logging("court-listener-opinion")
+
+
+def make_record_generator(file_path):
+    with open(file_path, "r") as csvfile:
+        # Create a CSV reader object
+        reader = csv.DictReader(csvfile)
+
+        # Yield a dictionary for each row
+        for row in reader:
+            # 'row' is a dictionary with column headers as keys
+
+            if not row["plain_text"]:
+                pass  # TODO load from row["download_url"] if not null
+            else:
+                yield {
+                    "id": row["id"],
+                    "text": row["plain_text"],
+                    "source": SOURCE_NAME,
+                    "added": datetime.utcnow().isoformat(),
+                    "created": row["data_created"],
+                    "metadata": {
+                        "license": str(PermissiveLicenses.PD),
+                        "url": row["download_url"],
+                    },
+                }
+
+
+def main(args):
+    example_generator = make_record_generator(args.input_file)
+    output_file_base_name = os.path.basename(args.input_file).replace(
+        ".csv", ".jsonl.gz"
+    )
+    to_dolma(example_generator, args.output_dir, output_file_base_name, args.shard_size)
+    logger.info(f"Saved {args.input_file} as dolma shared files at {args.output_dir}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Convert csv data to dolma.")
+    parser.add_argument(
+        "--output_dir",
+        default=f"data/courtlistener/v0",
+        help="Where the dolma formatted data goes.",
+    )
+    parser.add_argument(
+        "--input_file",
+        default="./data/courtlistener/raw/opinions-2022-08-02.csv",
+        help="The base filename stores data",
+    )
+    parser.add_argument(
+        "--shard_size", type=int, default=1, help="Size, in GB, for each shard."
+    )
+    args = parser.parse_args()
+    main(args)
diff --git a/courtlistener/get_data.sh b/courtlistener/get_data.sh
@@ -0,0 +1,106 @@
+#!/bin/bash
+set -e
+
+# URL of the directory
+base_url="https://storage.courtlistener.com/bulk-data/"
+
+# Define the download directory
+download_dir="./data/courtlistener/raw"
+
+# Create the directory if it does not exist
+mkdir -p "$download_dir"
+
+dates=(
+    "2022-08-02"
+    "2022-08-31"
+    "2022-09-30"
+    "2022-10-31"
+    "2022-11-30"
+    "2022-12-31"
+    "2023-01-31"
+    "2023-02-28"
+    "2023-03-31"
+    "2023-04-30"
+    "2023-05-31"
+    "2023-07-31"
+    "2023-08-31"
+    "2023-12-04"
+    "2024-03-11"
+)
+
+max_jobs=8
+
+# Parse command-line options
+while [[ $# -gt 0 ]]; do
+    key="$1"
+    case $key in
+        --test_run)
+            # Use the first N dates for testing
+            shift
+            test_run_count=$1
+            dates=("${dates[@]:0:$test_run_count}")
+            shift
+            ;;
+        --max_jobs)
+            # Set the maximum number of parallel jobs
+            shift
+            max_jobs=$1
+            shift
+            ;;
+        *)
+            echo "Unknown option: $key"
+            exit 1
+            ;;
+    esac
+done
+
+# Display the dates of the files to be fetched
+echo "Dates of files to be fetched:"
+for date in "${dates[@]}"; do
+    echo "$date"
+done
+
+# Function to download and decompress a file
+download_and_decompress() {
+    local file_name="opinions-${1}.csv"
+#    local file_name="financial-disclosure-investments-${1}.csv"
+    local file_url="${base_url}${file_name}.bz2"
+    local decompressed_file="${download_dir}/${file_name}"
+    local compressed_file="${download_dir}/${file_name}.bz2"
+
+    # Check if the decompressed file already exists
+    if [[ -f "$decompressed_file" ]]; then
+        echo "Decompressed file ${decompressed_file} already exists, skipping..."
+    else
+      # Check if the compressed file already exists
+      if [[ -f "$compressed_file" ]]; then
+          echo "Compressed file ${compressed_file} already exists, skipping download..."
+      else
+          # Download the file
+          wget -P "$download_dir" "$file_url"
+      fi
+      # Decompress the file
+      bunzip2 "$compressed_file"
+      echo "Decompressed file ${compressed_file} ..."
+    fi
+
+    # transform csv files into shared dolma data
+    echo "Save records in ${decompressed_file} to dolma data"
+    python ./courtlistener/csv_to_dolma.py --input_file ${decompressed_file}
+}
+
+
+# Download each file
+for date in "${dates[@]}"; do
+    download_and_decompress "$date" &
+
+    # Limit the number of parallel jobs
+    if (( $(jobs -r | wc -l) >= max_jobs )); then
+        wait -n
+    fi
+done
+
+# Wait for all background jobs to finish
+wait
+
+echo "Download and decompression completed."
diff --git a/courtlistener/process_csv_file.sh b/courtlistener/process_csv_file.sh
@@ -0,0 +1,3 @@
+#!/usr/bin/env sh
+set -e
+python courtlistener/process_csv.py
diff --git a/requirements.txt b/requirements.txt
@@ -1,11 +1,12 @@
-tqdm
-rdflib
-pre-commit
-google-cloud-storage
-dolma
-smart_open
-markdown-it-py
 charset_normalizer
+dolma
+google-cloud-storage
 logging_json
+markdown-it-py
+pandas
+pre-commit
+rdflib
 requests>=2.13
+smart_open
 tenacity
+tqdm