r-three · wildphoton · May 11, 2024 · Mar 18, 2024 · Apr 15, 2024 · May 6, 2024
diff --git a/courtlistener/csv_to_dolma.py b/courtlistener/csv_to_dolma.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python
+"""
+Created by zhenlinx on 01/19/2024
+"""
+import argparse
+import pandas as pd
+import glob
+import os
+import sys
+import csv
+from datetime import datetime
+import logging
+
+from licensed_pile.licenses import PermissiveLicenses
+from licensed_pile.write import to_dolma
+
+SOURCE_NAME = "CourtListenerOpinion"
+
+csv.field_size_limit(sys.maxsize)
+
+logging.basicConfig(level=logging.INFO, format="to-dolma: [%(asctime)s] [%(funcName)s] %(levelname)s - %(message)s")
+
+
+def make_record_generator(file_path):
+    with open(file_path, 'r') as csvfile:
+        # Create a CSV reader object
+        reader = csv.DictReader(csvfile)
+
+        # Yield a dictionary for each row
+        for row in reader:
+            # 'row' is a dictionary with column headers as keys
+
+            if not row['plain_text']:
+                pass  # TODO load from row["download_url"] if not null
+            else:
+                yield {
+                    "id": row["id"],
+                    "text": row["plain_text"],
+                    "source": SOURCE_NAME,
+                    "added": datetime.utcnow().isoformat(),
+                    "metadata": {
+                        "license": str(PermissiveLicenses.PD),
+                        "url": row["download_url"],
+                    },
+                }
+
+
+def main(args):
+    # Path to your large CSV file
+    # for csv_file_path in glob.iglob(os.path.join(args.data, "*.txt")):
+        # file_path = './data/courtlistener/raw/opinions-2022-08-02.csv'
+    example_generator = make_record_generator(args.input_file)
+    output_file_base_name = os.path.basename(args.input_file).replace('.csv', '.jsonl.gz')
+    to_dolma(example_generator, args.output_dir, output_file_base_name, args.shard_size)
+    logging.info(f"Saved {args.input_file} as dolma shared files at {args.output_dir}")
+
+
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Convert csv data to dolma.")
+    # parser.add_argument(
+    #     "--data", default=f"data/courtlistener/raw", help="Path to the directory containing raw data."
+    # )
+    parser.add_argument(
+        "--output_dir",
+        default=f"data/courtlistener/v0",
+        help="Where the dolma formatted data goes."
+    )
+    parser.add_argument(
+        "--input_file", default="./data/courtlistener/raw/opinions-2022-08-02.csv", help="The base filename stores data"
+    )
+    parser.add_argument(
+        "--shard_size", type=int, default=1, help="Size, in GB, for each shard."
+    )
+    args = parser.parse_args()
+    main(args)
diff --git a/courtlistener/get_data.sh b/courtlistener/get_data.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+set -e
+
+# URL of the directory
+base_url="https://storage.courtlistener.com/bulk-data/"
+
+# Define the download directory
+download_dir="./data/courtlistener/raw"
+
+# Create the directory if it does not exist
+mkdir -p "$download_dir"
+
+dates=(
+    "2022-08-02"
+#    "2022-08-31"
+#    "2022-09-30"
+#    "2022-10-31"
+#    "2022-11-30"
+#    "2022-12-31"
+#    "2023-01-31"
+#    "2023-02-28"
+#    "2023-03-31"
+#    "2023-04-30"
+#    "2023-05-31"
+#    "2023-07-31"
+#    "2023-08-31"
+#    "2023-12-04"
+)
+
+max_jobs=4
+
+# Function to download and decompress a file
+download_and_decompress() {
+    local file_name="opinions-${1}.csv"
+#    local file_name="financial-disclosure-investments-${1}.csv"
+    local file_url="${base_url}${file_name}.bz2"
+    local decompressed_file="${download_dir}/${file_name}"
+    local compressed_file="${download_dir}/${file_name}.bz2"
+
+    # Check if the decompressed file already exists
+    if [[ -f "$decompressed_file" ]]; then
+        echo "Decompressed file ${decompressed_file} already exists, skipping..."
+    else
+      # Check if the compressed file already exists
+      if [[ -f "$compressed_file" ]]; then
+          echo "Compressed file ${compressed_file} already exists, skipping download..."
+      else
+          # Download the file
+          wget -P "$download_dir" "$file_url"
+      fi
+      # Decompress the file
+      bunzip2 "$compressed_file"
+      echo "Decompressed file ${compressed_file} ..."
+    fi
+
+    # transform csv files into shared dolma data
+    echo "Save records in ${decompressed_file} to dolma data"
+    python ./courtlistener/csv_to_dolma.py --input_file ${decompressed_file}
+}
+
+
+# Download each file
+for date in "${dates[@]}"; do
+    download_and_decompress "$date" &
+
+    # Limit the number of parallel jobs
+    if (( $(jobs -r | wc -l) >= max_jobs )); then
+        wait -n
+    fi
+done
+
+# Wait for all background jobs to finish
+wait
+
+echo "Download and decompression completed."
diff --git a/courtlistener/process_csv_file.sh b/courtlistener/process_csv_file.sh
@@ -0,0 +1,3 @@
+#!/usr/bin/env sh
+set -e
+python courtlistener/process_csv.py
diff --git a/requirements.txt b/requirements.txt
@@ -7,3 +7,4 @@ smart_open
 markdown-it-py
 charset_normalizer
 logging_json
+pandas