improve downloader

project-polymorph · Jan 21, 2025 · cc43960 · cc43960
1 parent 97a506d
commit cc43960
Show file tree

Hide file tree

Showing 5 changed files with 147 additions and 2 deletions.
diff --git a/download/Makefile b/download/Makefile
@@ -32,3 +32,8 @@ all:
 random:
 	python .github/downloader/download/download.py --output-dir .github/downloader/webpage_archive/raw/all_webpage --download-type webpage --pattern ".*" --order random
 	python .github/downloader/download/download.py --output-dir .github/downloader/webpage_archive/raw/all_webpage1 --download-type webpage --pattern ".*" --order random
+
+	python .github/downloader/download/download.py --output-dir .github/downloader/webpage_archive/raw/all_webpage1 --download-type webpage --pattern ".*" --order random
+
+jina:
+	python .github/downloader/download/download.py --download-type jina --output-dir .github/downloader/webpage_archive/new_all_results/jinabatch1  --pattern ".*" --order random
diff --git a/download/download.py b/download/download.py
@@ -153,6 +153,11 @@ def process_links_file(yaml_path, output_dir, related_filter='true', file_patter
                 success, result = download_webpage(url, output_dir, title)
             elif download_type == 'jina':
                 success, result = download_jina(url, output_dir, title)
+            elif download_type == 'both':
+                # Try PDF first, if it fails try webpage
+                success, result = download_pdf(url, output_dir, title)
+                if not success:
+                    success, result = download_webpage(url, output_dir, title)
             else:
                 print(f"✗ Invalid download type: {download_type}")
                 continue
@@ -257,7 +262,7 @@ def main():
         '--download-type',
         choices=['pdf', 'webpage', 'jina', 'both'],
         default='pdf',
-        help='Type of download to perform (default: both)'
+        help='Type of download to perform (pdf, webpage, jina, or both)'
     )
 
     parser.add_argument(

diff --git a/download/jinadown.py b/download/jinadown.py
@@ -0,0 +1,54 @@
+import os
+import hashlib
+import subprocess
+from pathlib import Path
+import json
+import time
+def get_file_md5(filepath):
+    """Calculate MD5 hash of a file"""
+    md5_hash = hashlib.md5()
+    with open(filepath, "rb") as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            md5_hash.update(chunk)
+    return md5_hash.hexdigest()
+
+def download_jina(url, output_dir, title):
+    """Download webpage content using Jina Reader API"""
+    try:
+        # Create output directory if it doesn't exist
+        os.makedirs(output_dir, exist_ok=True)
+
+        # Generate safe base filename from title
+        safe_title = "".join(c for c in title if c.isalnum() or c in (' ', '-', '_')).rstrip()
+        base_name = safe_title.replace(' ', '_')[:100]
+        filename = f"{base_name}.md"
+        output_path = os.path.join(output_dir, filename)
+        # sleep 10 seconds to avoid rate limit
+        time.sleep(10)
+        # Prepare curl command with Jina Reader headers
+        jina_url = f"https://r.jina.ai/{url}"
+        command = [
+            'curl',
+            '--location',
+            jina_url,
+            '-H', 'X-Engine: readerlm-v2',
+            '-H', 'X-With-Iframe: true',
+            '-H', 'X-With-Shadow-Dom: true',
+            '--no-progress-meter',
+            '-v'
+        ]
+
+        # Execute curl command
+        result = subprocess.run(command, capture_output=True, text=True)
+        if result.returncode != 0:
+            raise Exception(f"curl failed with error: {result.stderr}")
+
+        # Save the content
+        with open(output_path, 'w', encoding='utf-8') as f:
+            f.write(result.stdout)
+
+        return True, output_path
+
+    except Exception as e:
+        print(f"✗ Unexpected error: {e}")
+        return False, str(e) 
diff --git a/file_processor.py b/file_processor.py
@@ -0,0 +1,81 @@
+import os
+import shutil
+import argparse
+import yaml
+from pathlib import Path
+
+def is_valid_cleaned_file(file_path):
+    """Check if a file is valid by reading its content."""
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            content = f.read()
+            # Check if file is not empty and doesn't contain error messages
+            if content.strip() and content.strip() not in ['太长', '爬取错误']:
+                return True
+    except Exception as e:
+        print(f"Error reading file {file_path}: {e}")
+    return False
+
+def get_original_links(page_yml_path):
+    """Get original links from page.yml."""
+    try:
+        with open(page_yml_path, 'r', encoding='utf-8') as f:
+            return yaml.safe_load(f)
+    except Exception as e:
+        print(f"Error reading page.yml: {e}")
+        return {}
+
+def append_original_link(file_path, original_link):
+    """Append original link as a comment to the end of the file."""
+    try:
+        with open(file_path, 'a', encoding='utf-8') as f:
+            f.write(f"\n<!-- tcd_original_link {original_link} -->\n")
+    except Exception as e:
+        print(f"Error appending original link to {file_path}: {e}")
+
+def process_files(source_dir, target_dir):
+    """Process and copy valid files from source to target directory."""
+    source_dir = Path(source_dir)
+    target_dir = Path(target_dir)
+    # mkdir target_dir if not exists
+    target_dir.mkdir(parents=True, exist_ok=True)
+
+    # Process ready directory
+    ready_dir = source_dir / 'ready'
+    if not ready_dir.exists():
+        print(f"Ready directory not found: {ready_dir}")
+        return
+
+    # Get original links from page.yml
+    downloads_dir = source_dir / 'downloads'
+    page_yml = downloads_dir / 'page.yml'
+    original_links = get_original_links(page_yml)
+
+    # Copy valid files from ready directory
+    for file_path in ready_dir.glob('*.md'):
+        if is_valid_cleaned_file(file_path):
+            target_file = target_dir / file_path.name
+            try:
+                shutil.copy2(file_path, target_file)
+                print(f"Copied: {file_path.name}")
+                # Append original link to the copied file
+                file_name_html = file_path.name.replace('.md', '.html')
+                if file_name_html in original_links:
+                    original_link = original_links[file_name_html]['link']
+                    append_original_link(target_file, original_link)
+                else:
+                    print("not found" + file_name_html)
+            except Exception as e:
+                print(f"Error copying {file_path}: {e}")
+
+def main():
+    parser = argparse.ArgumentParser(description='Process and copy cleaned files')
+    parser.add_argument('source_dir', help='Source directory path')
+    parser.add_argument('target_dir', help='Target directory path')
+
+    args = parser.parse_args()
+
+    process_files(args.source_dir, args.target_dir)
+
+if __name__ == '__main__':
+    main()
diff --git a/webpage_archive b/webpage_archive