Skip to content

Commit

Permalink
improve downloader
Browse files Browse the repository at this point in the history
  • Loading branch information
weekendfish committed Jan 21, 2025
1 parent 97a506d commit cc43960
Show file tree
Hide file tree
Showing 5 changed files with 147 additions and 2 deletions.
5 changes: 5 additions & 0 deletions download/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,8 @@ all:
random:
python .github/downloader/download/download.py --output-dir .github/downloader/webpage_archive/raw/all_webpage --download-type webpage --pattern ".*" --order random
python .github/downloader/download/download.py --output-dir .github/downloader/webpage_archive/raw/all_webpage1 --download-type webpage --pattern ".*" --order random

python .github/downloader/download/download.py --output-dir .github/downloader/webpage_archive/raw/all_webpage1 --download-type webpage --pattern ".*" --order random

jina:
python .github/downloader/download/download.py --download-type jina --output-dir .github/downloader/webpage_archive/new_all_results/jinabatch1 --pattern ".*" --order random
7 changes: 6 additions & 1 deletion download/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,11 @@ def process_links_file(yaml_path, output_dir, related_filter='true', file_patter
success, result = download_webpage(url, output_dir, title)
elif download_type == 'jina':
success, result = download_jina(url, output_dir, title)
elif download_type == 'both':
# Try PDF first, if it fails try webpage
success, result = download_pdf(url, output_dir, title)
if not success:
success, result = download_webpage(url, output_dir, title)
else:
print(f"✗ Invalid download type: {download_type}")
continue
Expand Down Expand Up @@ -257,7 +262,7 @@ def main():
'--download-type',
choices=['pdf', 'webpage', 'jina', 'both'],
default='pdf',
help='Type of download to perform (default: both)'
help='Type of download to perform (pdf, webpage, jina, or both)'
)

parser.add_argument(
Expand Down
54 changes: 54 additions & 0 deletions download/jinadown.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import os
import hashlib
import subprocess
from pathlib import Path
import json
import time
def get_file_md5(filepath):
"""Calculate MD5 hash of a file"""
md5_hash = hashlib.md5()
with open(filepath, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
md5_hash.update(chunk)
return md5_hash.hexdigest()

def download_jina(url, output_dir, title):
"""Download webpage content using Jina Reader API"""
try:
# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Generate safe base filename from title
safe_title = "".join(c for c in title if c.isalnum() or c in (' ', '-', '_')).rstrip()
base_name = safe_title.replace(' ', '_')[:100]
filename = f"{base_name}.md"
output_path = os.path.join(output_dir, filename)
# sleep 10 seconds to avoid rate limit
time.sleep(10)
# Prepare curl command with Jina Reader headers
jina_url = f"https://r.jina.ai/{url}"
command = [
'curl',
'--location',
jina_url,
'-H', 'X-Engine: readerlm-v2',
'-H', 'X-With-Iframe: true',
'-H', 'X-With-Shadow-Dom: true',
'--no-progress-meter',
'-v'
]

# Execute curl command
result = subprocess.run(command, capture_output=True, text=True)
if result.returncode != 0:
raise Exception(f"curl failed with error: {result.stderr}")

# Save the content
with open(output_path, 'w', encoding='utf-8') as f:
f.write(result.stdout)

return True, output_path

except Exception as e:
print(f"✗ Unexpected error: {e}")
return False, str(e)
81 changes: 81 additions & 0 deletions file_processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import os
import shutil
import argparse
import yaml
from pathlib import Path

def is_valid_cleaned_file(file_path):
"""Check if a file is valid by reading its content."""
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# Check if file is not empty and doesn't contain error messages
if content.strip() and content.strip() not in ['太长', '爬取错误']:
return True
except Exception as e:
print(f"Error reading file {file_path}: {e}")
return False

def get_original_links(page_yml_path):
"""Get original links from page.yml."""
try:
with open(page_yml_path, 'r', encoding='utf-8') as f:
return yaml.safe_load(f)
except Exception as e:
print(f"Error reading page.yml: {e}")
return {}

def append_original_link(file_path, original_link):
"""Append original link as a comment to the end of the file."""
try:
with open(file_path, 'a', encoding='utf-8') as f:
f.write(f"\n<!-- tcd_original_link {original_link} -->\n")
except Exception as e:
print(f"Error appending original link to {file_path}: {e}")

def process_files(source_dir, target_dir):
"""Process and copy valid files from source to target directory."""
source_dir = Path(source_dir)
target_dir = Path(target_dir)
# mkdir target_dir if not exists
target_dir.mkdir(parents=True, exist_ok=True)

# Process ready directory
ready_dir = source_dir / 'ready'
if not ready_dir.exists():
print(f"Ready directory not found: {ready_dir}")
return

# Get original links from page.yml
downloads_dir = source_dir / 'downloads'
page_yml = downloads_dir / 'page.yml'
original_links = get_original_links(page_yml)

# Copy valid files from ready directory
for file_path in ready_dir.glob('*.md'):
if is_valid_cleaned_file(file_path):
target_file = target_dir / file_path.name
try:
shutil.copy2(file_path, target_file)
print(f"Copied: {file_path.name}")
# Append original link to the copied file
file_name_html = file_path.name.replace('.md', '.html')
if file_name_html in original_links:
original_link = original_links[file_name_html]['link']
append_original_link(target_file, original_link)
else:
print("not found" + file_name_html)
except Exception as e:
print(f"Error copying {file_path}: {e}")

def main():
parser = argparse.ArgumentParser(description='Process and copy cleaned files')
parser.add_argument('source_dir', help='Source directory path')
parser.add_argument('target_dir', help='Target directory path')

args = parser.parse_args()

process_files(args.source_dir, args.target_dir)

if __name__ == '__main__':
main()
2 changes: 1 addition & 1 deletion webpage_archive

0 comments on commit cc43960

Please sign in to comment.