init

project-polymorph · Nov 11, 2024 · c3239e0 · c3239e0
commit c3239e0
Show file tree

Hide file tree

Showing 346 changed files with 695,042 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,7 @@
+node_modules
+res.md
+res.html
+output.md
+monolithic.html
+original.html
+original_clean.html
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2022 云微
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1,24 @@
+# downloader
+
+This is part of the chinese transgender digital archive project.
+
+Scripts and results for searching and downloading webpages.
+
+## Search
+
+- puppeteer: search for webpages using puppeteer.
+- serper: search for webpages using serper
+- googlecustom: search for webpages using google custom search json API
+- google: search for webpages using google python library
+
+Run ./gen_links to summary all links into a yml file.
+
+## download
+
+See download.
+
+Currently, support webpages and pdfs.
+
+## LICENSE
+
+MIT
diff --git a/ai/check_related.py b/ai/check_related.py
@@ -0,0 +1,144 @@
+import yaml
+import json
+import tempfile
+import subprocess
+import os
+from pathlib import Path
+import multiprocessing
+from functools import partial
+
+def load_template(template_path):
+    """Load the template file"""
+    with open(template_path, 'r', encoding='utf-8') as f:
+        return f.read()
+
+def get_ai_classification(title, link, snippet, gen_struct_path, template):
+    """Ask AI to classify if the content is related"""
+    # Define the JSON schema for classification
+    schema = {
+        "type": "object",
+        "properties": {
+            "is_related": {
+                "type": "string",
+                "enum": ["True", "False", "NotSure"],
+                "description": "Whether the content is related to transgender/LGBTQ+ topics"
+            }
+        },
+        "required": ["is_related"],
+        "additionalProperties": False
+    }
+
+    # Create temporary files
+    with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.txt') as temp_input:
+        # Fill in the template
+        prompt = template.format(
+            title=title or "Untitled",
+            link=link,
+            snippet=snippet or ""
+        )
+        temp_input.write(prompt)
+        print(f"Prompt: {prompt}")
+        temp_input_path = temp_input.name
+
+    with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.json') as temp_schema:
+        json.dump(schema, temp_schema)
+        schema_file = temp_schema.name
+
+    with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.json') as temp_output:
+        temp_output_path = temp_output.name
+
+    try:
+        # Run gen_struct.py
+        subprocess.run([
+            'python', gen_struct_path,
+            temp_input_path, temp_output_path, schema_file
+        ], check=True)
+
+        # Read the result
+        with open(temp_output_path, 'r', encoding='utf-8') as f:
+            result = json.load(f)
+        print(f"Result: {result}")
+        return result["is_related"].lower()  # Convert to lowercase to match YAML
+    except Exception as e:
+        print(f"Error during AI classification: {e}")
+        return "unknown"
+    finally:
+        # Cleanup temporary files
+        os.unlink(temp_input_path)
+        os.unlink(temp_output_path)
+        os.unlink(schema_file)
+
+def process_url(template, gen_struct_path, url_data):
+    """Process a single URL (to be run in parallel)"""
+    url, data = url_data
+    print(f"Processing: {url}")
+    result = get_ai_classification(
+        data.get('title'),
+        url,
+        data.get('snippet'),
+        gen_struct_path,
+        template
+    )
+    if result != 'unknown':
+        return url, result
+    return None
+
+def main():
+    # File paths
+    links_path = Path('.github/links.yml')
+    template_path = Path('.github/prompts/check_related.md.template')
+    gen_struct_path = Path('.github/scripts/ai/gen_struct.py')
+
+    # Load files
+    with open(links_path, 'r', encoding='utf-8') as f:
+        links_data = yaml.safe_load(f)
+
+    template = load_template(template_path)
+
+    # Process each unknown entry
+    modified = False
+    batch_count = 0
+
+    # Get items needing processing
+    to_process = [(url, data) for url, data in links_data.items() 
+                 if not data.get('is_related') or data.get('is_related') == 'unknown']
+
+    # Create a pool with 5 processes
+    with multiprocessing.Pool(5) as pool:
+        # Create a partial function with template and gen_struct_path
+        process_func = partial(process_url, template, gen_struct_path)
+
+        # Process items in chunks of 5
+        for i in range(0, len(to_process), 5):
+            chunk = to_process[i:i + 5]
+            print(f"Processing batch {i//5 + 1}/{(len(to_process) + 4)//5}")
+
+            # Process chunk in parallel
+            results = pool.map(process_func, chunk)
+
+            # Update results
+            modified_in_batch = False
+            for result in results:
+                if result:
+                    url, is_related = result
+                    links_data[url]['is_related'] = is_related
+                    modified = True
+                    modified_in_batch = True
+                    batch_count += 1
+                    print(f"Updated {url} to {is_related}")
+
+            # Write changes after every 6 batches
+            if modified_in_batch and (i//5 + 1) % 6 == 0:
+                with open(links_path, 'w', encoding='utf-8') as f:
+                    yaml.dump(links_data, f, allow_unicode=True)
+                    f.flush()
+                    print(f"Batch of {batch_count} changes saved to links.yml")
+                    batch_count = 0
+
+    with open(links_path, 'w', encoding='utf-8') as f:
+        yaml.dump(links_data, f, allow_unicode=True)
+    if not modified:
+        print("No changes were necessary")
+
+if __name__ == "__main__":
+    main()
diff --git a/ai/gen.py b/ai/gen.py
@@ -0,0 +1,66 @@
+import os
+import openai
+import argparse
+from openai import OpenAI
+from dotenv import load_dotenv
+
+load_dotenv()
+openai.api_key = os.getenv('OPENAI_API_KEY')
+model_name = os.getenv('OPENAI_MODEL_NAME')
+if not model_name:
+    model_name = "gpt-4o"
+temperature = os.getenv('OPENAI_TEMPERATURE')
+if not temperature:
+    temperature = 0.7
+client = OpenAI()
+
+def read_file(file_path):
+    """Read the content of the input file."""
+    with open(file_path, 'r', encoding='utf-8') as file:
+        return file.read()
+
+def write_file(file_path, content):
+    """Write the content to the output file."""
+    with open(file_path, 'w', encoding='utf-8') as file:
+        file.write(content)
+
+def generate_cleanup_content(content):
+    """Send the prompt and content to OpenAI's API and get the cleaned content."""
+
+    completion = client.chat.completions.create(
+                model=model_name,
+                messages=[
+                    {"role": "user", "content": content}
+                ]
+            )
+
+    return str(completion.choices[0].message.content)
+
+def main():
+    # Set up command-line argument parsing
+    parser = argparse.ArgumentParser(
+        description="Generate a cleaned-up version of a text file using OpenAI's GPT-4."
+    )
+    parser.add_argument('input_file', help='Path to the input .txt file')
+    parser.add_argument('output_file', help='Path to save the cleaned output file')
+
+    args = parser.parse_args()
+
+    try:
+
+        # Read input file
+        input_content = read_file(args.input_file)
+
+        # Generate cleaned content
+        cleaned_content = generate_cleanup_content(input_content)
+
+        # Write to output file
+        write_file(args.output_file, cleaned_content)
+
+        print(f"Successfully processed '{args.input_file}' and saved to '{args.output_file}'.")
+
+    except Exception as e:
+        print(f"An error occurred: {e}")
+
+if __name__ == "__main__":
+    main()
diff --git a/ai/gen_struct.py b/ai/gen_struct.py
@@ -0,0 +1,110 @@
+import os
+import json
+import openai
+import argparse
+from openai import OpenAI
+from dotenv import load_dotenv
+import base64
+
+load_dotenv()
+openai.api_key = os.getenv('OPENAI_API_KEY')
+model_name = os.getenv('OPENAI_MODEL_NAME')
+if not model_name:
+    model_name = "gpt-4o"
+print(f"Using model: {model_name}")
+temperature = os.getenv('OPENAI_TEMPERATURE')
+if not temperature:
+    temperature = 0.7
+print(f"Using temperature: {temperature}")
+client = OpenAI()
+
+def read_file(file_path):
+    """Read the content of the input file."""
+    with open(file_path, 'r', encoding='utf-8') as file:
+        return file.read()
+
+def write_file(file_path, content):
+    """Write the content to the output file."""
+    with open(file_path, 'w', encoding='utf-8') as file:
+        file.write(content)
+
+def encode_image(image_path):
+    """Encode image to base64 string."""
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode('utf-8')
+
+def generate_cleanup_content(content, schema, image_path=None):
+    """Send the prompt and content to OpenAI's API and get the structured content."""
+
+    messages = [
+        {"role": "system", "content": f"You are a helpful assistant that generates structured output based on the following JSON schema: {json.dumps(schema)}"}
+    ]
+
+    # Prepare user message with optional image
+    if image_path:
+        base64_image = encode_image(image_path)
+        messages.append({
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": content
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:image/jpeg;base64,{base64_image}"
+                    }
+                }
+            ]
+        })
+    else:
+        messages.append({"role": "user", "content": content})
+
+    completion = client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        response_format={
+            "type": "json_schema",
+            "json_schema": {
+                "name": "response",
+                "schema": schema,
+                "strict": True
+            }
+        }
+    )
+
+    return json.loads(completion.choices[0].message.content)
+
+def main():
+    # Set up command-line argument parsing
+    parser = argparse.ArgumentParser(
+        description="Generate a structured version of a text file using OpenAI's GPT-4."
+    )
+    parser.add_argument('input_file', help='Path to the input .txt file')
+    parser.add_argument('output_file', help='Path to save the structured output file')
+    parser.add_argument('schema_file', help='Path to the JSON schema file')
+    parser.add_argument('--image', help='Optional path to an image file', default=None)
+
+    args = parser.parse_args()
+
+    try:
+        # Read input file
+        input_content = read_file(args.input_file)
+
+        # Read schema file
+        schema = json.loads(read_file(args.schema_file))
+
+        # Generate structured content with optional image
+        structured_content = generate_cleanup_content(input_content, schema, args.image)
+
+        # Write to output file
+        write_file(args.output_file, json.dumps(structured_content, indent=2))
+
+        print(f"Successfully processed '{args.input_file}' and saved structured output to '{args.output_file}'.")
+
+    except Exception as e:
+        print(f"An error occurred: {e}")
+
+if __name__ == "__main__":
+    main()
diff --git a/download/Makefile b/download/Makefile
@@ -0,0 +1,17 @@
+news:
+	python .github/downloader/download/download.py --output-dir webpage_archive/raw/sina1 --download-type webpage --pattern ".*sina.*"
+	python .github/downloader/download/download.py --output-dir workspace_news --download-type webpage --pattern ".*news.ifeng.*"
+	python .github/downloader/download/download.py --output-dir webpage_archive/raw/ifeng.com1 --download-type webpage --pattern ".*ifeng.com.*"
+	python .github/downloader/download/download.py --output-dir workspace_news --download-type webpage --pattern ".*news.*"
+	python .github/downloader/download/download.py --output-dir workspace_news --download-type webpage --pattern ".*thepaper.cn.*"
+	python .github/downloader/download/download.py --output-dir webpage_archive/raw/thepaper.cn1 --download-type webpage --pattern ".*thepaper.cn.*"
+	python .github/downloader/download/download.py --output-dir webpage_archive/raw/sohu.com --download-type webpage --pattern ".*sohu.com.*"
+	python .github/downloader/download/download.py --output-dir webpage_archive/raw/163.com --download-type webpage --pattern ".*163.com/dy.*"
+	python .github/downloader/download/download.py --output-dir webpage_archive/raw/chinanews.com --download-type webpage --pattern ".*chinanews.com.*"
+	python .github/downloader/download/download.py --output-dir webpage_archive/raw/news.qq.com --download-type webpage --pattern ".*news.qq.com.*"
+	python .github/downloader/download/download.py --output-dir webpage_archive/raw/unclassify_news1 --download-type webpage --pattern ".*news.*"
+
+
+
+pdf:
+	python .github/downloader/download/download.py --output-dir workspace --download-type pdf --pattern ".*pdf.*"