-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit c3239e0
Showing
346 changed files
with
695,042 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
node_modules | ||
res.md | ||
res.html | ||
output.md | ||
monolithic.html | ||
original.html | ||
original_clean.html |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
MIT License | ||
|
||
Copyright (c) 2022 云微 | ||
|
||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
|
||
The above copyright notice and this permission notice shall be included in all | ||
copies or substantial portions of the Software. | ||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
SOFTWARE. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
# downloader | ||
|
||
This is part of the chinese transgender digital archive project. | ||
|
||
Scripts and results for searching and downloading webpages. | ||
|
||
## Search | ||
|
||
- puppeteer: search for webpages using puppeteer. | ||
- serper: search for webpages using serper | ||
- googlecustom: search for webpages using google custom search json API | ||
- google: search for webpages using google python library | ||
|
||
Run ./gen_links to summary all links into a yml file. | ||
|
||
## download | ||
|
||
See download. | ||
|
||
Currently, support webpages and pdfs. | ||
|
||
## LICENSE | ||
|
||
MIT |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,144 @@ | ||
import yaml | ||
import json | ||
import tempfile | ||
import subprocess | ||
import os | ||
from pathlib import Path | ||
import multiprocessing | ||
from functools import partial | ||
|
||
def load_template(template_path): | ||
"""Load the template file""" | ||
with open(template_path, 'r', encoding='utf-8') as f: | ||
return f.read() | ||
|
||
def get_ai_classification(title, link, snippet, gen_struct_path, template): | ||
"""Ask AI to classify if the content is related""" | ||
# Define the JSON schema for classification | ||
schema = { | ||
"type": "object", | ||
"properties": { | ||
"is_related": { | ||
"type": "string", | ||
"enum": ["True", "False", "NotSure"], | ||
"description": "Whether the content is related to transgender/LGBTQ+ topics" | ||
} | ||
}, | ||
"required": ["is_related"], | ||
"additionalProperties": False | ||
} | ||
|
||
# Create temporary files | ||
with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.txt') as temp_input: | ||
# Fill in the template | ||
prompt = template.format( | ||
title=title or "Untitled", | ||
link=link, | ||
snippet=snippet or "" | ||
) | ||
temp_input.write(prompt) | ||
print(f"Prompt: {prompt}") | ||
temp_input_path = temp_input.name | ||
|
||
with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.json') as temp_schema: | ||
json.dump(schema, temp_schema) | ||
schema_file = temp_schema.name | ||
|
||
with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.json') as temp_output: | ||
temp_output_path = temp_output.name | ||
|
||
try: | ||
# Run gen_struct.py | ||
subprocess.run([ | ||
'python', gen_struct_path, | ||
temp_input_path, temp_output_path, schema_file | ||
], check=True) | ||
|
||
# Read the result | ||
with open(temp_output_path, 'r', encoding='utf-8') as f: | ||
result = json.load(f) | ||
print(f"Result: {result}") | ||
return result["is_related"].lower() # Convert to lowercase to match YAML | ||
except Exception as e: | ||
print(f"Error during AI classification: {e}") | ||
return "unknown" | ||
finally: | ||
# Cleanup temporary files | ||
os.unlink(temp_input_path) | ||
os.unlink(temp_output_path) | ||
os.unlink(schema_file) | ||
|
||
def process_url(template, gen_struct_path, url_data): | ||
"""Process a single URL (to be run in parallel)""" | ||
url, data = url_data | ||
print(f"Processing: {url}") | ||
result = get_ai_classification( | ||
data.get('title'), | ||
url, | ||
data.get('snippet'), | ||
gen_struct_path, | ||
template | ||
) | ||
if result != 'unknown': | ||
return url, result | ||
return None | ||
|
||
def main(): | ||
# File paths | ||
links_path = Path('.github/links.yml') | ||
template_path = Path('.github/prompts/check_related.md.template') | ||
gen_struct_path = Path('.github/scripts/ai/gen_struct.py') | ||
|
||
# Load files | ||
with open(links_path, 'r', encoding='utf-8') as f: | ||
links_data = yaml.safe_load(f) | ||
|
||
template = load_template(template_path) | ||
|
||
# Process each unknown entry | ||
modified = False | ||
batch_count = 0 | ||
|
||
# Get items needing processing | ||
to_process = [(url, data) for url, data in links_data.items() | ||
if not data.get('is_related') or data.get('is_related') == 'unknown'] | ||
|
||
# Create a pool with 5 processes | ||
with multiprocessing.Pool(5) as pool: | ||
# Create a partial function with template and gen_struct_path | ||
process_func = partial(process_url, template, gen_struct_path) | ||
|
||
# Process items in chunks of 5 | ||
for i in range(0, len(to_process), 5): | ||
chunk = to_process[i:i + 5] | ||
print(f"Processing batch {i//5 + 1}/{(len(to_process) + 4)//5}") | ||
|
||
# Process chunk in parallel | ||
results = pool.map(process_func, chunk) | ||
|
||
# Update results | ||
modified_in_batch = False | ||
for result in results: | ||
if result: | ||
url, is_related = result | ||
links_data[url]['is_related'] = is_related | ||
modified = True | ||
modified_in_batch = True | ||
batch_count += 1 | ||
print(f"Updated {url} to {is_related}") | ||
|
||
# Write changes after every 6 batches | ||
if modified_in_batch and (i//5 + 1) % 6 == 0: | ||
with open(links_path, 'w', encoding='utf-8') as f: | ||
yaml.dump(links_data, f, allow_unicode=True) | ||
f.flush() | ||
print(f"Batch of {batch_count} changes saved to links.yml") | ||
batch_count = 0 | ||
|
||
with open(links_path, 'w', encoding='utf-8') as f: | ||
yaml.dump(links_data, f, allow_unicode=True) | ||
if not modified: | ||
print("No changes were necessary") | ||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
import os | ||
import openai | ||
import argparse | ||
from openai import OpenAI | ||
from dotenv import load_dotenv | ||
|
||
load_dotenv() | ||
openai.api_key = os.getenv('OPENAI_API_KEY') | ||
model_name = os.getenv('OPENAI_MODEL_NAME') | ||
if not model_name: | ||
model_name = "gpt-4o" | ||
temperature = os.getenv('OPENAI_TEMPERATURE') | ||
if not temperature: | ||
temperature = 0.7 | ||
client = OpenAI() | ||
|
||
def read_file(file_path): | ||
"""Read the content of the input file.""" | ||
with open(file_path, 'r', encoding='utf-8') as file: | ||
return file.read() | ||
|
||
def write_file(file_path, content): | ||
"""Write the content to the output file.""" | ||
with open(file_path, 'w', encoding='utf-8') as file: | ||
file.write(content) | ||
|
||
def generate_cleanup_content(content): | ||
"""Send the prompt and content to OpenAI's API and get the cleaned content.""" | ||
|
||
completion = client.chat.completions.create( | ||
model=model_name, | ||
messages=[ | ||
{"role": "user", "content": content} | ||
] | ||
) | ||
|
||
return str(completion.choices[0].message.content) | ||
|
||
def main(): | ||
# Set up command-line argument parsing | ||
parser = argparse.ArgumentParser( | ||
description="Generate a cleaned-up version of a text file using OpenAI's GPT-4." | ||
) | ||
parser.add_argument('input_file', help='Path to the input .txt file') | ||
parser.add_argument('output_file', help='Path to save the cleaned output file') | ||
|
||
args = parser.parse_args() | ||
|
||
try: | ||
|
||
# Read input file | ||
input_content = read_file(args.input_file) | ||
|
||
# Generate cleaned content | ||
cleaned_content = generate_cleanup_content(input_content) | ||
|
||
# Write to output file | ||
write_file(args.output_file, cleaned_content) | ||
|
||
print(f"Successfully processed '{args.input_file}' and saved to '{args.output_file}'.") | ||
|
||
except Exception as e: | ||
print(f"An error occurred: {e}") | ||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
import os | ||
import json | ||
import openai | ||
import argparse | ||
from openai import OpenAI | ||
from dotenv import load_dotenv | ||
import base64 | ||
|
||
load_dotenv() | ||
openai.api_key = os.getenv('OPENAI_API_KEY') | ||
model_name = os.getenv('OPENAI_MODEL_NAME') | ||
if not model_name: | ||
model_name = "gpt-4o" | ||
print(f"Using model: {model_name}") | ||
temperature = os.getenv('OPENAI_TEMPERATURE') | ||
if not temperature: | ||
temperature = 0.7 | ||
print(f"Using temperature: {temperature}") | ||
client = OpenAI() | ||
|
||
def read_file(file_path): | ||
"""Read the content of the input file.""" | ||
with open(file_path, 'r', encoding='utf-8') as file: | ||
return file.read() | ||
|
||
def write_file(file_path, content): | ||
"""Write the content to the output file.""" | ||
with open(file_path, 'w', encoding='utf-8') as file: | ||
file.write(content) | ||
|
||
def encode_image(image_path): | ||
"""Encode image to base64 string.""" | ||
with open(image_path, "rb") as image_file: | ||
return base64.b64encode(image_file.read()).decode('utf-8') | ||
|
||
def generate_cleanup_content(content, schema, image_path=None): | ||
"""Send the prompt and content to OpenAI's API and get the structured content.""" | ||
|
||
messages = [ | ||
{"role": "system", "content": f"You are a helpful assistant that generates structured output based on the following JSON schema: {json.dumps(schema)}"} | ||
] | ||
|
||
# Prepare user message with optional image | ||
if image_path: | ||
base64_image = encode_image(image_path) | ||
messages.append({ | ||
"role": "user", | ||
"content": [ | ||
{ | ||
"type": "text", | ||
"text": content | ||
}, | ||
{ | ||
"type": "image_url", | ||
"image_url": { | ||
"url": f"data:image/jpeg;base64,{base64_image}" | ||
} | ||
} | ||
] | ||
}) | ||
else: | ||
messages.append({"role": "user", "content": content}) | ||
|
||
completion = client.chat.completions.create( | ||
model=model_name, | ||
messages=messages, | ||
response_format={ | ||
"type": "json_schema", | ||
"json_schema": { | ||
"name": "response", | ||
"schema": schema, | ||
"strict": True | ||
} | ||
} | ||
) | ||
|
||
return json.loads(completion.choices[0].message.content) | ||
|
||
def main(): | ||
# Set up command-line argument parsing | ||
parser = argparse.ArgumentParser( | ||
description="Generate a structured version of a text file using OpenAI's GPT-4." | ||
) | ||
parser.add_argument('input_file', help='Path to the input .txt file') | ||
parser.add_argument('output_file', help='Path to save the structured output file') | ||
parser.add_argument('schema_file', help='Path to the JSON schema file') | ||
parser.add_argument('--image', help='Optional path to an image file', default=None) | ||
|
||
args = parser.parse_args() | ||
|
||
try: | ||
# Read input file | ||
input_content = read_file(args.input_file) | ||
|
||
# Read schema file | ||
schema = json.loads(read_file(args.schema_file)) | ||
|
||
# Generate structured content with optional image | ||
structured_content = generate_cleanup_content(input_content, schema, args.image) | ||
|
||
# Write to output file | ||
write_file(args.output_file, json.dumps(structured_content, indent=2)) | ||
|
||
print(f"Successfully processed '{args.input_file}' and saved structured output to '{args.output_file}'.") | ||
|
||
except Exception as e: | ||
print(f"An error occurred: {e}") | ||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
news: | ||
python .github/downloader/download/download.py --output-dir webpage_archive/raw/sina1 --download-type webpage --pattern ".*sina.*" | ||
python .github/downloader/download/download.py --output-dir workspace_news --download-type webpage --pattern ".*news.ifeng.*" | ||
python .github/downloader/download/download.py --output-dir webpage_archive/raw/ifeng.com1 --download-type webpage --pattern ".*ifeng.com.*" | ||
python .github/downloader/download/download.py --output-dir workspace_news --download-type webpage --pattern ".*news.*" | ||
python .github/downloader/download/download.py --output-dir workspace_news --download-type webpage --pattern ".*thepaper.cn.*" | ||
python .github/downloader/download/download.py --output-dir webpage_archive/raw/thepaper.cn1 --download-type webpage --pattern ".*thepaper.cn.*" | ||
python .github/downloader/download/download.py --output-dir webpage_archive/raw/sohu.com --download-type webpage --pattern ".*sohu.com.*" | ||
python .github/downloader/download/download.py --output-dir webpage_archive/raw/163.com --download-type webpage --pattern ".*163.com/dy.*" | ||
python .github/downloader/download/download.py --output-dir webpage_archive/raw/chinanews.com --download-type webpage --pattern ".*chinanews.com.*" | ||
python .github/downloader/download/download.py --output-dir webpage_archive/raw/news.qq.com --download-type webpage --pattern ".*news.qq.com.*" | ||
python .github/downloader/download/download.py --output-dir webpage_archive/raw/unclassify_news1 --download-type webpage --pattern ".*news.*" | ||
|
||
|
||
|
||
pdf: | ||
python .github/downloader/download/download.py --output-dir workspace --download-type pdf --pattern ".*pdf.*" |
Oops, something went wrong.