Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PHP to Python plus tests and stuff #51

Merged
merged 5 commits into from
Oct 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions .github/workflows/ai_robots_update.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
name: Updates for AI robots files
on:
push:
branches:
- "main"
schedule:
- cron: "0 0 * * *"

jobs:
dark-visitors:
runs-on: ubuntu-latest
name: dark-visitors
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 2
- run: |
pip install beautifulsoup4 requests
git config --global user.name "dark-visitors"
git config --global user.email "[email protected]"
Comment on lines +19 to +20
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are these configs made here? And should they be changed for the conversion step?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know (x2).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, the purpose of these configs is to make the changes produced by the dark visitors website look as if they were produced by a dark-visitors user. I'm not sure this is entirely necessary. The commit message should be sufficient on its own.

echo "Updating robots.json with data from darkvisitor.com ..."
python code/dark_visitors.py --update
echo "... done."
git --no-pager diff
git add -A
git diff --quiet && git diff --staged --quiet || (git commit -m "Update from Dark Visitors" && git push)

echo "Updating robots.txt and table-of-bot-metrics.md if necessary ..."
python code/dark_visitors.py --convert
echo "... done."
git --no-pager diff
git add -A
git diff --quiet && git diff --staged --quiet || (git commit -m "Updated from new robots.json" && git push)
shell: bash
30 changes: 0 additions & 30 deletions .github/workflows/daily_update.yml

This file was deleted.

36 changes: 0 additions & 36 deletions .github/workflows/main.yml

This file was deleted.

5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,4 @@
.DS_Store
.DS_Store
.venv
venv
__pycache__
33 changes: 0 additions & 33 deletions code/action.php

This file was deleted.

246 changes: 177 additions & 69 deletions code/dark_visitors.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,72 +4,180 @@
import requests
from bs4 import BeautifulSoup

session = requests.Session()
response = session.get("https://darkvisitors.com/agents")
soup = BeautifulSoup(response.text, "html.parser")

existing_content = json.loads(Path("./robots.json").read_text())
to_include = [
"AI Assistants",
"AI Data Scrapers",
"AI Search Crawlers",
# "Archivers",
# "Developer Helpers",
# "Fetchers",
# "Intelligence Gatherers",
# "Scrapers",
# "Search Engine Crawlers",
# "SEO Crawlers",
# "Uncategorized",
"Undocumented AI Agents"
]

for section in soup.find_all("div", {"class": "agent-links-section"}):
category = section.find("h2").get_text()
if category not in to_include:
continue
for agent in section.find_all("a", href=True):
name = agent.find("div", {"class": "agent-name"}).get_text().strip()
desc = agent.find("p").get_text().strip()

default_values = {
"Unclear at this time.",
"No information provided.",
"No information.",
"No explicit frequency provided."
}
default_value = "Unclear at this time."

# Parse the operator information from the description if possible
operator = default_value
if "operated by " in desc:
try:
operator = desc.split("operated by ", 1)[1].split(".", 1)[0].strip()
except Exception as e:
print(f"Error: {e}")

def consolidate(field: str, value: str) -> str:
# New entry
if name not in existing_content:
return value
# New field
if field not in existing_content[name]:
return value
# Unclear value
if existing_content[name][field] in default_values and value not in default_values:
return value
# Existing value
return existing_content[name][field]

existing_content[name] = {
"operator": consolidate("operator", operator),
"respect": consolidate("respect", default_value),
"function": consolidate("function", f"{category}"),
"frequency": consolidate("frequency", default_value),
"description": consolidate("description", f"{desc} More info can be found at https://darkvisitors.com/agents{agent['href']}")
}

print(f"Total: {len(existing_content)}")
sorted_keys = sorted(existing_content, key=lambda k: k.lower())
existing_content = {k: existing_content[k] for k in sorted_keys}
Path("./robots.json").write_text(json.dumps(existing_content, indent=4))

def load_robots_json():
"""Load the robots.json contents into a dictionary."""
return json.loads(Path("./robots.json").read_text(encoding="utf-8"))


def get_agent_soup():
"""Retrieve current known agents from darkvisitors.com"""
session = requests.Session()
try:
response = session.get("https://darkvisitors.com/agents")
except requests.exceptions.ConnectionError:
print(
"ERROR: Could not gather the current agents from https://darkvisitors.com/agents"
)
return
return BeautifulSoup(response.text, "html.parser")


def updated_robots_json(soup):
"""Update AI scraper information with data from darkvisitors."""
existing_content = load_robots_json()
to_include = [
"AI Assistants",
"AI Data Scrapers",
"AI Search Crawlers",
# "Archivers",
# "Developer Helpers",
# "Fetchers",
# "Intelligence Gatherers",
# "Scrapers",
# "Search Engine Crawlers",
# "SEO Crawlers",
# "Uncategorized",
"Undocumented AI Agents",
]

for section in soup.find_all("div", {"class": "agent-links-section"}):
category = section.find("h2").get_text()
if category not in to_include:
continue
for agent in section.find_all("a", href=True):
name = agent.find("div", {"class": "agent-name"}).get_text().strip()
desc = agent.find("p").get_text().strip()

default_values = {
"Unclear at this time.",
"No information provided.",
"No information.",
"No explicit frequency provided.",
}
default_value = "Unclear at this time."

# Parse the operator information from the description if possible
operator = default_value
if "operated by " in desc:
try:
operator = desc.split("operated by ", 1)[1].split(".", 1)[0].strip()
except Exception as e:
print(f"Error: {e}")

def consolidate(field: str, value: str) -> str:
# New entry
if name not in existing_content:
return value
# New field
if field not in existing_content[name]:
return value
# Unclear value
if (
existing_content[name][field] in default_values
and value not in default_values
):
return value
# Existing value
return existing_content[name][field]

existing_content[name] = {
"operator": consolidate("operator", operator),
"respect": consolidate("respect", default_value),
"function": consolidate("function", f"{category}"),
"frequency": consolidate("frequency", default_value),
"description": consolidate(
"description",
f"{desc} More info can be found at https://darkvisitors.com/agents{agent['href']}",
),
}

print(f"Total: {len(existing_content)}")
sorted_keys = sorted(existing_content, key=lambda k: k.lower())
sorted_robots = {k: existing_content[k] for k in sorted_keys}
return sorted_robots


def ingest_darkvisitors():

old_robots_json = load_robots_json()
soup = get_agent_soup()
if soup:
robots_json = updated_robots_json(soup)
print(
"robots.json is unchanged."
if robots_json == old_robots_json
else "robots.json got updates."
)
Path("./robots.json").write_text(
json.dumps(robots_json, indent=4), encoding="utf-8"
)


def json_to_txt(robots_json):
"""Compose the robots.txt from the robots.json file."""
robots_txt = "\n".join(f"User-agent: {k}" for k in robots_json.keys())
robots_txt += "\nDisallow: /\n"
return robots_txt


def json_to_table(robots_json):
"""Compose a markdown table with the information in robots.json"""
table = "| Name | Operator | Respects `robots.txt` | Data use | Visit regularity | Description |\n"
table += "|-----|----------|-----------------------|----------|------------------|-------------|\n"

for name, robot in robots_json.items():
table += f'| {name} | {robot["operator"]} | {robot["respect"]} | {robot["function"]} | {robot["frequency"]} | {robot["description"]} |\n'

return table


def update_file_if_changed(file_name, converter):
"""Update files if newer content is available and log the (in)actions."""
new_content = converter(load_robots_json())
old_content = Path(file_name).read_text(encoding="utf-8")
if old_content == new_content:
print(f"{file_name} is already up to date.")
else:
Path(file_name).write_text(new_content, encoding="utf-8")
print(f"{file_name} has been updated.")


def conversions():
"""Triggers the conversions from the json file."""
update_file_if_changed(file_name="./robots.txt", converter=json_to_txt)
update_file_if_changed(
file_name="./table-of-bot-metrics.md",
converter=json_to_table,
)


if __name__ == "__main__":
import argparse

parser = argparse.ArgumentParser()
parser = argparse.ArgumentParser(
prog="ai-robots",
description="Collects and updates information about web scrapers of AI companies.",
epilog="One of the flags must be set.\n",
)
parser.add_argument(
"--update",
action="store_true",
help="Update the robots.json file with data from darkvisitors.com/agents",
)
parser.add_argument(
"--convert",
action="store_true",
help="Create the robots.txt and markdown table from robots.json",
)
args = parser.parse_args()

if not (args.update or args.convert):
print("ERROR: please provide one of the possible flags.")
parser.print_help()

if args.update:
ingest_darkvisitors()
if args.convert:
conversions()
Loading