Skip to content

Commit

Permalink
Merge pull request #51 from fabianegli/php-to-python-plus-tests
Browse files Browse the repository at this point in the history
PHP to Python plus tests and stuff
  • Loading branch information
newbold authored Oct 23, 2024
2 parents d62cab6 + 3ab22bc commit a66b168
Show file tree
Hide file tree
Showing 12 changed files with 642 additions and 210 deletions.
34 changes: 34 additions & 0 deletions .github/workflows/ai_robots_update.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
name: Updates for AI robots files
on:
push:
branches:
- "main"
schedule:
- cron: "0 0 * * *"

jobs:
dark-visitors:
runs-on: ubuntu-latest
name: dark-visitors
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 2
- run: |
pip install beautifulsoup4 requests
git config --global user.name "dark-visitors"
git config --global user.email "[email protected]"
echo "Updating robots.json with data from darkvisitor.com ..."
python code/dark_visitors.py --update
echo "... done."
git --no-pager diff
git add -A
git diff --quiet && git diff --staged --quiet || (git commit -m "Update from Dark Visitors" && git push)
echo "Updating robots.txt and table-of-bot-metrics.md if necessary ..."
python code/dark_visitors.py --convert
echo "... done."
git --no-pager diff
git add -A
git diff --quiet && git diff --staged --quiet || (git commit -m "Updated from new robots.json" && git push)
shell: bash
30 changes: 0 additions & 30 deletions .github/workflows/daily_update.yml

This file was deleted.

36 changes: 0 additions & 36 deletions .github/workflows/main.yml

This file was deleted.

5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,4 @@
.DS_Store
.DS_Store
.venv
venv
__pycache__
33 changes: 0 additions & 33 deletions code/action.php

This file was deleted.

246 changes: 177 additions & 69 deletions code/dark_visitors.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,72 +4,180 @@
import requests
from bs4 import BeautifulSoup

session = requests.Session()
response = session.get("https://darkvisitors.com/agents")
soup = BeautifulSoup(response.text, "html.parser")

existing_content = json.loads(Path("./robots.json").read_text())
to_include = [
"AI Assistants",
"AI Data Scrapers",
"AI Search Crawlers",
# "Archivers",
# "Developer Helpers",
# "Fetchers",
# "Intelligence Gatherers",
# "Scrapers",
# "Search Engine Crawlers",
# "SEO Crawlers",
# "Uncategorized",
"Undocumented AI Agents"
]

for section in soup.find_all("div", {"class": "agent-links-section"}):
category = section.find("h2").get_text()
if category not in to_include:
continue
for agent in section.find_all("a", href=True):
name = agent.find("div", {"class": "agent-name"}).get_text().strip()
desc = agent.find("p").get_text().strip()

default_values = {
"Unclear at this time.",
"No information provided.",
"No information.",
"No explicit frequency provided."
}
default_value = "Unclear at this time."

# Parse the operator information from the description if possible
operator = default_value
if "operated by " in desc:
try:
operator = desc.split("operated by ", 1)[1].split(".", 1)[0].strip()
except Exception as e:
print(f"Error: {e}")

def consolidate(field: str, value: str) -> str:
# New entry
if name not in existing_content:
return value
# New field
if field not in existing_content[name]:
return value
# Unclear value
if existing_content[name][field] in default_values and value not in default_values:
return value
# Existing value
return existing_content[name][field]

existing_content[name] = {
"operator": consolidate("operator", operator),
"respect": consolidate("respect", default_value),
"function": consolidate("function", f"{category}"),
"frequency": consolidate("frequency", default_value),
"description": consolidate("description", f"{desc} More info can be found at https://darkvisitors.com/agents{agent['href']}")
}

print(f"Total: {len(existing_content)}")
sorted_keys = sorted(existing_content, key=lambda k: k.lower())
existing_content = {k: existing_content[k] for k in sorted_keys}
Path("./robots.json").write_text(json.dumps(existing_content, indent=4))

def load_robots_json():
"""Load the robots.json contents into a dictionary."""
return json.loads(Path("./robots.json").read_text(encoding="utf-8"))


def get_agent_soup():
"""Retrieve current known agents from darkvisitors.com"""
session = requests.Session()
try:
response = session.get("https://darkvisitors.com/agents")
except requests.exceptions.ConnectionError:
print(
"ERROR: Could not gather the current agents from https://darkvisitors.com/agents"
)
return
return BeautifulSoup(response.text, "html.parser")


def updated_robots_json(soup):
"""Update AI scraper information with data from darkvisitors."""
existing_content = load_robots_json()
to_include = [
"AI Assistants",
"AI Data Scrapers",
"AI Search Crawlers",
# "Archivers",
# "Developer Helpers",
# "Fetchers",
# "Intelligence Gatherers",
# "Scrapers",
# "Search Engine Crawlers",
# "SEO Crawlers",
# "Uncategorized",
"Undocumented AI Agents",
]

for section in soup.find_all("div", {"class": "agent-links-section"}):
category = section.find("h2").get_text()
if category not in to_include:
continue
for agent in section.find_all("a", href=True):
name = agent.find("div", {"class": "agent-name"}).get_text().strip()
desc = agent.find("p").get_text().strip()

default_values = {
"Unclear at this time.",
"No information provided.",
"No information.",
"No explicit frequency provided.",
}
default_value = "Unclear at this time."

# Parse the operator information from the description if possible
operator = default_value
if "operated by " in desc:
try:
operator = desc.split("operated by ", 1)[1].split(".", 1)[0].strip()
except Exception as e:
print(f"Error: {e}")

def consolidate(field: str, value: str) -> str:
# New entry
if name not in existing_content:
return value
# New field
if field not in existing_content[name]:
return value
# Unclear value
if (
existing_content[name][field] in default_values
and value not in default_values
):
return value
# Existing value
return existing_content[name][field]

existing_content[name] = {
"operator": consolidate("operator", operator),
"respect": consolidate("respect", default_value),
"function": consolidate("function", f"{category}"),
"frequency": consolidate("frequency", default_value),
"description": consolidate(
"description",
f"{desc} More info can be found at https://darkvisitors.com/agents{agent['href']}",
),
}

print(f"Total: {len(existing_content)}")
sorted_keys = sorted(existing_content, key=lambda k: k.lower())
sorted_robots = {k: existing_content[k] for k in sorted_keys}
return sorted_robots


def ingest_darkvisitors():

old_robots_json = load_robots_json()
soup = get_agent_soup()
if soup:
robots_json = updated_robots_json(soup)
print(
"robots.json is unchanged."
if robots_json == old_robots_json
else "robots.json got updates."
)
Path("./robots.json").write_text(
json.dumps(robots_json, indent=4), encoding="utf-8"
)


def json_to_txt(robots_json):
"""Compose the robots.txt from the robots.json file."""
robots_txt = "\n".join(f"User-agent: {k}" for k in robots_json.keys())
robots_txt += "\nDisallow: /\n"
return robots_txt


def json_to_table(robots_json):
"""Compose a markdown table with the information in robots.json"""
table = "| Name | Operator | Respects `robots.txt` | Data use | Visit regularity | Description |\n"
table += "|-----|----------|-----------------------|----------|------------------|-------------|\n"

for name, robot in robots_json.items():
table += f'| {name} | {robot["operator"]} | {robot["respect"]} | {robot["function"]} | {robot["frequency"]} | {robot["description"]} |\n'

return table


def update_file_if_changed(file_name, converter):
"""Update files if newer content is available and log the (in)actions."""
new_content = converter(load_robots_json())
old_content = Path(file_name).read_text(encoding="utf-8")
if old_content == new_content:
print(f"{file_name} is already up to date.")
else:
Path(file_name).write_text(new_content, encoding="utf-8")
print(f"{file_name} has been updated.")


def conversions():
"""Triggers the conversions from the json file."""
update_file_if_changed(file_name="./robots.txt", converter=json_to_txt)
update_file_if_changed(
file_name="./table-of-bot-metrics.md",
converter=json_to_table,
)


if __name__ == "__main__":
import argparse

parser = argparse.ArgumentParser()
parser = argparse.ArgumentParser(
prog="ai-robots",
description="Collects and updates information about web scrapers of AI companies.",
epilog="One of the flags must be set.\n",
)
parser.add_argument(
"--update",
action="store_true",
help="Update the robots.json file with data from darkvisitors.com/agents",
)
parser.add_argument(
"--convert",
action="store_true",
help="Create the robots.txt and markdown table from robots.json",
)
args = parser.parse_args()

if not (args.update or args.convert):
print("ERROR: please provide one of the possible flags.")
parser.print_help()

if args.update:
ingest_darkvisitors()
if args.convert:
conversions()
Loading

0 comments on commit a66b168

Please sign in to comment.