ai-robots-txt · newbold · Oct 23, 2024 · Oct 18, 2024 · Oct 19, 2024 · Oct 19, 2024
diff --git a/.github/workflows/ai_robots_update.yml b/.github/workflows/ai_robots_update.yml
@@ -0,0 +1,34 @@
+name: Updates for AI robots files
+on:
+  push:
+    branches:
+    - "main"
+  schedule:
+    - cron: "0 0 * * *"
+
+jobs:
+  dark-visitors:
+    runs-on: ubuntu-latest
+    name: dark-visitors
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 2
+      - run: |
+          pip install beautifulsoup4 requests
+          git config --global user.name "dark-visitors"
+          git config --global user.email "[email protected]"
+          echo "Updating robots.json with data from darkvisitor.com ..."
+          python code/dark_visitors.py --update
+          echo "... done."
+          git --no-pager diff
+          git add -A
+          git diff --quiet && git diff --staged --quiet || (git commit -m "Update from Dark Visitors" && git push)
+
+          echo "Updating robots.txt and table-of-bot-metrics.md if necessary ..."
+          python code/dark_visitors.py --convert
+          echo "... done."
+          git --no-pager diff
+          git add -A
+          git diff --quiet && git diff --staged --quiet || (git commit -m "Updated from new robots.json" && git push)
+        shell: bash
diff --git a/.github/workflows/daily_update.yml b/.github/workflows/daily_update.yml
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
diff --git a/.gitignore b/.gitignore
@@ -1 +1,4 @@
-.DS_Store
+.DS_Store
+.venv
+venv
+__pycache__
diff --git a/code/action.php b/code/action.php
diff --git a/code/dark_visitors.py b/code/dark_visitors.py
@@ -4,72 +4,180 @@
 import requests
 from bs4 import BeautifulSoup
 
-session = requests.Session()
-response = session.get("https://darkvisitors.com/agents")
-soup = BeautifulSoup(response.text, "html.parser")
-
-existing_content = json.loads(Path("./robots.json").read_text())
-to_include = [
-    "AI Assistants",
-    "AI Data Scrapers",
-    "AI Search Crawlers",
-    # "Archivers",
-    # "Developer Helpers",
-    # "Fetchers",
-    # "Intelligence Gatherers",
-    # "Scrapers",
-    # "Search Engine Crawlers",
-    # "SEO Crawlers",
-    # "Uncategorized",
-    "Undocumented AI Agents"
-]
-
-for section in soup.find_all("div", {"class": "agent-links-section"}):
-    category = section.find("h2").get_text()
-    if category not in to_include:
-        continue
-    for agent in section.find_all("a", href=True):
-        name = agent.find("div", {"class": "agent-name"}).get_text().strip()
-        desc = agent.find("p").get_text().strip()
-
-        default_values = {
-            "Unclear at this time.", 
-            "No information provided.",
-            "No information.",
-            "No explicit frequency provided."
-        }
-        default_value = "Unclear at this time."
-
-        # Parse the operator information from the description if possible
-        operator = default_value
-        if "operated by " in desc:
-            try:
-                operator = desc.split("operated by ", 1)[1].split(".", 1)[0].strip()
-            except Exception as e:
-                print(f"Error: {e}")
-
-        def consolidate(field: str, value: str) -> str:
-            # New entry
-            if name not in existing_content:
-                return value
-            # New field
-            if field not in existing_content[name]:
-                return value
-            # Unclear value
-            if existing_content[name][field] in default_values and value not in default_values:
-                return value
-            # Existing value
-            return existing_content[name][field]
-
-        existing_content[name] = {
-            "operator": consolidate("operator", operator),
-            "respect": consolidate("respect", default_value),
-            "function": consolidate("function", f"{category}"),
-            "frequency": consolidate("frequency", default_value),
-            "description": consolidate("description", f"{desc} More info can be found at https://darkvisitors.com/agents{agent['href']}")
-        }
-
-print(f"Total: {len(existing_content)}")
-sorted_keys = sorted(existing_content, key=lambda k: k.lower())
-existing_content = {k: existing_content[k] for k in sorted_keys}
-Path("./robots.json").write_text(json.dumps(existing_content, indent=4))
+
+def load_robots_json():
+    """Load the robots.json contents into a dictionary."""
+    return json.loads(Path("./robots.json").read_text(encoding="utf-8"))
+
+
+def get_agent_soup():
+    """Retrieve current known agents from darkvisitors.com"""
+    session = requests.Session()
+    try:
+        response = session.get("https://darkvisitors.com/agents")
+    except requests.exceptions.ConnectionError:
+        print(
+            "ERROR: Could not gather the current agents from https://darkvisitors.com/agents"
+        )
+        return
+    return BeautifulSoup(response.text, "html.parser")
+
+
+def updated_robots_json(soup):
+    """Update AI scraper information with data from darkvisitors."""
+    existing_content = load_robots_json()
+    to_include = [
+        "AI Assistants",
+        "AI Data Scrapers",
+        "AI Search Crawlers",
+        # "Archivers",
+        # "Developer Helpers",
+        # "Fetchers",
+        # "Intelligence Gatherers",
+        # "Scrapers",
+        # "Search Engine Crawlers",
+        # "SEO Crawlers",
+        # "Uncategorized",
+        "Undocumented AI Agents",
+    ]
+
+    for section in soup.find_all("div", {"class": "agent-links-section"}):
+        category = section.find("h2").get_text()
+        if category not in to_include:
+            continue
+        for agent in section.find_all("a", href=True):
+            name = agent.find("div", {"class": "agent-name"}).get_text().strip()
+            desc = agent.find("p").get_text().strip()
+
+            default_values = {
+                "Unclear at this time.",
+                "No information provided.",
+                "No information.",
+                "No explicit frequency provided.",
+            }
+            default_value = "Unclear at this time."
+
+            # Parse the operator information from the description if possible
+            operator = default_value
+            if "operated by " in desc:
+                try:
+                    operator = desc.split("operated by ", 1)[1].split(".", 1)[0].strip()
+                except Exception as e:
+                    print(f"Error: {e}")
+
+            def consolidate(field: str, value: str) -> str:
+                # New entry
+                if name not in existing_content:
+                    return value
+                # New field
+                if field not in existing_content[name]:
+                    return value
+                # Unclear value
+                if (
+                    existing_content[name][field] in default_values
+                    and value not in default_values
+                ):
+                    return value
+                # Existing value
+                return existing_content[name][field]
+
+            existing_content[name] = {
+                "operator": consolidate("operator", operator),
+                "respect": consolidate("respect", default_value),
+                "function": consolidate("function", f"{category}"),
+                "frequency": consolidate("frequency", default_value),
+                "description": consolidate(
+                    "description",
+                    f"{desc} More info can be found at https://darkvisitors.com/agents{agent['href']}",
+                ),
+            }
+
+    print(f"Total: {len(existing_content)}")
+    sorted_keys = sorted(existing_content, key=lambda k: k.lower())
+    sorted_robots = {k: existing_content[k] for k in sorted_keys}
+    return sorted_robots
+
+
+def ingest_darkvisitors():
+
+    old_robots_json = load_robots_json()
+    soup = get_agent_soup()
+    if soup:
+        robots_json = updated_robots_json(soup)
+        print(
+            "robots.json is unchanged."
+            if robots_json == old_robots_json
+            else "robots.json got updates."
+        )
+        Path("./robots.json").write_text(
+            json.dumps(robots_json, indent=4), encoding="utf-8"
+        )
+
+
+def json_to_txt(robots_json):
+    """Compose the robots.txt from the robots.json file."""
+    robots_txt = "\n".join(f"User-agent: {k}" for k in robots_json.keys())
+    robots_txt += "\nDisallow: /\n"
+    return robots_txt
+
+
+def json_to_table(robots_json):
+    """Compose a markdown table with the information in robots.json"""
+    table = "| Name | Operator | Respects `robots.txt` | Data use | Visit regularity | Description |\n"
+    table += "|-----|----------|-----------------------|----------|------------------|-------------|\n"
+
+    for name, robot in robots_json.items():
+        table += f'| {name} | {robot["operator"]} | {robot["respect"]} | {robot["function"]} | {robot["frequency"]} | {robot["description"]} |\n'
+
+    return table
+
+
+def update_file_if_changed(file_name, converter):
+    """Update files if newer content is available and log the (in)actions."""
+    new_content = converter(load_robots_json())
+    old_content = Path(file_name).read_text(encoding="utf-8")
+    if old_content == new_content:
+        print(f"{file_name} is already up to date.")
+    else:
+        Path(file_name).write_text(new_content, encoding="utf-8")
+        print(f"{file_name} has been updated.")
+
+
+def conversions():
+    """Triggers the conversions from the json file."""
+    update_file_if_changed(file_name="./robots.txt", converter=json_to_txt)
+    update_file_if_changed(
+        file_name="./table-of-bot-metrics.md",
+        converter=json_to_table,
+    )
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser = argparse.ArgumentParser(
+        prog="ai-robots",
+        description="Collects and updates information about web scrapers of AI companies.",
+        epilog="One of the flags must be set.\n",
+    )
+    parser.add_argument(
+        "--update",
+        action="store_true",
+        help="Update the robots.json file with data from darkvisitors.com/agents",
+    )
+    parser.add_argument(
+        "--convert",
+        action="store_true",
+        help="Create the robots.txt and markdown table from robots.json",
+    )
+    args = parser.parse_args()
+
+    if not (args.update or args.convert):
+        print("ERROR: please provide one of the possible flags.")
+        parser.print_help()
+
+    if args.update:
+        ingest_darkvisitors()
+    if args.convert:
+        conversions()