Merge pull request #10 from 4ARMED/ignore-regex

Overhaul of various areas
4ARMED · Oct 5, 2023 · 2d97e99 · 2d97e99
2 parents f730ab4 + 5bc0323
commit 2d97e99
Show file tree

Hide file tree

Showing 10 changed files with 241 additions and 77 deletions.
diff --git a/.github/workflows/pypi.yaml b/.github/workflows/pypi.yaml
@@ -1,18 +1,20 @@
-name: Publish sri-check to PyPI
+name: publish
 
 on: push
 
 jobs:
+  tests:
+    uses: ./.github/workflows/tests.yaml
   build:
     name: Build distribution
+    needs: [tests]
     runs-on: ubuntu-latest
-
     steps:
     - uses: actions/checkout@v4
     - name: Set up Python
       uses: actions/setup-python@v4
       with:
-        python-version: "3.x"
+          python-version: "3.x"
     - name: Install pypa/build
       run: >-
         python3 -m

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
@@ -0,0 +1,21 @@
+name: tests
+
+on: [workflow_call]
+
+jobs:
+  test:
+    name: Test
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+          python-version: "3.x"
+    - name: Install dependencies
+      run: |
+        python3 -m pip install --upgrade pip
+        pip3 install -r requirements.txt
+    - name: Run unittests
+      run: python3 -m unittest discover -s tests
diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,5 @@ venv
 sri_check.egg-info/**
 build/**
 dist/**
+.vscode/**
+**/__pycache__/**
diff --git a/setup.py b/setup.py
@@ -23,10 +23,12 @@ def open_local(paths, mode="r", encoding="utf8"):
     author_email="[email protected]",
     description="Subresource Integrity Checker",
     long_description=long_description,
+    long_description_content_type="text/markdown",
     url="https://github.com/4armed/sri-check",
     version=version,
     packages=setuptools.find_packages(),
     install_requires=install_requires,
     python_requires=">=3.6",
-    entry_points={"console_scripts": ["sri-check=sricheck.sricheck:cli"]}
+    entry_points={"console_scripts": ["sri-check=sricheck.sricheck:cli"]},
+    test_suite="tests"
 )
diff --git a/sricheck/__init__.py b/sricheck/__init__.py
@@ -1 +1 @@
-__version__ = "1.5.0"
+__version__ = "1.6.0"
diff --git a/sricheck/sricheck.py b/sricheck/sricheck.py
@@ -4,20 +4,11 @@
 import base64
 import hashlib
 import re
+import sys
 import requests
 from bs4 import BeautifulSoup
 from urllib.parse import urlparse
 
-whitelisted_hosts = []
-
-def is_whitelisted(netloc):
-    try:
-        whitelisted_hosts.index(netloc)
-    except ValueError:
-        return False
-    else:
-        return True
-
 def generate_sha(remote_resource_tag):
     tag = remote_resource_tag['tag']
     resource_data = requests.get(remote_resource_tag['tag'][remote_resource_tag['attr']]).content
@@ -27,101 +18,163 @@ def generate_sha(remote_resource_tag):
 
     return tag
 
-def get_html(url="", browser=False, headers={}):
-    if browser:
-        from selenium import webdriver
-        from selenium.webdriver.chrome.options import Options
-
-        chrome_options = Options()
-        chrome_options.add_argument("--headless")
-        chrome_options.add_argument("--no-sandbox")
-        chrome_options.add_argument("--disable-dev-shm-usage")
-        chrome_options.experimental_options["prefs"] = {
-            "profile.default_content_settings": {
-                "images": 2
+class SRICheck:
+    def __init__(self, url):
+
+        if url == "":
+            raise ValueError("URL cannot be empty")
+        else:
+            parsed_url = urlparse(url)
+            if parsed_url.scheme not in {'http', 'https'}:
+                raise ValueError("URL must be http or https")
+            elif parsed_url.netloc == "":
+                raise ValueError("URL must include a hostname")
+
+        self.url = url
+        self.browser = False
+        self.headers = {}
+        self.skip_checks = False
+
+        # hosts we will ignore (in netloc format), in addition to the target URL
+        self.whitelisted_hosts = [
+            "fonts\.googleapis\.com", # does not use versioning so can't realistically use SRI
+            "js\.hs-scripts\.com", # does not use versioning so can't realistically use SRI
+            re.escape(urlparse(self.url).netloc)
+        ]
+
+    def set_browser(self, browser):
+        self.browser = browser
+
+    def set_headers(self, headers):
+        self.headers = headers
+
+    def add_whitelisted_host(self, pattern):
+        self.whitelisted_hosts.append(pattern)
+
+    def whitelisted_hosts(self):
+        return self.whitelisted_hosts
+
+    def set_skip_checks(self, skip_checks):
+        self.skip_checks = skip_checks
+
+    def is_whitelisted(self, netloc):
+        for pattern in self.whitelisted_hosts:
+            # file deepcode ignore reDOS: Intended functionality
+            if re.search(pattern, netloc):
+                return True
+
+        return False
+
+    def get_html(self):
+        if self.browser:
+            from selenium import webdriver
+            from selenium.webdriver.chrome.options import Options
+
+            chrome_options = Options()
+            chrome_options.add_argument("--headless")
+            chrome_options.add_argument("--no-sandbox")
+            chrome_options.add_argument("--disable-dev-shm-usage")
+            chrome_options.experimental_options["prefs"] = {
+                "profile.default_content_settings": {
+                    "images": 2
+                }
             }
-        }
 
-        browser = webdriver.Chrome("chromedriver", options=chrome_options)
+            browser = webdriver.Chrome("chromedriver", options=chrome_options)
 
-        def interceptor(request):
-            request.headers.update(headers)
+            def interceptor(request):
+                request.headers.update(self.headers)
 
-        browser.request_interceptor = interceptor
+            browser.request_interceptor = interceptor
 
-        browser.get(url)
-        return browser.page_source
-    else:
-        # file deepcode ignore Ssrf: The purpose of the script is to parse remote URLs from the CLI
-        return requests.get(url, headers=headers).content
+            browser.get(self.url)
+            return browser.page_source
+        else:
+            # file deepcode ignore Ssrf: The purpose of the script is to parse remote URLs from the CLI
+            return requests.get(self.url, headers=self.headers).content
 
 
-def get_remote_resource_tags(url="", browser=False, headers={}, all=False):
-    html = get_html(url=url, browser=browser, headers=headers)
-    soup = BeautifulSoup(html, 'lxml')
+    def get_remote_resource_tags(self, html):
+        soup = BeautifulSoup(html, 'lxml')
 
-    resource_tags = []
-    remote_resource_tags = []
+        resource_tags = []
+        remote_resource_tags = []
 
-    if all:
-        script_tags = [tag for tag in soup.find_all(['script'], attrs={'src':True})]
-        link_tags = [tag for tag in soup.find_all(['link'], attrs={'href':True})]
-        resource_tags.extend(script_tags)
-        resource_tags.extend(link_tags)
-    else:
-        script_tags = [tag for tag in soup.find_all(['script'], attrs={'src':True, 'integrity':None})]
-        link_tags = [tag for tag in soup.find_all(['link'], attrs={'href':True, 'integrity':None})]
-        resource_tags.extend(script_tags)
-        resource_tags.extend(link_tags)
+        if self.skip_checks is True:
+            script_tags = [tag for tag in soup.find_all(['script'], attrs={'src':True})]
+            link_tags = [tag for tag in soup.find_all(['link'], attrs={'href':True})]
+            resource_tags.extend(script_tags)
+            resource_tags.extend(link_tags)
+        else:
+            script_tags = [tag for tag in soup.find_all(['script'], attrs={'src':True, 'integrity':None})]
+            link_tags = [tag for tag in soup.find_all(['link'], attrs={'href':True, 'integrity':None})]
+            resource_tags.extend(script_tags)
+            resource_tags.extend(link_tags)
+
+        if len(resource_tags) > 0:
+            parsed_source_url = urlparse(self.url)
+
+            for resource_tag in resource_tags:
+                attribute = ""
+                for potential_attribute in ['src', 'href']:
+                    if potential_attribute in resource_tag.attrs:
+                        attribute = potential_attribute
 
-    if len(resource_tags) > 0:
-        parsed_source_url = urlparse(url)
+                if re.search('^//', resource_tag[attribute]):
+                    resource_tag[attribute] = parsed_source_url.scheme + ':' + resource_tag[attribute]
 
-        for resource_tag in resource_tags:
-            attribute = ""
-            for potential_attribute in ['src', 'href']:
-                if potential_attribute in resource_tag.attrs:
-                    attribute = potential_attribute
+                parsed_tag = urlparse(resource_tag[attribute])
+                if parsed_tag.scheme in {'http', 'https'}:
+                    if self.is_whitelisted(parsed_tag.netloc) is False:
+                        remote_resource_tags.append({'tag': resource_tag, 'attr': attribute})
 
-            if re.search('^//', resource_tag[attribute]):
-                resource_tag[attribute] = parsed_source_url.scheme + ':' + resource_tag[attribute]
+        return remote_resource_tags
 
-            parsed_tag = urlparse(resource_tag[attribute])
-            if parsed_tag.scheme in {'http', 'https'}:
-                if is_whitelisted(parsed_tag.netloc) is False:
-                    remote_resource_tags.append({'tag': resource_tag, 'attr': attribute})
+    def run(self):
+        html = self.get_html()
+        remote_resource_tags = self.get_remote_resource_tags(html)
 
-    return remote_resource_tags
+        return remote_resource_tags
 
 def cli():
     parser = argparse.ArgumentParser()
     parser.add_argument("-g", "--generate", help="Generate sha384 hashes for resources", action="store_true")
     parser.add_argument("-a", "--all", help="Output detected script/link tags regardless of SRI status", action="store_true")
     parser.add_argument("-b", "--browser", help="Use headless browser to retrieve page and run client side rendering", action="store_true")
     parser.add_argument("-H", "--header", help="HTTP header value to send with the request. Specify multiple times if needed", action="append")
-    parser.add_argument("-i", "--ignore", help="Ignore a host (in netloc format - e.g. www.4armed.com) when checking for SRI. Specify multiple times if needed", action="append")
+    parser.add_argument("-i", "--ignore", help="host to ignore when checking for SRI. e.g. cdn.4armed.com. Specify multiple times if needed", action="append")
+    parser.add_argument("-I", "--ignore-regex", help="regex host to ignore when checking for SRI. e.g. .*\.4armed\.com. Specify multiple times if needed", action="append")
+    parser.add_argument("-q", "--quiet", help="Suppress output if all tags have SRI", action="store_true")
     parser.add_argument("url", help="Target URL to check for SRI")
     args = parser.parse_args()
 
+    try:
+        s = SRICheck(url=args.url)
+    except ValueError as error:
+        print(f"[-] {error}")
+        sys.exit(1)
+
     headers = {}
     if args.header:
         for header in args.header:
             k, v = header.split(": ")
             headers[k] = v
 
-    # hosts we will ignore (in netloc format), in addition to the target URL
-    global whitelisted_hosts
-    whitelisted_hosts = [
-        "fonts.googleapis.com", # does not use versioning so can't realistically use SRI
-        "js.hs-scripts.com", # does not use versioning so can't realistically use SRI
-        urlparse(args.url).netloc
-    ]
+    if len(headers) > 0:
+        s.set_headers(headers)
+
+    s.set_browser(args.browser)
 
     if args.ignore:
         for host in args.ignore:
-            whitelisted_hosts.append(host)
+            s.add_whitelisted_host(re.escape(host))
+
+    if args.ignore_regex:
+        for pattern in args.ignore_regex:
+            s.add_whitelisted_host(pattern)  
 
-    remote_resource_tags = get_remote_resource_tags(url=args.url, browser=args.browser, headers=headers, all=args.all)
+    s.set_skip_checks(args.all)
+    remote_resource_tags = s.run()
 
     if len(remote_resource_tags) > 0:
         for remote_resource_tag in remote_resource_tags:
@@ -130,7 +183,8 @@ def cli():
             else:
                 print(remote_resource_tag['tag'])
     else:
-        print("[*] No resource tags found without integrity attribute")
+        if args.quiet is False:
+            print("[*] No resource tags found without integrity attribute")
 
 if __name__== "__main__":
     cli()
diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py
diff --git a/tests/unit/test_init.py b/tests/unit/test_init.py
@@ -0,0 +1,24 @@
+import unittest
+
+from sricheck.sricheck import SRICheck
+
+class TestInit(unittest.TestCase):
+
+    def test_init_with_url(self):
+        check = SRICheck("https://www.4armed.com")
+        self.assertEqual(check.url, "https://www.4armed.com")
+
+    def test_init_without_args(self):
+        with self.assertRaises(TypeError) as error:
+            s = SRICheck()
+        self.assertEqual(str(error.exception), "SRICheck.__init__() missing 1 required positional argument: 'url'")
+
+    def test_init_with_empty_url(self):
+        with self.assertRaises(ValueError) as error:
+            s = SRICheck("")
+        self.assertEqual(str(error.exception), "URL cannot be empty")
+
+    def test_init_with_invalid_url(self):
+        with self.assertRaises(ValueError) as error:
+            s = SRICheck("ftp://www.4armed.com")
+        self.assertEqual(str(error.exception), "URL must be http or https")
diff --git a/tests/unit/test_parsing.py b/tests/unit/test_parsing.py
@@ -0,0 +1,24 @@
+import unittest
+
+from sricheck.sricheck import SRICheck
+
+class TestParsing(unittest.TestCase):
+
+    def test_script_tag_on_third_party_with_no_sri_returns_result(self):
+        check = SRICheck("https://www.4armed.com")
+        html = """<html><head><script src="https://cdn.cloudflare.com/script.js"></script></head></html>"""
+        remote_resource_tags = check.get_remote_resource_tags(html)
+        self.assertEqual(len(remote_resource_tags), 1)
+        self.assertEqual(remote_resource_tags[0]['tag']['src'], "https://cdn.cloudflare.com/script.js")
+
+    def test_script_tag_on_own_host_with_no_sri_returns_no_results(self):
+        check = SRICheck("https://www.4armed.com")
+        html = """<html><head><script src="https://www.4armed.com/script.js"></script></head></html>"""
+        remote_resource_tags = check.get_remote_resource_tags(html)
+        self.assertEqual(len(remote_resource_tags), 0)
+
+    def test_script_tag_on_third_party_with_sri_returns_no_results(self):
+        check = SRICheck("https://www.4armed.com")
+        html = """<html><head><script crossorigin="anonymous" integrity="sha384-qkIfm9UUNrOzzGFh3YtL/KOHBwDNjW00Iwd0LK/DAsdmiOWRUfXBRl/s1Rtn9h8/" src="https://cdn.cloudflare.com/script.js"></script></head></html>"""
+        remote_resource_tags = check.get_remote_resource_tags(html)
+        self.assertEqual(len(remote_resource_tags), 0)