Skip to content

Commit

Permalink
Merge pull request #10 from 4ARMED/ignore-regex
Browse files Browse the repository at this point in the history
Overhaul of various areas
  • Loading branch information
marcwickenden authored Oct 5, 2023
2 parents f730ab4 + 5bc0323 commit 2d97e99
Show file tree
Hide file tree
Showing 10 changed files with 241 additions and 77 deletions.
8 changes: 5 additions & 3 deletions .github/workflows/pypi.yaml
Original file line number Diff line number Diff line change
@@ -1,18 +1,20 @@
name: Publish sri-check to PyPI
name: publish

on: push

jobs:
tests:
uses: ./.github/workflows/tests.yaml
build:
name: Build distribution
needs: [tests]
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: "3.x"
python-version: "3.x"
- name: Install pypa/build
run: >-
python3 -m
Expand Down
21 changes: 21 additions & 0 deletions .github/workflows/tests.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
name: tests

on: [workflow_call]

jobs:
test:
name: Test
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: "3.x"
- name: Install dependencies
run: |
python3 -m pip install --upgrade pip
pip3 install -r requirements.txt
- name: Run unittests
run: python3 -m unittest discover -s tests
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,5 @@ venv
sri_check.egg-info/**
build/**
dist/**
.vscode/**
**/__pycache__/**
4 changes: 3 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,12 @@ def open_local(paths, mode="r", encoding="utf8"):
author_email="[email protected]",
description="Subresource Integrity Checker",
long_description=long_description,
long_description_content_type="text/markdown",
url="https://github.com/4armed/sri-check",
version=version,
packages=setuptools.find_packages(),
install_requires=install_requires,
python_requires=">=3.6",
entry_points={"console_scripts": ["sri-check=sricheck.sricheck:cli"]}
entry_points={"console_scripts": ["sri-check=sricheck.sricheck:cli"]},
test_suite="tests"
)
2 changes: 1 addition & 1 deletion sricheck/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "1.5.0"
__version__ = "1.6.0"
198 changes: 126 additions & 72 deletions sricheck/sricheck.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,11 @@
import base64
import hashlib
import re
import sys
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse

whitelisted_hosts = []

def is_whitelisted(netloc):
try:
whitelisted_hosts.index(netloc)
except ValueError:
return False
else:
return True

def generate_sha(remote_resource_tag):
tag = remote_resource_tag['tag']
resource_data = requests.get(remote_resource_tag['tag'][remote_resource_tag['attr']]).content
Expand All @@ -27,101 +18,163 @@ def generate_sha(remote_resource_tag):

return tag

def get_html(url="", browser=False, headers={}):
if browser:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.experimental_options["prefs"] = {
"profile.default_content_settings": {
"images": 2
class SRICheck:
def __init__(self, url):

if url == "":
raise ValueError("URL cannot be empty")
else:
parsed_url = urlparse(url)
if parsed_url.scheme not in {'http', 'https'}:
raise ValueError("URL must be http or https")
elif parsed_url.netloc == "":
raise ValueError("URL must include a hostname")

self.url = url
self.browser = False
self.headers = {}
self.skip_checks = False

# hosts we will ignore (in netloc format), in addition to the target URL
self.whitelisted_hosts = [
"fonts\.googleapis\.com", # does not use versioning so can't realistically use SRI
"js\.hs-scripts\.com", # does not use versioning so can't realistically use SRI
re.escape(urlparse(self.url).netloc)
]

def set_browser(self, browser):
self.browser = browser

def set_headers(self, headers):
self.headers = headers

def add_whitelisted_host(self, pattern):
self.whitelisted_hosts.append(pattern)

def whitelisted_hosts(self):
return self.whitelisted_hosts

def set_skip_checks(self, skip_checks):
self.skip_checks = skip_checks

def is_whitelisted(self, netloc):
for pattern in self.whitelisted_hosts:
# file deepcode ignore reDOS: Intended functionality
if re.search(pattern, netloc):
return True

return False

def get_html(self):
if self.browser:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.experimental_options["prefs"] = {
"profile.default_content_settings": {
"images": 2
}
}
}

browser = webdriver.Chrome("chromedriver", options=chrome_options)
browser = webdriver.Chrome("chromedriver", options=chrome_options)

def interceptor(request):
request.headers.update(headers)
def interceptor(request):
request.headers.update(self.headers)

browser.request_interceptor = interceptor
browser.request_interceptor = interceptor

browser.get(url)
return browser.page_source
else:
# file deepcode ignore Ssrf: The purpose of the script is to parse remote URLs from the CLI
return requests.get(url, headers=headers).content
browser.get(self.url)
return browser.page_source
else:
# file deepcode ignore Ssrf: The purpose of the script is to parse remote URLs from the CLI
return requests.get(self.url, headers=self.headers).content


def get_remote_resource_tags(url="", browser=False, headers={}, all=False):
html = get_html(url=url, browser=browser, headers=headers)
soup = BeautifulSoup(html, 'lxml')
def get_remote_resource_tags(self, html):
soup = BeautifulSoup(html, 'lxml')

resource_tags = []
remote_resource_tags = []
resource_tags = []
remote_resource_tags = []

if all:
script_tags = [tag for tag in soup.find_all(['script'], attrs={'src':True})]
link_tags = [tag for tag in soup.find_all(['link'], attrs={'href':True})]
resource_tags.extend(script_tags)
resource_tags.extend(link_tags)
else:
script_tags = [tag for tag in soup.find_all(['script'], attrs={'src':True, 'integrity':None})]
link_tags = [tag for tag in soup.find_all(['link'], attrs={'href':True, 'integrity':None})]
resource_tags.extend(script_tags)
resource_tags.extend(link_tags)
if self.skip_checks is True:
script_tags = [tag for tag in soup.find_all(['script'], attrs={'src':True})]
link_tags = [tag for tag in soup.find_all(['link'], attrs={'href':True})]
resource_tags.extend(script_tags)
resource_tags.extend(link_tags)
else:
script_tags = [tag for tag in soup.find_all(['script'], attrs={'src':True, 'integrity':None})]
link_tags = [tag for tag in soup.find_all(['link'], attrs={'href':True, 'integrity':None})]
resource_tags.extend(script_tags)
resource_tags.extend(link_tags)

if len(resource_tags) > 0:
parsed_source_url = urlparse(self.url)

for resource_tag in resource_tags:
attribute = ""
for potential_attribute in ['src', 'href']:
if potential_attribute in resource_tag.attrs:
attribute = potential_attribute

if len(resource_tags) > 0:
parsed_source_url = urlparse(url)
if re.search('^//', resource_tag[attribute]):
resource_tag[attribute] = parsed_source_url.scheme + ':' + resource_tag[attribute]

for resource_tag in resource_tags:
attribute = ""
for potential_attribute in ['src', 'href']:
if potential_attribute in resource_tag.attrs:
attribute = potential_attribute
parsed_tag = urlparse(resource_tag[attribute])
if parsed_tag.scheme in {'http', 'https'}:
if self.is_whitelisted(parsed_tag.netloc) is False:
remote_resource_tags.append({'tag': resource_tag, 'attr': attribute})

if re.search('^//', resource_tag[attribute]):
resource_tag[attribute] = parsed_source_url.scheme + ':' + resource_tag[attribute]
return remote_resource_tags

parsed_tag = urlparse(resource_tag[attribute])
if parsed_tag.scheme in {'http', 'https'}:
if is_whitelisted(parsed_tag.netloc) is False:
remote_resource_tags.append({'tag': resource_tag, 'attr': attribute})
def run(self):
html = self.get_html()
remote_resource_tags = self.get_remote_resource_tags(html)

return remote_resource_tags
return remote_resource_tags

def cli():
parser = argparse.ArgumentParser()
parser.add_argument("-g", "--generate", help="Generate sha384 hashes for resources", action="store_true")
parser.add_argument("-a", "--all", help="Output detected script/link tags regardless of SRI status", action="store_true")
parser.add_argument("-b", "--browser", help="Use headless browser to retrieve page and run client side rendering", action="store_true")
parser.add_argument("-H", "--header", help="HTTP header value to send with the request. Specify multiple times if needed", action="append")
parser.add_argument("-i", "--ignore", help="Ignore a host (in netloc format - e.g. www.4armed.com) when checking for SRI. Specify multiple times if needed", action="append")
parser.add_argument("-i", "--ignore", help="host to ignore when checking for SRI. e.g. cdn.4armed.com. Specify multiple times if needed", action="append")
parser.add_argument("-I", "--ignore-regex", help="regex host to ignore when checking for SRI. e.g. .*\.4armed\.com. Specify multiple times if needed", action="append")
parser.add_argument("-q", "--quiet", help="Suppress output if all tags have SRI", action="store_true")
parser.add_argument("url", help="Target URL to check for SRI")
args = parser.parse_args()

try:
s = SRICheck(url=args.url)
except ValueError as error:
print(f"[-] {error}")
sys.exit(1)

headers = {}
if args.header:
for header in args.header:
k, v = header.split(": ")
headers[k] = v

# hosts we will ignore (in netloc format), in addition to the target URL
global whitelisted_hosts
whitelisted_hosts = [
"fonts.googleapis.com", # does not use versioning so can't realistically use SRI
"js.hs-scripts.com", # does not use versioning so can't realistically use SRI
urlparse(args.url).netloc
]
if len(headers) > 0:
s.set_headers(headers)

s.set_browser(args.browser)

if args.ignore:
for host in args.ignore:
whitelisted_hosts.append(host)
s.add_whitelisted_host(re.escape(host))

if args.ignore_regex:
for pattern in args.ignore_regex:
s.add_whitelisted_host(pattern)

remote_resource_tags = get_remote_resource_tags(url=args.url, browser=args.browser, headers=headers, all=args.all)
s.set_skip_checks(args.all)
remote_resource_tags = s.run()

if len(remote_resource_tags) > 0:
for remote_resource_tag in remote_resource_tags:
Expand All @@ -130,7 +183,8 @@ def cli():
else:
print(remote_resource_tag['tag'])
else:
print("[*] No resource tags found without integrity attribute")
if args.quiet is False:
print("[*] No resource tags found without integrity attribute")

if __name__== "__main__":
cli()
Empty file added tests/unit/__init__.py
Empty file.
24 changes: 24 additions & 0 deletions tests/unit/test_init.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import unittest

from sricheck.sricheck import SRICheck

class TestInit(unittest.TestCase):

def test_init_with_url(self):
check = SRICheck("https://www.4armed.com")
self.assertEqual(check.url, "https://www.4armed.com")

def test_init_without_args(self):
with self.assertRaises(TypeError) as error:
s = SRICheck()
self.assertEqual(str(error.exception), "SRICheck.__init__() missing 1 required positional argument: 'url'")

def test_init_with_empty_url(self):
with self.assertRaises(ValueError) as error:
s = SRICheck("")
self.assertEqual(str(error.exception), "URL cannot be empty")

def test_init_with_invalid_url(self):
with self.assertRaises(ValueError) as error:
s = SRICheck("ftp://www.4armed.com")
self.assertEqual(str(error.exception), "URL must be http or https")
24 changes: 24 additions & 0 deletions tests/unit/test_parsing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import unittest

from sricheck.sricheck import SRICheck

class TestParsing(unittest.TestCase):

def test_script_tag_on_third_party_with_no_sri_returns_result(self):
check = SRICheck("https://www.4armed.com")
html = """<html><head><script src="https://cdn.cloudflare.com/script.js"></script></head></html>"""
remote_resource_tags = check.get_remote_resource_tags(html)
self.assertEqual(len(remote_resource_tags), 1)
self.assertEqual(remote_resource_tags[0]['tag']['src'], "https://cdn.cloudflare.com/script.js")

def test_script_tag_on_own_host_with_no_sri_returns_no_results(self):
check = SRICheck("https://www.4armed.com")
html = """<html><head><script src="https://www.4armed.com/script.js"></script></head></html>"""
remote_resource_tags = check.get_remote_resource_tags(html)
self.assertEqual(len(remote_resource_tags), 0)

def test_script_tag_on_third_party_with_sri_returns_no_results(self):
check = SRICheck("https://www.4armed.com")
html = """<html><head><script crossorigin="anonymous" integrity="sha384-qkIfm9UUNrOzzGFh3YtL/KOHBwDNjW00Iwd0LK/DAsdmiOWRUfXBRl/s1Rtn9h8/" src="https://cdn.cloudflare.com/script.js"></script></head></html>"""
remote_resource_tags = check.get_remote_resource_tags(html)
self.assertEqual(len(remote_resource_tags), 0)
Loading

0 comments on commit 2d97e99

Please sign in to comment.