Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Entropy scanning fixups #319

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
v3.0.x - TBD
------------

Bug fixes:

* [315](https://github.com/godaddy/tartufo/issues/315) - Rework entropy scanning
to reduce number of new issues generated for text that passed 2.x scans without
issues. Some corner cases may remain that will require exclusions to silence.
See [319](https://github.com/godaddy/tartufo/pull/319) for details.

v3.0.0 - 5 January 2022
-----------------------

Expand Down
38 changes: 27 additions & 11 deletions tartufo/scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,9 @@
Scope,
)

BASE64_REGEX = re.compile(r"[A-Z0-9=+/_-]+", re.IGNORECASE)
HEX_REGEX = re.compile(r"[0-9A-F]+", re.IGNORECASE)
BASE64_REGEX = re.compile(r"[A-Z0-9=+/]{20,}", re.IGNORECASE)
BASE64URL_REGEX = re.compile(r"[A-Z0-9=_-]{20,}", re.IGNORECASE)
HEX_REGEX = re.compile(r"[0-9A-F]{20,}", re.IGNORECASE)


class Issue:
Expand Down Expand Up @@ -561,38 +562,53 @@ def scan_entropy(
"""

for line in chunk.contents.split("\n"):
for word in line.split():
for string in util.find_strings_by_regex(word, BASE64_REGEX):
yield from self.evaluate_entropy_string(
# Using a set eliminates exact duplicates, which are common
base64_strings = set(util.find_strings_by_regex(line, BASE64_REGEX))
base64_strings.update(util.find_strings_by_regex(line, BASE64URL_REGEX))
# Don't report a string which is a substring of a string we already reported
reported_strings: List[str] = []
for string in sorted(base64_strings, key=len, reverse=True):
if all((string not in already for already in reported_strings)):
report = self.evaluate_entropy_string(
chunk, line, string, self.b64_entropy_limit
)
for string in util.find_strings_by_regex(word, HEX_REGEX):
yield from self.evaluate_entropy_string(
if report is not None:
reported_strings.append(string)
yield report
# Hex alphabet is subset of base64, so check duplicates here too;
# but it's easier because hex string will never be substring of another
# hex string.
for string in util.find_strings_by_regex(line, HEX_REGEX):
if all((string not in already for already in reported_strings)):
report = self.evaluate_entropy_string(
chunk, line, string, self.hex_entropy_limit
)
if report is not None:
yield report

def evaluate_entropy_string(
self,
chunk: types.Chunk,
line: str,
string: str,
min_entropy_score: float,
) -> Generator[Issue, None, None]:
) -> Optional[Issue]:
"""Check entropy string using entropy characters and score.

:param chunk: The chunk of data to check
:param line: Source line containing string of interest
:param string: String to check
:param min_entropy_score: Minimum entropy score to flag
:return: Generator of issues flagged
:return: Issue, if one is detected; otherwise None
"""
if not self.signature_is_excluded(string, chunk.file_path):
entropy_score = self.calculate_entropy(string)
if entropy_score > min_entropy_score:
if self.entropy_string_is_excluded(string, line, chunk.file_path):
self.logger.debug("line containing entropy was excluded: %s", line)
else:
yield Issue(types.IssueType.Entropy, string, chunk)
return None
return Issue(types.IssueType.Entropy, string, chunk)
return None

def scan_regex(self, chunk: types.Chunk) -> Generator[Issue, None, None]:
"""Scan a chunk of data for matches against the configured regexes.
Expand Down
6 changes: 6 additions & 0 deletions tartufo/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,12 @@ def find_strings_by_regex(
:param text: The text string to be analyzed
:param regex: A pattern which matches all character sequences of interest
:param threshold: The minimum acceptable length of a matching string

WARNING: If you are passing one of the scanner.py regex variables, note
they are compiled with a minimum length of 20, and using a lower threshold
here will not have the intended effect. Nothing in this codebase passes a
threshold parameter, but external users might need to adjust or use their
own expressions.
"""

for match in regex.finditer(text):
Expand Down
83 changes: 69 additions & 14 deletions tests/test_base_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -444,7 +444,20 @@ def test_issues_are_not_created_for_b64_string_excluded_signatures(
mock_signature: mock.MagicMock,
mock_calculate: mock.MagicMock,
):
mock_strings.side_effect = (["foo"], [], [], [], [], [])
mock_strings.side_effect = (
["foo"],
["foo"],
[],
[],
[],
[],
[],
[],
[],
[],
[],
[],
)
mock_signature.return_value = True
issues = list(self.scanner.scan_entropy(self.chunk))
mock_calculate.assert_not_called()
Expand All @@ -459,7 +472,7 @@ def test_issues_are_not_created_for_hex_string_excluded_signatures(
mock_signature: mock.MagicMock,
mock_calculate: mock.MagicMock,
):
mock_strings.side_effect = ([], ["foo"], [], [], [], [])
mock_strings.side_effect = ([], [], ["foo"], [], [], [], [], [], [], [], [], [])
mock_signature.return_value = True
issues = list(self.scanner.scan_entropy(self.chunk))
mock_calculate.assert_not_called()
Expand All @@ -474,7 +487,20 @@ def test_issues_are_created_for_high_entropy_b64_strings(
mock_signature: mock.MagicMock,
mock_calculate: mock.MagicMock,
):
mock_strings.side_effect = (["foo"], [], [], [], [], [])
mock_strings.side_effect = (
["foo"],
["foo"],
[],
[],
[],
[],
[],
[],
[],
[],
[],
[],
)
mock_signature.return_value = False
mock_calculate.return_value = 9.0
issues = list(self.scanner.scan_entropy(self.chunk))
Expand All @@ -491,7 +517,7 @@ def test_issues_are_created_for_high_entropy_hex_strings(
mock_signature: mock.MagicMock,
mock_calculate: mock.MagicMock,
):
mock_strings.side_effect = ([], ["foo"], [], [], [], [])
mock_strings.side_effect = ([], [], ["foo"], [], [], [], [], [], [], [], [], [])
mock_signature.return_value = False
mock_calculate.return_value = 9.0
issues = list(self.scanner.scan_entropy(self.chunk))
Expand All @@ -510,7 +536,7 @@ def test_issues_are_not_created_for_high_entropy_hex_strings_given_entropy_is_ex
mock_signature: mock.MagicMock,
mock_calculate: mock.MagicMock,
):
mock_strings.side_effect = ([], ["foo"], [], [], [], [])
mock_strings.side_effect = ([], [], ["foo"], [], [], [], [], [], [], [], [], [])
mock_entropy.return_value = True
mock_signature.return_value = False
mock_calculate.return_value = 9.0
Expand All @@ -528,7 +554,20 @@ def test_issues_are_not_created_for_low_entropy_b64_strings_given_entropy_is_exc
mock_signature: mock.MagicMock,
mock_calculate: mock.MagicMock,
):
mock_strings.side_effect = (["foo"], [], [], [], [], [])
mock_strings.side_effect = (
["foo"],
["foo"],
[],
[],
[],
[],
[],
[],
[],
[],
[],
[],
)
mock_entropy.return_value = True
mock_signature.return_value = False
mock_calculate.return_value = 9.0
Expand All @@ -544,7 +583,20 @@ def test_issues_are_not_created_for_low_entropy_b64_strings(
mock_signature: mock.MagicMock,
mock_calculate: mock.MagicMock,
):
mock_strings.side_effect = (["foo"], [], [], [], [], [])
mock_strings.side_effect = (
["foo"],
["foo"],
[],
[],
[],
[],
[],
[],
[],
[],
[],
[],
)
mock_signature.return_value = False
mock_calculate.return_value = 1.0
issues = list(self.scanner.scan_entropy(self.chunk))
Expand All @@ -559,7 +611,7 @@ def test_issues_are_not_created_for_low_entropy_hex_strings(
mock_signature: mock.MagicMock,
mock_calculate: mock.MagicMock,
):
mock_strings.side_effect = ([], ["foo"], [], [], [], [])
mock_strings.side_effect = ([], [], ["foo"], [], [], [], [], [], [], [], [], [])
mock_signature.return_value = False
mock_calculate.return_value = 1.0
issues = list(self.scanner.scan_entropy(self.chunk))
Expand Down Expand Up @@ -604,12 +656,15 @@ def test_scan_entropy_find_b64_strings_for_every_word_in_diff(
list(self.scanner.scan_entropy(self.chunk))
mock_strings.assert_has_calls(
(
mock.call("foo", scanner.BASE64_REGEX),
mock.call("foo", scanner.HEX_REGEX),
mock.call("bar", scanner.BASE64_REGEX),
mock.call("bar", scanner.HEX_REGEX),
mock.call("asdfqwer", scanner.BASE64_REGEX),
mock.call("asdfqwer", scanner.HEX_REGEX),
mock.call(" foo bar", scanner.BASE64_REGEX),
mock.call(" foo bar", scanner.BASE64URL_REGEX),
mock.call(" foo bar", scanner.HEX_REGEX),
mock.call(" asdfqwer", scanner.BASE64_REGEX),
mock.call(" asdfqwer", scanner.BASE64URL_REGEX),
mock.call(" asdfqwer", scanner.HEX_REGEX),
mock.call(" ", scanner.BASE64_REGEX),
mock.call(" ", scanner.BASE64URL_REGEX),
mock.call(" ", scanner.HEX_REGEX),
)
)

Expand Down
6 changes: 3 additions & 3 deletions tests/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -427,14 +427,14 @@ def test_find_strings_by_regex_recognizes_base64url(self):
"""

strings = list(
util.find_strings_by_regex(sample_input, scanner.BASE64_REGEX, 20)
util.find_strings_by_regex(sample_input, scanner.BASE64URL_REGEX, 20)
)
self.assertEqual(strings, ["111111111-ffffCCCC=="])

def test_find_strings_by_regex_recognizes_mutant_base64(self):

sample_input = """
+111111111-ffffCCCC= Can't mix + and - but both are in regex
+111111111-ffffCCCC= Can't mix + and - and substrings are too short
111111111111111111111== Not a valid length but we don't care
==111111111111111111 = Is supposed to be end only but we don't care
"""
Expand All @@ -444,5 +444,5 @@ def test_find_strings_by_regex_recognizes_mutant_base64(self):
)
self.assertEqual(
strings,
["+111111111-ffffCCCC=", "111111111111111111111==", "==111111111111111111"],
["111111111111111111111==", "==111111111111111111"],
)