diff --git a/CHANGELOG.md b/CHANGELOG.md index d59874f0..f1bf0244 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,13 @@ +v3.0.x - TBD +------------ + +Bug fixes: + +* [315](https://github.com/godaddy/tartufo/issues/315) - Rework entropy scanning + to reduce number of new issues generated for text that passed 2.x scans without + issues. Some corner cases may remain that will require exclusions to silence. + See [319](https://github.com/godaddy/tartufo/pull/319) for details. + v3.0.0 - 5 January 2022 ----------------------- diff --git a/tartufo/scanner.py b/tartufo/scanner.py index dfb7df36..02676bff 100755 --- a/tartufo/scanner.py +++ b/tartufo/scanner.py @@ -36,8 +36,9 @@ Scope, ) -BASE64_REGEX = re.compile(r"[A-Z0-9=+/_-]+", re.IGNORECASE) -HEX_REGEX = re.compile(r"[0-9A-F]+", re.IGNORECASE) +BASE64_REGEX = re.compile(r"[A-Z0-9=+/]{20,}", re.IGNORECASE) +BASE64URL_REGEX = re.compile(r"[A-Z0-9=_-]{20,}", re.IGNORECASE) +HEX_REGEX = re.compile(r"[0-9A-F]{20,}", re.IGNORECASE) class Issue: @@ -561,15 +562,29 @@ def scan_entropy( """ for line in chunk.contents.split("\n"): - for word in line.split(): - for string in util.find_strings_by_regex(word, BASE64_REGEX): - yield from self.evaluate_entropy_string( + # Using a set eliminates exact duplicates, which are common + base64_strings = set(util.find_strings_by_regex(line, BASE64_REGEX)) + base64_strings.update(util.find_strings_by_regex(line, BASE64URL_REGEX)) + # Don't report a string which is a substring of a string we already reported + reported_strings: List[str] = [] + for string in sorted(base64_strings, key=len, reverse=True): + if all((string not in already for already in reported_strings)): + report = self.evaluate_entropy_string( chunk, line, string, self.b64_entropy_limit ) - for string in util.find_strings_by_regex(word, HEX_REGEX): - yield from self.evaluate_entropy_string( + if report is not None: + reported_strings.append(string) + yield report + # Hex alphabet is subset of base64, so check duplicates here too; + # but it's easier because hex string will never be substring of another + # hex string. + for string in util.find_strings_by_regex(line, HEX_REGEX): + if all((string not in already for already in reported_strings)): + report = self.evaluate_entropy_string( chunk, line, string, self.hex_entropy_limit ) + if report is not None: + yield report def evaluate_entropy_string( self, @@ -577,22 +592,23 @@ def evaluate_entropy_string( line: str, string: str, min_entropy_score: float, - ) -> Generator[Issue, None, None]: + ) -> Optional[Issue]: """Check entropy string using entropy characters and score. :param chunk: The chunk of data to check :param line: Source line containing string of interest :param string: String to check :param min_entropy_score: Minimum entropy score to flag - :return: Generator of issues flagged + :return: Issue, if one is detected; otherwise None """ if not self.signature_is_excluded(string, chunk.file_path): entropy_score = self.calculate_entropy(string) if entropy_score > min_entropy_score: if self.entropy_string_is_excluded(string, line, chunk.file_path): self.logger.debug("line containing entropy was excluded: %s", line) - else: - yield Issue(types.IssueType.Entropy, string, chunk) + return None + return Issue(types.IssueType.Entropy, string, chunk) + return None def scan_regex(self, chunk: types.Chunk) -> Generator[Issue, None, None]: """Scan a chunk of data for matches against the configured regexes. diff --git a/tartufo/util.py b/tartufo/util.py index 2237c5cb..8bb19ae5 100644 --- a/tartufo/util.py +++ b/tartufo/util.py @@ -209,6 +209,12 @@ def find_strings_by_regex( :param text: The text string to be analyzed :param regex: A pattern which matches all character sequences of interest :param threshold: The minimum acceptable length of a matching string + + WARNING: If you are passing one of the scanner.py regex variables, note + they are compiled with a minimum length of 20, and using a lower threshold + here will not have the intended effect. Nothing in this codebase passes a + threshold parameter, but external users might need to adjust or use their + own expressions. """ for match in regex.finditer(text): diff --git a/tests/test_base_scanner.py b/tests/test_base_scanner.py index 2fe2fd2a..2f370319 100644 --- a/tests/test_base_scanner.py +++ b/tests/test_base_scanner.py @@ -444,7 +444,20 @@ def test_issues_are_not_created_for_b64_string_excluded_signatures( mock_signature: mock.MagicMock, mock_calculate: mock.MagicMock, ): - mock_strings.side_effect = (["foo"], [], [], [], [], []) + mock_strings.side_effect = ( + ["foo"], + ["foo"], + [], + [], + [], + [], + [], + [], + [], + [], + [], + [], + ) mock_signature.return_value = True issues = list(self.scanner.scan_entropy(self.chunk)) mock_calculate.assert_not_called() @@ -459,7 +472,7 @@ def test_issues_are_not_created_for_hex_string_excluded_signatures( mock_signature: mock.MagicMock, mock_calculate: mock.MagicMock, ): - mock_strings.side_effect = ([], ["foo"], [], [], [], []) + mock_strings.side_effect = ([], [], ["foo"], [], [], [], [], [], [], [], [], []) mock_signature.return_value = True issues = list(self.scanner.scan_entropy(self.chunk)) mock_calculate.assert_not_called() @@ -474,7 +487,20 @@ def test_issues_are_created_for_high_entropy_b64_strings( mock_signature: mock.MagicMock, mock_calculate: mock.MagicMock, ): - mock_strings.side_effect = (["foo"], [], [], [], [], []) + mock_strings.side_effect = ( + ["foo"], + ["foo"], + [], + [], + [], + [], + [], + [], + [], + [], + [], + [], + ) mock_signature.return_value = False mock_calculate.return_value = 9.0 issues = list(self.scanner.scan_entropy(self.chunk)) @@ -491,7 +517,7 @@ def test_issues_are_created_for_high_entropy_hex_strings( mock_signature: mock.MagicMock, mock_calculate: mock.MagicMock, ): - mock_strings.side_effect = ([], ["foo"], [], [], [], []) + mock_strings.side_effect = ([], [], ["foo"], [], [], [], [], [], [], [], [], []) mock_signature.return_value = False mock_calculate.return_value = 9.0 issues = list(self.scanner.scan_entropy(self.chunk)) @@ -510,7 +536,7 @@ def test_issues_are_not_created_for_high_entropy_hex_strings_given_entropy_is_ex mock_signature: mock.MagicMock, mock_calculate: mock.MagicMock, ): - mock_strings.side_effect = ([], ["foo"], [], [], [], []) + mock_strings.side_effect = ([], [], ["foo"], [], [], [], [], [], [], [], [], []) mock_entropy.return_value = True mock_signature.return_value = False mock_calculate.return_value = 9.0 @@ -528,7 +554,20 @@ def test_issues_are_not_created_for_low_entropy_b64_strings_given_entropy_is_exc mock_signature: mock.MagicMock, mock_calculate: mock.MagicMock, ): - mock_strings.side_effect = (["foo"], [], [], [], [], []) + mock_strings.side_effect = ( + ["foo"], + ["foo"], + [], + [], + [], + [], + [], + [], + [], + [], + [], + [], + ) mock_entropy.return_value = True mock_signature.return_value = False mock_calculate.return_value = 9.0 @@ -544,7 +583,20 @@ def test_issues_are_not_created_for_low_entropy_b64_strings( mock_signature: mock.MagicMock, mock_calculate: mock.MagicMock, ): - mock_strings.side_effect = (["foo"], [], [], [], [], []) + mock_strings.side_effect = ( + ["foo"], + ["foo"], + [], + [], + [], + [], + [], + [], + [], + [], + [], + [], + ) mock_signature.return_value = False mock_calculate.return_value = 1.0 issues = list(self.scanner.scan_entropy(self.chunk)) @@ -559,7 +611,7 @@ def test_issues_are_not_created_for_low_entropy_hex_strings( mock_signature: mock.MagicMock, mock_calculate: mock.MagicMock, ): - mock_strings.side_effect = ([], ["foo"], [], [], [], []) + mock_strings.side_effect = ([], [], ["foo"], [], [], [], [], [], [], [], [], []) mock_signature.return_value = False mock_calculate.return_value = 1.0 issues = list(self.scanner.scan_entropy(self.chunk)) @@ -604,12 +656,15 @@ def test_scan_entropy_find_b64_strings_for_every_word_in_diff( list(self.scanner.scan_entropy(self.chunk)) mock_strings.assert_has_calls( ( - mock.call("foo", scanner.BASE64_REGEX), - mock.call("foo", scanner.HEX_REGEX), - mock.call("bar", scanner.BASE64_REGEX), - mock.call("bar", scanner.HEX_REGEX), - mock.call("asdfqwer", scanner.BASE64_REGEX), - mock.call("asdfqwer", scanner.HEX_REGEX), + mock.call(" foo bar", scanner.BASE64_REGEX), + mock.call(" foo bar", scanner.BASE64URL_REGEX), + mock.call(" foo bar", scanner.HEX_REGEX), + mock.call(" asdfqwer", scanner.BASE64_REGEX), + mock.call(" asdfqwer", scanner.BASE64URL_REGEX), + mock.call(" asdfqwer", scanner.HEX_REGEX), + mock.call(" ", scanner.BASE64_REGEX), + mock.call(" ", scanner.BASE64URL_REGEX), + mock.call(" ", scanner.HEX_REGEX), ) ) diff --git a/tests/test_util.py b/tests/test_util.py index 59aee19d..f9577fa5 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -427,14 +427,14 @@ def test_find_strings_by_regex_recognizes_base64url(self): """ strings = list( - util.find_strings_by_regex(sample_input, scanner.BASE64_REGEX, 20) + util.find_strings_by_regex(sample_input, scanner.BASE64URL_REGEX, 20) ) self.assertEqual(strings, ["111111111-ffffCCCC=="]) def test_find_strings_by_regex_recognizes_mutant_base64(self): sample_input = """ - +111111111-ffffCCCC= Can't mix + and - but both are in regex + +111111111-ffffCCCC= Can't mix + and - and substrings are too short 111111111111111111111== Not a valid length but we don't care ==111111111111111111 = Is supposed to be end only but we don't care """ @@ -444,5 +444,5 @@ def test_find_strings_by_regex_recognizes_mutant_base64(self): ) self.assertEqual( strings, - ["+111111111-ffffCCCC=", "111111111111111111111==", "==111111111111111111"], + ["111111111111111111111==", "==111111111111111111"], )