From 6e2f56c4c08bd5e29e5c376b38682ee3557090af Mon Sep 17 00:00:00 2001 From: howff Date: Tue, 22 Aug 2023 09:54:03 +0100 Subject: [PATCH] StringUtils.py - improve HTML parser for tag redaction --- .../SmiServices/StringUtils.py | 210 ++++++++++++++---- 1 file changed, 170 insertions(+), 40 deletions(-) diff --git a/src/common/Smi_Common_Python/SmiServices/StringUtils.py b/src/common/Smi_Common_Python/SmiServices/StringUtils.py index 0bd5ffc57..f4d108057 100755 --- a/src/common/Smi_Common_Python/SmiServices/StringUtils.py +++ b/src/common/Smi_Common_Python/SmiServices/StringUtils.py @@ -4,72 +4,199 @@ import re from html.parser import HTMLParser -#from html.entities import name2codepoint - -class MyHTMLParser(HTMLParser): - """ A HTML parser which simply extracts text and throws away tags, - so you can get the text out. - Character entities will be replaced with the actual character. - No attempt to preserve original document length. - Not sure how robust it is to badly formatted HTML. - """ +from html.entities import name2codepoint + +# --------------------------------------------------------------------- + +class RedactingHTMLParser(HTMLParser): def __init__(self): - super().__init__() - self._style_active = False - self._script_active = False - self._extracted_text = '' + super().__init__(convert_charrefs = False) + self.style_active = False + self.script_active = False + self.data_active = False + self.newline_active = False + self.ref_char = 0 + self.html_str = None + self.rc = [] + self.curpos = (0,0) + + def feed(self, html_str): + # Build a list of line numbers and their character positions + idx = 0 + eol_is_next = 0 + linenum = 1 # first line is 1, not zero + self.linepos = [0] + self.linepos.insert(1, 0) # first line is 1, not zero + for ch in html_str: + if ch == '\n' or ch == '\r': + # If prev newline char was same then is a new line + # otherwise CR,LF is a single newline + if eol_is_next and (eol_is_next == ch): + linenum += 1 + self.linepos.insert(linenum, idx) + eol_is_next = ch + idx += 1 + continue + if eol_is_next: + linenum += 1 + self.linepos.insert(linenum, idx) + eol_is_next = 0 + idx += 1 + # At EOF add a fake final line because getpos() will return it + self.linepos.insert(linenum+1, idx) + self.html_str = html_str + super().feed(html_str) - def extracted_text(self): - return self._extracted_text + def result(self): + """ Return the redacted HTML string. + Could also return the array of character offsets to be redacted. + """ + # Ensure the final item is processed + self.handle_prev() + return self.html_str + + def handle_prev(self): + """ Output or redact the previous item which is between the + character offsets self.curpos through to self.getpos(). + """ + add_lf = self.newline_active + self.newline_active = False + if self.getpos() == (1,0): + # Called at very start of document, no previous elements to redact + return + if not self.data_active or (self.style_active or self.script_active or self.ref_char): + startline = self.curpos[0] + startoffset = self.linepos[startline] + self.curpos[1] + endline, endchar = self.getpos() + endoffset = self.linepos[endline] + endchar + redact_char = ' ' + redact_length = endoffset - startoffset + if add_lf: + redact_str = '\n' + else: + redact_str = redact_char + redact_str = redact_char.rjust(redact_length, redact_char) if redact_char else '' + if add_lf and redact_char: + redact_str = '\n' + redact_str[1:] + redact_str = redact_str[:-1] + '\n' + if self.ref_char: + redact_str = self.ref_char + redact_str[1:] + self.ref_char = 0 + self.html_str = self.html_str[:startoffset] + redact_str + self.html_str[endoffset:] + self.rc.append( (startoffset, endoffset) ) + + def prepare_next(self, data_active): + self.data_active = data_active + self.curpos = self.getpos() def handle_starttag(self, tag, attrs): - #print("Start tag:", tag) + self.handle_prev() + # Ensure that any text is not output whilst in +

Hello World<> +
new line +

+""" + expected=""" + + + + + + + + + +Hello  World< > + + +new line + + """ + parser.feed(html_str) + result = parser.result() + assert(len(html_str) == len(result)) + assert(result == expected) + # --------------------------------------------------------------------- @@ -87,6 +214,7 @@ def test_string_match_ignore_linebreak(): assert(string_match_ignore_linebreak('hello\r\nworld', 'hello world')) assert(string_match_ignore_linebreak('hello\r\rworld', 'hello world')) + # --------------------------------------------------------------------- def redact_html_tags_in_string(html_str, replace_char='.', replace_newline='\n'): @@ -143,17 +271,19 @@ def test_redact_html_tags_in_string(): assert(string_match_ignore_linebreak(dest, expected)) +# --------------------------------------------------------------------- + def remove_html_tags_in_string(html_str): """ Remove all HTML tags from the string and return the new string. Does not try to preserve the original string length.""" - parser = MyHTMLParser() + parser = RedactingHTMLParser() parser.feed(html_str) - text_str = parser.extracted_text() + text_str = parser.result() return(text_str) def test_remove_html_tags_in_string(): dest = remove_html_tags_in_string(' hello

world') - expected = 'hello world ' + expected = ' hello \n \nworld' assert(dest == expected)