Skip to content

Commit

Permalink
StringUtils.py - make new HTML parser the default, and
Browse files Browse the repository at this point in the history
improve HTML removal (as distinct from redaction), and
fix tests in DicomText.py for slight differences in newlines compared to previous version
  • Loading branch information
howff committed Aug 23, 2023
1 parent 6e2f56c commit 9d42aa6
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 25 deletions.
25 changes: 23 additions & 2 deletions src/common/Smi_Common_Python/SmiServices/DicomText.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,15 @@ class DicomText:
dicomtext.write(redacted_dcmname) # Writes out the redacted DICOM file
OR
write_redacted_text_into_dicom_file # to rewrite a second file with redacted text
After parse() you call text() to get the extracted text without HTML tags
that can be fed into the anonymiser. That will output annotations,
typically in Knowtator XML format, indicating the character positions
of every word that needs to be redacted. To perform the redaction in
a DICOM file you need to parse() then redact(xml), then call redacted_text()
to see the text, or write*() to save a redacted DICOM file.
The redacted text will still include the HTML tags which is why it is
important to preserve exact character offsets between text(), anonymise
and redact() steps.
Class variables determine whether unknown tags are included in the output
(ideally this would be True but in practice we are only interested in known tags
Expand Down Expand Up @@ -348,7 +357,13 @@ def test_DicomText():
# Finding
......
..................................
..........
.
..
.
There is bruising of the medial femoral condyle with some intrasubstance injury to the medial collateral ligament. The lateral collateral ligament in intact. The Baker's cruciate ligament is irregular and slightly lax suggesting a partial tear. It does not appear to be completely torn. The posterior cruciate ligament is intact. The suprapatellar tendons are normal.
# Finding
There is a tear of the posterior limb of the medial meniscus which communicates with the superior articular surface. The lateral meniscus is intact. There is a Baker's cyst and moderate joint effusion.
Expand Down Expand Up @@ -395,7 +410,13 @@ def test_DicomText():
# Finding
......
..................................
..........
.
..
.
There is bruising of the medial femoral condyle with some intrasubstance injury to the medial collateral ligament. The lateral collateral ligament in intact. The Baker's cruciate ligament is irregular and slightly lax suggesting a partial tear. It does not appear to be completely torn. The posterior cruciate ligament is intact. The suprapatellar tendons are normal.
# Finding
There is a tear of the posterior limb of the medial meniscus which communicates with the superior articular surface. The lateral meniscus is intact. There is a Baker's cyst and moderate joint effusion.
Expand Down
60 changes: 37 additions & 23 deletions src/common/Smi_Common_Python/SmiServices/StringUtils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,21 @@
# ---------------------------------------------------------------------

class RedactingHTMLParser(HTMLParser):
def __init__(self):

def __init__(self, replace_char = ' '):
super().__init__(convert_charrefs = False)
self.style_active = False
self.script_active = False
self.data_active = False
self.newline_active = False
self.ref_char = 0
self.html_str = None
self.rc = []
self.curpos = (0,0)
self.replace_char = replace_char # HTML tags replaced with
self.style_active = False # is processing a <style>
self.script_active = False # is processing a <script>
self.data_active = False # is processing real text content
self.newline_active = False # is processing newlines
self.ref_char = 0 # is an entity or character reference
self.html_str = None # redacted document
self.rc = [] # redaction offsets
self.curpos = (0,0) # last HTML entity offset
# Change empty string to a nul character so it can be removed at the end
if not self.replace_char:
self.replace_char = '\0'

def feed(self, html_str):
# Build a list of line numbers and their character positions
Expand Down Expand Up @@ -53,6 +58,8 @@ def result(self):
"""
# Ensure the final item is processed
self.handle_prev()
# If user wants to remove HTML tags (not redact) do this now
self.html_str = self.html_str.replace('\0', '')
return self.html_str

def handle_prev(self):
Expand All @@ -69,20 +76,27 @@ def handle_prev(self):
startoffset = self.linepos[startline] + self.curpos[1]
endline, endchar = self.getpos()
endoffset = self.linepos[endline] + endchar
redact_char = ' '
redact_char = self.replace_char
redact_length = endoffset - startoffset
# First char needs to be replaced with newline?
if add_lf:
redact_str = '\n'
else:
redact_str = redact_char
redact_str = redact_char.rjust(redact_length, redact_char) if redact_char else ''
if add_lf and redact_char:
# Rest of replacement string is repeated chars
redact_str = redact_char.rjust(redact_length, redact_char)
# First char needs to be replaced with newline?
if add_lf:
redact_str = '\n' + redact_str[1:]
redact_str = redact_str[:-1] + '\n'
# Entity/char reference replaces first character in the replacement string
if self.ref_char:
redact_str = self.ref_char + redact_str[1:]
# Reset flag for next time
self.ref_char = 0
# Rebuild string by cutting out redacted section and replacing it
self.html_str = self.html_str[:startoffset] + redact_str + self.html_str[endoffset:]
# Create array of offsets which might be returned
self.rc.append( (startoffset, endoffset) )

def prepare_next(self, data_active):
Expand Down Expand Up @@ -217,7 +231,7 @@ def test_string_match_ignore_linebreak():

# ---------------------------------------------------------------------

def redact_html_tags_in_string(html_str, replace_char='.', replace_newline='\n'):
def redact_html_tags_in_string_simple(html_str, replace_char='.', replace_newline='\n'):
""" Replace the HTML tags in a string with equal length of a
repeating character (space or dot for example).
The character (or string!) is given in replace_char, default dot.
Expand Down Expand Up @@ -251,38 +265,38 @@ def replfunc(s):
html_str = re.sub('</{0,1}(.DOCTYPE|a|abbr|acronym|address|applet|area|article|aside|audio|b|base|basefont|bdi|bdo|big|blockquote|body|br|button|canvas|caption|center|cite|code|col|colgroup|data|datalist|dd|del|details|dfn|dialog|dir|div|dl|dt|em|embed|fieldset|figcaption|figure|font|footer|form|frame|frameset|h1|h2|h3|h4|h5|h6|head|header|hr|html|i|iframe|img|input|ins|kbd|label|legend|li|link|main|map|mark|meta|meter|nav|noframes|noscript|object|ol|optgroup|option|output|p|param|picture|pre|progress|q|rp|rt|ruby|s|samp|script|section|select|small|source|span|strike|strong|style|sub|summary|sup|svg|table|tbody|td|template|textarea|tfoot|th|thead|time|title|tr|track|tt|u|ul|var|video|wbr)( [^<>]*){0,1}>', replfunc, html_str, flags=re.IGNORECASE)
return(html_str)

def test_redact_html_tags_in_string():
src = '<script src="s.js"/> <SCRIPT lang="js"> script1\n </script> text1 <1 month\r\n<BR>text2 <script> script2 </script> text3&nbsp;</p>'
def test_redact_html_tags_in_string_simple():
src = '<script src="s.js"/> <SCRIPT lang="js"> script1\n </script> text1 <1 month\r\n<BR>text2 <script> script2 </script> text3&nbsp;</p>'
# changing the \r to a space in the expected string also tests the string_match_ignore_linebreak function
dest = redact_html_tags_in_string(src)
dest = redact_html_tags_in_string_simple(src, replace_char='.')
expected = '.................... ..................................... text1 <1 month \n....text2 .......................... text3 ....'
assert(string_match_ignore_linebreak(dest, expected))
# Test replacing HTML with spaces
dest = redact_html_tags_in_string(src, replace_char=' ')
dest = redact_html_tags_in_string_simple(src, replace_char=' ')
expected = ' text1 <1 month \n text2 text3 '
assert(string_match_ignore_linebreak(dest, expected))
# Test the newline replacement
dest = redact_html_tags_in_string(src, replace_char=' ', replace_newline=' ')
dest = redact_html_tags_in_string_simple(src, replace_char=' ', replace_newline=' ')
expected = ' text1 <1 month text2 text3 '
assert(string_match_ignore_linebreak(dest, expected))
# Test squashing HTML and newlines
dest = redact_html_tags_in_string(src, replace_char='', replace_newline='')
dest = redact_html_tags_in_string_simple(src, replace_char='', replace_newline='')
expected = ' text1 <1 monthtext2 text3 '
assert(string_match_ignore_linebreak(dest, expected))


# ---------------------------------------------------------------------

def remove_html_tags_in_string(html_str):
def redact_html_tags_in_string(html_str, replace_char=' ', replace_newline='\n'):
""" Remove all HTML tags from the string and return the new string.
Does not try to preserve the original string length."""
parser = RedactingHTMLParser()
parser = RedactingHTMLParser(replace_char = replace_char)
parser.feed(html_str)
text_str = parser.result()
return(text_str)

def test_remove_html_tags_in_string():
dest = remove_html_tags_in_string('<!DOCTYPE fake> <style>stuff </style>hello <p>world')
def test_redact_html_tags_in_string():
dest = redact_html_tags_in_string('<!DOCTYPE fake> <style>stuff </style>hello <p>world')
expected = ' hello \n \nworld'
assert(dest == expected)

Expand Down

0 comments on commit 9d42aa6

Please sign in to comment.