Skip to content

Commit

Permalink
Merge pull request #1611 from SMI/bugfix/srhtml
Browse files Browse the repository at this point in the history
Better HTML handling for StructuredReports
  • Loading branch information
rkm authored Sep 22, 2023
2 parents 0da9ffb + ae5ea55 commit 9203cc4
Show file tree
Hide file tree
Showing 6 changed files with 304 additions and 29 deletions.
1 change: 1 addition & 0 deletions news/1611-bugfix.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
CTP_SRAnonTool - implement a full HTML parser (with other sanity checks) for HTML in TextValue in SRs
27 changes: 17 additions & 10 deletions src/applications/SRAnonTool/test/CTP_SRAnonTool_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
# allow us to test the text extraction, redaction and DICOM recreation all work.

import argparse
import logging
import os
from os.path import join, abspath, dirname
import pydicom
Expand Down Expand Up @@ -40,6 +41,7 @@
fake_pattern = args.pattern
if args.yaml:
yaml1 = args.yaml
logging.basicConfig(level = logging.INFO)

# Intermediate files:
anon_dcm_file = 'report10html.anon.dcm'
Expand All @@ -57,16 +59,20 @@
fdout.write(fdin.read())

# Extract the text which would be input to SemEHR
logging.info('Extracting text from %s using CTP_DicomToText.py' % source_dcm)
os.system(f"{binpath}/CTP_DicomToText.py -y {yaml1} -i {source_dcm} -o {source_txt_file}")

# Fake the output from SemEHR, both txt and xml.
logging.info('Running a fake anonymiser to get XML %s' % xml_file)
os.system(join(abspath(dirname(__file__)), f"clinical_doc_wrapper.py -s {semehr_dir}"))

# Now convert the txt,xml back into a redacted DICOM file:
logging.info('Redacting using XML into %s' % anon_dcm_file)
os.system(f"{binpath}/CTP_XMLToDicom.py -y {yaml1} -i {source_dcm} -x {xml_file} -o {anon_dcm_file}")

# Extract the text from the DICOM file to the screen if possible
if shutil.which('jq') and shutil.which('dcm2json'):
logging.info('Text in new DICOM file:')
os.system("dcm2json %s | jq '..|select(.vr==\"UT\")?|.Value[0]'" % anon_dcm_file)

# Finally compare the two text strings
Expand All @@ -87,7 +93,7 @@
manually_redacted = re.sub(fake_pattern, 'X'.rjust(len(fake_pattern), 'X'), before_text)
with open(test_before, 'w') as fd: fd.write(manually_redacted)
with open(test_after, 'w') as fd: fd.write(after_text)
rc = os.system('diff -wB %s %s' % (test_before, test_after))
rc = os.system('diff -wB --ignore-matching="\\[\\[" %s %s' % (test_before, test_after))
if rc == 0:
print('SUCCESS')
rc = 0
Expand All @@ -96,14 +102,15 @@
rc = 1

# Tidy up (ignore errors)
try:
os.remove(source_txt_file)
os.remove(txt_file)
os.remove(xml_file)
os.remove(anon_dcm_file)
os.remove(test_before)
os.remove(test_after)
except:
pass
if rc == 0:
try:
os.remove(source_txt_file)
os.remove(txt_file)
os.remove(xml_file)
os.remove(anon_dcm_file)
os.remove(test_before)
os.remove(test_after)
except:
pass

exit(rc)
2 changes: 1 addition & 1 deletion src/applications/SRAnonTool/test/clinical_doc_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def fake_anonymise(doc_filename):
tmp_file = os.path.join(input_dir, doc_filename)
txt_file = os.path.join(output_dir, doc_filename)
xml_file = os.path.join(output_dir, doc_filename+'.knowtator.xml')
logging.debug('Fake-anonymising %s -> %s' % (doc_filename, txt_file))
logging.debug('Fake-anonymising the name %s in %s -> %s' % (fake_pattern, doc_filename, txt_file))
fdin = open(tmp_file, 'r')
fdout = open(txt_file, 'w')
in_text = False
Expand Down
29 changes: 25 additions & 4 deletions src/common/Smi_Common_Python/SmiServices/DicomText.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,15 @@ class DicomText:
dicomtext.write(redacted_dcmname) # Writes out the redacted DICOM file
OR
write_redacted_text_into_dicom_file # to rewrite a second file with redacted text
After parse() you call text() to get the extracted text without HTML tags
that can be fed into the anonymiser. That will output annotations,
typically in Knowtator XML format, indicating the character positions
of every word that needs to be redacted. To perform the redaction in
a DICOM file you need to parse() then redact(xml), then call redacted_text()
to see the text, or write*() to save a redacted DICOM file.
The redacted text will still include the HTML tags which is why it is
important to preserve exact character offsets between text(), anonymise
and redact() steps.
Class variables determine whether unknown tags are included in the output
(ideally this would be True but in practice we are only interested in known tags
Expand Down Expand Up @@ -129,7 +138,7 @@ def _dataset_read_callback(self, dataset, data_element):
else:
rc = rc + ('%s' % (str(data_element.value)))
# Replace HTML tags with spaces
if self._replace_HTML_entities:
if self._replace_HTML_entities and '<' in rc:
rc = redact_html_tags_in_string(rc,
replace_char = self._replace_HTML_char,
replace_newline = self._replace_newline_char)
Expand Down Expand Up @@ -171,7 +180,7 @@ def parse(self):
if 'TextValue' in self._dicom_raw:
textval = str(self._dicom_raw['TextValue'].value)
self._p_text = self._p_text + '[[Text]]\n'
if self._replace_HTML_entities:
if self._replace_HTML_entities and '<' in textval:
self._p_text = self._p_text + redact_html_tags_in_string(textval,
replace_char = self._replace_HTML_char,
replace_newline = self._replace_newline_char)
Expand Down Expand Up @@ -348,7 +357,13 @@ def test_DicomText():
# Finding
......
..................................
..........
.
..
.
There is bruising of the medial femoral condyle with some intrasubstance injury to the medial collateral ligament. The lateral collateral ligament in intact. The Baker's cruciate ligament is irregular and slightly lax suggesting a partial tear. It does not appear to be completely torn. The posterior cruciate ligament is intact. The suprapatellar tendons are normal.
# Finding
There is a tear of the posterior limb of the medial meniscus which communicates with the superior articular surface. The lateral meniscus is intact. There is a Baker's cyst and moderate joint effusion.
Expand Down Expand Up @@ -395,7 +410,13 @@ def test_DicomText():
# Finding
......
..................................
..........
.
..
.
There is bruising of the medial femoral condyle with some intrasubstance injury to the medial collateral ligament. The lateral collateral ligament in intact. The Baker's cruciate ligament is irregular and slightly lax suggesting a partial tear. It does not appear to be completely torn. The posterior cruciate ligament is intact. The suprapatellar tendons are normal.
# Finding
There is a tear of the posterior limb of the medial meniscus which communicates with the superior articular surface. The lateral meniscus is intact. There is a Baker's cyst and moderate joint effusion.
Expand Down
Loading

0 comments on commit 9203cc4

Please sign in to comment.