Add IsAnonymized as a config for Suvpar

Emory-HITI · Jul 29, 2022 · 07cd42b · 07cd42b
1 parent 300b3d3
commit 07cd42b
Show file tree

Hide file tree

Showing 3 changed files with 27 additions and 22 deletions.
diff --git a/modules/suvpar/README.md b/modules/suvpar/README.md
@@ -24,6 +24,8 @@ Find the config.json file in the folder and modify accordingly.
 
 * *IsFinalCSV*: Do you want to drop the intermediate fields and produce the final csv. By default, true. If false, only pre-processing of data to anonymize the data and prepare an intermediate file that is ready for Suvpar processing.
 
+* *IsAnonymized*: Do you want to anonymize certain sensitive PHI headers. By default, true.
+
 # Running Niffler SUVPaR
 
 First, to run the script to trim the file.

diff --git a/modules/suvpar/Suvpar.py b/modules/suvpar/Suvpar.py
@@ -12,7 +12,7 @@
 
 
 def initialize():
-    global output_csv, df, device_SN, scanner_filter, statistics_csv, isStatistics, final_csv
+    global output_csv, df, device_SN, scanner_filter, statistics_csv, isStatistics, final_csv, isAnonymized
     with open('config.json', 'r') as f:
         config = json.load(f)
 
@@ -24,6 +24,7 @@ def initialize():
     statistics_csv = config['Statistics_File']
     isStatistics = bool(config['IsStatistics'])
     final_csv = bool(config['IsFinalCSV'])
+    isAnonymized = bool(config['IsAnonymized'])
     text_file = open(feature_file, "r")
     feature_list = text_file.read().split('\n')
     # Consider some Device Serial Number and remove other.
@@ -61,26 +62,27 @@ def suvpar():
     # Check for the AcquisitionTime > SeriesTime case, currently observed in Philips and FONAR scanners.
     df['AltCase'] = numpy.where(df['Manufacturer'].str.contains('Philips|FONAR'), True, False)
 
-    # Apply hashing function to the column.
-    df['AccessionNumber'] = df['AccessionNumber'].astype(str).apply(
-        lambda x:
-        hashlib.sha256(x.encode()).hexdigest()
-    )
-
-    df['InstitutionAddress'] = df['InstitutionAddress'].astype(str).apply(
-        lambda x:
-        hashlib.sha256(x.encode()).hexdigest()
-    )
-
-    df['PatientID'] = df['PatientID'].astype(str).apply(
-        lambda x:
-        hashlib.sha256(x.encode()).hexdigest()
-    )
-
-    df['SeriesInstanceUID'] = df['SeriesInstanceUID'].astype(str).apply(
-        lambda x:
-        hashlib.sha256(x.encode()).hexdigest()
-    )
+    if isAnonymized:
+        # Apply hashing function to the column.
+        df['AccessionNumber'] = df['AccessionNumber'].astype(str).apply(
+            lambda x:
+            hashlib.sha256(x.encode()).hexdigest()
+        )
+
+        df['InstitutionAddress'] = df['InstitutionAddress'].astype(str).apply(
+            lambda x:
+            hashlib.sha256(x.encode()).hexdigest()
+        )
+
+        df['PatientID'] = df['PatientID'].astype(str).apply(
+            lambda x:
+            hashlib.sha256(x.encode()).hexdigest()
+        )
+
+        df['SeriesInstanceUID'] = df['SeriesInstanceUID'].astype(str).apply(
+            lambda x:
+            hashlib.sha256(x.encode()).hexdigest()
+        )
 
     # Add computed non-DICOM fields and drop a few attributes, if we are producing a final_csv and not an intermediate.
     if final_csv:

diff --git a/modules/suvpar/config.json b/modules/suvpar/config.json
@@ -6,5 +6,6 @@
   "FeaturesetFile": "featureset1.txt",
   "IsStatistics": false,
   "Statistics_File": "statistic.csv",
-  "IsFinalCSV": true
+  "IsFinalCSV": true,
+  "IsAnonymized": true
 }