fixing output format

monarch-initiative · Aug 23, 2023 · 89a7099 · 89a7099
1 parent f9c44ba
commit 89a7099
Show file tree

Hide file tree

Showing 3 changed files with 59 additions and 18 deletions.
diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/model/AdditionalConceptType.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/model/AdditionalConceptType.java
@@ -19,6 +19,7 @@ public enum AdditionalConceptType {
     DIAGNOSTICS,
     TREATMENT,
     PMH,
+    FAMILY_HISTORY,
     VERBATIM;
 
 
@@ -30,6 +31,7 @@ public static AdditionalConceptType of(String s) {
             case "DIAGNOSTICS" ->  DIAGNOSTICS;
             case "TREATMENT" -> TREATMENT;
             case "PMH" -> PMH;
+            case "FAMILY_HISTORY" -> FAMILY_HISTORY;
             case "VERBATIM" -> VERBATIM;
             default -> throw new PhenolRuntimeException("Unrecognised concept \"" + concept + "\"");
         };

diff --git a/...va/org/monarchinitiative/phenopacket2prompt/querygen/qfactory/AbstractQueryGenerator.java b/...va/org/monarchinitiative/phenopacket2prompt/querygen/qfactory/AbstractQueryGenerator.java
@@ -89,6 +89,16 @@ protected String getPersonIntroduction() {
         return  String.format("%s presented with the following signs and symptoms:\n", person_string);
     }
 
+    private String stripFamilyHistory(String originalSeg) {
+        List<String> validLines = new ArrayList<>(); // everything but family history
+        String [] lines = originalSeg.split("\\.");
+        for (var line : lines) {
+            if (line.toLowerCase().contains("family history")) continue;
+            validLines.add(line);
+        }
+        return String.join(". ", validLines);
+    }
+
     protected Map<String, String> timeSegments(String vignette, List<TimePoint> timePointList) {
         Map<String, String> timeSegments = new LinkedHashMap<>(); // ordered map
         String nextStart = "";
@@ -97,6 +107,7 @@ protected Map<String, String> timeSegments(String vignette, List<TimePoint> time
             int s = timePoint.start();
             int e = timePoint.end();
             String seg = nextStart + vignette.substring(lastEnd, s);
+            seg = stripFamilyHistory(seg);
             lastEnd = e + 1;
             timeSegments.put(nextStart, seg.strip());
             nextStart = timePoint.point();
@@ -113,6 +124,11 @@ protected Map<String, String> timeSegments(String vignette, List<TimePoint> time
      * @return A string formatted as X, Y, and Z.
      */
     protected String getOxfordCommaList(Set<String> items) {
+        if (items.size() == 2) {
+            // no comma if we just have two items.
+            // one item will work with the below code
+            return String.join(" and ", items) + ".";
+        }
         StringBuilder sb = new StringBuilder();
         String symList = String.join(", ", items);
         int jj = symList.lastIndexOf(", ");

diff --git a/...a/org/monarchinitiative/phenopacket2prompt/querygen/qfactory/TextPlusManualGenerator.java b/...a/org/monarchinitiative/phenopacket2prompt/querygen/qfactory/TextPlusManualGenerator.java
@@ -1,9 +1,11 @@
 package org.monarchinitiative.phenopacket2prompt.querygen.qfactory;
 
+import org.checkerframework.checker.units.qual.A;
 import org.monarchinitiative.fenominal.core.TermMiner;
 import org.monarchinitiative.phenol.base.PhenolRuntimeException;
 import org.monarchinitiative.phenol.ontology.data.Ontology;
 import org.monarchinitiative.phenopacket2prompt.model.AdditionalConceptI;
+import org.monarchinitiative.phenopacket2prompt.model.AdditionalConceptType;
 import org.monarchinitiative.phenopacket2prompt.nejm.NejmCaseReportFromPdfFilterer;
 import org.monarchinitiative.phenopacket2prompt.querygen.TimePoint;
 import org.monarchinitiative.phenopacket2prompt.querygen.TimePointParser;
@@ -12,10 +14,7 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
+import java.util.*;
 import java.util.function.Predicate;
 import java.util.stream.Collectors;
 
@@ -24,10 +23,20 @@ public class TextPlusManualGenerator extends AbstractQueryGenerator {
     private final String promptText;
 
     private final Set<AdditionalConceptI> additionalConcepts;
+    private final Set<String> pmh;
+    private final Set<String> familyHistory;
+
+    private final List<String> outputLines;
 
 
     public  TextPlusManualGenerator(NejmCaseReportFromPdfFilterer filterer, String id, TermMiner miner, Ontology hpo) {
         super(filterer, id, miner, hpo);
+        this.outputLines = new ArrayList<>();
+        this.pmh = new HashSet<>();
+        familyHistory = filterer.getAdditionalConcepts().stream()
+                        .filter(a -> a.conceptType() == AdditionalConceptType.FAMILY_HISTORY)
+                        .map(AdditionalConceptI::insertText)
+                        .collect(Collectors.toSet());
         this.additionalConcepts = filterer.getAdditionalConcepts();
         String phenotext = getPhenopacketTextWithAdditions();
         promptText = String.format("%s%s", QUERY_HEADER, phenotext);
@@ -47,8 +56,7 @@ protected String getPhenopacketTextWithAdditions() {
         vignette = vignette.substring(ii + 1);
         List<TimePoint> timePointList = timePointParser.getTimePoints(vignette);
 
-        StringBuilder sb = new StringBuilder();
-        sb.append(firstSentence).append("\n");
+
 
         try {
             //Map<String, String> timeSegments = timeSegments(starts, ends, vignette, start2pointMap);
@@ -62,13 +70,29 @@ protected String getPhenopacketTextWithAdditions() {
                 if (description.length() > MIN_DESCRIPTION_LENGTH) {
                     String output = getPhenopacketBasedQuerySegmentWithAdditions(timePoint, description);
                     if (output.isEmpty()) continue;
-                    sb.append(output).append("\n");
+                    //sb.append(output).append("\n");
+                    outputLines.add(output);
                 }
             }
         } catch (Exception eee) {
             System.out.printf("[ERROR(TextPlusManualGenerator.java] Could not parse time segments for because of %s",  eee.getMessage());
             System.exit(1);
         }
+        StringBuilder sb = new StringBuilder();
+        sb.append(firstSentence).append("\n");
+        if (pmh.size() > 0) {
+            sb.append("The past medical history was notable for ")
+                    .append(getOxfordCommaList(pmh))
+                    .append("\n");
+        }
+        if (familyHistory.size() > 0) {
+            for (String item: familyHistory) {
+                sb.append(item).append("\n");
+            }
+        }
+        for (var line : outputLines) {
+            sb.append(line);
+        }
         return sb.toString();
     }
 
@@ -82,7 +106,6 @@ protected String getPhenopacketBasedQuerySegmentWithAdditions(String presentatio
         Set<String> treatment = new HashSet<>();
         Set<String> verbatim = new HashSet<>();
         /* Past medical history */
-        Set<String> pmh = new HashSet<>();
 
         Set<String> observed_terms = pfeatures.stream()
                 .filter(Predicate.not(PhenotypicFeature::getExcluded))
@@ -95,25 +118,24 @@ protected String getPhenopacketBasedQuerySegmentWithAdditions(String presentatio
                 .map(OntologyClass::getLabel)
                 .collect(Collectors.toSet());
         for (var addcon : this.additionalConcepts ) {
-            LOGGER.error("TOP {}", addcon);
             if (input.contains(addcon.originalText())) {
-                LOGGER.error("FOUND INPUT {}", addcon);
                 switch (addcon.conceptType()) {
                     case PHENOTYPE -> observed_terms.add(addcon.insertText());
                     case EXCLUDE -> excluded_terms.add(addcon.insertText());
                     case DIAGNOSTICS -> diagnostics.add(addcon.insertText());
                     case TREATMENT -> treatment.add(addcon.insertText());
                     case VERBATIM -> verbatim.add(addcon.insertText());
-                    case PMH -> pmh.add(addcon.insertText());
+                    case PMH -> {
+                        // do not repeat the PMH even if the original text mentions it more than once
+                        if (!pmh.contains(addcon.originalText())) {
+                            pmh.add(addcon.insertText());
+                        }
+                    }
+                    case FAMILY_HISTORY -> familyHistory.add(addcon.insertText());
                 }
             }
         }
         StringBuilder sb = new StringBuilder();
-        if (! pmh.isEmpty()) {
-            sb.append("The past medical history was notable for ");
-            sb.append(getOxfordCommaList(pmh));
-            sb.append(".\n");
-        }
         String capitalizedTimepoint;
         if (presentationTimeDescription.equalsIgnoreCase("Examination was notable for")) {
             presentationTimeDescription = "On examination";
@@ -146,15 +168,16 @@ protected String getPhenopacketBasedQuerySegmentWithAdditions(String presentatio
             } else {
                 sb.append("The following signs and symptoms were excluded: ");
             }
-            sb.append(excludededSymptoms).append(" ");
+            sb.append(excludededSymptoms).append("\n");
         }
         if (! diagnostics.isEmpty()) {
             sb.append("The following diagnostic observations were made: \n");
             sb.append(getOxfordCommaList(diagnostics));
         }
         if (! treatment.isEmpty()) {
-            sb.append("The following treatments were administered: \n");
+            sb.append("The following treatments were administered: ");
             sb.append(getOxfordCommaList(treatment));
+            sb.append("\n");
         }
         if (! verbatim.isEmpty()) {
             for (var v : verbatim)