Support not removing matching bases and sticking with original input (#…

…233) * parent fe506e6 author as1000 <[email protected]> 1616179108 -0400 committer Xiang Li <[email protected]> 1667277999 -0400 parent fe506e6 author as1000 <[email protected]> 1616179108 -0400 committer Xiang Li <[email protected]> 1667277826 -0400 Support immutable variants * Support strip all, none or first matching bases by adding '-s' in the command, for example '-s all' * Add '-a' parameter to allow add original genomic location columns in MAF, name columns with prefix IGNORE_Genome_Nexus_Original_ * Add '-d' to stick with original input (columns with prefix IGNORE_Genome_Nexus_Original_) Co-authored-by: as1000 <[email protected]>
genome-nexus · Dec 12, 2022 · 90a1e9f · 90a1e9f
1 parent 5b4d537
commit 90a1e9f
Show file tree

Hide file tree

Showing 21 changed files with 475 additions and 56 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -44,11 +44,65 @@ jobs:
 #      - run:
 #          name: "check if maf file still the same when annotating with uniprot transcripts"
 #          command: 'java -jar annotationPipeline/target/annotationPipeline-*.jar --filename test/data/data_mutations_extended_100.in.txt  --output-filename test/data/data_mutations_extended_100.out.uniprot.txt --isoform-override uniprot && git diff -G "^[^#]" --exit-code test/data/data_mutations_extended_100.out.uniprot.txt || (echo MAF uniprot output changed test/data/data_mutations_extended_100.out.uniprot.txt && exit 1)'
+      # run test
+      - run:
+          name: "check if maf file still the same when annotating with uniprot transcripts"
+          command: 'java -jar annotationPipeline/target/annotationPipeline-*.jar --filename test/data/data_mutations_extended_100.in.txt  --output-filename test/data/data_mutations_extended_100.out.uniprot.txt --isoform-override uniprot && git diff -G "^[^#]" --exit-code test/data/data_mutations_extended_100.out.uniprot.txt || (echo MAF uniprot output changed test/data/data_mutations_extended_100.out.uniprot.txt && exit 1)'
+
+      - run:
+          name: "check if maf file still the same when annotating with mskcc transcripts"
+          command: 'java -jar annotationPipeline/target/annotationPipeline-*.jar --filename test/data/data_mutations_extended_100.in.txt  --output-filename test/data/data_mutations_extended_100.out.mskcc.txt --isoform-override mskcc && git diff -G "^[^#]" --exit-code test/data/data_mutations_extended_100.out.mskcc.txt || (echo MAF mskcc output changed test/data/data_mutations_extended_100.out.mskcc.txt && exit 1)'
+
+      # run test
+      - run:
+          name: "check if minimal example maf file still the same when annotating with uniprot transcripts"
+          command: 'java -jar annotationPipeline/target/annotationPipeline-*.jar -r --filename test/data/minimal_example.in.txt  --output-filename test/data/minimal_example.out.uniprot.txt --isoform-override uniprot && git diff -G "^[^#]" --exit-code test/data/minimal_example.out.uniprot.txt || (echo simple MAF uniprot output changed test/data/minimal_example.out.uniprot.txt && exit 1)'
+
+      - run:
+          name: "check if columns prefixed by IGNORE_Genome_Nexus_Original are immutable"
+          command: 'java -jar annotationPipeline/target/annotationPipeline-*.jar -r --filename test/data/immutable_columns_test.in.txt  --output-filename test/data/immutable_columns_test.out.uniprot.txt --isoform-override uniprot && git diff -G "^[^#]" --exit-code test/data/immutable_columns_test.out.uniprot.txt || (echo simple MAF uniprot output changed test/data/immutable_columns_test.out.uniprot.txt && exit 1)'
+
+      - run:
+          name: "check if corner cases example maf file still the same when annotating with uniprot transcripts"
+          command: 'java -jar annotationPipeline/target/annotationPipeline-*.jar -r --filename test/data/corner_cases.two_tumor_seq_allele.in.txt  --output-filename test/data/corner_cases.two_tumor_seq_allele.out.uniprot.txt --isoform-override uniprot && git diff -G "^[^#]" --exit-code test/data/corner_cases.two_tumor_seq_allele.out.uniprot.txt || (echo simple MAF uniprot output changed test/data/corner_cases.two_tumor_seq_allele.out.uniprot.txt && exit 1)'
+
+      - run:
+          name: "check if corner cases example maf file still the same when annotating with mskcc transcripts"
+          command: 'java -jar annotationPipeline/target/annotationPipeline-*.jar -r --filename test/data/corner_cases.two_tumor_seq_allele.in.txt  --output-filename test/data/corner_cases.two_tumor_seq_allele.out.mskcc.txt --isoform-override mskcc && git diff -G "^[^#]" --exit-code test/data/corner_cases.two_tumor_seq_allele.out.mskcc.txt || (echo simple MAF mskcc output changed test/data/corner_cases.two_tumor_seq_allele.out.mskcc.txt && exit 1)'
+
+      - run:
+          name: "check if corner cases example maf file still the same when annotating with uniprot transcripts"
+          command: 'java -jar annotationPipeline/target/annotationPipeline-*.jar -r --filename test/data/corner_cases.in.txt  --output-filename test/data/corner_cases.out.uniprot.txt --isoform-override uniprot && git diff -G "^[^#]" --exit-code test/data/corner_cases.out.uniprot.txt || (echo simple MAF uniprot output changed test/data/corner_cases.out.uniprot.txt && exit 1)'
+
+      - run:
+          name: "check if corner cases example maf file still the same when annotating with mskcc transcripts"
+          command: 'java -jar annotationPipeline/target/annotationPipeline-*.jar -r --filename test/data/corner_cases.in.txt  --output-filename test/data/corner_cases.out.mskcc.txt --isoform-override mskcc && git diff -G "^[^#]" --exit-code test/data/corner_cases.out.mskcc.txt || (echo simple MAF mskcc output changed test/data/corner_cases.out.mskcc.txt && exit 1)'
 
       - run:
           name: "Run vcf2maf test cases"
           command: 'sudo apt-get install make && ./test/scripts/vcf2maf_tests.sh'
 
+      # run test
+      - run:
+          name: "check if maf file still the same when annotating with uniprot transcripts and POSTs"
+          command: 'java -jar annotationPipeline/target/annotationPipeline-*.jar --filename test/data/data_mutations_extended_100.in.txt  --output-filename test/data/data_mutations_extended_100.out.post.uniprot.txt --isoform-override uniprot && git diff -G "^[^#]" --exit-code test/data/data_mutations_extended_100.out.uniprot.txt || (echo MAF uniprot output changed test/data/data_mutations_extended_100.out.uniprot.txt)'
+
+      - run:
+          name: "check if maf file still the same when annotating with mskcc transcripts and POSTs"
+          command: 'java -jar annotationPipeline/target/annotationPipeline-*.jar --filename test/data/data_mutations_extended_100.in.txt  --output-filename test/data/data_mutations_extended_100.out.post.mskcc.txt --isoform-override mskcc && git diff -G "^[^#]" --exit-code test/data/data_mutations_extended_100.out.mskcc.txt || (echo MAF mskcc output changed test/data/data_mutations_extended_100.out.mskcc.txt)'
+
+      - run:
+          name: "check if minimal example maf file still the same when annotating with uniprot transcripts and POSTs"
+          command: 'java -jar annotationPipeline/target/annotationPipeline-*.jar -r --filename test/data/minimal_example.in.txt  --output-filename test/data/minimal_example.out.post.uniprot.txt --isoform-override uniprot && git diff -G "^[^#]" --exit-code test/data/minimal_example.out.uniprot.txt || (echo simple MAF uniprot output changed test/data/minimal_example.out.uniprot.txt)'
+
+      - run:
+          name: "check if my_variant_info provides gnomad annotations"
+          command: 'java -Dgenomenexus.enrichment_fields=annotation_summary,my_variant_info -jar annotationPipeline/target/annotationPipeline-*.jar -r --filename test/data/my_variant_info_corner_cases.in.txt  --output-filename test/data/my_variant_info_corner_cases.out.uniprot.txt --isoform-override uniprot && git diff -G "^[^#]" --exit-code test/data/my_variant_info_corner_cases.out.uniprot.txt || (echo simple MAF uniprot output changed test/data/my_variant_info_corner_cases.out.uniprot.txt)'
+
+      - run:
+          name: "check if nucleotide_context provides Ref_Tri and Var_Tri columnsj"
+          command: 'java -Dgenomenexus.enrichment_fields=annotation_summary,nucleotide_context -jar annotationPipeline/target/annotationPipeline-*.jar --filename test/data/data_mutations_extended_100.in.txt  --output-filename test/data/data_mutations_extended_100.out.uniprot.nucleotide_context.txt --isoform-override uniprot && git diff -G "^[^#]" --exit-code  test/data/data_mutations_extended_100.out.uniprot.nucleotide_context.txt || (echo simple MAF uniprot output changed  test/data/data_mutations_extended_100.out.uniprot.nucleotide_context.txt)'
+
       - store_artifacts:
           path: test/data
           destination: /test-data-output

diff --git a/README.md b/README.md
@@ -40,6 +40,21 @@ reporting to a file, supply the `-e` option a location for the file to be
 saved. By running the jar without any arguments or by providing the optional
 parameter `-h` you can view the full usage statement. 
 
+### Optional parameters
+| Short | Long | Description | 
+| ------ | ------  | ------ |
+| `-h` | `--help` | shows this help document and quits|
+| `-f` | `--filename` |Mutation filename|
+| `-o` | `--output-filename` | Output filename (including path)|
+| `-t` | `--output-format`  | extended, minimal or a file path which includes output format (FORMAT EXAMPLE: Chromosome,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build)|
+| `-i` | `--isoform-override` | Isoform Overrides. Options: mskcc or uniprot|
+| `-e` | `--error-report-location` | Error report filename (including path)|
+| `-r` | `--replace-symbol-entrez` | Replace gene symbols and entrez id with what is provided by annotator"|
+| `-p` | `--post-interval-size` | Number of records to make POST requests to Genome Nexus with at a time |
+| `-s` | `--strip-matching-bases` | Strip matching allele bases. Options: first, all, none. For example: AAC/AAT, strip-off first: AC/AT, strip-off all: C/T, strip-off none: AAC/AAT  |
+| `-a` | `--add-original-genomic-location` | Add original genomic location data columns into the output, name columns with prefix 'IGNORE_Genome_Nexus_Original_'). This would be useful if saving a reference of original input is needed and won't be changed in any condition|
+| `-d` | `--ignore-original-location` | Genome-nexus-annotation-pipeline reads original genomic location info as input by default, if not existing, reading from normal genomic location info columns. Adding `-d` ignores original genomic location info columns (columns with prefix 'IGNORE_Genome_Nexus_Original_') and only use whatever in normal genomic location info columns. This would be helpful if you'd like to stick with current genomic location info columns.|
+
 ### Minimal MAF Example
 
 For an example minimal input file see

diff --git a/annotationPipeline/src/main/java/org/cbioportal/annotation/AnnotationPipeline.java b/annotationPipeline/src/main/java/org/cbioportal/annotation/AnnotationPipeline.java
@@ -66,7 +66,7 @@ public class AnnotationPipeline {
     private static final Logger LOG = LoggerFactory.getLogger(AnnotationPipeline.class);
 
     private static void annotateJob(String[] args, String filename, String outputFilename, String outputFormat, String isoformOverride,
-                                    String errorReportLocation, boolean replace, String postIntervalSize) throws Exception {
+                                    String errorReportLocation, boolean replace, String postIntervalSize, String stripMatchingBases, Boolean ignoreOriginalGenomicLocation, Boolean addOriginalGenomicLocation) throws Exception {
         SpringApplication app = new SpringApplication(AnnotationPipeline.class);
         app.setWebApplicationType(WebApplicationType.NONE);
         app.setAllowBeanDefinitionOverriding(Boolean.TRUE);
@@ -82,6 +82,9 @@ private static void annotateJob(String[] args, String filename, String outputFil
             .addString("isoformOverride", isoformOverride)
             .addString("errorReportLocation", errorReportLocation)
             .addString("postIntervalSize", postIntervalSize)
+            .addString("stripMatchingBases", stripMatchingBases)
+            .addString("ignoreOriginalGenomicLocation", String.valueOf(ignoreOriginalGenomicLocation))
+            .addString("addOriginalGenomicLocation", String.valueOf(addOriginalGenomicLocation))
             .toJobParameters();
         JobExecution jobExecution = jobLauncher.run(annotationJob, jobParameters);
         if (!jobExecution.getExitStatus().equals(ExitStatus.COMPLETED)) {
@@ -217,7 +220,7 @@ private static void annotate(Subcommand subcommand, String[] args) throws Annota
         try {
             annotateJob(args, subcommand.getOptionValue("filename"), subcommand.getOptionValue("output-filename"), outputFormat, subcommand.getOptionValue("isoform-override"),
                     subcommand.getOptionValue("error-report-location", ""),
-                    subcommand.hasOption("replace-symbol-entrez"), subcommand.getOptionValue("post-interval-size", "100"));
+                    subcommand.hasOption("replace-symbol-entrez"), subcommand.getOptionValue("post-interval-size", "100"), subcommand.getOptionValue("strip-matching-bases", "all"), subcommand.hasOption("ignore-original-genomic-location"), subcommand.hasOption("add-original-genomic-location"));
             // When you change the default value of post-interval-size, do not forget to update MutationRecordReader.postIntervalSize accordingly
         } catch (Exception e) {
             throw new AnnotationFailedException(e);

diff --git a/annotationPipeline/src/main/java/org/cbioportal/annotation/cli/AnnotateSubcommand.java b/annotationPipeline/src/main/java/org/cbioportal/annotation/cli/AnnotateSubcommand.java
@@ -36,7 +36,11 @@ private static Options getOptions() {
                 .addOption("i", "isoform-override", true, "Isoform Overrides (mskcc or uniprot)")
                 .addOption("e", "error-report-location", true, "Error report filename (including path)")
                 .addOption("r", "replace-symbol-entrez", false, "Replace gene symbols and entrez id with what is provided by annotator")
-                .addOption("p", "post-interval-size", true, "Number of records to make POST requests to Genome Nexus with at a time");
+                .addOption("p", "post-interval-size", true, "Number of records to make POST requests to Genome Nexus with at a time")
+                .addOption("s", "strip-matching-bases", true, "Strip matching allele bases, options are: first,all,none")
+                .addOption("d", "ignore-original-genomic-location", false, "Ignore original genomic location in input file (columns with prefix 'IGNORE_Genome_Nexus_Original_').")
+                .addOption("a", "add-original-genomic-location", false, "Add original genomic location input columns in the output, name columns with prefix 'IGNORE_Genome_Nexus_Original_')");
+
         return gnuOptions;
     }
 

diff --git a/...tationPipeline/src/main/java/org/cbioportal/annotation/pipeline/MutationRecordReader.java b/...tationPipeline/src/main/java/org/cbioportal/annotation/pipeline/MutationRecordReader.java
@@ -74,12 +74,21 @@ public class MutationRecordReader implements ItemStreamReader<AnnotatedRecord> {
     @Value("#{jobParameters[errorReportLocation] ?: ''}")
     private String errorReportLocation;
 
+    @Value("#{jobParameters[stripMatchingBases] ?: 'all'}")
+    private String stripMatchingBases;
+
     @Value("#{jobParameters[postIntervalSize] ?: '100'}")
     private Integer postIntervalSize;
 
     @Value("#{jobParameters[outputFormat]}")
     private String outputFormat;
 
+    @Value("#{jobParameters[ignoreOriginalGenomicLocation] ?: 'false'}")
+    private Boolean ignoreOriginalGenomicLocation;
+
+    @Value("#{jobParameters[addOriginalGenomicLocation] ?: 'false'}")
+    private Boolean addOriginalGenomicLocation;
+
     private AnnotationSummaryStatistics summaryStatistics;
     private List<AnnotatedRecord> allAnnotatedRecords = new ArrayList<>();
     private Set<String> header = new LinkedHashSet<>();
@@ -98,9 +107,9 @@ public void open(ExecutionContext ec) throws ItemStreamException {
         List<MutationRecord> mutationRecords = loadMutationRecordsFromMaf();
         if (!mutationRecords.isEmpty()) {
             if (postIntervalSize > 1) {
-                allAnnotatedRecords = annotator.getAnnotatedRecordsUsingPOST(summaryStatistics, mutationRecords, isoformOverride, replace, postIntervalSize, true);
+                allAnnotatedRecords = annotator.getAnnotatedRecordsUsingPOST(summaryStatistics, mutationRecords, isoformOverride, replace, postIntervalSize, true, stripMatchingBases, ignoreOriginalGenomicLocation, addOriginalGenomicLocation);
             } else {
-                allAnnotatedRecords = annotator.annotateRecordsUsingGET(summaryStatistics, mutationRecords, isoformOverride, replace, true);
+                allAnnotatedRecords = annotator.annotateRecordsUsingGET(summaryStatistics, mutationRecords, isoformOverride, replace, true, stripMatchingBases, ignoreOriginalGenomicLocation, addOriginalGenomicLocation);
             }
             // if output-format option is supplied, we only need to convert its data into header
             if (outputFormat != null) {