Skip to content

Commit

Permalink
Support not removing matching bases and sticking with original input (#…
Browse files Browse the repository at this point in the history
…233)

* parent fe506e6
author as1000 <[email protected]> 1616179108 -0400
committer Xiang Li <[email protected]> 1667277999 -0400

parent fe506e6
author as1000 <[email protected]> 1616179108 -0400
committer Xiang Li <[email protected]> 1667277826 -0400

Support immutable variants

* Support strip all, none or first matching bases by adding '-s' in the command, for example '-s all'

* Add '-a' parameter to allow add original genomic location columns in MAF, name columns with prefix IGNORE_Genome_Nexus_Original_

* Add '-d' to stick with original input (columns with prefix IGNORE_Genome_Nexus_Original_)

Co-authored-by: as1000 <[email protected]>
  • Loading branch information
leexgh and as1000 authored Dec 12, 2022
1 parent 5b4d537 commit 90a1e9f
Show file tree
Hide file tree
Showing 21 changed files with 475 additions and 56 deletions.
54 changes: 54 additions & 0 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,65 @@ jobs:
# - run:
# name: "check if maf file still the same when annotating with uniprot transcripts"
# command: 'java -jar annotationPipeline/target/annotationPipeline-*.jar --filename test/data/data_mutations_extended_100.in.txt --output-filename test/data/data_mutations_extended_100.out.uniprot.txt --isoform-override uniprot && git diff -G "^[^#]" --exit-code test/data/data_mutations_extended_100.out.uniprot.txt || (echo MAF uniprot output changed test/data/data_mutations_extended_100.out.uniprot.txt && exit 1)'
# run test
- run:
name: "check if maf file still the same when annotating with uniprot transcripts"
command: 'java -jar annotationPipeline/target/annotationPipeline-*.jar --filename test/data/data_mutations_extended_100.in.txt --output-filename test/data/data_mutations_extended_100.out.uniprot.txt --isoform-override uniprot && git diff -G "^[^#]" --exit-code test/data/data_mutations_extended_100.out.uniprot.txt || (echo MAF uniprot output changed test/data/data_mutations_extended_100.out.uniprot.txt && exit 1)'

- run:
name: "check if maf file still the same when annotating with mskcc transcripts"
command: 'java -jar annotationPipeline/target/annotationPipeline-*.jar --filename test/data/data_mutations_extended_100.in.txt --output-filename test/data/data_mutations_extended_100.out.mskcc.txt --isoform-override mskcc && git diff -G "^[^#]" --exit-code test/data/data_mutations_extended_100.out.mskcc.txt || (echo MAF mskcc output changed test/data/data_mutations_extended_100.out.mskcc.txt && exit 1)'

# run test
- run:
name: "check if minimal example maf file still the same when annotating with uniprot transcripts"
command: 'java -jar annotationPipeline/target/annotationPipeline-*.jar -r --filename test/data/minimal_example.in.txt --output-filename test/data/minimal_example.out.uniprot.txt --isoform-override uniprot && git diff -G "^[^#]" --exit-code test/data/minimal_example.out.uniprot.txt || (echo simple MAF uniprot output changed test/data/minimal_example.out.uniprot.txt && exit 1)'

- run:
name: "check if columns prefixed by IGNORE_Genome_Nexus_Original are immutable"
command: 'java -jar annotationPipeline/target/annotationPipeline-*.jar -r --filename test/data/immutable_columns_test.in.txt --output-filename test/data/immutable_columns_test.out.uniprot.txt --isoform-override uniprot && git diff -G "^[^#]" --exit-code test/data/immutable_columns_test.out.uniprot.txt || (echo simple MAF uniprot output changed test/data/immutable_columns_test.out.uniprot.txt && exit 1)'

- run:
name: "check if corner cases example maf file still the same when annotating with uniprot transcripts"
command: 'java -jar annotationPipeline/target/annotationPipeline-*.jar -r --filename test/data/corner_cases.two_tumor_seq_allele.in.txt --output-filename test/data/corner_cases.two_tumor_seq_allele.out.uniprot.txt --isoform-override uniprot && git diff -G "^[^#]" --exit-code test/data/corner_cases.two_tumor_seq_allele.out.uniprot.txt || (echo simple MAF uniprot output changed test/data/corner_cases.two_tumor_seq_allele.out.uniprot.txt && exit 1)'

- run:
name: "check if corner cases example maf file still the same when annotating with mskcc transcripts"
command: 'java -jar annotationPipeline/target/annotationPipeline-*.jar -r --filename test/data/corner_cases.two_tumor_seq_allele.in.txt --output-filename test/data/corner_cases.two_tumor_seq_allele.out.mskcc.txt --isoform-override mskcc && git diff -G "^[^#]" --exit-code test/data/corner_cases.two_tumor_seq_allele.out.mskcc.txt || (echo simple MAF mskcc output changed test/data/corner_cases.two_tumor_seq_allele.out.mskcc.txt && exit 1)'

- run:
name: "check if corner cases example maf file still the same when annotating with uniprot transcripts"
command: 'java -jar annotationPipeline/target/annotationPipeline-*.jar -r --filename test/data/corner_cases.in.txt --output-filename test/data/corner_cases.out.uniprot.txt --isoform-override uniprot && git diff -G "^[^#]" --exit-code test/data/corner_cases.out.uniprot.txt || (echo simple MAF uniprot output changed test/data/corner_cases.out.uniprot.txt && exit 1)'

- run:
name: "check if corner cases example maf file still the same when annotating with mskcc transcripts"
command: 'java -jar annotationPipeline/target/annotationPipeline-*.jar -r --filename test/data/corner_cases.in.txt --output-filename test/data/corner_cases.out.mskcc.txt --isoform-override mskcc && git diff -G "^[^#]" --exit-code test/data/corner_cases.out.mskcc.txt || (echo simple MAF mskcc output changed test/data/corner_cases.out.mskcc.txt && exit 1)'

- run:
name: "Run vcf2maf test cases"
command: 'sudo apt-get install make && ./test/scripts/vcf2maf_tests.sh'

# run test
- run:
name: "check if maf file still the same when annotating with uniprot transcripts and POSTs"
command: 'java -jar annotationPipeline/target/annotationPipeline-*.jar --filename test/data/data_mutations_extended_100.in.txt --output-filename test/data/data_mutations_extended_100.out.post.uniprot.txt --isoform-override uniprot && git diff -G "^[^#]" --exit-code test/data/data_mutations_extended_100.out.uniprot.txt || (echo MAF uniprot output changed test/data/data_mutations_extended_100.out.uniprot.txt)'

- run:
name: "check if maf file still the same when annotating with mskcc transcripts and POSTs"
command: 'java -jar annotationPipeline/target/annotationPipeline-*.jar --filename test/data/data_mutations_extended_100.in.txt --output-filename test/data/data_mutations_extended_100.out.post.mskcc.txt --isoform-override mskcc && git diff -G "^[^#]" --exit-code test/data/data_mutations_extended_100.out.mskcc.txt || (echo MAF mskcc output changed test/data/data_mutations_extended_100.out.mskcc.txt)'

- run:
name: "check if minimal example maf file still the same when annotating with uniprot transcripts and POSTs"
command: 'java -jar annotationPipeline/target/annotationPipeline-*.jar -r --filename test/data/minimal_example.in.txt --output-filename test/data/minimal_example.out.post.uniprot.txt --isoform-override uniprot && git diff -G "^[^#]" --exit-code test/data/minimal_example.out.uniprot.txt || (echo simple MAF uniprot output changed test/data/minimal_example.out.uniprot.txt)'

- run:
name: "check if my_variant_info provides gnomad annotations"
command: 'java -Dgenomenexus.enrichment_fields=annotation_summary,my_variant_info -jar annotationPipeline/target/annotationPipeline-*.jar -r --filename test/data/my_variant_info_corner_cases.in.txt --output-filename test/data/my_variant_info_corner_cases.out.uniprot.txt --isoform-override uniprot && git diff -G "^[^#]" --exit-code test/data/my_variant_info_corner_cases.out.uniprot.txt || (echo simple MAF uniprot output changed test/data/my_variant_info_corner_cases.out.uniprot.txt)'

- run:
name: "check if nucleotide_context provides Ref_Tri and Var_Tri columnsj"
command: 'java -Dgenomenexus.enrichment_fields=annotation_summary,nucleotide_context -jar annotationPipeline/target/annotationPipeline-*.jar --filename test/data/data_mutations_extended_100.in.txt --output-filename test/data/data_mutations_extended_100.out.uniprot.nucleotide_context.txt --isoform-override uniprot && git diff -G "^[^#]" --exit-code test/data/data_mutations_extended_100.out.uniprot.nucleotide_context.txt || (echo simple MAF uniprot output changed test/data/data_mutations_extended_100.out.uniprot.nucleotide_context.txt)'

- store_artifacts:
path: test/data
destination: /test-data-output
Expand Down
15 changes: 15 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,21 @@ reporting to a file, supply the `-e` option a location for the file to be
saved. By running the jar without any arguments or by providing the optional
parameter `-h` you can view the full usage statement.

### Optional parameters
| Short | Long | Description |
| ------ | ------ | ------ |
| `-h` | `--help` | shows this help document and quits|
| `-f` | `--filename` |Mutation filename|
| `-o` | `--output-filename` | Output filename (including path)|
| `-t` | `--output-format` | extended, minimal or a file path which includes output format (FORMAT EXAMPLE: Chromosome,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build)|
| `-i` | `--isoform-override` | Isoform Overrides. Options: mskcc or uniprot|
| `-e` | `--error-report-location` | Error report filename (including path)|
| `-r` | `--replace-symbol-entrez` | Replace gene symbols and entrez id with what is provided by annotator"|
| `-p` | `--post-interval-size` | Number of records to make POST requests to Genome Nexus with at a time |
| `-s` | `--strip-matching-bases` | Strip matching allele bases. Options: first, all, none. For example: AAC/AAT, strip-off first: AC/AT, strip-off all: C/T, strip-off none: AAC/AAT |
| `-a` | `--add-original-genomic-location` | Add original genomic location data columns into the output, name columns with prefix 'IGNORE_Genome_Nexus_Original_'). This would be useful if saving a reference of original input is needed and won't be changed in any condition|
| `-d` | `--ignore-original-location` | Genome-nexus-annotation-pipeline reads original genomic location info as input by default, if not existing, reading from normal genomic location info columns. Adding `-d` ignores original genomic location info columns (columns with prefix 'IGNORE_Genome_Nexus_Original_') and only use whatever in normal genomic location info columns. This would be helpful if you'd like to stick with current genomic location info columns.|

### Minimal MAF Example

For an example minimal input file see
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ public class AnnotationPipeline {
private static final Logger LOG = LoggerFactory.getLogger(AnnotationPipeline.class);

private static void annotateJob(String[] args, String filename, String outputFilename, String outputFormat, String isoformOverride,
String errorReportLocation, boolean replace, String postIntervalSize) throws Exception {
String errorReportLocation, boolean replace, String postIntervalSize, String stripMatchingBases, Boolean ignoreOriginalGenomicLocation, Boolean addOriginalGenomicLocation) throws Exception {
SpringApplication app = new SpringApplication(AnnotationPipeline.class);
app.setWebApplicationType(WebApplicationType.NONE);
app.setAllowBeanDefinitionOverriding(Boolean.TRUE);
Expand All @@ -82,6 +82,9 @@ private static void annotateJob(String[] args, String filename, String outputFil
.addString("isoformOverride", isoformOverride)
.addString("errorReportLocation", errorReportLocation)
.addString("postIntervalSize", postIntervalSize)
.addString("stripMatchingBases", stripMatchingBases)
.addString("ignoreOriginalGenomicLocation", String.valueOf(ignoreOriginalGenomicLocation))
.addString("addOriginalGenomicLocation", String.valueOf(addOriginalGenomicLocation))
.toJobParameters();
JobExecution jobExecution = jobLauncher.run(annotationJob, jobParameters);
if (!jobExecution.getExitStatus().equals(ExitStatus.COMPLETED)) {
Expand Down Expand Up @@ -217,7 +220,7 @@ private static void annotate(Subcommand subcommand, String[] args) throws Annota
try {
annotateJob(args, subcommand.getOptionValue("filename"), subcommand.getOptionValue("output-filename"), outputFormat, subcommand.getOptionValue("isoform-override"),
subcommand.getOptionValue("error-report-location", ""),
subcommand.hasOption("replace-symbol-entrez"), subcommand.getOptionValue("post-interval-size", "100"));
subcommand.hasOption("replace-symbol-entrez"), subcommand.getOptionValue("post-interval-size", "100"), subcommand.getOptionValue("strip-matching-bases", "all"), subcommand.hasOption("ignore-original-genomic-location"), subcommand.hasOption("add-original-genomic-location"));
// When you change the default value of post-interval-size, do not forget to update MutationRecordReader.postIntervalSize accordingly
} catch (Exception e) {
throw new AnnotationFailedException(e);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,11 @@ private static Options getOptions() {
.addOption("i", "isoform-override", true, "Isoform Overrides (mskcc or uniprot)")
.addOption("e", "error-report-location", true, "Error report filename (including path)")
.addOption("r", "replace-symbol-entrez", false, "Replace gene symbols and entrez id with what is provided by annotator")
.addOption("p", "post-interval-size", true, "Number of records to make POST requests to Genome Nexus with at a time");
.addOption("p", "post-interval-size", true, "Number of records to make POST requests to Genome Nexus with at a time")
.addOption("s", "strip-matching-bases", true, "Strip matching allele bases, options are: first,all,none")
.addOption("d", "ignore-original-genomic-location", false, "Ignore original genomic location in input file (columns with prefix 'IGNORE_Genome_Nexus_Original_').")
.addOption("a", "add-original-genomic-location", false, "Add original genomic location input columns in the output, name columns with prefix 'IGNORE_Genome_Nexus_Original_')");

return gnuOptions;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,12 +74,21 @@ public class MutationRecordReader implements ItemStreamReader<AnnotatedRecord> {
@Value("#{jobParameters[errorReportLocation] ?: ''}")
private String errorReportLocation;

@Value("#{jobParameters[stripMatchingBases] ?: 'all'}")
private String stripMatchingBases;

@Value("#{jobParameters[postIntervalSize] ?: '100'}")
private Integer postIntervalSize;

@Value("#{jobParameters[outputFormat]}")
private String outputFormat;

@Value("#{jobParameters[ignoreOriginalGenomicLocation] ?: 'false'}")
private Boolean ignoreOriginalGenomicLocation;

@Value("#{jobParameters[addOriginalGenomicLocation] ?: 'false'}")
private Boolean addOriginalGenomicLocation;

private AnnotationSummaryStatistics summaryStatistics;
private List<AnnotatedRecord> allAnnotatedRecords = new ArrayList<>();
private Set<String> header = new LinkedHashSet<>();
Expand All @@ -98,9 +107,9 @@ public void open(ExecutionContext ec) throws ItemStreamException {
List<MutationRecord> mutationRecords = loadMutationRecordsFromMaf();
if (!mutationRecords.isEmpty()) {
if (postIntervalSize > 1) {
allAnnotatedRecords = annotator.getAnnotatedRecordsUsingPOST(summaryStatistics, mutationRecords, isoformOverride, replace, postIntervalSize, true);
allAnnotatedRecords = annotator.getAnnotatedRecordsUsingPOST(summaryStatistics, mutationRecords, isoformOverride, replace, postIntervalSize, true, stripMatchingBases, ignoreOriginalGenomicLocation, addOriginalGenomicLocation);
} else {
allAnnotatedRecords = annotator.annotateRecordsUsingGET(summaryStatistics, mutationRecords, isoformOverride, replace, true);
allAnnotatedRecords = annotator.annotateRecordsUsingGET(summaryStatistics, mutationRecords, isoformOverride, replace, true, stripMatchingBases, ignoreOriginalGenomicLocation, addOriginalGenomicLocation);
}
// if output-format option is supplied, we only need to convert its data into header
if (outputFormat != null) {
Expand Down
Loading

0 comments on commit 90a1e9f

Please sign in to comment.