Skip to content

Commit

Permalink
Implement setup ref-genome command.
Browse files Browse the repository at this point in the history
Signed-off-by: Daniel Danis <[email protected]>
  • Loading branch information
ielis committed May 23, 2022
1 parent 4609ac6 commit 4396765
Show file tree
Hide file tree
Showing 17 changed files with 233 additions and 73 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,8 @@
import org.monarchinitiative.squirls.cli.cmd.annotate_pos.AnnotatePosCommand;
import org.monarchinitiative.squirls.cli.cmd.annotate_vcf.AnnotateVcfCommand;
import org.monarchinitiative.squirls.cli.cmd.precalculate.PrecalculateCommand;
import org.monarchinitiative.squirls.cli.cmd.setup.PreprocessReferenceGenomeCommand;
import org.monarchinitiative.squirls.cli.cmd.setup.SetupCommand;
import picocli.CommandLine;
import picocli.CommandLine.Help.ColorScheme.Builder;

Expand Down Expand Up @@ -117,6 +119,8 @@ public static void main(String[] args) {
Locale.setDefault(Locale.US);
CommandLine cline = new CommandLine(new Main())
.setColorScheme(COLOR_SCHEME)
.addSubcommand("setup", new CommandLine(new SetupCommand())
.addSubcommand("ref-genome", new PreprocessReferenceGenomeCommand()))
.addSubcommand("annotate-pos", new AnnotatePosCommand())
.addSubcommand("annotate-csv", new AnnotateCsvCommand())
.addSubcommand("annotate-vcf", new AnnotateVcfCommand())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@
* Daniel Danis, Peter N Robinson, 2020
*/

package org.monarchinitiative.squirls.ingest.data;
package org.monarchinitiative.squirls.cli.cmd.setup;

import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceDictionaryCodec;
Expand All @@ -85,6 +85,7 @@
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
import org.apache.commons.io.IOUtils;
import org.monarchinitiative.squirls.io.download.UrlResourceDownloader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
package org.monarchinitiative.squirls.cli.cmd.setup;

import org.monarchinitiative.squirls.cli.Main;
import org.monarchinitiative.squirls.io.download.UrlResourceDownloader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import picocli.CommandLine;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.concurrent.Callable;
import java.util.function.UnaryOperator;

@CommandLine.Command(name = "ref-genome",
header = "Download and preprocess reference genome",
mixinStandardHelpOptions = true,
version = Main.VERSION,
usageHelpWidth = Main.WIDTH,
footer = Main.FOOTER)
public class PreprocessReferenceGenomeCommand implements Callable<Integer> {

private static final Logger LOGGER = LoggerFactory.getLogger(PreprocessReferenceGenomeCommand.class);


@CommandLine.Option(names = {"-d", "--data-directory"},
paramLabel = "path/to/datadir",
required = true,
description = "Path to Squirls data directory")
public Path dataDirectory;

@CommandLine.Option(names = {"-g", "--genome-assembly"},
required = true,
description = "Genome assembly URL")
public URL genomeUrl;

@CommandLine.Option(names = {"-a", "--assembly-report"},
required = true,
description = "Assembly report URL")
public URL assemblyReportUrl;

@CommandLine.Option(names = {"--overwrite"},
description = "Overwrite the genome files (default: ${DEFAULT-VALUE})")
public boolean overwrite = false;

@Override
public Integer call() {
if (!Files.isDirectory(dataDirectory)) {
LOGGER.error("`-d | --data-directory` option must point to an existing directory");
return 1;
}

try {
// First, reference genome
LOGGER.info("Downloading reference genome ZIP");
downloadReferenceGenome(dataDirectory, genomeUrl, overwrite);

// Then, assembly report.
downloadAssemblyReport(dataDirectory, assemblyReportUrl, overwrite);
return 0;
} catch (Exception e) {
LOGGER.error(e.getMessage(), e);
return 1;
}
}

/**
* Download, decompress, and concatenate contigs into a single FASTA file. Then, index the FASTA file.
*
* @param buildDir path to directory where Squirls data files will be created
* @param genomeUrl url pointing to reference genome FASTA file to be downloaded
* @param overwrite overwrite existing FASTA file if true
*/
private static void downloadReferenceGenome(Path buildDir, URL genomeUrl, boolean overwrite) {
Path genomeFastaPath = buildDir.resolve("genome.fa");
GenomeAssemblyDownloader downloader = new GenomeAssemblyDownloader(genomeUrl, genomeFastaPath, overwrite);
downloader.run(); // !
}

private static void downloadAssemblyReport(Path dataDirectory, URL assemblyReportUrl, boolean overwrite) throws IOException {
Path temporary = dataDirectory.resolve("assembly_report.tmp.txt");
UrlResourceDownloader downloader = new UrlResourceDownloader(assemblyReportUrl, temporary, overwrite);
downloader.run(); // !

Path destination = dataDirectory.resolve("assembly_report.txt");
fixHg19MitochondrialLine(temporary, destination);
Files.deleteIfExists(temporary);
}

private static void fixHg19MitochondrialLine(Path source, Path destination) throws IOException {
try (BufferedReader reader = Files.newBufferedReader(source);
BufferedWriter writer = Files.newBufferedWriter(destination)) {
reader.lines()
.map(fixIfNecessary())
.forEachOrdered(line -> {
try {
writer.write(line);
writer.newLine();
} catch (IOException e) {
throw new RuntimeException(e);
}
});
}
}

private static UnaryOperator<String> fixIfNecessary() {
return line -> {
if (line.equals("MT\tassembled-molecule\tMT\tMitochondrion\tJ01415.2\t=\tNC_012920.1\tnon-nuclear\t16569\tchrM")) {
LOGGER.info("Fixing MT contig length (16569 -> 16571)");
return "MT\tassembled-molecule\tMT\tMitochondrion\tJ01415.2\t=\tNC_012920.1\tnon-nuclear\t16571\tchrM";
}
return line;
};
}


}
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
package org.monarchinitiative.squirls.cli.cmd.setup;

import org.monarchinitiative.squirls.cli.Main;
import picocli.CommandLine;

import java.util.concurrent.Callable;

@CommandLine.Command(name = "setup",
aliases = {"S"},
header = "Setup Squirls resources",
mixinStandardHelpOptions = true,
version = Main.VERSION,
usageHelpWidth = Main.WIDTH
)
public class SetupCommand implements Callable<Integer> {

@Override
public Integer call() {
// work done in subcommands
return 0;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -96,12 +96,14 @@
import org.springframework.context.annotation.Configuration;

import java.io.IOException;
import java.nio.file.Paths;
import java.nio.file.Path;
import java.util.Map;

@Configuration
public class TestDataSourceConfig {

public static final Path BASE_DIR = Path.of("src/test/resources/org/monarchinitiative/squirls/cli");


/**
* Small Jannovar cache containing RefSeq transcripts of several genes only:
Expand Down Expand Up @@ -137,7 +139,7 @@ public VariantsForTesting variantsForTesting(VmvtGenerator vmvtGenerator, Jannov

@Bean
public GenomicAssembly genomicAssembly() {
return GenomicAssemblyParser.parseAssembly(Paths.get("src/test/resources/org/monarchinitiative/squirls/cli/GCF_000001405.25_GRCh37.p13_assembly_report.txt"));
return GenomicAssemblyParser.parseAssembly(BASE_DIR.resolve("GCF_000001405.25_GRCh37.p13_assembly_report.txt"));
}

@Bean
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,45 +74,28 @@
* Daniel Danis, Peter N Robinson, 2020
*/

package org.monarchinitiative.squirls.ingest.data;
package org.monarchinitiative.squirls.cli.cmd.setup;

import htsjdk.samtools.reference.FastaSequenceIndex;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.monarchinitiative.squirls.ingest.TestDataSourceConfig;
import org.monarchinitiative.squirls.ingest.TestUtils;
import org.springframework.boot.test.context.SpringBootTest;
import org.junit.jupiter.api.io.TempDir;
import org.monarchinitiative.squirls.cli.TestDataSourceConfig;

import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;

import static org.hamcrest.CoreMatchers.is;
import static org.hamcrest.MatcherAssert.assertThat;

@SpringBootTest(classes = TestDataSourceConfig.class)
public class GenomeAssemblyDownloaderTest {

private Path buildDir;


@BeforeEach
public void setUp() throws Exception {
buildDir = Files.createDirectories(Paths.get(System.getProperty("java.io.tmpdir")).resolve("3S-TEST"));
}

@AfterEach
public void tearDown() throws Exception {
TestUtils.deleteFolderAndFiles(buildDir);
}
@TempDir
public Path buildDir;


@Test
public void download() {
URL fastaUrl = GenomeAssemblyDownloaderTest.class.getResource("shortHg19ChromFa.tar.gz");

public void download() throws Exception {
URL fastaUrl = TestDataSourceConfig.BASE_DIR.resolve("cmd").resolve("setup").resolve("shortHg19ChromFa.tar.gz").toUri().toURL();

Path whereToSave = buildDir.resolve("the-genome.fa");

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
package org.monarchinitiative.squirls.cli.cmd.setup;

import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
import org.monarchinitiative.squirls.cli.TestDataSourceConfig;

import java.nio.file.Files;
import java.nio.file.Path;

import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.equalTo;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;

public class PreprocessReferenceGenomeCommandTest {

private static final Path PARENT = TestDataSourceConfig.BASE_DIR.resolve("cmd").resolve("setup");

@TempDir
public Path tempDataDirectory;

private final PreprocessReferenceGenomeCommand cmd = new PreprocessReferenceGenomeCommand();

@Test
public void call() throws Exception {
cmd.dataDirectory = tempDataDirectory;
cmd.genomeUrl = PARENT.resolve("shortHg19ChromFa.tar.gz").toUri().toURL();
cmd.assemblyReportUrl = PARENT.resolve("GCF_000001405.25_GRCh37.p13_assembly_report.short.txt").toUri().toURL();

assertFalse(Files.isRegularFile(tempDataDirectory.resolve("assembly_report.txt")));
assertFalse(Files.isRegularFile(tempDataDirectory.resolve("genome.fa")));
assertFalse(Files.isRegularFile(tempDataDirectory.resolve("genome.fa.fai")));
assertFalse(Files.isRegularFile(tempDataDirectory.resolve("genome.fa.dict")));

Integer result = cmd.call();

assertThat(result, equalTo(0));
assertTrue(Files.isRegularFile(tempDataDirectory.resolve("assembly_report.txt")));
assertTrue(Files.isRegularFile(tempDataDirectory.resolve("genome.fa")));
assertTrue(Files.isRegularFile(tempDataDirectory.resolve("genome.fa.fai")));
assertTrue(Files.isRegularFile(tempDataDirectory.resolve("genome.fa.dict")));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -335,4 +335,4 @@ HSCHR6_MHC_QBL_CTG1 alt-scaffold 6 Chromosome GL000255.1 = NT_167248.1 ALT_REF_L
HSCHR6_MHC_SSTO_CTG1 alt-scaffold 6 Chromosome GL000256.1 = NT_167249.1 ALT_REF_LOCI_7 4928567 chr6_ssto_hap7
HSCHR4_1_CTG9 alt-scaffold 4 Chromosome GL000257.1 = NT_167250.1 ALT_REF_LOCI_8 590426 chr4_ctg9_hap1
HSCHR17_1_CTG5 alt-scaffold 17 Chromosome GL000258.1 = NT_167251.1 ALT_REF_LOCI_9 1680828 chr17_ctg5_hap1
MT assembled-molecule MT Mitochondrion J01415.2 = NC_012920.1 non-nuclear 16571 chrM
MT assembled-molecule MT Mitochondrion J01415.2 = NC_012920.1 non-nuclear 16569 chrM
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,7 @@
import org.monarchinitiative.squirls.core.SquirlsException;
import org.monarchinitiative.squirls.core.reference.SplicingParameters;
import org.monarchinitiative.squirls.core.reference.SplicingPwmData;
import org.monarchinitiative.squirls.ingest.data.GenomeAssemblyDownloader;
import org.monarchinitiative.squirls.ingest.data.UrlResourceDownloader;
import org.monarchinitiative.squirls.io.download.UrlResourceDownloader;
import org.monarchinitiative.squirls.ingest.parse.FileKMerParser;
import org.monarchinitiative.squirls.ingest.parse.InputStreamBasedPositionalWeightMatrixParser;
import org.monarchinitiative.squirls.io.SplicingPositionalWeightMatrixParser;
Expand Down Expand Up @@ -160,17 +159,6 @@ private static void processPwms(DataSource dataSource, SplicingPwmData splicingP
pwmIngestDao.insertDoubleMatrix(splicingPwmData.getAcceptor(), ACCEPTOR_NAME, parameters.getAcceptorExonic(), parameters.getAcceptorIntronic());
}

/**
* Download, decompress, and concatenate contigs into a single FASTA file. Then, index the FASTA file.
* @param genomeUrl url pointing to reference genome FASTA file to be downloaded
* @param buildDir path to directory where Squirls data files will be created
* @param overwrite overwrite existing FASTA file if true
*/
static Runnable downloadReferenceGenome(URL genomeUrl, Path buildDir, boolean overwrite) {
Path genomeFastaPath = buildDir.resolve("genome.fa");
return new GenomeAssemblyDownloader(genomeUrl, genomeFastaPath, overwrite);
}

/**
* Store data for hexamer and septamer-dependent methods.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,6 @@
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.monarchinitiative.squirls.ingest.data.GenomeAssemblyDownloaderTest;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;

Expand All @@ -97,8 +96,6 @@ public class SquirlsDataBuilderTest {

private static final Path DATA_DIR = Paths.get("src/test/resources/org/monarchinitiative/squirls/ingest");

private static final URL FASTA_URL = GenomeAssemblyDownloaderTest.class.getResource("shortHg19ChromFa.tar.gz");

@Autowired
public DataSource dataSource;
private Path buildDir;
Expand All @@ -113,22 +110,6 @@ public void tearDown() throws Exception {
TestUtils.deleteFolderAndFiles(buildDir);
}

@Test
public void downloadReferenceGenome() {
// arrange - nothing to be done

// act - download a small reference genome
// TODO - move to CLI
Runnable rgTask = SquirlsDataBuilder.downloadReferenceGenome(FASTA_URL, buildDir, true);
rgTask.run();

// assert - there should be a FASTA file with index present in the `buildDir`

assertThat("FASTA file was not generated", buildDir.resolve("genome.fa").toFile().isFile(), is(true));
assertThat("FASTA index was not generated", buildDir.resolve("genome.fa.fai").toFile().isFile(), is(true));
assertThat("FASTA dictionary was not generated", buildDir.resolve("genome.fa.dict").toFile().isFile(), is(true));
}

@Test
public void buildDatabase() throws Exception {
Path hg19Dir = DATA_DIR.resolve("transcripts").resolve("hg19");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,6 @@ public void addResource() throws Exception {
assertThat(names, hasSize(2));
assertThat(names, hasItems("something.txt", "anything.txt"));
assertThat(sizes, hasSize(2));
assertThat(sizes, hasItems(55L, 55L));
assertThat(sizes, hasItems(51L, 51L));
}
}
Original file line number Diff line number Diff line change
@@ -1 +1 @@
This file is used to test download of an URL resource.
This file is used to test Zip compression wrapper.
Binary file not shown.
Loading

0 comments on commit 4396765

Please sign in to comment.