Skip to content

Commit

Permalink
Cobalt: apply gc ratio min and max in ratio mapper instead of GC prof…
Browse files Browse the repository at this point in the history
…ile loading
  • Loading branch information
charlesshale committed Dec 9, 2024
1 parent 413a66f commit 41b652e
Show file tree
Hide file tree
Showing 13 changed files with 135 additions and 111 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -56,18 +56,6 @@ public CobaltApplication(final ConfigBuilder configBuilder)
}
}

public static void main(final String... args) throws IOException, ExecutionException, InterruptedException
{
ConfigBuilder configBuilder = new ConfigBuilder(APP_NAME);

registerConfig(configBuilder);

configBuilder.checkAndParseCommandLine(args);

CobaltApplication application = new CobaltApplication(configBuilder);
application.run();
}

private void run()
{
long startTimeMs = System.currentTimeMillis();
Expand All @@ -90,17 +78,19 @@ private void run()
Table referenceReadDepths = bamReadCounter.getReferenceDepths();
Table tumorReadDepths = bamReadCounter.getTumorDepths();

final Table gcProfiles = loadGCContent(chromosomePosCodec);
Table gcProfiles = loadGCContent(chromosomePosCodec);

final RatioSupplier ratioSupplier = new RatioSupplier(mConfig.ReferenceId, mConfig.TumorId, mConfig.OutputDir,
gcProfiles, referenceReadDepths, tumorReadDepths,
RatioSupplier ratioSupplier = new RatioSupplier(
mConfig.ReferenceId, mConfig.TumorId, mConfig.OutputDir, gcProfiles, referenceReadDepths, tumorReadDepths,
chromosomePosCodec);

if(mConfig.TargetRegionPath != null)
{
CsvReadOptions options = CsvReadOptions.builder(mConfig.TargetRegionPath)
CsvReadOptions options = CsvReadOptions.builder(
mConfig.TargetRegionPath)
.separator(TSV_DELIM.charAt(0))
.columnTypesPartial(Map.of("chromosome", ColumnType.STRING)).build();

Table targetRegionEnrichment = Table.read().usingOptions(options);
chromosomePosCodec.addEncodedChrPosColumn(targetRegionEnrichment, true);
ratioSupplier.setTargetRegionEnrichment(targetRegionEnrichment);
Expand All @@ -111,12 +101,14 @@ private void run()
switch(mConfig.mode())
{
case TUMOR_ONLY:
final Table diploidRegions = new DiploidRegionLoader(mConfig.TumorOnlyDiploidBed, chromosomePosCodec).build();
Table diploidRegions = new DiploidRegionLoader(chromosomePosCodec, mConfig.TumorOnlyDiploidBed).build();
ratios = ratioSupplier.tumorOnly(diploidRegions);
break;

case GERMLIHE_ONLY:
ratios = ratioSupplier.germlineOnly();
break;

default:
ratios = ratioSupplier.tumorNormalPair();
}
Expand All @@ -126,10 +118,10 @@ private void run()

CB_LOGGER.info("persisting cobalt ratios to {}", outputFilename);


CobaltRatioFile.write(outputFilename, ratios.stream().map(r -> rowToCobaltRatio(r, chromosomePosCodec)).collect(Collectors.toList()));

applyRatioSegmentation(executorService, mConfig.OutputDir, outputFilename, mConfig.ReferenceId, mConfig.TumorId, mConfig.PcfGamma);
if(!mConfig.SkipPcfCalc)
applyRatioSegmentation(executorService, mConfig.OutputDir, outputFilename, mConfig.ReferenceId, mConfig.TumorId, mConfig.PcfGamma);

final VersionInfo version = fromAppName(APP_NAME);
version.write(mConfig.OutputDir);
Expand Down Expand Up @@ -174,14 +166,6 @@ public Table loadGCContent(ChromosomePositionCodec chromosomePosCodec) throws IO
{
Row row = gcProfileTable.appendRow();

double gcContent = gcProfile.gcContent();

if(gcContent > 0 && mConfig.GcRatioFileMin > 0 && gcContent < mConfig.GcRatioFileMin)
continue;

if(mConfig.GcRatioFileMax < 1 && gcContent > mConfig.GcRatioFileMax)
continue;

long chrPosIndex = chromosomePosCodec.encodeChromosomePosition(gcProfile.chromosome(), gcProfile.start());

if(chrPosIndex > 0)
Expand All @@ -193,11 +177,23 @@ public Table loadGCContent(ChromosomePositionCodec chromosomePosCodec) throws IO
throw new RuntimeException("Unknown chromosome: " + gcProfile.chromosome());
}

row.setDouble(CobaltColumns.GC_CONTENT, gcContent);
row.setDouble(CobaltColumns.GC_CONTENT, gcProfile.gcContent());
row.setBoolean(CobaltColumns.IS_MAPPABLE, gcProfile.isMappable());
row.setBoolean(CobaltColumns.IS_AUTOSOME, HumanChromosome.fromString(gcProfile.chromosome()).isAutosome());
}

return gcProfileTable;
}

public static void main(final String... args) throws IOException, ExecutionException, InterruptedException
{
ConfigBuilder configBuilder = new ConfigBuilder(APP_NAME);

registerConfig(configBuilder);

configBuilder.checkAndParseCommandLine(args);

CobaltApplication application = new CobaltApplication(configBuilder);
application.run();
}
}
29 changes: 14 additions & 15 deletions cobalt/src/main/java/com/hartwig/hmftools/cobalt/CobaltConfig.java
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package com.hartwig.hmftools.cobalt;

import static com.hartwig.hmftools.cobalt.CobaltConstants.DEFAULT_GC_RATIO_FILE_MAX;
import static com.hartwig.hmftools.cobalt.CobaltConstants.DEFAULT_GC_RATIO_FILE_MIN;
import static com.hartwig.hmftools.cobalt.CobaltConstants.DEFAULT_GC_RATIO_MAX;
import static com.hartwig.hmftools.cobalt.CobaltConstants.DEFAULT_GC_RATIO_MIN;
import static com.hartwig.hmftools.cobalt.CobaltConstants.DEFAULT_MIN_MAPPING_QUALITY;
import static com.hartwig.hmftools.cobalt.CobaltConstants.DEFAULT_PCF_GAMMA;
import static com.hartwig.hmftools.common.genome.gc.GCProfileFactory.GC_PROFILE;
Expand Down Expand Up @@ -47,8 +47,9 @@ public enum Mode
private static final String TARGET_REGION_NORM_FILE = "target_region";
private static final String INCLUDE_DUPLICATES = "include_duplicates";

private static final String GC_RATIO_FILE_MIN = "gc_ratio_file_min";
private static final String GC_RATIO_FILE_MAX = "gc_ratio_file_max";
private static final String GC_RATIO_MIN = "gc_ratio_min";
private static final String GC_RATIO_MAX = "gc_ratio_max";
private static final String SKIP_PCF_CALC = "skip_pcf_calc";

public final String ReferenceId;
public final String ReferenceBamPath;
Expand All @@ -67,9 +68,7 @@ public enum Mode

public final ValidationStringency BamStringency;
public final boolean IncludeDuplicates;

public final double GcRatioFileMin;
public final double GcRatioFileMax;
public final boolean SkipPcfCalc;

public final String TumorOnlyDiploidBed;
public final String TargetRegionPath;
Expand All @@ -90,8 +89,9 @@ public CobaltConfig(final ConfigBuilder configBuilder)
TargetRegionPath = configBuilder.getValue(TARGET_REGION_NORM_FILE);
RefGenomePath = configBuilder.getValue(REF_GENOME);

GcRatioFileMin = configBuilder.getDecimal(GC_RATIO_FILE_MIN);
GcRatioFileMax = configBuilder.getDecimal(GC_RATIO_FILE_MAX);
// set global constants
CobaltConstants.GC_RATIO_MIN = configBuilder.getDecimal(GC_RATIO_MIN);
CobaltConstants.GC_RATIO_MAX = configBuilder.getDecimal(GC_RATIO_MAX);

MinMappingQuality = configBuilder.getInteger(MIN_MAPPING_QUALITY);
PcfGamma = configBuilder.getInteger(PCF_GAMMA);
Expand All @@ -100,6 +100,8 @@ public CobaltConfig(final ConfigBuilder configBuilder)
BamStringency = BamUtils.validationStringency(configBuilder);
OutputDir = parseOutputDir(configBuilder);
Threads = parseThreads(configBuilder);

SkipPcfCalc = configBuilder.hasFlag(SKIP_PCF_CALC);
}

public static void registerConfig(final ConfigBuilder configBuilder)
Expand All @@ -113,15 +115,16 @@ public static void registerConfig(final ConfigBuilder configBuilder)
configBuilder.addPath(GC_PROFILE, true, GC_PROFILE_DESC);
configBuilder.addPath(REF_GENOME, false, REF_GENOME_CFG_DESC + ", required when using CRAM files");

configBuilder.addDecimal(GC_RATIO_FILE_MIN, "Restrict GC profile entries to above minimum", DEFAULT_GC_RATIO_FILE_MIN);
configBuilder.addDecimal(GC_RATIO_FILE_MAX, "Restrict GC profile entries to below maximum", DEFAULT_GC_RATIO_FILE_MAX);
configBuilder.addDecimal(GC_RATIO_MIN, "Restrict GC ratios to above minimum", DEFAULT_GC_RATIO_MIN);
configBuilder.addDecimal(GC_RATIO_MAX, "Restrict GC ratios to below maximum", DEFAULT_GC_RATIO_MAX);

configBuilder.addPath(TUMOR_ONLY_DIPLOID_BED, false, "Diploid regions for tumor-only mode");
configBuilder.addPath(TARGET_REGION_NORM_FILE, false, "Targeted regions normalisation file");

configBuilder.addInteger(MIN_MAPPING_QUALITY, "Min map quality", DEFAULT_MIN_MAPPING_QUALITY);
configBuilder.addInteger(PCF_GAMMA, "Gamma value for copy number PCF", DEFAULT_PCF_GAMMA);
configBuilder.addFlag(INCLUDE_DUPLICATES, "Include duplicate reads in depth counts");
configBuilder.addFlag(SKIP_PCF_CALC, "Skip final PCF output");

addOutputDir(configBuilder);
addThreadOptions(configBuilder);
Expand All @@ -137,10 +140,6 @@ public void validate() throws Exception
{
throw new Exception(String.format("%s option not allowed in tumor only mode", REFERENCE_BAM));
}
if(TumorOnlyDiploidBed == null)
{
throw new Exception(String.format("missing required option %s in tumor only mode", TUMOR_ONLY_DIPLOID_BED));
}
}
else if(TumorOnlyDiploidBed != null)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,14 @@ public class CobaltConstants

public static final int INVALID_VALUE_INDICATOR = -1;

public static final double DEFAULT_GC_RATIO_FILE_MIN = 0;
public static final double DEFAULT_GC_RATIO_FILE_MAX = 1;
public static final double DEFAULT_GC_RATIO_MIN = 0.2;
public static final double DEFAULT_GC_RATIO_MAX = 0.6;

public static double GC_RATIO_MIN = DEFAULT_GC_RATIO_MIN;
public static double GC_RATIO_MAX = DEFAULT_GC_RATIO_MAX;

public static final int WINDOW_SIZE = 1000;
public static final int PARTITION_SIZE = 100_000_000;
public static final int OFF_TARGET_WINDOW_SIZE = 1_000_000;
public static final double MIN_OFF_TARGET_WINDOW_RATIO = 0.5;

public static final int DEFAULT_MIN_MAPPING_QUALITY = 10;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ public void generateDepths(
samReader.close();
}

CB_LOGGER.info("read Depth Complete");
CB_LOGGER.info("read depth complete");
}

private List<Future<?>> createFutures(final String bamFilePath, final ReadDepthAccumulator readDepthCounter, List<SamReader> samReaderList)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,38 +26,37 @@

public class DiploidRegionLoader implements Consumer<Locatable>
{
private final Table mResult = Table.create(
StringColumn.create(CobaltColumns.CHROMOSOME), IntColumn.create(CobaltColumns.POSITION));
private final Table mContigResult = mResult.emptyCopy();
private final Table mResult;
private final Table mContigResult;

private final ChromosomePositionCodec mChromosomePosCodec;

private String mChromosome = null;
private int mStart = 0;

public DiploidRegionLoader(ChromosomePositionCodec chromosomePosCodec)
public DiploidRegionLoader(final ChromosomePositionCodec chromosomePosCodec, final String diploidBedPath) throws IOException
{
mChromosomePosCodec = chromosomePosCodec;
}

public DiploidRegionLoader(final String diploidBedPath,
ChromosomePositionCodec chromosomePosCodec) throws IOException
{
this(chromosomePosCodec);
List<BEDFeature> bedFeatures = new ArrayList<>();
mResult = Table.create(StringColumn.create(CobaltColumns.CHROMOSOME), IntColumn.create(CobaltColumns.POSITION));
mContigResult = mResult.emptyCopy();

CB_LOGGER.info("Reading diploid regions from {}", diploidBedPath);
try(final AbstractFeatureReader<BEDFeature, LineIterator> reader = getFeatureReader(diploidBedPath,
new BEDCodec(),
false))
if(diploidBedPath != null)
{
for(BEDFeature bedFeature : reader.iterator())
List<BEDFeature> bedFeatures = new ArrayList<>();

CB_LOGGER.info("Reading diploid regions from {}", diploidBedPath);
try(final AbstractFeatureReader<BEDFeature, LineIterator> reader = getFeatureReader(diploidBedPath,
new BEDCodec(),
false))
{
bedFeatures.add(bedFeature);
for(BEDFeature bedFeature : reader.iterator())
{
bedFeatures.add(bedFeature);
}
}
}

bedFeatures.forEach(this);
bedFeatures.forEach(this);
}
}

@Override
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
package com.hartwig.hmftools.cobalt.ratio;

import static java.lang.Math.round;

import static com.hartwig.hmftools.cobalt.CobaltConfig.CB_LOGGER;
import static com.hartwig.hmftools.cobalt.CobaltConstants.GC_RATIO_MAX;
import static com.hartwig.hmftools.cobalt.CobaltConstants.GC_RATIO_MIN;

import java.util.HashMap;
import java.util.Map;
Expand All @@ -16,22 +20,17 @@

public class GcNormalizedRatioMapper implements RatioMapper
{
private static final int MIN_BUCKET = 20;
private static final int MAX_BUCKET = 60;

private Table mGCMedianReadDepth;
private double mSampleMedianReadDepth;
private double mSampleMeanReadDepth;

// apply gc normalisation, the input ratios must have chromosome, position, ratio, gcBucket, isMappable
public GcNormalizedRatioMapper()
{
}
public GcNormalizedRatioMapper() {}

@Override
public Table mapRatios(final Table inputRatios)
{
CB_LOGGER.info("applying ratio GC normalization");
CB_LOGGER.info("applying ratio GC normalisation");

// add a gc bucket column if not already have one
if (!inputRatios.containsColumn(CobaltColumns.GC_BUCKET))
Expand All @@ -42,10 +41,13 @@ public Table mapRatios(final Table inputRatios)

// create a gc normalisation df

int gcRatioBucketMin = (int)round(GC_RATIO_MIN * 100);
int gcRatioBucketMax = (int)round(GC_RATIO_MAX * 100);

// skipped masked regions
Table gcMedianCalcDf = inputRatios.where(
inputRatios.doubleColumn(CobaltColumns.RATIO).isGreaterThan(0.0) // TODO: change to >= 0.0
.and(inputRatios.intColumn(CobaltColumns.GC_BUCKET).isBetweenInclusive(MIN_BUCKET, MAX_BUCKET))
.and(inputRatios.intColumn(CobaltColumns.GC_BUCKET).isBetweenInclusive(gcRatioBucketMin, gcRatioBucketMax))
.and(inputRatios.booleanColumn(CobaltColumns.IS_MAPPABLE).asSelection())
.and(inputRatios.booleanColumn(CobaltColumns.IS_AUTOSOME).asSelection()));

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ static class GermlineRatios extends SampleRatios
final String ratioMedianFilename = MedianRatioFile.generateFilename(outputDir, referenceId);
MedianRatioFile.write(ratioMedianFilename, medianRatios);

CB_LOGGER.info("applying ratio diploid normalization");
CB_LOGGER.info("applying ratio diploid normalisation");
gcDiploidRatios = calcDiploidRatioResults(getRatios(), medianRatios);
}
}
Expand All @@ -182,7 +182,7 @@ public RatioSupplier(final String reference, final String tumor,
mChromosomePosCodec = chromosomePosCodec;
}

public void setTargetRegionEnrichment(Table targetRegionEnrichment)
public void setTargetRegionEnrichment(final Table targetRegionEnrichment)
{
mTargetRegionEnrichment = targetRegionEnrichment;
}
Expand All @@ -195,8 +195,12 @@ public Table tumorOnly(final Table diploidRegions) throws IOException
CB_LOGGER.error("tumor count should not be null");
throw new RuntimeException("tumor count is null");
}
SparseBucketPolicy sparseBucketPolicy = mTargetRegionEnrichment == null ? SparseBucketPolicy.CALC_CONSOLIDATED_BUCKETS : SparseBucketPolicy.DO_NOT_CONSOLIDATE;
Table tumorRatios = new SampleRatios(mTumorId, mTumorDepths, mGcProfiles, mTargetRegionEnrichment, sparseBucketPolicy,

SparseBucketPolicy sparseBucketPolicy = mTargetRegionEnrichment == null ?
SparseBucketPolicy.CALC_CONSOLIDATED_BUCKETS : SparseBucketPolicy.DO_NOT_CONSOLIDATE;

Table tumorRatios = new SampleRatios(
mTumorId, mTumorDepths, mGcProfiles, mTargetRegionEnrichment, sparseBucketPolicy,
null, mOutputDir, mChromosomePosCodec).getRatios();

// filter tumor ratios by the diploid regions
Expand All @@ -217,9 +221,14 @@ public Table germlineOnly() throws IOException
CB_LOGGER.fatal("Reference count should not be null");
throw new RuntimeException("reference count is null");
}
SparseBucketPolicy sparseBucketPolicy = mTargetRegionEnrichment == null ? SparseBucketPolicy.CALC_CONSOLIDATED_BUCKETS : SparseBucketPolicy.DO_NOT_CONSOLIDATE;
var germlineRatios = new GermlineRatios(mReferenceId, mReferenceDepths, mGcProfiles, mTargetRegionEnrichment,

SparseBucketPolicy sparseBucketPolicy = mTargetRegionEnrichment == null ?
SparseBucketPolicy.CALC_CONSOLIDATED_BUCKETS : SparseBucketPolicy.DO_NOT_CONSOLIDATE;

GermlineRatios germlineRatios = new GermlineRatios(
mReferenceId, mReferenceDepths, mGcProfiles, mTargetRegionEnrichment,
sparseBucketPolicy, null, mOutputDir, mChromosomePosCodec);

return mergeRatios(
mReferenceDepths, null,
germlineRatios.getRatios(), null, germlineRatios.gcDiploidRatios);
Expand All @@ -241,13 +250,13 @@ public Table tumorNormalPair() throws IOException
SparseBucketPolicy tumorSparseBucketPolicy = mTargetRegionEnrichment == null ?
SparseBucketPolicy.CALC_CONSOLIDATED_BUCKETS : SparseBucketPolicy.DO_NOT_CONSOLIDATE;

var tumorRatios = new SampleRatios(mTumorId, mTumorDepths, mGcProfiles, mTargetRegionEnrichment,
SampleRatios tumorRatios = new SampleRatios(mTumorId, mTumorDepths, mGcProfiles, mTargetRegionEnrichment,
tumorSparseBucketPolicy, null, mOutputDir, mChromosomePosCodec);

SparseBucketPolicy germlineSparseBucketPolicy = tumorRatios.consolidatedBuckets == null ?
SparseBucketPolicy.DO_NOT_CONSOLIDATE : SparseBucketPolicy.USE_PROVIDED_BUCKETS;

var germlineRatios = new GermlineRatios(mReferenceId, mReferenceDepths, mGcProfiles, mTargetRegionEnrichment,
GermlineRatios germlineRatios = new GermlineRatios(mReferenceId, mReferenceDepths, mGcProfiles, mTargetRegionEnrichment,
germlineSparseBucketPolicy, tumorRatios.consolidatedBuckets, mOutputDir, mChromosomePosCodec);

return mergeRatios(
Expand Down
Loading

0 comments on commit 41b652e

Please sign in to comment.