From 73f457b95ded278e12321afacc76be1ca650dd46 Mon Sep 17 00:00:00 2001 From: Eric Talevich Date: Fri, 1 May 2015 16:41:03 -0700 Subject: [PATCH] doc: explain the new -x option in genome2access.py --- doc/scripts.rst | 37 ++++++++++++++++++++++++++++--------- scripts/genome2access.py | 2 +- 2 files changed, 29 insertions(+), 10 deletions(-) diff --git a/doc/scripts.rst b/doc/scripts.rst index 2e79962a..80644b4a 100644 --- a/doc/scripts.rst +++ b/doc/scripts.rst @@ -1,17 +1,36 @@ Additional scripts ================== -refFlat2bed.py - Generate a BED file of the genes or exons in the reference genome given in - UCSC refFlat.txt format. - This script can be used in case the original BED file of targeted intervals - is unavailable. Subsequent steps of the pipeline will remove probes that - did not receive sufficient coverage, including those exons or genes that - were not targeted by the sequencing library. However, better results are - expected from CNVkit if the true targeted intervals can be provided. - genome2access.py: Calculate the sequence-accessible coordinates in chromosomes from the given reference genome, treating long spans of 'N' characters as the inaccessible regions. + CNVkit will compute "antitarget" bins only within the accessible genomic + regions specified in the "access" file produced by this script. If there are + many small excluded/inaccessible regions in the genome, then small, + less-reliable antitarget bins would be squeezed into the remaining + accessible regions. The ``-s`` option tells the script to ignore short + regions that would otherwise be excluded as inaccessible, allowing larger + antitarget bins to overlap them. + + Additional regions to exclude can also be given with the ``-x`` option. This + option can be used more than once to exclude several BED files listing + different sets of regions. For example, "excludable" regions of poor + mappability have been precalculated by others and are available from the + `UCSC FTP Server `_ + (see `here for hg19 + `_). + + +refFlat2bed.py + Generate a BED file of the genes or exons in the reference genome given in + UCSC refFlat.txt format. (Download the input file from `UCSC Genome + Bioinformatics `_). + + This script can be used in case the original BED file of targeted intervals + is unavailable. Subsequent steps of the pipeline will remove probes that + did not receive sufficient coverage, including those exons or genes that + were not targeted by the sequencing library. However, CNVkit will give much + better results if the true targeted intervals can be provided. + diff --git a/scripts/genome2access.py b/scripts/genome2access.py index e9c439d1..07f477e9 100755 --- a/scripts/genome2access.py +++ b/scripts/genome2access.py @@ -174,7 +174,7 @@ def next_or_inf(iterable): AP = argparse.ArgumentParser(description=__doc__) AP.add_argument("fa_fname", help="Genome FASTA file name") - AP.add_argument("-s", "--min-gap-size", type=int, default=100, + AP.add_argument("-s", "--min-gap-size", type=int, default=5000, help="""Minimum gap size between accessible sequence regions. Regions separated by less than this distance will be joined together. [Default: %(default)s]""")