diff --git a/Docker/build.sh b/Docker/build.sh index 6da6c76..c0015e3 100644 --- a/Docker/build.sh +++ b/Docker/build.sh @@ -1,7 +1,7 @@ #!/bin/bash -tag="V0.1.1" +tag="V0.1.2" docker build --platform linux/amd64 -t wenjin27/methylgrapher:latest -t wenjin27/methylgrapher:$tag ./ # docker push wenjin27/methylgrapher:latest diff --git a/docs/build/doctrees/environment.pickle b/docs/build/doctrees/environment.pickle index 8f2325c..b5dcb5e 100644 Binary files a/docs/build/doctrees/environment.pickle and b/docs/build/doctrees/environment.pickle differ diff --git a/docs/build/doctrees/filetype.doctree b/docs/build/doctrees/filetype.doctree index 2910788..c86d598 100644 Binary files a/docs/build/doctrees/filetype.doctree and b/docs/build/doctrees/filetype.doctree differ diff --git a/docs/build/doctrees/process.doctree b/docs/build/doctrees/process.doctree index 4a04420..382f1a0 100644 Binary files a/docs/build/doctrees/process.doctree and b/docs/build/doctrees/process.doctree differ diff --git a/docs/build/html/_sources/filetype.rst.txt b/docs/build/html/_sources/filetype.rst.txt index ae7878d..65999aa 100644 --- a/docs/build/html/_sources/filetype.rst.txt +++ b/docs/build/html/_sources/filetype.rst.txt @@ -5,17 +5,17 @@ File Types GFA: Graphical Fragment Assembly format ------------------------------- +------------------------------------------------------------ For detail explanation, refer to https://gfa-spec.github.io/GFA-spec/GFA1.html GAF: Graphical mApping Format ------------------------------- +------------------------------------------------------------ For detail explanation, refer to https://github.com/lh3/gfatools/blob/master/doc/rGFA.md methyl: Graph methylation file ------------------------------- +------------------------------------------------------------ The methylGrapher extraction output provides a list of each cytosine mapped to the genome graph, specified by its coordinates (segment ID, 0-based position on the segment, and strand relative to the segment). For each cytosine, the output includes its context, the number of read-pairs supporting whether the cytosine is methylated or unmethylated, and the coverage, which is the sum of methylated and unmethylated read-pairs. The methylation percentage is calculated as the ratio of methylated read-pairs to the total coverage. diff --git a/docs/build/html/_sources/process.rst.txt b/docs/build/html/_sources/process.rst.txt index f281181..51f995a 100644 --- a/docs/build/html/_sources/process.rst.txt +++ b/docs/build/html/_sources/process.rst.txt @@ -11,9 +11,6 @@ Description ~~~~~~~~~~~~~~~~~~~~~~ This process involves transforming the genome graph from GFA format into two fully converted genome graphs: one depleted of C bases and another depleted of G bases. Additionally, if desired, you may include a spike-in genome in FASTA format to estimate the conversion rate in a single step further. -.. note:: - It's important to note that in the C-to-T genome graph, if both C and T segments are positioned identically—meaning they share the same parent and child segments, and each has only one parent and one child—the T segments are removed. Additionally, any associated links and paths are redirected to the corresponding C segment. This principle also applies in the G-to-A genome graph. - Example Usage ~~~~~~~~~~~~~~~~~~~~~~ .. code-block:: shell diff --git a/docs/build/html/process.html b/docs/build/html/process.html index fa9cc60..4086610 100644 --- a/docs/build/html/process.html +++ b/docs/build/html/process.html @@ -118,10 +118,6 @@

Genome Indexing

Description

This process involves transforming the genome graph from GFA format into two fully converted genome graphs: one depleted of C bases and another depleted of G bases. Additionally, if desired, you may include a spike-in genome in FASTA format to estimate the conversion rate in a single step further.

-
-

Note

-

It’s important to note that in the C-to-T genome graph, if both C and T segments are positioned identically—meaning they share the same parent and child segments, and each has only one parent and one child—the T segments are removed. Additionally, any associated links and paths are redirected to the corresponding C segment. This principle also applies in the G-to-A genome graph.

-

Example Usage

diff --git a/docs/build/html/searchindex.js b/docs/build/html/searchindex.js index 6dbf2a8..4e9172d 100644 --- a/docs/build/html/searchindex.js +++ b/docs/build/html/searchindex.js @@ -1 +1 @@ -Search.setIndex({"alltitles": {"Align": [[5, "align"]], "Citation": [[0, "citation"]], "Contact us": [[0, null]], "ConversionRate": [[5, "conversionrate"]], "Deduplication": [[5, "deduplication"]], "Description": [[5, "description"], [5, "id1"], [5, "id5"], [5, "id9"], [5, "id13"]], "Example Usage": [[5, "example-usage"], [5, "id2"], [5, "id6"], [5, "id10"], [5, "id14"]], "File Types": [[2, null]], "GAF: Graphical mApping Format": [[2, "gaf-graphical-mapping-format"]], "GFA: Graphical Fragment Assembly format": [[2, "gfa-graphical-fragment-assembly-format"]], "Genome Indexing": [[5, "genome-indexing"]], "Install via CONDA": [[4, "install-via-conda"]], "Install via PIP": [[4, "install-via-pip"]], "Installation": [[4, null]], "Main": [[5, "main"]], "MethylCall": [[5, "methylcall"]], "Optional arguments": [[5, "optional-arguments"], [5, "id4"], [5, "id8"], [5, "id12"]], "Prerequisites": [[4, "prerequisites"]], "Process": [[5, null]], "Reference:": [[2, "reference"]], "Required arguments": [[5, "required-arguments"], [5, "id3"], [5, "id7"], [5, "id11"], [5, "id15"]], "Source code": [[4, "source-code"]], "Source code and issue tracker": [[0, "source-code-and-issue-tracker"]], "Toy Example": [[1, null]], "Use Docker": [[4, "use-docker"]], "Welcome to methylGrapher\u2019s documentation!": [[3, null]], "methyl: Graph methylation file": [[2, "methyl-graph-methylation-file"]], "methylGrapher: Genome-Graph-Based DNA Methylation Analysis Using Whole Genome Bisulfite Sequencing": [[3, "methylgrapher-genome-graph-based-dna-methylation-analysis-using-whole-genome-bisulfite-sequencing"]]}, "docnames": ["contact", "example", "filetype", "index", "install", "process"], "envversion": {"sphinx": 63, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2}, "filenames": ["contact.rst", "example.rst", "filetype.rst", "index.rst", "install.rst", "process.rst"], "indexentries": {}, "objects": {}, "objnames": {}, "objtypes": {}, "terms": {"": 5, "0": [2, 5], "1": 5, "10": 4, "11": 4, "12": 4, "20": 5, "3": 4, "4096": 5, "9": 4, "A": 5, "For": 2, "If": 0, "It": 5, "The": [1, 2, 4, 5], "activ": 4, "ad": 4, "adapt": 4, "addition": 5, "against": 5, "align": [3, 4], "all": 1, "along": 1, "also": [1, 5], "an": 0, "analyz": 1, "ani": [0, 5], "anoth": 5, "appli": 5, "ar": [1, 4, 5], "assembli": 3, "associ": 5, "base": [2, 5], "bash": 1, "batch_siz": 5, "below": 1, "best": 4, "bioconda": 4, "blob": [1, 2], "both": [4, 5], "c": 5, "calcul": 2, "call": 5, "can": [0, 1], "cd": 1, "child": 5, "citat": 3, "clone": 1, "code": 3, "com": [0, 1, 2], "compris": 5, "conda": 3, "construct": 2, "contact": 3, "context": 2, "convers": 5, "conversionr": 3, "convert": 5, "coordin": 2, "core": 5, "correspond": 5, "coverag": 2, "creat": 4, "cytosin": 2, "data": 1, "dedupl": [3, 4], "default": 5, "deplet": 5, "design": 2, "desir": 5, "detail": [2, 5], "direct": 5, "directori": 5, "discard_multimap": 5, "doc": 2, "docker": 3, "document": 5, "each": [2, 5], "ensur": 4, "essenti": 5, "estim": 5, "exampl": 3, "example_b": 1, "execut": 5, "explan": 2, "extract": [2, 5], "facilit": 5, "fasta": 5, "fastq": [1, 5], "fastq1_file_path": 5, "fastq2_file_path": 5, "fastuniq": [4, 5], "file": [1, 3, 5], "final": 1, "find": 0, "firstli": 5, "folder": 1, "format": [1, 3, 5], "four": 5, "fq1": 5, "fq2": 5, "fragment": 3, "from": 5, "fulli": 5, "further": 5, "g": 5, "gaf": [3, 5], "galor": 4, "gener": [1, 5], "genom": [1, 2], "gfa": [1, 3, 5], "gfa1": 2, "gfa_file_path": 5, "gfatool": 2, "giraff": [4, 5], "git": 1, "github": [0, 1, 2], "glossari": 2, "graph": [1, 5], "graphic": 3, "ha": [4, 5], "have": 0, "here": 0, "how": 1, "html": 2, "http": [0, 1, 2], "hub": 4, "i": [1, 2, 4, 5], "id": 2, "ident": 5, "import": 5, "includ": [2, 5], "incorpor": 1, "index": 3, "index_prefix": 5, "instal": [1, 3], "involv": 5, "io": 2, "issu": 3, "its": 2, "lambda_phage_genome_path": 5, "lambdaphagefastafilepath": 5, "lastli": 5, "latest": 4, "lh3": 2, "link": 5, "list": 2, "lp": 5, "mai": 5, "main": [1, 3], "make_exampl": 1, "map": 3, "master": 2, "md": 2, "mean": 5, "methyl": [1, 5], "methylcal": 3, "methylgraph": [0, 1, 2, 4, 5], "minigraph": 2, "minimum_ident": 5, "minimum_mapq": 5, "more": 5, "must": 4, "n": 4, "next": 5, "note": 5, "number": 2, "one": 5, "onli": 5, "oper": 5, "our": 0, "output": [1, 2], "output_prefix": 5, "outputprefix": 5, "packag": 1, "pair": 2, "pangenom": 2, "parent": 5, "path": [4, 5], "percentag": 2, "perform": 4, "pip": 3, "pipelin": 1, "pleas": [0, 5], "posit": [2, 5], "prefix": 5, "preparegenom": 5, "preparegenomeoutputprefix": 5, "prerequisit": 3, "principl": 5, "process": 3, "provid": [2, 5], "py": 1, "python": 4, "python3": 4, "question": 0, "rate": 5, "ratio": 2, "read": 2, "recommend": 4, "redirect": 5, "refer": [3, 5], "rel": 2, "relat": 2, "remov": 5, "repo": 0, "result": 5, "rgfa": 2, "same": 5, "script": 1, "segment": [2, 5], "sh": 1, "share": 5, "show": 1, "simul": 1, "singl": 5, "sort": 5, "sourc": 3, "spec": 2, "specifi": 2, "spike": 5, "step": 5, "strand": 2, "submit": 0, "subsequ": 5, "sum": 2, "support": 2, "t": 5, "test": 4, "thank": 0, "thei": 5, "theworkingdir": 5, "thi": [1, 5], "threads_us": 5, "threadsus": 5, "toi": 3, "toolkit": 4, "total": 2, "tracker": 3, "transform": 5, "trim": 4, "twlab": [0, 1], "two": 5, "type": 3, "u": 3, "undergo": 5, "unmethyl": 2, "us": 1, "verifi": 1, "version": 4, "vg": [2, 4, 5], "via": 3, "whether": 2, "which": [1, 2], "within": 1, "work": 5, "work_dir": 5, "workflow": 1, "working_directori": 5, "would": 4, "y": 5, "you": [0, 1, 4, 5], "your": 4, "yourgfafilepath": 5}, "titles": ["Contact us", "Toy Example", "File Types", "Welcome to methylGrapher\u2019s documentation!", "Installation", "Process"], "titleterms": {"": 3, "align": 5, "analysi": 3, "argument": 5, "assembli": 2, "base": 3, "bisulfit": 3, "citat": 0, "code": [0, 4], "conda": 4, "contact": 0, "conversionr": 5, "dedupl": 5, "descript": 5, "dna": 3, "docker": 4, "document": 3, "exampl": [1, 5], "file": 2, "format": 2, "fragment": 2, "gaf": 2, "genom": [3, 5], "gfa": 2, "graph": [2, 3], "graphic": 2, "index": 5, "instal": 4, "issu": 0, "main": 5, "map": 2, "methyl": [2, 3], "methylcal": 5, "methylgraph": 3, "option": 5, "pip": 4, "prerequisit": 4, "process": 5, "refer": 2, "requir": 5, "sequenc": 3, "sourc": [0, 4], "toi": 1, "tracker": 0, "type": 2, "u": 0, "us": [3, 4], "usag": 5, "via": 4, "welcom": 3, "whole": 3}}) \ No newline at end of file +Search.setIndex({"alltitles": {"Align": [[5, "align"]], "Citation": [[0, "citation"]], "Contact us": [[0, null]], "ConversionRate": [[5, "conversionrate"]], "Deduplication": [[5, "deduplication"]], "Description": [[5, "description"], [5, "id1"], [5, "id5"], [5, "id9"], [5, "id13"]], "Example Usage": [[5, "example-usage"], [5, "id2"], [5, "id6"], [5, "id10"], [5, "id14"]], "File Types": [[2, null]], "GAF: Graphical mApping Format": [[2, "gaf-graphical-mapping-format"]], "GFA: Graphical Fragment Assembly format": [[2, "gfa-graphical-fragment-assembly-format"]], "Genome Indexing": [[5, "genome-indexing"]], "Install via CONDA": [[4, "install-via-conda"]], "Install via PIP": [[4, "install-via-pip"]], "Installation": [[4, null]], "Main": [[5, "main"]], "MethylCall": [[5, "methylcall"]], "Optional arguments": [[5, "optional-arguments"], [5, "id4"], [5, "id8"], [5, "id12"]], "Prerequisites": [[4, "prerequisites"]], "Process": [[5, null]], "Reference:": [[2, "reference"]], "Required arguments": [[5, "required-arguments"], [5, "id3"], [5, "id7"], [5, "id11"], [5, "id15"]], "Source code": [[4, "source-code"]], "Source code and issue tracker": [[0, "source-code-and-issue-tracker"]], "Toy Example": [[1, null]], "Use Docker": [[4, "use-docker"]], "Welcome to methylGrapher\u2019s documentation!": [[3, null]], "methyl: Graph methylation file": [[2, "methyl-graph-methylation-file"]], "methylGrapher: Genome-Graph-Based DNA Methylation Analysis Using Whole Genome Bisulfite Sequencing": [[3, "methylgrapher-genome-graph-based-dna-methylation-analysis-using-whole-genome-bisulfite-sequencing"]]}, "docnames": ["contact", "example", "filetype", "index", "install", "process"], "envversion": {"sphinx": 63, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2}, "filenames": ["contact.rst", "example.rst", "filetype.rst", "index.rst", "install.rst", "process.rst"], "indexentries": {}, "objects": {}, "objnames": {}, "objtypes": {}, "terms": {"0": [2, 5], "1": 5, "10": 4, "11": 4, "12": 4, "20": 5, "3": 4, "4096": 5, "9": 4, "For": 2, "If": 0, "The": [1, 2, 4, 5], "activ": 4, "ad": 4, "adapt": 4, "addition": 5, "against": 5, "align": [3, 4], "all": 1, "along": 1, "also": 1, "an": 0, "analyz": 1, "ani": 0, "anoth": 5, "ar": [1, 4, 5], "assembli": 3, "base": [2, 5], "bash": 1, "batch_siz": 5, "below": 1, "best": 4, "bioconda": 4, "blob": [1, 2], "both": 4, "c": 5, "calcul": 2, "call": 5, "can": [0, 1], "cd": 1, "citat": 3, "clone": 1, "code": 3, "com": [0, 1, 2], "compris": 5, "conda": 3, "construct": 2, "contact": 3, "context": 2, "convers": 5, "conversionr": 3, "convert": 5, "coordin": 2, "core": 5, "coverag": 2, "creat": 4, "cytosin": 2, "data": 1, "dedupl": [3, 4], "default": 5, "deplet": 5, "design": 2, "desir": 5, "detail": [2, 5], "direct": 5, "directori": 5, "discard_multimap": 5, "doc": 2, "docker": 3, "document": 5, "each": 2, "ensur": 4, "essenti": 5, "estim": 5, "exampl": 3, "example_b": 1, "execut": 5, "explan": 2, "extract": [2, 5], "facilit": 5, "fasta": 5, "fastq": [1, 5], "fastq1_file_path": 5, "fastq2_file_path": 5, "fastuniq": [4, 5], "file": [1, 3, 5], "final": 1, "find": 0, "firstli": 5, "folder": 1, "format": [1, 3, 5], "four": 5, "fq1": 5, "fq2": 5, "fragment": 3, "from": 5, "fulli": 5, "further": 5, "g": 5, "gaf": [3, 5], "galor": 4, "gener": [1, 5], "genom": [1, 2], "gfa": [1, 3, 5], "gfa1": 2, "gfa_file_path": 5, "gfatool": 2, "giraff": [4, 5], "git": 1, "github": [0, 1, 2], "glossari": 2, "graph": [1, 5], "graphic": 3, "ha": 4, "have": 0, "here": 0, "how": 1, "html": 2, "http": [0, 1, 2], "hub": 4, "i": [1, 2, 4, 5], "id": 2, "includ": [2, 5], "incorpor": 1, "index": 3, "index_prefix": 5, "instal": [1, 3], "involv": 5, "io": 2, "issu": 3, "its": 2, "lambda_phage_genome_path": 5, "lambdaphagefastafilepath": 5, "lastli": 5, "latest": 4, "lh3": 2, "list": 2, "lp": 5, "mai": 5, "main": [1, 3], "make_exampl": 1, "map": 3, "master": 2, "md": 2, "methyl": [1, 5], "methylcal": 3, "methylgraph": [0, 1, 2, 4, 5], "minigraph": 2, "minimum_ident": 5, "minimum_mapq": 5, "more": 5, "must": 4, "n": 4, "next": 5, "number": 2, "one": 5, "oper": 5, "our": 0, "output": [1, 2], "output_prefix": 5, "outputprefix": 5, "packag": 1, "pair": 2, "pangenom": 2, "path": 4, "percentag": 2, "perform": 4, "pip": 3, "pipelin": 1, "pleas": [0, 5], "posit": 2, "prefix": 5, "preparegenom": 5, "preparegenomeoutputprefix": 5, "prerequisit": 3, "process": 3, "provid": [2, 5], "py": 1, "python": 4, "python3": 4, "question": 0, "rate": 5, "ratio": 2, "read": 2, "recommend": 4, "refer": [3, 5], "rel": 2, "relat": 2, "repo": 0, "result": 5, "rgfa": 2, "script": 1, "segment": 2, "sh": 1, "show": 1, "simul": 1, "singl": 5, "sort": 5, "sourc": 3, "spec": 2, "specifi": 2, "spike": 5, "step": 5, "strand": 2, "submit": 0, "subsequ": 5, "sum": 2, "support": 2, "t": 5, "test": 4, "thank": 0, "theworkingdir": 5, "thi": [1, 5], "threads_us": 5, "threadsus": 5, "toi": 3, "toolkit": 4, "total": 2, "tracker": 3, "transform": 5, "trim": 4, "twlab": [0, 1], "two": 5, "type": 3, "u": 3, "undergo": 5, "unmethyl": 2, "us": 1, "verifi": 1, "version": 4, "vg": [2, 4, 5], "via": 3, "whether": 2, "which": [1, 2], "within": 1, "work": 5, "work_dir": 5, "workflow": 1, "working_directori": 5, "would": 4, "y": 5, "you": [0, 1, 4, 5], "your": 4, "yourgfafilepath": 5}, "titles": ["Contact us", "Toy Example", "File Types", "Welcome to methylGrapher\u2019s documentation!", "Installation", "Process"], "titleterms": {"": 3, "align": 5, "analysi": 3, "argument": 5, "assembli": 2, "base": 3, "bisulfit": 3, "citat": 0, "code": [0, 4], "conda": 4, "contact": 0, "conversionr": 5, "dedupl": 5, "descript": 5, "dna": 3, "docker": 4, "document": 3, "exampl": [1, 5], "file": 2, "format": 2, "fragment": 2, "gaf": 2, "genom": [3, 5], "gfa": 2, "graph": [2, 3], "graphic": 2, "index": 5, "instal": 4, "issu": 0, "main": 5, "map": 2, "methyl": [2, 3], "methylcal": 5, "methylgraph": 3, "option": 5, "pip": 4, "prerequisit": 4, "process": 5, "refer": 2, "requir": 5, "sequenc": 3, "sourc": [0, 4], "toi": 1, "tracker": 0, "type": 2, "u": 0, "us": [3, 4], "usag": 5, "via": 4, "welcom": 3, "whole": 3}}) \ No newline at end of file diff --git a/docs/source/process.rst b/docs/source/process.rst index f281181..51f995a 100644 --- a/docs/source/process.rst +++ b/docs/source/process.rst @@ -11,9 +11,6 @@ Description ~~~~~~~~~~~~~~~~~~~~~~ This process involves transforming the genome graph from GFA format into two fully converted genome graphs: one depleted of C bases and another depleted of G bases. Additionally, if desired, you may include a spike-in genome in FASTA format to estimate the conversion rate in a single step further. -.. note:: - It's important to note that in the C-to-T genome graph, if both C and T segments are positioned identically—meaning they share the same parent and child segments, and each has only one parent and one child—the T segments are removed. Additionally, any associated links and paths are redirected to the corresponding C segment. This principle also applies in the G-to-A genome graph. - Example Usage ~~~~~~~~~~~~~~~~~~~~~~ .. code-block:: shell diff --git a/src/config.ini b/src/config.ini index f7b5e47..b1c56c1 100644 --- a/src/config.ini +++ b/src/config.ini @@ -1,6 +1,7 @@ [default] # Provide the path to the executable vg_path = vg +ga_path = GraphAligner fastuniq_path = fastuniq diff --git a/src/gfa.py b/src/gfa.py index cb44b7b..f5c2c7f 100644 --- a/src/gfa.py +++ b/src/gfa.py @@ -345,6 +345,32 @@ def get_sequences_by_segment_ID(self, segment_IDs): return res +# Store segment length in memory +class GraphicalFragmentAssemblySegmentLengthMemory(object): + + def __init__(self): + self.clear() + + def clear(self): + self._segment_length = {} + + def parse(self, gfa_file): + with open(gfa_file) as gfa_fh: + for l in gfa_fh: + if l[0] not in "S": + continue + + l = l.strip().split("\t") + + if l[0] == "S": + rt, segID, seq, *tags = l + # sequence, tag, links, parent_links_count + self._segment_length[segID] = len(seq) + + + def get_sequence_length_by_segment_ID(self, segment_ID): + return self._segment_length[segment_ID] + diff --git a/src/longread.py b/src/longread.py new file mode 100644 index 0000000..639bc7d --- /dev/null +++ b/src/longread.py @@ -0,0 +1,642 @@ + + +import os +import re +import sys +import time +import gzip +import argparse + +import gfa +import utility + + + + + + + + +utl = utility.Utility() + +cytosine_methylation_regex = re.compile(r"\+(\(.*\))\-(\(.*\))") +cigar_regex = re.compile(r"\d+[MID]") + + +def path_to_list(path_str): + # Example: <29983488>29983486<29983485<29983484 + res = [] + direction = True + segment_ID = "" + for c in path_str: + if c in "<>": + if c == "<": + direction = False + elif c == ">": + direction = True + else: + segment_ID += c + continue + + res.append((segment_ID, direction)) + segment_ID = "" + + res.append((segment_ID, direction)) + res.pop(0) + + return res + + +def debug_get_fasta_by_read_name(read_name, fasta_fp): + with open(fasta_fp) as f: + for l in f: + if l.startswith(">"): + if l[1:].strip() == read_name: + return f.readline().strip() + return None + +def sam_bam_line_reader(file_path): + lower_fp = file_path.lower() + if lower_fp.endswith(".sam"): + with open(file_path, 'r') as f: + for line in f: + yield line + elif lower_fp.endswith(".bam"): + samtool_cmd = utility.SystemExecute() + samtool_out, samtool_err = samtool_cmd.execute(f"samtools view -h {file_path}", ) + for line in samtool_out: + yield str(line, 'utf-8') + samtool_cmd.wait() + + + +def bam_sam_to_fasta(bsam_fp, fasta_fp): + # Explore sam file + + output_fasta_fh = None + if utl.isGzip(fasta_fp): + output_fasta_fh = gzip.open(fasta_fp, 'wt') + else: + output_fasta_fh = open(fasta_fp, 'w') + + + no_methylation_tag = 0 + read_line_count = 0 + + + durations = [0] * 10 + timestamps = [0] * 10 + next_report = 10 + + + for l in sam_bam_line_reader(bsam_fp): + # Skip header + if l.startswith('@'): + continue + + + if timestamps[0] == 0: + ts = time.time() + for i in range(10): + timestamps[i] = ts + + durations[0] += time.time() - timestamps[0] + + if sum(durations) > next_report: + print(durations) + next_report += 10 + + line = l.strip().split('\t') + basic_info = line[:11] + tags_list = line[11:] + tags = {} + for tag_str in tags_list: + tag = [tag_str[:4], tag_str[5:]] + if tag_str.startswith("Ml:B:C"): + tag = ["Ml:B:C", tag_str[7:]] + tags[tag[0]] = tag[1] + + read_name = basic_info[0] + read_seq = basic_info[9].upper() + + if 'Mm:Z' not in tags: + no_methylation_tag += 1 + continue + + if "Ml:B:C" not in tags: + no_methylation_tag += 1 + continue + + mm_tag = tags['Mm:Z'] + ml_tag = tags["Ml:B:C"] + + if mm_tag.endswith(";"): + mm_tag = mm_tag[:-1] + if ml_tag.endswith(";"): + ml_tag = ml_tag[:-1] + + mm_tag = mm_tag.split(",") + ml_tag = ml_tag.split(",") + + if mm_tag[0] != 'C+m': + # print(mm_tag[0], "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") + continue + else: + mm_tag = mm_tag[1:] + + if len(ml_tag) in [0, 1]: + continue + + mm_tag = [int(x) for x in mm_tag] + ml_tag = [int(x) for x in ml_tag] + assert len(mm_tag) == len(ml_tag) + + cg_count = read_seq.count("C") + + cytosines_positions = [] + cpg_positions = [] + for i, nuc in enumerate(read_seq): + if nuc == "C": + cytosines_positions.append(i) + if i < len(read_seq) - 1 and read_seq[i + 1] == "G": + cpg_positions.append(i) + + methylated_cytosines = [] + unmethylated_cytosines = [] + nth_cytosine = 0 + for i in range(len(mm_tag)): + interval = mm_tag[i] + 1 + if nth_cytosine == 0: + nth_cytosine = interval - 1 + else: + nth_cytosine += interval + # print(nth_cytosine, interval) + + methylated = ml_tag[i] > 127 + if methylated: + methylated_cytosines.append(cytosines_positions[nth_cytosine]) + else: + unmethylated_cytosines.append(cytosines_positions[nth_cytosine]) + + cpg_covered_count = len( + set(cpg_positions).intersection(set(methylated_cytosines).union(set(unmethylated_cytosines)))) + if cpg_covered_count < len(cpg_positions) - 5: + # print(read_line_count, cpg_covered_count, len(cpg_positions)) + # continue + pass + + # print(read_name) + + # print(read_line_count) + # print(read_name) + # print(l) + # print(len(read_seq)) + # print(line) + # print(basic_info[:9]) + # print(tags) + # print(cg_count) + # print(len(mm_tag)) + # print(len(ml_tag)) + + # print(mm_tag[:40]) + # print(cytosines_positions[:80]) + # print() + # print(methylated_cytosines[:20]) + # print(cpg_positions[:20]) + + # print(len(methylated_cytosines), len(cpg_positions)) + # print(methylated_cytosines[-1], cpg_positions[-1]) + + # print('\n') + + methylated_cytosines = [str(x) for x in methylated_cytosines] + unmethylated_cytosines = [str(x) for x in unmethylated_cytosines] + met_str = f"+({','.join(methylated_cytosines)})-({','.join(unmethylated_cytosines)})" + fasta_entry = f">{read_name}{met_str}\n{read_seq}\n\n" + + # print(fasta_entry) + output_fasta_fh.write(fasta_entry) + + read_line_count += 1 + + + tnow = time.time() + durations[1] += tnow - timestamps[0] + timestamps[0] = tnow + + + + + output_fasta_fh.close() + return None + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +def generate_fp_within_wd(wd, file_name): + return os.path.join(wd, file_name) + +def generate_fasta_fp(wd): + return generate_fp_within_wd(wd, "input.fasta") + + +def extract_methylation_single_thread(gfa_fp, wd, threads=1, debug=False): + """ + Extract methylation information + :param wd: working directory + :param gfa: genome graph file path in GFA format + :return: methylation information + """ + + counter_pass = 0 + counter_issue_no_methylation = 0 + counter_issue = 0 + counter_total = 0 + + methylation_result = {} + + time_start = time.time() + + gfa_instance = gfa.GraphicalFragmentAssemblySegmentLengthMemory() + gfa_instance.parse(gfa_fp) + + if debug: + # Just in case the sequence is needed for debugging + gfa_seq_instance = gfa.GraphicalFragmentAssemblyMemorySegmentOptimized() + gfa_seq_instance.parse(gfa_fp) + + + + with open(generate_fp_within_wd(wd, "align.gaf")) as f: + for l in f: + + l = l.strip().split("\t") + # print(l) + + counter_total += 1 + if counter_total % 1000 == 0: + duration = time.time() - time_start + # TODO better logging + # print(counter_total, counter_pass, counter_issue_no_methylation, counter_issue, duration) + + # Read the graph alignment file (GAF) + read_name = l[0] + + query_len = int(l[1]) + query_start = int(l[2]) + query_end = int(l[3]) + + strand = l[4] + + path_str = l[5] + path_list = path_to_list(path_str) + + path_len = int(l[6]) + path_start = int(l[7]) + path_end = int(l[8]) + + matched_len = int(l[9]) + alignment_block_len = int(l[10]) + mapq = int(l[11]) + + tag_list = l[12:] + tags = {} + for tag_str in tag_list: + tname, tdt, tv = tag_str.split(":") + if tdt == "i": + tv = int(tv) + elif tdt == "f": + tv = float(tv) + tags[tname] = tv + # print(tname, tv) + + # Example: 42M1I5M1D23M1I1M1I1M1I13M1D41M1D35M1D4M1D + cigar = [] + cigar_str = tags["cg"] + for c in cigar_regex.findall(cigar_str): + cigar.append((int(c[:-1]), c[-1])) + + alignment_block_type = [] + block_start = 0 + block_end = 0 + for length, alignment_type in cigar: + block_end += length + + alignment_block_type.append((block_start, block_end, alignment_type)) + block_start = block_end + alignment_block_type.append((block_start, block_end, alignment_type)) + + + + # Sanity checks + + # make sure there is methylation information here, and there is no conflict + mc = cytosine_methylation_regex.findall(read_name) + if len(mc) != 1: + counter_issue_no_methylation += 1 + continue + + methylated_cytosines = mc[0][0][1:-1].split(",") + unmethylated_cytosines = mc[0][1][1:-1].split(",") + + if len(methylated_cytosines) + len(unmethylated_cytosines) == 0: + counter_issue_no_methylation += 1 + continue + + methylated = list(map(int, list(filter(lambda x: x != "", methylated_cytosines)))) + unmethylated = list(map(int, list(filter(lambda x: x != "", unmethylated_cytosines)))) + + + # Just to make sure the cigar interpretation is correct + cigar_freq = {} + for c in cigar: + if c[1] not in cigar_freq: + cigar_freq[c[1]] = 0 + cigar_freq[c[1]] += c[0] + assert path_end - path_start == cigar_freq["M"] + cigar_freq.get("D", 0) + assert query_end - query_start == cigar_freq["M"] + cigar_freq.get("I", 0) + + reconstructed_cigar = "" + for c in cigar: + reconstructed_cigar += str(c[0]) + c[1] + assert reconstructed_cigar == cigar_str + + + # if ">" in path_str: + # continue + + + # Just to make sure the path interpretation is correct + path_len_constructed = 0 + path_segment_length = {} + for segID, direction in path_list: + seq_length = gfa_instance.get_sequence_length_by_segment_ID(segID) + path_len_constructed += seq_length + path_segment_length[segID] = seq_length + assert path_len_constructed == path_len + + + # Debugging + # query_seq = debug_get_fasta_by_read_name(read_name, "./short.fasta") + query_seq = "" + path_seq = "" + + """ + for i, (segID, direction) in enumerate(path_list): + seq = gfa_instance.get_sequence_by_segment_ID(segID) + if not direction: + seq = seq_reverse_complement(seq) + path_seq += seq + """ + + + pos_in_read = -1 + pos_in_path = -1 + for alignment_block_len, alignment_block_type in cigar: + + for ai in range(alignment_block_len): + if alignment_block_type == "M": + pos_in_read += 1 + pos_in_path += 1 + + pos_in_abs_path = path_start + pos_in_path + pos_in_abs_read = query_start + pos_in_read + + met_flag = None + if pos_in_abs_read in methylated: + met_flag = "M" + elif pos_in_abs_read in unmethylated: + met_flag = "U" + + if met_flag is not None: + + corresponding_segment_ID = None + corresponding_segment_pos = pos_in_abs_path + for segID, direction in path_list: + if corresponding_segment_pos < path_segment_length[segID]: + corresponding_segment_ID = segID + if not direction: + # TODO need double check + corresponding_segment_pos = path_segment_length[segID] - corresponding_segment_pos - 1 + break + + corresponding_segment_pos -= path_segment_length[segID] + + if debug: + segment_seq = gfa_seq_instance.get_sequence_by_segment_ID(corresponding_segment_ID) + csp1 = corresponding_segment_pos + csp2 = corresponding_segment_pos + 2 + if csp2 > len(segment_seq): + csp2 = csp1 + 1 + + if not direction: + csp2 = corresponding_segment_pos + csp1 = corresponding_segment_pos - 2 + if csp1 < 0: + csp1 = 0 + + # This is to check whether I am aligning CG within the read to CG in the segment + print(path_seq[pos_in_abs_path: pos_in_abs_path+2], query_seq[pos_in_abs_read:pos_in_abs_read+2], segment_seq[csp1: csp2], met_flag) + + if corresponding_segment_ID not in methylation_result: + methylation_result[corresponding_segment_ID] = {} + if corresponding_segment_pos not in methylation_result[corresponding_segment_ID]: + dir_str = "-" if direction else "+" + methylation_result[corresponding_segment_ID][corresponding_segment_pos] = [dir_str, 0, 0] + methylation_result[corresponding_segment_ID][corresponding_segment_pos][2] += 1 + if met_flag == "M": + methylation_result[corresponding_segment_ID][corresponding_segment_pos][1] += 1 + + + elif alignment_block_type == "I": + pos_in_read += 1 + elif alignment_block_type == "D": + pos_in_path += 1 + else: + raise Exception("Unknown alignment block type") + + # print(path_len_constructed, path_len) + # print(cigar) + # print(alignment_block_type) + + # print(path_start, path_end, path_end - path_start) + # print(query_start, query_end, query_end-query_start) + # print(cigar_freq) + + # print(path_seq[433:533]) + # print(query_seq[:100]) + # print(path_str) + counter_pass += 1 + # break + + + + for segment_ID in sorted(methylation_result): + for pos in sorted(methylation_result[segment_ID]): + strand, met, cov = methylation_result[segment_ID][pos] + unmet = cov - met + l = f"{segment_ID}\t{pos}\t{strand}\t{met}\t{unmet}\t{cov}" + print(l) + return None + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +def prepare_fasta(basecall, wd, threads=1, debug=False): + """ + Prepare fasta file from long read base call BAM/SAM file + :param basecall: long read base call BAM/SAM file + :param wd: working directory + :return: fasta file + """ + + bam_sam_to_fasta(basecall, generate_fasta_fp(wd)) + return None + + +def align(gfa, wd, threads=1, debug=False): + """ + Align long reads to genome graph + :param wd: working directory + :param gfa: genome graph file path in GFA format + """ + + # TODO use non-default version of GraphAligner + cmd = f"GraphAligner -t {threads} --cigar-match-mismatch -g {gfa} -f {generate_fasta_fp(wd)} -a {generate_fp_within_wd(wd, 'align.gaf')} -x vg" + align_exe = utility.SystemExecute() + + # TODO + # align_exe.execute(cmd, stdout=generate_fp_within_wd(wd, 'align.log'), stderr=generate_fp_within_wd(wd, 'align.err')) + print(cmd) + + return None + +def extract_methylation(gfa_fp, wd, threads=1, debug=False): + """ + Extract methylation information + :param wd: working directory + :param gfa_fp: genome graph file path in GFA format + :return: methylation information + """ + + extract_methylation_single_thread(gfa_fp, wd, debug=debug) + + return None + + +def main(basecall, gfa_fp, wd, threads=1, debug=False): + """ + One step to run all + :param basecall: long read base call BAM/SAM file + :param gfa_fp: genome graph file path in GFA format + :param wd: working directory + :return: methylation information + """ + + prepare_fasta(basecall, wd, threads=threads, debug=debug) + align(gfa_fp, wd, threads=threads, debug=debug) + extract_methylation(gfa_fp, wd, threads=threads, debug=debug) + + return None + + + + + +if __name__ == "__main__": + + pass diff --git a/src/main.py b/src/main.py index 55eb3f9..2b48405 100644 --- a/src/main.py +++ b/src/main.py @@ -4,7 +4,7 @@ __copyright__ = "Copyright 2023-2024, Ting Wang Lab" __credits__ = ["Juan Macias"] __license__ = "MIT" -__version__ = "0.1.1" +__version__ = "0.1.2" __maintainer__ = "Wenjin Zhang" __email__ = "wenjin@wustl.edu" @@ -98,18 +98,23 @@ else: freport.write(f"Insert lambda phage genome into genome graph as segment: {lambda_segment_id}\n") - graph_trim_flag = kvargs.get("trim", "Y").lower() in "yestrue" - g = gfa.GraphicalFragmentAssemblyMemory() - g.parse(original_gfa_with_lambda, keep_link=True) - g.write_converted(gfa_c2t, "C", "T", SNV_trim=graph_trim_flag) + graph_trim_flag = kvargs.get("trim", "N").lower() in "yestrue" - del g - g = gfa.GraphicalFragmentAssemblyMemory() - g.parse(original_gfa_with_lambda, keep_link=True) - g.write_converted(gfa_g2a, "G", "A", SNV_trim=graph_trim_flag) + if graph_trim_flag: + g = gfa.GraphicalFragmentAssemblyMemory() + g.parse(original_gfa_with_lambda, keep_link=True) + g.write_converted(gfa_c2t, "C", "T", SNV_trim=graph_trim_flag) - del g + del g + + g = gfa.GraphicalFragmentAssemblyMemory() + g.parse(original_gfa_with_lambda, keep_link=True) + g.write_converted(gfa_g2a, "G", "A", SNV_trim=graph_trim_flag) + + del g + else: + utility.gfa_converter(original_gfa_with_lambda, prefix+".wl", compress=False) for gfa_file_path in [gfa_c2t, gfa_g2a]: index_prefix = gfa_file_path[:-4] diff --git a/src/mainL.py b/src/mainL.py new file mode 100644 index 0000000..2637bc1 --- /dev/null +++ b/src/mainL.py @@ -0,0 +1,167 @@ + + + +import os +import re +import sys +import argparse + + +import longread + + + + + + + + +if __name__ == "__main__": + + # Arugment parser + parser = argparse.ArgumentParser(description='methylGrapher for long reads') + + subparsers = parser.add_subparsers(dest='command', help='Sub-command help') + + + # main + parser_main = subparsers.add_parser('main', help='One step to run all') + parser_main.add_argument('-basecall', help='Long read base call BAM/SAM file', required=True) + parser_main.add_argument('-gfa', help='Genome graph file path in GFA format', required=True) + parser_main.add_argument('-t', help='number of threads', default=1, type=int) + parser_main.add_argument('-debug', help='debug mode') + parser_main.add_argument('-wd', help='working directory', default="./") + + + # 1 prepare fasta + parser_prepare_fasta = subparsers.add_parser('prepare_fasta', help='Prepare fasta file') + parser_prepare_fasta.add_argument('-basecall', help='Long read base call BAM/SAM file', required=True) + parser_prepare_fasta.add_argument('-wd', help='working directory', default="./") + parser_prepare_fasta.add_argument('-t', help='number of threads', default=1, type=int) + parser_prepare_fasta.add_argument('-debug', help='debug mode') + + + # 2 align + parser_align = subparsers.add_parser('align', help='Align long reads to genome graph') + parser_align.add_argument('-gfa', help='Genome graph file path in GFA format', required=True) + parser_align.add_argument('-wd', help='Long read fasta file', default="./") + parser_align.add_argument('-t', help='number of threads', default=1, type=int) + parser_align.add_argument('-debug', help='debug mode') + + # 3 methylation extraction + parser_extraction = subparsers.add_parser('extraction', help='Extract methylation information') + parser_extraction.add_argument('-gfa', help='Genome graph file path in GFA format', required=True) + parser_extraction.add_argument('-wd', help='working directory', default="./") + parser_extraction.add_argument('-t', help='number of threads', default=1, type=int) + parser_extraction.add_argument('-debug', help='debug mode') + + + # TODO add function execution for subcommands + + + + args = parser.parse_args() + + if args.command == 'main': + print("Running methylGrapherL with main function", file=sys.stderr) + + + wd = "./" + thread = 1 + debug = False + basecall_fp = args.basecall + gfa_fp = args.gfa + + if args.wd: + wd = args.wd + if args.t: + thread = int(args.t) + if args.debug: + debug = True + + + + # Log parameters + print("Long read base call file path: ", basecall_fp, file=sys.stderr) + print("Genome graph file path: ", gfa_fp, file=sys.stderr) + print("Working directory: ", wd, file=sys.stderr) + print("Number of threads: ", thread, file=sys.stderr) + print("Debug mode: ", debug, file=sys.stderr) + + # Run main function + longread.main(basecall_fp, gfa_fp, wd) + + + elif args.command == 'prepare_fasta': + print("Prepare fasta", file=sys.stderr) + + wd = "./" + thread = 1 + debug = False + basecall_fp = args.basecall + + if args.wd: + wd = args.wd + if args.t: + thread = int(args.t) + if args.debug: + debug = True + + # Log parameters + print("Long read base call file path: ", basecall_fp, file=sys.stderr) + print("Working directory: ", wd, file=sys.stderr) + print("Number of threads: ", thread, file=sys.stderr) + print("Debug mode: ", debug, file=sys.stderr) + + # Run prepare fasta function + longread.prepare_fasta(basecall_fp, wd) + + + + elif args.command == 'align': + print("Align") + + wd = "./" + thread = 1 + debug = False + gfa_fp = args.gfa + + if args.wd: + wd = args.wd + if args.t: + thread = int(args.t) + if args.debug: + debug = True + + + # Log parameters + print("Genome graph file path: ", gfa_fp, file=sys.stderr) + print("Working directory: ", wd, file=sys.stderr) + print("Number of threads: ", thread, file=sys.stderr) + print("Debug mode: ", debug, file=sys.stderr) + + + + elif args.command == 'extraction': + wd = "./" + thread = 1 + debug = False + gfa_fp = args.gfa + + if args.wd: + wd = args.wd + if args.t: + thread = int(args.t) + if args.debug: + debug = True + + # Log parameters + print("Genome graph file path: ", gfa_fp, file=sys.stderr) + print("Working directory: ", wd, file=sys.stderr) + print("Number of threads: ", thread, file=sys.stderr) + print("Debug mode: ", debug, file=sys.stderr) + + else: + print('No command provided') + + diff --git a/src/mainLOLD.py b/src/mainLOLD.py new file mode 100644 index 0000000..b8b1b4a --- /dev/null +++ b/src/mainLOLD.py @@ -0,0 +1,578 @@ + + + +import os +import re +import sys +import time + +import gfa +import mcall +import utility + + + + +class tmpHelp: + def __init__(self): + self._help_text = """ + Usage: main.py [options] + + Commands: + prepareGenome Prepare genome for methylation calling + callMethylation Call methylation from nanopore reads + + Options: + -h, --help Show this help message and exit + -t, --thread Number of threads to use + """ + + def help_text(self): + return self._help_text + + +help = tmpHelp() + + +utl = utility.Utility() + + + + + + +# Step1: SAM/BAM file to fasta file +# Step2: Alignment +# Step3: Methylation extraction + + + + + +cytosine_methylation_regex = re.compile(r"\+(\(.*\))\-(\(.*\))") + + +def path_to_list(path_str): + # Example: <29983488>29983486<29983485<29983484 + res = [] + direction = True + segment_ID = "" + for c in path_str: + if c in "<>": + if c == "<": + direction = False + elif c == ">": + direction = True + else: + segment_ID += c + continue + + res.append((segment_ID, direction)) + segment_ID = "" + + res.append((segment_ID, direction)) + res.pop(0) + + return res + + +def debug_get_fasta_by_read_name(read_name, fasta_fp): + with open(fasta_fp) as f: + for l in f: + if l.startswith(">"): + if l[1:].strip() == read_name: + return f.readline().strip() + return None + + +def seq_reverse_complement(seq): + return seq.translate(str.maketrans("ATCG", "TAGC"))[::-1] + + +gfa_fp = f"/scratch/wzhang/ref/graph/wgbs_bspg/bspg.wl.gfa" + + +# Store entire GFA in memory +class GraphicalFragmentAssemblyMemory(object): + + def __init__(self): + self.clear() + + def clear(self): + self._header = "" + self._segment = {} + self._walk = {} + + self._original_gfa_path = "" + + def parse(self, gfa_file): + self._original_gfa_path = gfa_file + # TODO when not keep_tag, keep_link, forbid to write GFA and access those two + for l in open(gfa_file): + if l[0] not in "HSL": + continue + + l = l.strip().split("\t") + + if l[0] == "H": + self._header = "\t".join(l[1:]) + + if l[0] == "S": + rt, segID, seq, *tags = l + # sequence, tag, links, parent_links_count + self._segment[segID] = seq + + else: + continue + + def get_sequence_by_segment_ID(self, segment_ID): + return self._segment[segment_ID] + + def get_sequence_length_by_segment_ID(self, segment_ID): + return len(self._segment[segment_ID]) + + def get_sequences_by_segment_ID(self, segment_IDs): + res = {} + for sID in segment_IDs: + res[sID] = self.get_sequence_by_segment_ID(sID) + return res + + def get_tag_by_segment_ID(self, segment_ID): + raise NotImplementedError + + def get_parent_link_count(self, segment_ID): + return self._segment[segment_ID][3] + + def get_parent_links(self, segment_ID): + raise NotImplementedError + + def get_child_links(self, segment_ID): + return self._segment[segment_ID][2] + + def all_segment_IDs(self): + return self._segment.keys() + + +gfa_instance = GraphicalFragmentAssemblyMemory() +gfa_instance.parse(gfa_fp) + + + + + + + + + + + + + +def bsam_to_fasta(bsam_fp, fasta_fp): + # Explore sam file + + output_fasta = open(fasta_fp, "w") + + no_methylation_tag = 0 + with open(bsam_fp) as f: + read_line_count = 0 + for l in f: + # Skip header + if l.startswith('@'): + continue + + line = l.strip().split('\t') + basic_info = line[:11] + tags_list = line[11:] + tags = {} + for tag_str in tags_list: + tag = [tag_str[:4], tag_str[5:]] + if tag_str.startswith("Ml:B:C"): + tag = ["Ml:B:C", tag_str[7:]] + tags[tag[0]] = tag[1] + + read_name = basic_info[0] + read_seq = basic_info[9].upper() + + if 'Mm:Z' not in tags: + no_methylation_tag += 1 + continue + + if "Ml:B:C" not in tags: + no_methylation_tag += 1 + continue + + mm_tag = tags['Mm:Z'] + ml_tag = tags["Ml:B:C"] + + if mm_tag.endswith(";"): + mm_tag = mm_tag[:-1] + if ml_tag.endswith(";"): + ml_tag = ml_tag[:-1] + + mm_tag = mm_tag.split(",") + ml_tag = ml_tag.split(",") + + if mm_tag[0] != 'C+m': + # print(mm_tag[0], "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") + continue + else: + mm_tag = mm_tag[1:] + + if len(ml_tag) in [0, 1]: + continue + + mm_tag = [int(x) for x in mm_tag] + ml_tag = [int(x) for x in ml_tag] + assert len(mm_tag) == len(ml_tag) + + cg_count = read_seq.count("C") + + cytosines_positions = [] + cpg_positions = [] + for i, nuc in enumerate(read_seq): + if nuc == "C": + cytosines_positions.append(i) + if i < len(read_seq) - 1 and read_seq[i + 1] == "G": + cpg_positions.append(i) + + methylated_cytosines = [] + unmethylated_cytosines = [] + nth_cytosine = 0 + for i in range(len(mm_tag)): + interval = mm_tag[i] + 1 + if nth_cytosine == 0: + nth_cytosine = interval - 1 + else: + nth_cytosine += interval + # print(nth_cytosine, interval) + + methylated = ml_tag[i] > 127 + if methylated: + methylated_cytosines.append(cytosines_positions[nth_cytosine]) + else: + unmethylated_cytosines.append(cytosines_positions[nth_cytosine]) + + cpg_covered_count = len( + set(cpg_positions).intersection(set(methylated_cytosines).union(set(unmethylated_cytosines)))) + if cpg_covered_count < len(cpg_positions) - 5: + print(read_line_count, cpg_covered_count, len(cpg_positions)) + continue + # print(read_name) + + # print(read_line_count) + # print(read_name) + # print(l) + # print(len(read_seq)) + # print(line) + # print(basic_info[:9]) + # print(tags) + # print(cg_count) + # print(len(mm_tag)) + # print(len(ml_tag)) + + # print(mm_tag[:40]) + # print(cytosines_positions[:80]) + # print() + # print(methylated_cytosines[:20]) + # print(cpg_positions[:20]) + + # print(len(methylated_cytosines), len(cpg_positions)) + # print(methylated_cytosines[-1], cpg_positions[-1]) + + # print('\n') + + methylated_cytosines = [str(x) for x in methylated_cytosines] + unmethylated_cytosines = [str(x) for x in unmethylated_cytosines] + met_str = f"+({','.join(methylated_cytosines)})-({','.join(unmethylated_cytosines)})" + fasta_entry = f">{read_name}{met_str}\n{read_seq}\n\n" + + # print(fasta_entry) + output_fasta.write(fasta_entry) + + read_line_count += 1 + if read_line_count > 10: + pass + + output_fasta.close() + return None + + + + + +counter_pass = 0 +counter_issue_no_methylation = 0 +counter_issue = 0 +counter_total = 0 + +methylation_result = {} + +time_start = time.time() +with open(output_fp) as f: + for l in f: + + l = l.strip().split("\t") + # print(l) + + counter_total += 1 + if counter_total % 1000 == 0: + duration = time.time() - time_start + print(counter_total, counter_pass, counter_issue_no_methylation, counter_issue, duration) + + read_name = l[0] + + query_len = int(l[1]) + query_start = int(l[2]) + query_end = int(l[3]) + + strand = l[4] + + path_str = l[5] + # if ">" in path_str: + # continue + + path_len = int(l[6]) + path_start = int(l[7]) + path_end = int(l[8]) + + matched_len = int(l[9]) + alignment_block_len = int(l[10]) + mapq = int(l[11]) + + tag_list = l[12:] + tags = {} + + # print(tag_list) + + for tag_str in tag_list: + tname, tdt, tv = tag_str.split(":") + if tdt == "i": + tv = int(tv) + elif tdt == "f": + tv = float(tv) + tags[tname] = tv + # print(tname, tv) + + mc = cytosine_methylation_regex.findall(read_name) + if len(mc) != 1: + counter_issue_no_methylation += 1 + continue + + methylated_cytosines = mc[0][0][1:-1].split(",") + unmethylated_cytosines = mc[0][1][1:-1].split(",") + + if len(methylated_cytosines) + len(unmethylated_cytosines) == 0: + counter_issue_no_methylation += 1 + continue + + methylated = list(map(int, list(filter(lambda x: x != "", methylated_cytosines)))) + unmethylated = list(map(int, list(filter(lambda x: x != "", unmethylated_cytosines)))) + + # Example: 42M1I5M1D23M1I1M1I1M1I13M1D41M1D35M1D4M1D + cigar_str = tags["cg"] + cigar = [] + for c in re.findall(r"\d+[MID]", cigar_str): + cigar.append((int(c[:-1]), c[-1])) + + alignment_block_type = [] + block_start = 0 + block_end = 0 + for length, alignment_type in cigar: + block_end += length + + alignment_block_type.append((block_start, block_end, alignment_type)) + block_start = block_end + alignment_block_type.append((block_start, block_end, alignment_type)) + + """ + TODO uncomment + cigar_freq = {} + for c in cigar: + if c[1] not in cigar_freq: + cigar_freq[c[1]] = 0 + cigar_freq[c[1]] += c[0] + + assert path_end - path_start == cigar_freq["M"] + cigar_freq.get("D", 0) + assert query_end - query_start == cigar_freq["M"] + cigar_freq.get("I", 0) + + + # Just to make sure the cigar interpretation is correct + reconstructed_cigar = "" + for c in cigar: + reconstructed_cigar += str(c[0]) + c[1] + assert reconstructed_cigar == cigar_str + """ + + # Just to make sure the path interpretation is correct + path_len_constructed = 0 + path_list = path_to_list(path_str) + for segID, direction in path_list: + seq = gfa_instance.get_sequence_by_segment_ID(segID) + # print(segID, len(seq)) + path_len_constructed += len(seq) + # print(path_str) + assert path_len_constructed == path_len + + # query_seq = debug_get_fasta_by_read_name(read_name, "./short.fasta") + query_seq = "" + path_seq = "" + + """ + for i, (segID, direction) in enumerate(path_list): + seq = gfa_instance.get_sequence_by_segment_ID(segID) + if not direction: + seq = seq_reverse_complement(seq) + path_seq += seq + """ + + path_segment_length = {} + for i, (segID, direction) in enumerate(path_list): + seql = gfa_instance.get_sequence_length_by_segment_ID(segID) + path_segment_length[segID] = seql + + pos_in_read = -1 + pos_in_path = -1 + for alignment_block_len, alignment_block_type in cigar: + + for ai in range(alignment_block_len): + if alignment_block_type == "M": + pos_in_read += 1 + pos_in_path += 1 + + pos_in_abs_path = path_start + pos_in_path + pos_in_abs_read = query_start + pos_in_read + + met_flag = None + if pos_in_abs_read in methylated: + met_flag = "M" + elif pos_in_abs_read in unmethylated: + met_flag = "U" + + if met_flag is not None: + + corresponding_segment_ID = None + corresponding_segment_pos = pos_in_abs_path + for segID, direction in path_list: + if corresponding_segment_pos < path_segment_length[segID]: + corresponding_segment_ID = segID + if not direction: + # TODO need double check + corresponding_segment_pos = path_segment_length[ + segID] - corresponding_segment_pos - 1 + break + + corresponding_segment_pos -= path_segment_length[segID] + + segment_seq = gfa_instance.get_sequence_by_segment_ID(corresponding_segment_ID) + csp1 = corresponding_segment_pos + csp2 = corresponding_segment_pos + 2 + if csp2 > len(segment_seq): + csp2 = csp1 + 1 + + if not direction: + csp2 = corresponding_segment_pos + csp1 = corresponding_segment_pos - 2 + if csp1 < 0: + csp1 = 0 + # print(path_seq[pos_in_abs_path: pos_in_abs_path+2], query_seq[pos_in_abs_read:pos_in_abs_read+2], segment_seq[csp1: csp2], met_flag) + + if corresponding_segment_ID not in methylation_result: + methylation_result[corresponding_segment_ID] = {} + if corresponding_segment_pos not in methylation_result[corresponding_segment_ID]: + dir_str = "-" if direction else "+" + methylation_result[corresponding_segment_ID][corresponding_segment_pos] = [dir_str, 0, 0] + methylation_result[corresponding_segment_ID][corresponding_segment_pos][2] += 1 + if met_flag == "M": + methylation_result[corresponding_segment_ID][corresponding_segment_pos][1] += 1 + + + + + elif alignment_block_type == "I": + pos_in_read += 1 + elif alignment_block_type == "D": + pos_in_path += 1 + else: + raise Exception("Unknown alignment block type") + + # print(path_len_constructed, path_len) + # print(cigar) + # print(alignment_block_type) + + # print(path_start, path_end, path_end - path_start) + # print(query_start, query_end, query_end-query_start) + # print(cigar_freq) + + # print(path_seq[433:533]) + # print(query_seq[:100]) + # print(path_str) + counter_pass += 1 + # break + + + + +if __name__ == "__main__": + args = sys.argv + args.pop(0) + + ga_path = "GraphAligner" + thread = 1 + + # Find config file + try: + config = utility.ConfigParser("config.ini") + vg_path = config.get("default", "ga_path") + thread = config.get("default", "thread") + except: + pass + + try: + thread = int(thread) + except: + thread = 1 + + if len(args) == 0: + print("No command specified. Use 'help' for more information.") + print(help.help_text()) + sys.exit(0) + + command = args.pop(0) + command = command.lower() + if command not in ["help", "-h", "--help", "main"]: + print(f"Unknown command: {command}") + sys.exit(1) + + kvargs = {} + while len(args) > 0: + arg = args.pop(0) + if arg.startswith("-"): + key = arg[1:] + value = args.pop(0) + kvargs[key] = value + else: + print(f"Unknown argument: {arg}\n\n") + print(help.help_text()) + sys.exit(1) + + if "thread" in kvargs: + thread = int(kvargs["thread"]) + del kvargs["thread"] + if "t" in kvargs: + thread = int(kvargs["t"]) + del kvargs["t"] + + + if command in ["help", "-h", "--help"]: + print(help.help_text()) + sys.exit(0) + + + + if command == "preparegenome": + original_gfa_file_path = kvargs["gfa"] + prefix = kvargs["prefix"] + + fn_report = prefix + ".prepare.genome.report.txt" + freport = open(fn_report, "w") \ No newline at end of file diff --git a/src/mgmp.py b/src/mgmp.py new file mode 100644 index 0000000..650c187 --- /dev/null +++ b/src/mgmp.py @@ -0,0 +1,361 @@ + + +import os +import sys +import time +import multiprocessing + + + + +# write a framework for both single threaded and multi-threaded execution + +# 3 main class +# 1. Functions to execute +# 2. Worker class +# 3. Task class + + + + +# The single threaded execution is just one function after another +# The multi-processed execution is more complex + + + +def test_step1(iter): + + i = 0 + while i < 10000: + i += 1 + yield i + + +def test_step2(iter): + + for i in iter: + time.sleep(0.0001) + yield i**2 + + + + + +def worker_function(pid, name, function, input_queue, output_queue, batch_size=1024): + print(f"{name} worker {pid} started") + while True: + try: + tasks = input_queue.get() + if tasks is None: + break + + # print(pid, "tasks") + + result = [] + for r in function(tasks): + result.append(r) + + if len(result) >= batch_size: + output_queue.put(result) + result = [] + + if len(result) > 0: + output_queue.put(result) + + except Exception as e: + print(e) + break + + print(f"{name} worker {pid} finished") + + +def end_function(pid, last_queue): + print(f"End function {pid} started") + while True: + try: + tasks = last_queue.get() + if tasks is None: + break + + # print(pid, len(tasks)) + except Exception as e: + print(e) + break + print(f"End function {pid} finished") + + + +class TaskOLD: + + def __init__(self): + pass + + def run(self, threads=1): + maxsize = 100 + batch_size = 256 + + q0 = multiprocessing.Queue(maxsize=maxsize) + q1 = multiprocessing.Queue(maxsize=maxsize) + q2 = multiprocessing.Queue(maxsize=maxsize) + + + + pool1 = [] + for i in range(1): + p = multiprocessing.Process( + name=f"MGWorke1-{i}", + target=worker_function, + args=(i, "1", test_step1, q0, q1), + kwargs={"batch_size": batch_size} + ) + p.start() + pool1.append(p) + + pool2 = [] + for i in range(threads): + p = multiprocessing.Process( + name=f"MGWorker2-{i}", + target=worker_function, + args=(i, "2", test_step2, q1, q2), + kwargs={"batch_size": batch_size} + ) + p.start() + pool2.append(p) + + p = multiprocessing.Process( + name=f"MGEnd", + target=end_function, + args=(0, q2) + ) + p.start() + + + + + q0.put(1) + q0.put(None) + + for p in pool1: + p.join() + + for p in pool2: + q1.put(None) + + for p in pool2: + p.join() + + q2.put(None) + + + + print("Finished") + + return None + + + + +# TODO how to pass parameters +class Task: + + def __init__(self, steps, batch_size=1024, queue_max_size=100): + + # Example steps + steps_example = [ + {"name": "step1", "function": test_step1, "args": [], "kwargs": {}, "threads": 1}, + {"name": "step2", "function": test_step2, "args": [], "kwargs": {}, "threads": 2}, + ] + + self._steps = steps + self._batch_size = batch_size + self._queue_max_size = queue_max_size + + return + + def run(self): + threads = 10 + maxsize = 100 + batch_size = 256 + + + # N steps need N queues, but for formatting purposes, it need N+1 queues + q0 = multiprocessing.Queue(maxsize=self._queue_max_size) + q1 = multiprocessing.Queue(maxsize=self._queue_max_size) + q2 = multiprocessing.Queue(maxsize=self._queue_max_size) + + + # N steps need N pools + pool1 = [] + for i in range(1): + p = multiprocessing.Process( + name=f"MGWorke1-{i}", + target=worker_function, + args=(i, "1", test_step1, q0, q1), + kwargs={"batch_size": batch_size} + ) + p.start() + pool1.append(p) + + pool2 = [] + for i in range(3): + p = multiprocessing.Process( + name=f"MGWorker2-{i}", + target=worker_function, + args=(i, "2", test_step2, q1, q2), + kwargs={"batch_size": batch_size} + ) + p.start() + pool2.append(p) + + p = multiprocessing.Process( + name=f"MGEnd", + target=end_function, + args=(0, q2) + ) + p.start() + + + + + + # Let the steps begin + q0.put([1]) + q0.put(None) + + for p in pool1: + p.join() + + for p in pool2: + q1.put(None) + + for p in pool2: + p.join() + + q2.put(None) + + + + print("Finished") + + return None + + + +def myfunc(x, y, z): + print(x) + print(y) + print(z) + print(x,y,z) + return x + y + z + + +def test_args(a, b, c, *args, kw=3, **kwargs): + print(a) + print(b) + print(c) + print(kw) + print(args) + print(kwargs) + myfunc(*args) + return None + + + + +def to_worker_function(function, pid, name, input_queue, output_queue, *args, **kwargs): + + + def wrapper(function, pid, name, input_queue, output_queue, *args, **kwargs): + batch_size = kwargs.get("batch_size", 1024) + print(f"{name} worker {pid} started") + while True: + try: + tasks = input_queue.get() + if tasks is None: + break + + # print(pid, "tasks") + + result = [] + for r in function(tasks): + result.append(r) + + if len(result) >= batch_size: + output_queue.put(result) + result = [] + + if len(result) > 0: + output_queue.put(result) + + except Exception as e: + print(e) + break + + print(f"{name} worker {pid} finished") + + return wrapper + + + + + +test_args(1, 2, 3, 100, 200, 300, verbose=True, silent=False) + + +sys.exit(3) + +if __name__ == "__main__": + + + # Single threaded execution + + ts1 = time.time() + s1_gen = test_step1(lambda x: x) + s2_gen = test_step2(s1_gen) + for i in s2_gen: + pass + duration1 = time.time() - ts1 + + + # Multi-threaded execution + + ts2 = time.time() + task = Task(1) + task.run() + duration2 = time.time() - ts2 + + ts5 = time.time() + task = Task(1) + task.run() + duration5 = time.time() - ts5 + + ts10 = time.time() + task = Task(1) + task.run() + duration10 = time.time() - ts10 + + print(duration1, duration2, duration5, duration10) + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/utility.py b/src/utility.py index 0432a2c..c0d417f 100644 --- a/src/utility.py +++ b/src/utility.py @@ -131,8 +131,8 @@ def gfa_converter(input_gfa, output_file_prefix, compress=True, thread=1): l = l.strip().split('\t') if l[0] == 'S': - l[2] = l[2].replace(conversion[0], conversion[1]) - l[2] = l[2].replace(conversion[0].lower(), conversion[1].lower()) + sc = l[2].upper().replace(conversion[0], conversion[1]) + l[2] = sc newl = '\t'.join(l) + '\n' @@ -631,7 +631,7 @@ def header(self): """.strip() def version(self): - return "0.1.1" + return "0.1.2" def help_text_raw(self): return """