Skip to content

Commit

Permalink
fixes for gapclosing (rDNA & removal of nontip-nontip connections)
Browse files Browse the repository at this point in the history
  • Loading branch information
Dmitry-Antipov committed Dec 18, 2023
1 parent fe4928d commit c67aa94
Show file tree
Hide file tree
Showing 4 changed files with 71 additions and 11 deletions.
1 change: 1 addition & 0 deletions src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -781,6 +781,7 @@ all: $(addprefix ${TARGET_DIR}/,${ALL_TGTS}) \
../lib/verkko/scripts/graph_functions.py \
../lib/verkko/scripts/hicverkko.py \
../lib/verkko/scripts/parse_sam_pairs.py \
../lib/verkko/scripts/graph_functions.py \
\
../lib/verkko/Snakefile \
../lib/verkko/Snakefiles/1-buildGraph.sm \
Expand Down
8 changes: 5 additions & 3 deletions src/Snakefiles/4-processONT.sm
Original file line number Diff line number Diff line change
Expand Up @@ -82,14 +82,16 @@ echo Step 2a

cat alns-ont-nogap-2.gaf alns-ont-gap1.gaf alns-ont-gap2.gaf | {PYTHON} {VERKKO}/scripts/maybe_trim_alignment.py gapped-twice-unitig-unrolled-hifi-resolved.gfa 100 selected-winnowmap.ids > alns-cut-trim.gaf

{PYTHON} {VERKKO}/scripts/insert_aln_gaps.py gapped-twice-unitig-unrolled-hifi-resolved.gfa alns-cut-trim.gaf 10 100 alns-ont-nogap.gaf alns-ont-gap3.gaf cutgap y > gapped-unitig-unrolled-hifi-resolved.gfa
#{PYTHON} {VERKKO}/scripts/insert_aln_gaps.py gapped-twice-unitig-unrolled-hifi-resolved.gfa alns-cut-trim.gaf 10 100 alns-ont-nogap.gaf alns-ont-gap3.gaf cutgap y > gapped-unitig-unrolled-hifi-resolved.gfa
cat gapped-twice-unitig-unrolled-hifi-resolved.gfa > gapped-unitig-unrolled-hifi-resolved.gfa


cat alns-ont-gap1.gaf alns-ont-gap2.gaf alns-ont-gap3.gaf > ../{output.ont_gap_align}
cat alns-ont-gap1.gaf alns-ont-gap2.gaf > ../{output.ont_gap_align}


echo ""
echo Step 2b
cat alns-ont-nogap.gaf ../{output.ont_gap_align} | awk -F "\\t" '{{if (\$4-\$3 >= \$2*0.8 && \$12 >= 20) print;}}' | {PYTHON} {VERKKO}/scripts/trim_dbg_alignment.py gapped-unitig-unrolled-hifi-resolved.gfa 1500 > alns-ont-filter-trim.gaf
cat ../{output.ont_gap_align} | awk -F "\\t" '{{if (\$4-\$3 >= \$2*0.8 && \$12 >= 20) print;}}' | {PYTHON} {VERKKO}/scripts/trim_dbg_alignment.py gapped-unitig-unrolled-hifi-resolved.gfa 1500 > alns-ont-filter-trim.gaf

echo ""
echo Step 2c
Expand Down
44 changes: 37 additions & 7 deletions src/scripts/graph_functions.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import networkx as nx
import sys

def remove_large_tangles(G, MIN_LEN, MAX_SHORT_COMPONENT):
#Functions for indirect graph processing, nodes - "utig1-234"
def remove_large_tangles(G, MAX_LEN, MAX_SHORT_COMPONENT):
shorts = set()
for node in G.nodes():
if G.nodes[node]['length'] < MIN_LEN:
if G.nodes[node]['length'] < MAX_LEN:
shorts.add(node)
sh_G = G.subgraph(shorts)
nodes_deleted = 0
Expand All @@ -21,17 +22,46 @@ def remove_large_tangles(G, MIN_LEN, MAX_SHORT_COMPONENT):
f'number of nodes {G.number_of_nodes()}\n')
return set(to_delete)

def nodes_in_tangles(G, MAX_LEN, MIN_TANGLE_SIZE):
shorts = set()
for node in G.nodes():
if G.nodes[node]['length'] < MAX_LEN:
shorts.add(node)
res = set()
sh_G = G.subgraph(shorts)
for comp in nx.connected_components(sh_G):
if len(comp) > MIN_TANGLE_SIZE:
for n in comp:
res.add(n)
return res


def load_indirect_graph(gfa_file, G):
translate = open(gfa_file, 'r')
for line in translate:

for line in open(gfa_file, 'r'):
if "#" in line:
continue
line = line.strip().split()

if line[0] == "S":
G.add_node(line[1], length=int(line[3][5:]), coverage=float(line[5][5:]))
elif line[0] == "L":
#noseq graphs
ls = len(line[2])
cov = 0
for i in range(4, len(line)):
spl_tag = line[i].split(":")
if spl_tag[0] == "LN":
ls = int(spl_tag[2])
if spl_tag[0] == "ll":
cov = float(spl_tag[2])
G.add_node(line[1], length=int(ls), coverage=float(cov))

#L and S lines can be mixed
for line in open(gfa_file, 'r'):
if "#" in line:
continue
line = line.strip().split()
if line[0] == "L":
if line[1] not in G or line[3] not in G:
sys.stderr.write("Warning, skip link between nodes not in graph:%s" % (line))
sys.stderr.write("Error while graph loading; link between nodes not in graph:%s" % (line))
sys.exit(1)
G.add_edge(line[1], line[3])
29 changes: 28 additions & 1 deletion src/scripts/insert_aln_gaps.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#!/usr/bin/env python

import sys
import networkx as nx
import graph_functions

graph_file = sys.argv[1]
input_aln_file = sys.argv[2]
Expand Down Expand Up @@ -33,13 +35,16 @@ def revcomp(s):
node_seqs = {}
edge_overlaps = {}

#not_tips = no outgoing nodes, assymetric!

with open(graph_file) as f:
for l in f:
parts = l.strip().split('\t')
if parts[0] == "S":
node_seqs[parts[1]] = parts[2]
if parts[0] == 'L':
fromnode = (">" if parts[2] == "+" else "<") + parts[1]
#this is not to_node but reverse(to_node)
tonode = ("<" if parts[4] == "+" else ">") + parts[3]
edge_overlaps[canontip(fromnode, tonode)] = int(parts[5][:-1])
if parts[1] != parts[3]: # we will skip self-edges for this consideration, if you have a loop at a gap, try to patch it anyway
Expand All @@ -49,6 +54,15 @@ def revcomp(s):

if allow_nontips: not_tips = set()


G = nx.Graph()
graph_functions.load_indirect_graph(graph_file, G)
tangles = graph_functions.nodes_in_tangles(G, 30000, 100)
or_tangles = set()
for node in tangles:
for pref in (">", "<"):
or_tangles.add(pref + node)

gaps = {}

count_per_read = {}
Expand All @@ -58,6 +72,7 @@ def revcomp(s):
if readname not in count_per_read: count_per_read[readname] = 0
count_per_read[readname] += 1

#Here we save rc to start of alignment. This allow to work
alns_per_read = {}
with open(input_aln_file) as f:
for l in f:
Expand All @@ -83,6 +98,15 @@ def revcomp(s):
used_names = list(alns_per_read)
used_names.sort()

def tangle_onetip(s_node, e_node, or_tangles, not_tips):
if (s_node in not_tips and e_node in or_tangles) or (e_node in not_tips and s_node in or_tangles):
# print (f"banning nodes near tangles {s_node} {e_node}")
# print (f"{s_node in not_tips} and {e_node in or_tangles} or {e_node in not_tips} ad {s_node in or_tangles}")
return True
else:
return False


for name in alns_per_read:
alns_per_read[name].sort(key=lambda x: x[0])
if len(alns_per_read[name]) < 2: continue
Expand All @@ -91,7 +115,9 @@ def revcomp(s):
assert alns[i][0] >= alns[i-1][0]
if alns[i][0] == alns[i-1][0]: continue
if alns[i][1] <= alns[i-1][1]: continue
if allow_onetip and alns[i-1][3] in not_tips and alns[i][2] in not_tips: continue
#alns[i][2] is RC to alignment, that's why it works.
if allow_onetip and alns[i-1][3] in not_tips and alns[i][2] in not_tips: continue
if allow_onetip and tangle_onetip(alns[i-1][3], alns[i][2], or_tangles, not_tips): continue
if not allow_onetip and (alns[i-1][3] in not_tips or alns[i][2] in not_tips): continue
if alns[i-1][5] > max_end_clip or alns[i][4] > max_end_clip: continue
gap_len = (alns[i][0] - alns[i][4]) - (alns[i-1][1] + alns[i-1][5])
Expand Down Expand Up @@ -172,6 +198,7 @@ def revcomp(s):
if alns[i][0] == alns[i-1][0]: continue
if alns[i][1] <= alns[i-1][1]: continue
if allow_onetip and alns[i-1][3] in not_tips and alns[i][2] in not_tips: continue
if allow_onetip and tangle_onetip(alns[i-1][3], alns[i][2], or_tangles, not_tips): continue
if not allow_onetip and (alns[i-1][3] in not_tips or alns[i][2] in not_tips): continue
if alns[i-1][5] > max_end_clip or alns[i][4] > max_end_clip: continue
key = canontip(alns[i-1][3], alns[i][2])
Expand Down

0 comments on commit c67aa94

Please sign in to comment.