Skip to content
This repository has been archived by the owner on May 3, 2024. It is now read-only.

Parallel processing in subsample.py #197

Open
wants to merge 17 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
*.o
*.pyc
*.so
*.swp

build/*
cupcake/tofu/branch/*
35 changes: 21 additions & 14 deletions annotation/subsample.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import math
from csv import DictReader
from collections import defaultdict
from multiprocessing import Pool

def get_counts(count_filename, min_fl_count=2, key='id', min_len=None, max_len=None):
total = 0
Expand All @@ -22,21 +23,24 @@ def get_counts(count_filename, min_fl_count=2, key='id', min_len=None, max_len=N

return total, counts

def subsample(total, counts, iter=100, min_fl_count=2, step=10**4):
def sampleBySize(s, iteration, counts):
tmp = []
for i in range(iteration):
tally = defaultdict(lambda: 0)
for k in random.sample(counts, s):
tally[k] += 1
tmp.append(len(tally))
_mean = sum(tmp)*1./len(tmp)
_std = math.sqrt(sum((x-_mean)**2 for x in tmp)*1./len(tmp))
return((s, min(tmp), max(tmp), _mean, _std))

def subsample_parallel(total, counts, iteration=100, min_fl_count=2, step=10**4, ncore=1):
sizes = list(range(0, total+1, step))
print("min fl count:", min_fl_count)
print("#min fl count:", min_fl_count)
print("size", "min", "max", "mean", "sd")
for s in sizes:
tmp = []
for i in range(iter):
tally = defaultdict(lambda: 0)
for k in random.sample(counts, s):
tally[k] += 1
tmp.append(len(tally)) #tmp.append(len(filter(lambda k: tally[k]>=min_fl_count, tally)))
#tmp = [len(set(random.sample(counts, s))) for i in xrange(iter)]
_mean = sum(tmp)*1./len(tmp)
_std = math.sqrt(sum((x-_mean)**2 for x in tmp)*1./len(tmp))
print(s, min(tmp), max(tmp), _mean, _std)

with Pool(ncore) as p:
return p.starmap(sampleBySize, [(s, iteration, counts) for s in sizes])

if __name__ == "__main__":
from argparse import ArgumentParser
Expand All @@ -47,6 +51,7 @@ def subsample(total, counts, iter=100, min_fl_count=2, step=10**4):
parser.add_argument("--range", default=None, help="Length range (ex: (1000,2000), default None)")
parser.add_argument("--min_fl_count", default=1, type=int, help="Minimum FL count (default: 1)")
parser.add_argument("--step", default=10000, type=int, help="Step size (default: 10000)")
parser.add_argument("--ncores", default=1, type=int, help="Number of threads to use for parallelization")
args = parser.parse_args()

min_len, max_len = None, None
Expand All @@ -55,6 +60,8 @@ def subsample(total, counts, iter=100, min_fl_count=2, step=10**4):
assert 0 <= min_len < max_len

total, counts = get_counts(args.count_filename, args.min_fl_count, args.by, min_len, max_len)
subsample(total, counts, args.iterations, args.min_fl_count, args.step)
results = subsample_parallel(total, counts, args.iterations, args.min_fl_count, args.step, args.ncores)
for row in results:
print(' '.join(str(i) for i in row))


16 changes: 12 additions & 4 deletions cupcake/tofu/collapse_isoforms_by_sam.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,12 @@ def multiprocess_helper(start_index, end_index, args, cov_threshold, f_good, f_b
max_5_diff=args.max_5_diff,
max_3_diff=args.max_3_diff)

iter = b.iter_gmap_sam(args.bam, ignored_fout, type='BAM', bam_start_index=start_index, bam_end_index=end_index)
if args.bam is not None:
iter = b.iter_gmap_sam(args.bam, ignored_fout, type='BAM', bam_start_index=start_index, bam_end_index=end_index)
else:
# if type is set to SAM, it'll use the gmap sam reader, which messes up the identity and breaks
# doing this might break things in the future
iter = b.iter_gmap_sam(args.sam, ignored_fout, type='BAM', bam_start_index=start_index, bam_end_index=end_index)
for recs in iter: # recs is {'+': list of list of records, '-': list of list of records}
for v in recs.values():
for v2 in v:
Expand Down Expand Up @@ -319,7 +324,10 @@ def main(args):
if len(v2) > 0: b.process_records(v2, args.allow_extra_5exon, False, f_good, f_bad, f_txt)
else:
# need to first predefine the regions
region_list_ignore, chunk_regions_list = multiprocess_predefine_regions(args.bam, args.cpus)
if args.bam is not None:
region_list_ignore, chunk_regions_list = multiprocess_predefine_regions(args.bam, args.cpus)
else:
region_list_ignore, chunk_regions_list = multiprocess_predefine_regions(args.sam, args.cpus)
assert len(chunk_regions_list) == args.cpus

if args.flnc_coverage > 0:
Expand All @@ -334,8 +342,8 @@ def main(args):
pool = []
for i in range(args.cpus):
p = Process(target=multiprocess_helper,
args=(chunk_regions_list[i][0], chunk_regions_list[i][1], args, cov_threshold,
f_good_pool[i], f_bad_pool[i], f_txt_pool[i], f_ignore_pool[i], ))
args=(chunk_regions_list[i][0], chunk_regions_list[i][1], args, cov_threshold,
f_good_pool[i], f_bad_pool[i], f_txt_pool[i], f_ignore_pool[i], ))
p.start()
pool.append(p)
# NOTE: f_good_pool[i] and f_bad_pool[i] and f_txt_pool[i] actually will get file CLOSED
Expand Down
17 changes: 17 additions & 0 deletions singlecell/clean.mplstyle
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
font.size : 8.0
font.sans-serif : Arial
xtick.major.size : 2
xtick.major.width : 0.75
xtick.labelsize : 8.0
xtick.direction : out
ytick.major.size : 2
ytick.major.width : 0.75
ytick.labelsize : 8.0
ytick.direction : out
xtick.major.pad : 2
xtick.minor.pad : 2
ytick.major.pad : 2
ytick.minor.pad : 2
savefig.dpi : 600
axes.linewidth : 0.75
mathtext.default : regular
68 changes: 68 additions & 0 deletions singlecell/combine_same_diff.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import pysam
from csv import DictReader, DictWriter
from collections import defaultdict
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--same', '-s', help='Prefix for same, eg. output.same.deconcat.tagged')
parser.add_argument('--diff', '-d', help='Prefix for diff, eg. output.diff.deconcat.tagged')
parser.add_argument('--out', '-o', help='Prefix for output, eg. combined.flt')
args = parser.parse_args()

# note: remember to use the "old_id"
# ex: m64182_210318_053335/0/ccs/1
in_diff = defaultdict(lambda: set()) # old zmw --> new zmw
in_same = defaultdict(lambda: set())

touse_diff = set()
touse_same = set()
# id old_id clip_len extra UMI BC BC_rev BC_match BC_top_rank TSO
fields = ['id', 'old_id', 'clip_len', 'extra', 'UMI', 'BC', 'BC_rev', 'BC_match', 'BC_top_rank', 'TSO']
# f1 = open('combined.flt.csv', 'w')
f1 = open(args.out + '.csv', 'w')
writer = DictWriter(f1, fields, delimiter=',')
writer.writeheader()

# for r in DictReader(open('output.diff.deconcat.tagged.csv'),delimiter='\t'):
for r in DictReader(open(args.diff + '.csv'), delimiter='\t'):
raw = r['old_id'].split('/')
zmw = raw[0] + '/' + raw[1]
in_diff[zmw].add(r['id'])
touse_diff.add(r['id'])
writer.writerow(r)

dup = 0
# for r in DictReader(open('output.same.deconcat.tagged.csv'),delimiter='\t'):
for r in DictReader(open(args.same + '.csv'),delimiter='\t'):
raw = r['old_id'].split('/')
zmw = raw[0] + '/' + raw[1]
if zmw in in_diff: dup += 1
else:
writer.writerow(r)
in_same[zmw].add(r['id'])
touse_same.add(r['id'])

f1.close()
print(f"{dup} zmws showed up in both diff/same...")


def get_zmw(seqid):
raw = seqid.split('/')
zmw = raw[0] + '/' + raw[1]
return zmw

# reader = pysam.AlignmentFile(open('output.diff.deconcat.tagged.bam'),'rb',check_sq=False)
reader = pysam.AlignmentFile(open(args.diff + '.bam'),'rb',check_sq=False)
# f = pysam.AlignmentFile(open('combined.flt.bam', 'w'), 'wb', header=reader.header)
f = pysam.AlignmentFile(open(args.out + '.bam', 'w'), 'wb', header=reader.header)
for r in reader:
if r.qname in touse_diff:
print(r.qname)
f.write(r)

# for r in pysam.AlignmentFile(open('output.same.deconcat.tagged.bam'),'rb',check_sq=False):
for r in pysam.AlignmentFile(open(args.same + '.bam'),'rb',check_sq=False):
if r.qname in touse_same: f.write(r)

f.close()

Loading