-
Notifications
You must be signed in to change notification settings - Fork 0
/
kmer_scoring.py
137 lines (109 loc) · 3.2 KB
/
kmer_scoring.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
"""
Author: Caleb Kibet
A module for scoring sequences using k-mers used by Assess_by_score
"""
import pandas as pd
from MARSTools.utils import revcompl
def get_kmer_dict_rev(kmerscore, kmer_name):
"""
Given a PBM enrichment scores file, create a k-mer dictionary
of both forward and reverse sequences
:param kmerscore:
:param kmer_name:
:return:
"""
kmer_df_fw = pd.read_table(kmerscore, index_col="8-mer", usecols=["8-mer", "E-score"])
kmer_df_fw.fillna(0, inplace=True)
kmer_df_rv = pd.read_table(kmerscore, index_col="8-mer.1", usecols=["8-mer.1", "E-score"])
kmer_df_rv.index.name = "8-mer"
kmer_df_rv.fillna(0, inplace=True)
combined_kmers = kmer_df_fw.append(kmer_df_rv)
combined_kmers_dict = combined_kmers.to_dict()["E-score"]
return combined_kmers_dict, kmer_name
def score_kmer(kmer, kmerdict):
"""
Simple function to score sequences given a k-mer dictionary
of forward k-mers
:param kmer:
:param kmerdict:
:return:
"""
score = 0
if kmer in kmerdict:
score = float(kmerdict[kmer])
else:
kmer2 = revcompl(kmer)
score = float(kmerdict[kmer2])
return score
def sum_kmer_score(kmerdict, seq):
"""
Given a k-mer dictionary and a sequence, calculate sum occupancy
:param kmerdict: A dictionary of k-mers
:param seq: Sequence
:return: Sequence occupancy score
"""
k_mers = find_kmers(seq, 8)
total_seq_score = 0
for kmer in k_mers:
if kmer in kmerdict:
kmer_score = float(kmerdict[kmer])
else:
kmer2 = revcompl(kmer)
kmer_score = float(kmerdict[kmer2])
total_seq_score += kmer_score
return total_seq_score
def max_kmer_score(kmerdict, seq):
"""
Given a sequence and a dictionary of k-mer scores, calculate
maximum sequence occupancy
:param kmerdict:
:param seq:
:return:
"""
k_mers = find_kmers(seq, 8)
kmer_scores_list = []
for kmer in k_mers:
if kmer in kmerdict:
score = float(kmerdict[kmer])
else:
score = 0.0
kmer2 = revcompl(kmer)
score = float(kmerdict[kmer2])
kmer_scores_list.append(score)
return max(kmer_scores_list)
def max_kmer_score_pos(kmerdict, seq):
"""
Given a dictionary of k-mer scores and a sequence, compute
the sum scores around the maximum score
:param kmerdict:
:param seq:
:return:
"""
k_mers = find_kmers(seq, 8)
tot_score = []
for kmer in k_mers:
if kmer in kmerdict:
score = float(kmerdict[kmer])
else:
score = 0.0
#kmer2 = revcompl(kmer)
#score = float(kmerdict[kmer2])
tot_score.append(score)
max_pos = tot_score.index(max(tot_score))
return sum(tot_score[max_pos-4:max_pos+4])
def find_kmers(string, kmer_size):
"""
Given a sequence string, extract all k-mers of length kmer_size
:param string:
:param kmer_size:
:return:
"""
kmers = []
for i in range(0, len(string)-kmer_size+1):
kmers.append(string[i:i+kmer_size])
return kmers
def getkey(item):
"""
:param item:
"""
return item[1]