-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit be545db
Showing
90 changed files
with
22,702 additions
and
0 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
# | ||
# Copyright (C) 2010-2016 University of Southern California and | ||
# Andrew D. Smith | ||
# | ||
# Authors: Andrew D. Smith | ||
# | ||
# This program is free software: you can redistribute it and/or modify | ||
# it under the terms of the GNU General Public License as published by | ||
# the Free Software Foundation, either version 3 of the License, or | ||
# (at your option) any later version. | ||
# | ||
# This program is distributed in the hope that it will be useful, | ||
# but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
# GNU General Public License for more details. | ||
# | ||
# You should have received a copy of the GNU General Public License | ||
# along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
# | ||
|
||
CREAD=$(shell dirname $(realpath $(lastword $(MAKEFILE_LIST)))) | ||
|
||
all: | ||
@make -C src SMITHLAB_CPP=$(SMITHLAB_CPP) CREAD=$(CREAD) OPT=1 | ||
|
||
install: | ||
@make -C src SMITHLAB_CPP=$(SMITHLAB_CPP) CREAD=$(CREAD) OPT=1 install | ||
|
||
clean: | ||
@make -C src SMITHLAB_CPP=$(SMITHLAB_CPP) CREAD=$(CREAD) clean |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
# cread2 | ||
Software tools for regulatory sequence analysis | ||
|
||
CREAD: Comprehensive Regulatory Element Analysis and Discovery | ||
The CREAD software package is a computational pipeline for understanding | ||
how DNA and RNA elements participate in regulating gene expression. | ||
CREAD uses Pattern-Feature-Model framewrok machine learning and pattern | ||
visulization to to identify motifs involved in transcriptional | ||
regulation. CREAD also inclucdes codelibraries to facilitate | ||
implementation of new tools. | ||
|
||
Building and Installing | ||
======================= | ||
|
||
You may download the latest version of CREAD from: http://smithlabresearch.org/software/cread | ||
This software has been designed to run in a UNIX-like environment. | ||
|
||
* Step 0 | ||
|
||
This software package requires a functioning installation of the GNU | ||
Scientific Library (GSL). If you don't already have this installed, you | ||
will need to download and install it from http://www.gnu.org/software/gsl/ | ||
|
||
If gsl is not installed in the default path, | ||
``` | ||
export CPATH=/path_to_my_gsl/include | ||
export LIBRARY_PATH=/path_to_my_gsl/lib | ||
``` | ||
will add search paths for compiling and linking. | ||
|
||
* Step 1 | ||
|
||
To build the binaries type the following where '>' is your prompt and the CWD is the root of the distribution: | ||
|
||
> make | ||
|
||
This will create all of the executables needed for the CREAD pipleine. | ||
|
||
Contacts and bug reports | ||
======================== | ||
|
||
Andrew D. Smith | ||
[email protected] | ||
|
||
Copyright and License Information | ||
================================= | ||
|
||
Copyright (C) 2005-2016 | ||
University of Southern California, | ||
Andrew D. Smith | ||
# revert |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
# | ||
# Copyright (C) 2010-2014 University of Southern California and | ||
# Andrew D. Smith | ||
# | ||
# Authors: Andrew D. Smith | ||
# | ||
# This program is free software: you can redistribute it and/or modify | ||
# it under the terms of the GNU General Public License as published by | ||
# the Free Software Foundation, either version 3 of the License, or | ||
# (at your option) any later version. | ||
# | ||
# This program is distributed in the hope that it will be useful, | ||
# but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
# GNU General Public License for more details. | ||
# | ||
# You should have received a copy of the GNU General Public License | ||
# along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
# | ||
|
||
all_subdirs=common tools | ||
lib_subdirs=common | ||
app_subdirs=tools | ||
|
||
# check if a global copy of smithlab_cpp cannot be found and try to | ||
# use a copy that is in the current directory | ||
ifndef SMITHLAB_CPP | ||
SMITHLAB_CPP=$(abspath $(dir $(MAKEFILE_LIST)))/smithlab_cpp | ||
ifeq ("$(wildcard $(SMITHLAB_CPP))","") | ||
$(error SMITHLAB_CPP not set and smithlab_cpp not found) | ||
endif | ||
all_subdirs += $(SMITHLAB_CPP) | ||
lib_subdirs += $(SMITHLAB_CPP) | ||
endif | ||
|
||
all: | ||
@for i in $(app_subdirs); do \ | ||
make -C $${i} SMITHLAB_CPP=$(SMITHLAB_CPP) SRC_ROOT=$(CREAD) OPT=1; \ | ||
done; | ||
|
||
install: | ||
@for i in $(app_subdirs); do \ | ||
make -C $${i} SMITHLAB_CPP=$(SMITHLAB_CPP) SRC_ROOT=$(CREAD) OPT=1 install; \ | ||
done; | ||
|
||
clean: | ||
@for i in $(all_subdirs); do \ | ||
make -C $${i} SMITHLAB_CPP=$(SMITHLAB_CPP) SRC_ROOT=$(CREAD) clean; \ | ||
done; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,227 @@ | ||
/* | ||
* Copyright (C) 2006 Cold Spring Harbor Laboratory | ||
* Authors: Andrew D. Smith | ||
* | ||
* This file is part of CREAD. | ||
* | ||
* CREAD is free software; you can redistribute it and/or modify | ||
* it under the terms of the GNU General Public License as published by | ||
* the Free Software Foundation; either version 2 of the License, or | ||
* (at your option) any later version. | ||
* | ||
* CREAD is distributed in the hope that it will be useful, | ||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
* GNU General Public License for more details. | ||
* | ||
* You should have received a copy of the GNU General Public License | ||
* along with CREAD; if not, write to the Free Software | ||
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA | ||
*/ | ||
|
||
#include "Alphabet.hpp" | ||
#include "cread.hpp" | ||
#include "smithlab_utils.hpp" | ||
|
||
#include <functional> // for "bind" | ||
|
||
using std::string; | ||
using std::vector; | ||
using std::transform; | ||
|
||
// private static members | ||
static const int b2i_size = 20; | ||
static const int b2i[] = { | ||
//A, b, C, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, T | ||
0,-1, 1,-1,-1,-1, 2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 3 | ||
};* | ||
static const int b2i_rc[] = { | ||
//A, b, C, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, T | ||
3,-1, 2,-1,-1,-1, 1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0 | ||
}; | ||
static const int b2c_size = 20; | ||
static const char b2c[] = { | ||
//A, b, C, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, T | ||
'T','N','G','N','N','N','C','N','N','N','N','N','N','N','N','N','N','N','N','A' | ||
}; | ||
static const char *i2b = "ACGTN"; | ||
|
||
|
||
/*int commented out | ||
base2int(char b) { | ||
b = std::toupper(b); | ||
if (b - 'A' >= 0 && b - 'A' < b2i_size) | ||
return b2i[b - 'A']; | ||
else | ||
return -1; | ||
} | ||
int | ||
base2int_rc(char b) { | ||
b = std::toupper(b); | ||
if (b - 'A' >= 0 && b - 'A' < b2i_size) | ||
return b2i_rc[b - 'A']; | ||
else | ||
return -1; | ||
}*/ | ||
|
||
|
||
char | ||
int2base(int i) { | ||
if (i < static_cast<int>(alphabet_size) && i >= 0) | ||
return i2b[i]; | ||
else return i2b[alphabet_size]; | ||
} | ||
|
||
|
||
bool | ||
valid_base_id(int c) { | ||
return (c < static_cast<int>(alphabet_size) && c >= 0); | ||
} | ||
|
||
|
||
/*char commented out | ||
complement(int i) { | ||
if (i - 'A' >= 0 && i - 'A' < b2c_size) | ||
return b2c[i - 'A']; | ||
else return 'N'; | ||
}*/ | ||
|
||
|
||
bool | ||
valid_base(char c) { | ||
char i = std::toupper(c); | ||
return (i == 'A' || i == 'C' || i == 'G' || i == 'T'); | ||
} | ||
|
||
bool | ||
isgap(char c) { | ||
return c == '-'; | ||
} | ||
|
||
void | ||
get_base_comp(const vector<string>& sequences, float *base_comp) { | ||
std::fill(base_comp, base_comp + alphabet_size, 0.0); | ||
float total = 0; | ||
for (vector<string>::const_iterator i = sequences.begin(); | ||
i != sequences.end(); ++i) | ||
for (string::const_iterator j = i->begin(); j != i->end(); ++j) | ||
if (valid_base(*j)) { | ||
base_comp[base2int(*j)]++; | ||
total++; | ||
} | ||
transform(base_comp, base_comp + alphabet_size, base_comp, | ||
std::bind(std::divides<float>(), std::placeholders::_1, total)); | ||
} | ||
|
||
|
||
void | ||
get_base_comp(const vector<string>& sequences, vector<float>& base_comp) { | ||
vector<size_t> count(alphabet_size, 0); | ||
for (vector<string>::const_iterator i = sequences.begin(); | ||
i != sequences.end(); ++i) | ||
for (string::const_iterator j = i->begin(); j != i->end(); ++j) | ||
if (valid_base(*j)) { | ||
count[base2int(*j)]++; | ||
} | ||
const float total = std::accumulate(count.begin(), count.end(), 0.0); | ||
base_comp.clear(); | ||
transform(count.begin(), count.end(), back_inserter(base_comp), | ||
std::bind(std::divides<float>(), std::placeholders::_1, total)); | ||
} | ||
|
||
|
||
void | ||
get_base_comp(const vector<string>& sequences, vector<double>& base_comp) { | ||
vector<size_t> count(alphabet_size, 0); | ||
for (vector<string>::const_iterator i = sequences.begin(); | ||
i != sequences.end(); ++i) | ||
for (string::const_iterator j = i->begin(); j != i->end(); ++j) | ||
if (valid_base(*j)) { | ||
count[base2int(*j)]++; | ||
} | ||
const double total = std::accumulate(count.begin(), count.end(), 0.0); | ||
base_comp.clear(); | ||
transform(count.begin(), count.end(), back_inserter(base_comp), | ||
std::bind(std::divides<double>(), std::placeholders::_1, total)); | ||
} | ||
|
||
|
||
string | ||
reverse_complement(const string& s) { | ||
string r; | ||
transform(s.begin(), s.end(), back_inserter(r), complement); | ||
reverse(r.begin(), r.end()); | ||
return r; | ||
} | ||
|
||
|
||
/*string commented out | ||
revcomp(const string& s) { | ||
string r; | ||
transform(s.begin(), s.end(), back_inserter(r), complement); | ||
reverse(r.begin(), r.end()); | ||
return r; | ||
}*/ | ||
|
||
|
||
size_t | ||
count_valid_bases(const string& s) { | ||
return count_if(s.begin(), s.end(), &valid_base); | ||
} | ||
|
||
|
||
size_t | ||
count_valid_bases(const vector<string>& s) { | ||
size_t n_valid = 0; | ||
for (vector<string>::const_iterator i = s.begin(); i != s.end(); ++i) | ||
n_valid += count_valid_bases(*i); | ||
return n_valid; | ||
} | ||
|
||
|
||
size_t | ||
kmer_counts(const vector<string> &seqs, | ||
vector<size_t> &counts, size_t k) { | ||
counts.clear(); | ||
size_t nwords = static_cast<size_t>(pow(static_cast<float>(alphabet_size), | ||
static_cast<int>(k))); | ||
counts.resize(nwords, 0); | ||
size_t total = 0; | ||
for (size_t i = 0; i < seqs.size(); ++i) { | ||
char seq[seqs[i].length() + 1]; | ||
seq[seqs[i].length()] = '\0'; | ||
copy(seqs[i].begin(), seqs[i].end(), seq); | ||
for (size_t j = 0; j < seqs[i].length() - k + 1; ++j) | ||
if (std::count_if(seq + j, seq + j + k, &valid_base) == | ||
static_cast<int>(k)) { | ||
counts[mer2index(seq + j, k)]++; | ||
++total; | ||
} | ||
} | ||
return total; | ||
} | ||
|
||
|
||
size_t | ||
mer2index(const char *s, size_t n) { | ||
size_t multiplier = 1, index = 0; | ||
do { | ||
--n; | ||
index += base2int(s[n])*multiplier; | ||
multiplier *= alphabet_size; | ||
} while (n > 0); | ||
return index; | ||
} | ||
|
||
size_t | ||
mer2index_rc(const char *s, size_t n) { | ||
size_t multiplier = 1, index = 0; | ||
size_t k = 0; | ||
do { | ||
index += base2int_rc(s[k])*multiplier; | ||
multiplier *= alphabet_size; | ||
} while (++k < n); | ||
return index; | ||
} |
Oops, something went wrong.