Skip to content

Commit

Permalink
ordinal logit model
Browse files Browse the repository at this point in the history
  • Loading branch information
Bob Carpenter committed Sep 9, 2023
1 parent c6a1606 commit a1302a3
Show file tree
Hide file tree
Showing 8 changed files with 430 additions and 47 deletions.
7 changes: 5 additions & 2 deletions kmers/makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,18 @@ O = 3
EIGEN_LIB = ~/github/stan-dev/cmdstan/stan/lib/stan_math/lib/eigen_3.3.9
TRANSCRIPTOME = data/unpacked/GRCh38_latest_rna.fna
BINARY_OUT_FILE = xt.bin
CPP = /usr/local/Cellar/llvm/13.0.0_1/bin/clang++
# CPP = /usr/local/Cellar/gcc/11.2.0_2/bin/g++-11
COMPILE = ${CPP} -O${O} -g -std=c++14 -I src/ -I ${EIGEN_LIB}

builder : src/builder.cpp
clang++ -O${O} -g -std=c++14 -I src/ -I ${EIGEN_LIB} -o builder src/builder.cpp
${COMPILE} -o builder src/builder.cpp

xt.bin: builder
time ./builder ${TRANSCRIPTOME} ${BINARY_OUT_FILE}

reader: src/reader.cpp src/kmers/multinomial-model.hpp
clang++ -O${O} -g -std=c++14 -I src/ -I ${EIGEN_LIB} -o reader src/reader.cpp
${COMPILE} -o reader src/reader.cpp

read: xt.bin reader
time ./reader ${BINARY_OUT_FILE}
Expand Down
48 changes: 38 additions & 10 deletions kmers/src/builder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
#include <stdexcept>
#include <vector>

int64_t num_kmers(int64_t K) {
int64_t y = 1;
int32_t num_kmers(int32_t K) {
int32_t y = 1;
for (int k = 0; k < K; ++k) {
y *= 4;
}
Expand Down Expand Up @@ -126,8 +126,10 @@ struct validator {
struct counter {
size_t num_targets_;
size_t num_bases_;
counter() : num_targets_(0), num_bases_(0) { }
std::vector<int32_t> isoform_lengths_;
counter() : num_targets_(0), num_bases_(0), isoform_lengths_() { }
void operator()(const std::string id, const std::string& seq) {
isoform_lengths_.push_back(seq.size());
++num_targets_;
num_bases_ += seq.size();
if ((num_targets_ % 10000) == 0)
Expand All @@ -141,15 +143,33 @@ struct counter {
std::cout << "counter.report(): " << num_bases_ << " bases"
<< std::endl;
}
void write_isoform_lengths(const std::string& file) {
std::fstream out(file, std::ios::out);
if (!out) {
throw std::runtime_error("could not open output file");
return;
}
try {
out << "length" << std::endl;
for (int32_t n : isoform_lengths_)
out << n << "\n";
} catch (...) {
std::cout << "error writing to isoform file" << std::endl;
}
out.close();
}
};

struct triplet_counter {
int32_t K_;
int32_t ref_id_;
std::vector<Eigen::Triplet<float, int32_t>> kmer_count_;
triplet_counter(int32_t K) : K_(K), ref_id_(0), kmer_count_() { }
triplet_counter(int32_t K) : K_(K), ref_id_(0), kmer_count_() {
kmer_count_.reserve(300000000);
}
void operator()(const std::string& id, const std::string& seq) {
int32_t num_kmers = seq.size() - K_ + 1;
if (ref_id_ > 100) return;
float prob_kmer = 1.0f / num_kmers;
for (int32_t start = 0; start < num_kmers; ++start) {
std::string kmer = seq.substr(start, K_);
Expand Down Expand Up @@ -204,7 +224,7 @@ int main(int argc, char* argv[]) {
std::cout << "main: binary output file = " << binoutfile
<< std::endl;

std::size_t K = 10;
std::size_t K = 8;
std::cout << "main: K = " << K
<< std::endl;

Expand All @@ -214,13 +234,21 @@ int main(int argc, char* argv[]) {

// shredder shred_handler = shredder(K);
// validator validate_handler = validator();

counter count_handler = counter();
triplet_counter triplet_handler = triplet_counter(K);

coupler<counter, triplet_counter> handler = couple(count_handler, triplet_handler);
fasta::parse_file(fastafile, handler);
handler.report();
triplet_handler.write_matrix(binoutfile);
// triplet_counter triplet_handler = triplet_counter(K);
// coupler<counter, triplet_counter> handler = couple(count_handler, triplet_handler);

fasta::parse_file(fastafile, count_handler);

std::string file = "human-isoform-lengths-GRCh38.csv";
std::cout << "writing isoform lengths to csv file = " << file
<< std::endl;
count_handler.write_isoform_lengths(file);

// handler.report();
// triplet_handler.write_matrix(binoutfile);

std::cout << "main: FINI." << std::endl;
return 0;
Expand Down
33 changes: 25 additions & 8 deletions kmers/src/kmers/multinomial-model.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ Eigen::VectorXf softmax(const Eigen::VectorXf& alpha) {
using std::exp;
auto delta = Eigen::VectorXf::Constant(alpha.size(), alpha.maxCoeff());
Eigen::VectorXf phi = (alpha - delta).array().exp();
return delta + phi / phi.sum();
return phi / phi.sum();
}

/**
Expand Down Expand Up @@ -52,21 +52,38 @@ struct multinomial_model {
const Eigen::Map<Eigen::SparseMatrix<float, Eigen::RowMajor>>& xt,
const Eigen::VectorXf& y)
: xt_(xt), y_(y) {
if (xt.cols() != y.rows()) {
if (xt.rows() != y.rows()) {
throw std::runtime_error("xt rows must equal y cols");
}
}

float log_density(const Eigen::VectorXf& beta) {
Eigen::VectorXf theta = softmax(beta);
std::cout << "beta.size() = " << beta.size()
<< "; theta.size() = " << theta.size()
<< std::endl;

std::cout << "model: xt_.rows() = " << xt_.rows() << std::endl;
std::cout << "model: xt_.cols() = " << xt_.cols() << std::endl;
// Eigen::VectorXf xt_sm_a = (xt_ * theta).array().log();
float log_likelihood = 0;
// float log_likelihood = y_.transpose() * xt_sm_a;
std::cout << "model: theta.rows() = " << theta.rows() << std::endl;
std::cout << "model: theta.cols() = " << theta.cols()
<< std::endl;
std::cout << "model: theta.sum() = " << theta.sum() << std::endl;

Eigen::VectorXf xt_theta = 1e-6 * Eigen::VectorXf::Ones(xt_.rows())
+ (1 - 1e-6) * (xt_ * theta);

std::cout << "model:: xt_theta.sum() = " << xt_theta.sum() << std::endl;

Eigen::VectorXf log_xt_theta = xt_theta.array().log();
for (int i = 0; i < log_xt_theta.size(); ++i)
if (!(log_xt_theta(i) < 0))
std::cout << "UH log_xt_theta(" << i << ") = " << log_xt_theta(i)
<< std::endl;

std::cout << "model: log_xt_theta.rows() = " << log_xt_theta.rows()
<< std::endl;
std::cout << "model: log_xt_theta.cols() = " << log_xt_theta.cols()
<< std::endl;

float log_likelihood = y_.transpose() * log_xt_theta;
float log_prior = -0.125 * beta.transpose() * beta;
return log_likelihood + log_prior;
}
Expand Down
25 changes: 20 additions & 5 deletions kmers/src/reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,25 @@ read_xt(const std::string& filename) {
<< std::endl;

std::vector<int> outerIndexPtr = read_array<int>(in, rows + 1);
for (size_t i = 0; i < outerIndexPtr.size(); ++i) {
if (outerIndexPtr[i] > nnz || outerIndexPtr[i] < 0) {
throw std::runtime_error("OOPS: outerIndexPtr out of range");
}
}
std::cout << "reader: read outerIndexPtr"
<< std::endl;

std::vector<int> innerIndices = read_array<int>(in, nnz);
for (size_t i = 0; i < innerIndices.size(); ++i) {
if (innerIndices[i] < 0 || innerIndices[i] >= cols) {
throw std::runtime_error("innerIndices exceeded cols");
}
}
std::cout << "reader: read innerIndices"
<< std::endl;

std::vector<float> values = read_array<float>(in, nnz);
std::cout << "reader: read innerIndices"
std::cout << "reader: read values"
<< std::endl;

in.close();
Expand Down Expand Up @@ -86,14 +96,19 @@ int main(int argc, char* argv[]) {
std::cout << "reader: building multinomial model"
<< std::endl;

Eigen::VectorXf y = Eigen::VectorXf::Ones(xt.cols());
for (int32_t i = 0; i < 10; ++i)
for (int32_t j = 0; j < 10; ++j)
std::cout << "xt[" << i << ", " << j << "] = " << xt.coeffRef(i, j)
<< std::endl;

Eigen::VectorXf y = Eigen::VectorXf::Ones(xt.rows());
std::cout << "reader: y.rows() = " << y.rows()
<< std::endl;
multinomial_model model(xt, y);
std::cout << "reader: model.y_.size() = " << model.y_.size()
<< std::endl;

Eigen::VectorXf beta = Eigen::VectorXf::Ones(xt.rows());
float lp = model.log_density(beta);
std::cout << "lp = " << lp << std::endl;
// Eigen::VectorXf beta = Eigen::VectorXf::Ones(xt.cols());
// float lp = model.log_density(beta);
// std::cout << "lp = " << lp << std::endl;
}
4 changes: 2 additions & 2 deletions sushi-rating/makefile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
rating.html: rating.qmd
rating.html: rating.qmd *.stan
quarto render rating.qmd --to html

rating.pdf: rating.qmd
rating.pdf: rating.qmd *.stan
quarto render rating.qmd --to pdf

clean:
Expand Down
8 changes: 4 additions & 4 deletions sushi-rating/ordinal-logit.stan
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,18 @@ data {
int<lower=1> K; // # items per rater
int<lower=0> R; // # raters
array[R, K] int<lower=1, upper=I> u; // items rated
array[R, K] int<lower=1, upper=5> z; // ordinal ratings
array[R, K] int<lower=1, upper=K> z; // ordinal ratings
}
parameters {
vector[I - 1] eta_pre; // item quality
ordered[K - 1] c; // cutpoints
vector[I - 1] eta_pre; // item quality
}
transformed parameters {
vector[I] eta = append_row(eta_pre, -sum(eta_pre));
}
model {
eta ~ normal(0, 5);
c ~ normal(0, 5);
eta ~ normal(0, 3);
c ~ normal(0, 3);
for (r in 1:R) {
z[r] ~ ordered_logistic(eta[u[r]], c);
}
Expand Down
Loading

0 comments on commit a1302a3

Please sign in to comment.