Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added cumulative hazard #7

Merged
merged 4 commits into from
Dec 18, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/make.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ makedocs(
"Home" => "index.md",
"Event Times" => "events.md",
"Kaplan-Meier" => "km.md",
"Nelson-Aalen" => "na.md",
"Cox" => "cox.md",
],
)
Expand Down
6 changes: 3 additions & 3 deletions docs/src/km.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,10 @@ using Greenwood's formula:

```@docs
Survival.KaplanMeier
StatsBase.fit(::Type{KaplanMeier},
StatsBase.fit(::Type{S},
times::AbstractVector{T},
status::AbstractVector{<:Integer}) where {T}
StatsBase.confint
status::AbstractVector{<:Integer}) where {S<:NonparametricEstimator, T}
StatsBase.confint(km::KaplanMeier, α::Float64=0.05)
```

## References
Expand Down
36 changes: 36 additions & 0 deletions docs/src/na.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Nelson-Aalen Estimator

The [Nelson-Aalen estimator](https://en.wikipedia.org/wiki/Nelson%E2%80%93Aalen_estimator)
is a nonparametric estimator of the cumulative hazard function.

The estimate is given by

```math
\hat{H}(t) = \sum_{i: t_i < t} \frac{d_i}{n_i}
```

where ``d_i`` is the number of observed events at time ``t_i`` and ``n_i`` is the
number of subjects at risk for the event just before time ``t_i``.

The pointwise standard error of the log of the survivor function can be computed
directly as the standard error or a Bernoulli random variable with `d_i` successes
from `n_i` samples:

```math
\text{SE}(\hat{H}(t)) = \sqrt{\sum_{i: t_i < t} \frac{d_i(n_i-d_i)}{n_i^3}}
```

## API

```@docs
Survival.NelsonAalen
StatsBase.fit(::Type{S},
times::AbstractVector{T},
status::AbstractVector{<:Integer}) where {S<:NonparametricEstimator, T}
StatsBase.confint(na::NelsonAalen, α::Float64=0.05)
```

## References

* Nelson, W. (1969). *Hazard plotting for incomplete failure data*.
Journal of Quality Technology 1, 27–52.
4 changes: 4 additions & 0 deletions src/Survival.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@ export
isevent,
iscensored,

NonparametricEstimator,
KaplanMeier,
NelsonAalen,
fit,
confint,

Expand All @@ -30,7 +32,9 @@ abstract type AbstractEstimator end
abstract type NonparametricEstimator <: AbstractEstimator end

include("eventtimes.jl")
include("estimator.jl")
include("kaplanmeier.jl")
include("nelsonaalen.jl")
include("cox.jl")
include("optimization.jl")

Expand Down
2 changes: 1 addition & 1 deletion src/cox.jl
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ StatsModels.drop_intercept(::Type{CoxModel}) = true
fit(::Type{CoxModel}, M::AbstractMatrix, y::AbstractVector; kwargs...)

Given a matrix `M` of predictors and a corresponding vector of events, compute the
Cox proportional hazard model estimate of coefficients. Returns a [`CoxModel`](@ref)
Cox proportional hazard model estimate of coefficients. Returns a `CoxModel`
object.
"""
function StatsBase.fit(::Type{CoxModel}, M::AbstractMatrix, y::AbstractVector; kwargs...)
Expand Down
91 changes: 91 additions & 0 deletions src/estimator.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
# Estimating functions with the following assumptions:
# * The input is nonempty
# * Time 0 is not included

function _estimator(::Type{S}, tte::AbstractVector{T}, status::BitVector) where {S, T}
nobs = length(tte)
dᵢ = 0 # Number of observed events at time t
cᵢ = 0 # Number of censored events at time t
nᵢ = nobs # Number remaining at risk at time t
es = estimator_start(S) # Estimator starting point
gw = stderr_start(S) # Standard Error starting point

times = T[] # The set of unique event times
nevents = Int[] # Total observed events at each time
ncensor = Int[] # Total censored events at each time
natrisk = Int[] # Number at risk at each time
estimator = Float64[] # Estimates
stderr = Float64[] # Pointwise standard errors

t_prev = zero(T)

@inbounds for i = 1:nobs
t = tte[i]
s = status[i]
# Aggregate over tied times
if t == t_prev
dᵢ += s
cᵢ += !s
continue
elseif !iszero(t_prev)
es = estimator_update(S, es, dᵢ, nᵢ)
gw = stderr_update(S, gw, dᵢ, nᵢ)
push!(times, t_prev)
push!(nevents, dᵢ)
push!(ncensor, cᵢ)
push!(natrisk, nᵢ)
push!(estimator, es)
push!(stderr, sqrt(gw))
end
nᵢ -= dᵢ + cᵢ
dᵢ = s
cᵢ = !s
t_prev = t
end

# We need to do this one more time to capture the last time
# since everything in the loop is lagged
push!(times, t_prev)
push!(nevents, dᵢ)
push!(ncensor, cᵢ)
push!(natrisk, nᵢ)
push!(estimator, es)
push!(stderr, sqrt(gw))

return S{T}(times, nevents, ncensor, natrisk, estimator, stderr)
end

"""
fit(::Type{S}, times, status) where S<:NonparametricEstimator

Given a vector of times to events and a corresponding vector of indicators that
dictate whether each time is an observed event or is right censored, compute the
model of type `S`. Return an object of type `S`: [`KaplanMeier`](@ref) and
[`NelsonAalen`](@ref) are supported so far.
"""
function StatsBase.fit(::Type{S},
times::AbstractVector{T},
status::AbstractVector{<:Integer}) where {S<:NonparametricEstimator, T}
nobs = length(times)
if length(status) != nobs
throw(DimensionMismatch("there must be as many event statuses as times"))
end
if nobs == 0
throw(ArgumentError("the sample must be nonempty"))
end
p = sortperm(times)
t = times[p]
s = BitVector(status[p])
return _estimator(S, t, s)
end

function StatsBase.fit(::Type{S}, ets::AbstractVector{<:EventTime}) where S<:NonparametricEstimator
length(ets) > 0 || throw(ArgumentError("the sample must be nonempty"))
x = sort(ets)
# TODO: Refactor, since iterating over the EventTime objects directly in
# the _km loop may actually be easier/more efficient than working with
# the times and statuses as separate vectors. Plus it might be nice to
# make this method the One True Method™ so that folks are encouraged to
# use EventTimes instead of raw values.
return fit(S, map(t->t.time, x), BitVector(map(t->t.status, x)))
end
93 changes: 4 additions & 89 deletions src/kaplanmeier.jl
Original file line number Diff line number Diff line change
Expand Up @@ -24,96 +24,11 @@ struct KaplanMeier{T<:Real} <: NonparametricEstimator
stderr::Vector{Float64}
end

# Internal Kaplan-Meier function with the following assumptions:
# * The input is nonempty
# * Time 0 is not included
function _km(tte::AbstractVector{T}, status::BitVector) where {T}
nobs = length(tte)
dᵢ = 0 # Number of observed events at time t
cᵢ = 0 # Number of censored events at time t
nᵢ = nobs # Number remaining at risk at time t
km = 1.0 # Ŝ(t)
gw = 0.0 # SE(log(Ŝ(t)))
estimator_start(::Type{KaplanMeier}) = 1.0 # Estimator starting point
stderr_start(::Type{KaplanMeier}) = 0.0 # StdErr starting point

times = T[] # The set of unique event times
nevents = Int[] # Total observed events at each time
ncensor = Int[] # Total censored events at each time
natrisk = Int[] # Number at risk at each time
survival = Float64[] # Survival estimates
stderr = Float64[] # Pointwise standard errors for log(Ŝ(t))

t_prev = zero(T)

@inbounds for i = 1:nobs
t = tte[i]
s = status[i]
# Aggregate over tied times
if t == t_prev
dᵢ += s
cᵢ += !s
continue
elseif !iszero(t_prev)
km *= 1 - dᵢ / nᵢ
gw += dᵢ / (nᵢ * (nᵢ - dᵢ))
push!(times, t_prev)
push!(nevents, dᵢ)
push!(ncensor, cᵢ)
push!(natrisk, nᵢ)
push!(survival, km)
push!(stderr, sqrt(gw))
end
nᵢ -= dᵢ + cᵢ
dᵢ = s
cᵢ = !s
t_prev = t
end

# We need to do this one more time to capture the last time
# since everything in the loop is lagged
push!(times, t_prev)
push!(nevents, dᵢ)
push!(ncensor, cᵢ)
push!(natrisk, nᵢ)
push!(survival, km)
push!(stderr, sqrt(gw))

return KaplanMeier{T}(times, nevents, ncensor, natrisk, survival, stderr)
end

"""
fit(::Type{KaplanMeier}, times, status)

Given a vector of times to events and a corresponding vector of indicators that
dictate whether each time is an observed event or is right censored, compute the
Kaplan-Meier estimate of the survivor function. Returns a [`KaplanMeier`](@ref)
object.
"""
function StatsBase.fit(::Type{KaplanMeier},
times::AbstractVector{T},
status::AbstractVector{<:Integer}) where {T}
nobs = length(times)
if length(status) != nobs
throw(DimensionMismatch("there must be as many event statuses as times"))
end
if nobs == 0
throw(ArgumentError("the sample must be nonempty"))
end
p = sortperm(times)
t = times[p]
s = BitVector(status[p])
return _km(t, s)
end

function StatsBase.fit(::Type{KaplanMeier}, ets::AbstractVector{<:EventTime})
length(ets) > 0 || throw(ArgumentError("the sample must be nonempty"))
x = sort(ets)
# TODO: Refactor, since iterating over the EventTime objects directly in
# the _km loop may actually be easier/more efficient than working with
# the times and statuses as separate vectors. Plus it might be nice to
# make this method the One True Method™ so that folks are encouraged to
# use EventTimes instead of raw values.
return _km(map(t->t.time, x), BitVector(map(t->t.status, x)))
end
estimator_update(::Type{KaplanMeier}, es, dᵢ, nᵢ) = es * (1 - dᵢ / nᵢ) # Estimator update rule
stderr_update(::Type{KaplanMeier}, gw, dᵢ, nᵢ) = gw + dᵢ / (nᵢ * (nᵢ - dᵢ)) # StdErr update rule

"""
confint(km::KaplanMeier, α=0.05)
Expand Down
43 changes: 43 additions & 0 deletions src/nelsonaalen.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
"""
NelsonAalen

An immutable type containing cumulative hazard function estimates computed
using the Nelson-Aalen method.
The type has the following fields:

* `times`: Distinct event times
* `nevents`: Number of observed events at each time
* `ncensor`: Number of right censored events at each time
* `natrisk`: Size of the risk set at each time
* `chaz`: Estimate of the cumulative hazard at each time
* `stderr`: Standard error of the cumulative hazard
Use `fit(NelsonAalen, ...)` to compute the estimates and construct
this type.
"""
struct NelsonAalen{T<:Real} <: NonparametricEstimator
times::Vector{T}
nevents::Vector{Int}
ncensor::Vector{Int}
natrisk::Vector{Int}
chaz::Vector{Float64}
stderr::Vector{Float64}
end

estimator_start(::Type{NelsonAalen}) = 0.0 # Estimator starting point
stderr_start(::Type{NelsonAalen}) = 0.0 # StdErr starting point

estimator_update(::Type{NelsonAalen}, es, dᵢ, nᵢ) = es + dᵢ / nᵢ # Estimator update rule
stderr_update(::Type{NelsonAalen}, gw, dᵢ, nᵢ) = gw + dᵢ * (nᵢ - dᵢ) / (nᵢ^3) # StdErr update rule

"""
confint(na::NelsonAalen, α=0.05)

Compute the pointwise confidence intervals for the cumulative hazard
function as a vector of tuples.
"""
function StatsBase.confint(na::NelsonAalen, α::Float64=0.05)
q = quantile(Normal(), 1 - α/2)
return map(na.chaz, na.stderr) do srv, se
srv - q * se, srv + q * se
end
end
39 changes: 38 additions & 1 deletion test/runtests.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
using Survival
using Compat
using Compat.Test
using Distributions
using DataFrames, StatsModels, StatsBase, CSV

@testset "Event times" begin
Expand Down Expand Up @@ -145,6 +146,42 @@ end
@test_throws ArgumentError fit(KaplanMeier, EventTime{Int}[])
end

@testset "Nelson-Aalen" begin
t = [
310, 361, 654, 728, 61, 81, 520, 473, 107, 122, 965, 731, 153, 433, 145, 95, 765,
735, 5, 687, 345, 444, 60, 208, 821, 305, 226, 426, 705, 363, 167, 641, 740, 245,
588, 166, 559, 450, 529, 351, 201, 524, 199, 550, 551, 543, 293, 511, 511, 371, 201,
62, 356, 340, 315, 182, 364, 376, 384, 268, 266, 194, 348, 382, 296, 186, 145, 269,
350, 272, 292, 332, 285, 243, 276, 79, 240, 202, 235, 224, 239, 173, 252, 92, 192,
211, 175, 203, 105, 177,
]
s = [
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1,
0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
0, 0, 0, 0, 0, 0,
]

na = fit(NelsonAalen, t, s)
km = fit(KaplanMeier, t, s)

@test na.times == km.times
@test na.nevents == km.nevents
@test na.ncensor == km.ncensor
@test na.natrisk == km.natrisk
@test exp.(-na.chaz[1:50]) ≈ km.survival[1:50] rtol = 1e-2
@test na.stderr[1:50] ≈ km.stderr[1:50] rtol = 2e-2
na_conf = confint(na)
na_lower, na_upper = getindex.(na_conf, 1), getindex.(na_conf, 2)
@test cdf.(Normal.(na.chaz, na.stderr), na_lower) ≈ fill(0.025, length(na.chaz)) rtol = 1e-8
@test cdf.(Normal.(na.chaz, na.stderr), na_upper) ≈ fill(0.975, length(na.chaz)) rtol = 1e-8
na_conf = confint(na, 0.01)
na_lower, na_upper = getindex.(na_conf, 1), getindex.(na_conf, 2)
@test cdf.(Normal.(na.chaz, na.stderr), na_lower) ≈ fill(0.005, length(na.chaz)) rtol = 1e-8
@test cdf.(Normal.(na.chaz, na.stderr), na_upper) ≈ fill(0.995, length(na.chaz)) rtol = 1e-8
end


@testset "Cox" begin
filepath = joinpath(@__DIR__, "rossi.csv")
rossi = CSV.read(filepath)
Expand Down Expand Up @@ -220,4 +257,4 @@ end
end
@test_throws ErrorException Survival.newton_raphson(wrong_fgh!, [2.2])

end
end