From 3e9934638ba2aba2cef308021fc71d1cbd2d6e32 Mon Sep 17 00:00:00 2001 From: Tim Holy Date: Wed, 21 Aug 2024 09:24:56 -0500 Subject: [PATCH 1/2] Switch to BioStructures for PDB/structure rep BioStructures is poised to become the standard package for protein structure I/O and representation. While MIToS is retained for the MSA functionality and a few very useful data tables (vanderwaals radii and atom-level amino acid traits), otherwise this is a complete switch. It's somewhat surprising how few changes were needed to the tests (some of which were essentially cleanups that could have happened earlier). --- NEWS.md | 7 +++ Project.toml | 6 +-- README.md | 4 +- src/GPCRAnalysis.jl | 25 +++++++---- src/align.jl | 103 +++++++++++++++++++++++++------------------- src/alphafold.jl | 12 +++--- src/analyze.jl | 73 ++++++++++++++----------------- src/deprecated.jl | 4 -- src/features.jl | 73 ++++++++++++++++--------------- src/forces.jl | 12 +++--- src/msa.jl | 35 ++++++++++++--- src/pocket.jl | 20 +++++---- src/properties.jl | 40 ++++++++--------- src/tmalign.jl | 28 ++++++------ src/utils.jl | 32 +++++++++++--- test/deprecated.jl | 11 ----- test/runtests.jl | 56 ++++++++++++------------ 17 files changed, 302 insertions(+), 239 deletions(-) create mode 100644 NEWS.md delete mode 100644 src/deprecated.jl delete mode 100644 test/deprecated.jl diff --git a/NEWS.md b/NEWS.md new file mode 100644 index 0000000..9f593ad --- /dev/null +++ b/NEWS.md @@ -0,0 +1,7 @@ +# v0.5.0 + +Breaking changes: +- BioStructures and MIToS implement different parsers for PDBFiles and different representations of structures. As of BioStructures v4, there is momentum for standardizing the Julia ecosystem around BioStructures. GPCRAnalysis version 0.5.0 switches from MIToS to Biostructures. `getchain` now returns the BioStructures representation, which requires users to switch to their API for further manipulations. + + MIToS is still used for its Multiple Sequence Alignment (MSA) module. +- The `align*` utilities now return a transformation, rather than the transformed structure. Users should use `applytransform!(chain, tform)` or `applytransform!(copy(chain), tform)` manually. An advantage of this approach is that it can also be used to align complexes based on a superposition computed from alignment of individual chains. diff --git a/Project.toml b/Project.toml index 2e530da..5a5e5bd 100644 --- a/Project.toml +++ b/Project.toml @@ -1,11 +1,11 @@ name = "GPCRAnalysis" uuid = "c1d73f9e-d42a-418a-8d5b-c7b00ec0358f" authors = ["Tim Holy and contributors"] -version = "0.4.3" +version = "0.5.0" [deps] +BioStructures = "de9282ab-8554-53be-b2d6-f6c222edabfc" ColorTypes = "3da002f7-5984-5a60-b8a6-cbb66c0b333f" -CoordinateTransformations = "150eb455-5306-5404-9cee-2592286d6298" Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6" FixedPointNumbers = "53c48c17-4a7d-5ca2-90c5-79b7896eea93" @@ -34,8 +34,8 @@ JuMP = "4076af6c-e467-56ae-b986-b466b2749572" GPCRAnalysisJuMPExt = ["JuMP", "HiGHS"] [compat] +BioStructures = "4.2" ColorTypes = "0.11" -CoordinateTransformations = "0.6" Distances = "0.10" Downloads = "1" FixedPointNumbers = "0.8" diff --git a/README.md b/README.md index f882163..d3dc1b8 100644 --- a/README.md +++ b/README.md @@ -5,4 +5,6 @@ [![Coverage](https://codecov.io/gh/HolyLab/GPCRAnalysis.jl/branch/main/graph/badge.svg)](https://codecov.io/gh/HolyLab/GPCRAnalysis.jl) A suite for analyzing G-protein coupled receptors. Much of this is just a convenience wrapper and/or extension of -[MIToS](https://github.com/diegozea/MIToS.jl), and some of this code may move there or elsewhere eventually. +[BioStructures](https://github.com/BioJulia/BioStructures.jl) and [MIToS](https://github.com/diegozea/MIToS.jl). + +See NEWS.md for major changes. diff --git a/src/GPCRAnalysis.jl b/src/GPCRAnalysis.jl index 0f41f1d..fab9be1 100644 --- a/src/GPCRAnalysis.jl +++ b/src/GPCRAnalysis.jl @@ -4,10 +4,14 @@ using Downloads using Statistics using LinearAlgebra -using MIToS -using MIToS.Pfam -using MIToS.MSA -using MIToS.PDB +using BioStructures + +using MIToS: MIToS, Pfam, MSA +using MIToS.MSA: AbstractMultipleSequenceAlignment, AnnotatedAlignedSequence, AnnotatedMultipleSequenceAlignment, + ReducedAlphabet, GAP, XAA +using MIToS.MSA: getsequence, getannotsequence, getsequencemapping, getresidues, three2residue, sequencenames, + filtersequences!, percentsimilarity +using MIToS.PDB: vanderwaalsradius, ishydrophobic, isaromatic, iscationic, isanionic, ishbonddonor, ishbondacceptor using MultivariateStats using Distances @@ -20,7 +24,6 @@ using TravelingSalesmanHeuristics using Interpolations using MutableConvexHulls -using CoordinateTransformations using GaussianMixtureAlignment # For querying the Uniprot and Alphafold REST APIs @@ -32,7 +35,11 @@ using GZip using FixedPointNumbers using ColorTypes -export @res_str +const ChainLike = Union{Chain, AbstractVector{<:AbstractResidue}} # an iterable of residues +const ResidueLike = Union{Residue, AbstractVector{<:AbstractAtom}} # an iterable of atoms +const StructureLike = Union{ChainLike, Model, MolecularStructure} + +# export @res_str export SequenceMapping, AccessionCode, MSACode, NWGapCosts export species, uniprotX, query_uniprot_accession, query_ebi_proteins, query_ncbi @@ -40,8 +47,9 @@ export try_download_alphafold, query_alphafold_latest, download_alphafolds, alph findall_subseq, pLDDT, pLDDTcolor export align_to_axes, align_to_membrane, align_nw, align_ranges export filter_species!, filter_long!, sortperm_msa, chimerax_script -export project_sequences, columnwise_entropy, align, residue_centroid, residue_centroid_matrix, alphacarbon_coordinates, alphacarbon_coordinates_matrix, mapclosest, chargelocations, positive_locations, negative_locations -export StructAlign, residueindex, ismapped +export project_sequences, columnwise_entropy, align, residue_centroid, residue_centroid_matrix, alphacarbon_coordinates, + alphacarbon_coordinates_matrix, mapclosest, chargelocations, positive_locations, negative_locations +export StructAlign, residueindex, ismapped, skipnothing export BWScheme, lookupbw export aa_properties, aa_properties_zscored export sidechaincentroid, scvector, inward_tm_residues, inward_ecl_residues @@ -61,6 +69,5 @@ include("chimerax.jl") include("pocket.jl") include("features.jl") include("forces.jl") -include("deprecated.jl") end diff --git a/src/align.jl b/src/align.jl index e8aef64..ed099f3 100644 --- a/src/align.jl +++ b/src/align.jl @@ -1,13 +1,16 @@ ## Minimum-distance alignment """ - align(fixedpos::AbstractMatrix{Float64}, moving::AbstractVector{PDBResidue}, sm::SequenceMapping) - align(fixed::AbstractVector{PDBResidue}, moving::AbstractVector{PDBResidue}, sm::SequenceMapping) - -Return a rotated and shifted version of `moving` so that the α-carbons of residues `moving[sm]` have least -mean square error deviation from positions `fixedpos` or those of residues `fixed`. + tform = align(fixedpos::AbstractMatrix{Float64}, moving::Chain, sm::SequenceMapping) + tform = align(fixed::Chain, moving::Chain, sm::SequenceMapping) + +Return a rotated and shifted version of `moving` so that the centroids of +residues `moving[sm]` have least mean square error deviation from positions +`fixedpos` or those of residues `fixed`. `fixed` or `fixedpos` should include +just the residues or positions you want to align to, but `moving` should be an +entire chain. """ -function align(fixedpos::AbstractMatrix{Float64}, moving::AbstractVector{PDBResidue}, sm::SequenceMapping; seqname=nothing, warn::Bool=true) +function align(fixedpos::AbstractMatrix{Float64}, moving::StructureLike, sm::SequenceMapping; seqname=nothing, warn::Bool=true) length(sm) == size(fixedpos, 2) || throw(DimensionMismatch("reference has $(size(fixedpos, 2)) positions, but `sm` has $(length(sm))")) movres = moving[sm] keep = map(!isnothing, movres) @@ -16,36 +19,33 @@ function align(fixedpos::AbstractMatrix{Float64}, moving::AbstractVector{PDBResi end movpos = residue_centroid_matrix(movres[keep]) fixedpos = fixedpos[:,keep] - refmean = mean(fixedpos; dims=2) - movmean = mean(movpos; dims=2) - R = MIToS.PDB.kabsch((fixedpos .- refmean)', (movpos .- movmean)') - return change_coordinates(moving, (coordinatesmatrix(moving) .- movmean') * R .+ refmean') + return Transformation(movpos, fixedpos, sm[keep], findall(keep)) end -align(fixed::AbstractVector{PDBResidue}, moving::AbstractVector{PDBResidue}, sm::SequenceMapping; kwargs...) = - align(residue_centroid_matrix(fixed), moving, sm; kwargs...) +align(fixed, moving::StructureLike, sm::SequenceMapping; kwargs...) = align(residue_centroid_matrix(fixed), moving, sm; kwargs...) -function mapclosest(mapto::AbstractVector{PDBResidue}, mapfrom::AbstractVector{PDBResidue}) +function mapclosest(mapto, mapfrom) refcenters = residue_centroid_matrix(mapto) seqcenters = residue_centroid_matrix(mapfrom) D = pairwise(Euclidean(), refcenters, seqcenters; dims=2) assignment, _ = hungarian(D) - return collect(zip(assignment, [j == 0 ? convert(eltype(D), NaN) : D[i, j] for (i, j) in enumerate(assignment)])) - # d, m = findmin(D; dims=1) - # return [mi[1] => di for (mi, di) in zip(vec(m), vec(d))] + fillval = convert(eltype(D), NaN) + return collect(zip(assignment, [j == 0 ? fillval : D[i, j] for (i, j) in enumerate(assignment)])) end ## Align to membrane """ - newchain = align_to_membrane(chain::AbstractVector{PDBResidue}, tms; extracellular=true) + tform = align_to_membrane(chain::ChainLike, tms; extracellular=true) -Align the `chain` to the membrane, given the transmembrane segments `tms` as -residue-indexes (e.g., `[37:61, 74:96, ...]`). `extracelluar` should be true if -the N-terminus of the protein is extracellular (`chain[first(tms[1])]` is at the -extracellular face), and false otherwise. +Compute the rigid transformation `tform` needed to align `chain` to the +membrane, given the transmembrane segments `tms` as residue-indexes (e.g., +`[37:61, 74:96, ...]`). `extracelluar` should be true if the N-terminus of the +protein is extracellular (`chain[first(tms[1])]` is at the extracellular face), +and false otherwise. -`newchain` is oriented so that the center of the membrane is `z=0` and -extracellular is positive. +`applytransform!(chain, tform)` (or the `model` that includes `chain`) will +re-orient `chain` so that the center of the membrane is `z=0` and extracellular +is positive. The algorithm finds the membrane normal `u` by maximizing the ratio @@ -53,13 +53,13 @@ The algorithm finds the membrane normal `u` by maximizing the ratio ----------------- sumᵢ (u ⋅ δᵢ)² -where `vᵢ` is a vector parallel to the `i`th TM helix, and `δᵢ` is a within-leaflet -atomic displacement. +where `vᵢ` is a vector parallel to the `i`th TM helix, and `δᵢ` is a +within-leaflet atomic displacement. """ -function align_to_membrane(chain::AbstractVector{PDBResidue}, tms; extracellular::Bool=true) +function align_to_membrane(chain::ChainLike, tms; extracellular::Bool=true) function leaflet_displacements(residues) - coords = reduce(vcat, [coordinatesmatrix(r) for r in residues]) - meanpos = mean(coords, dims=1) + coords = reduce(hcat, [coordarray(r) for r in residues]) + meanpos = mean(coords, dims=2) return coords .- meanpos end @@ -79,33 +79,38 @@ function align_to_membrane(chain::AbstractVector{PDBResidue}, tms; extracellular # eigenvector corresponding to the smallest eigenvalue of the matrix. μtm = mean(tm_vectors) A = sum(u -> (Δu = u - μtm; Δu * Δu'), tm_vectors) - B = delta_i'*delta_i + delta_e'*delta_e + B = delta_i*delta_i' + delta_e*delta_e' λ, u = eigen(A, B) u = u[:, argmax(λ)] u = u / norm(u) # Find the projection by `u` of atom coordinates in each leaflet - proj_e = reduce(vcat, [coordinatesmatrix(r) for r in leaflet_e]) * u - proj_i = reduce(vcat, [coordinatesmatrix(r) for r in leaflet_i]) * u + proj_e = reduce(hcat, [coordarray(r) for r in leaflet_e])' * u + proj_i = reduce(hcat, [coordarray(r) for r in leaflet_i])' * u # Compute a rotation that aligns u to the z-axis t = [0, 0, mean(proj_e) > mean(proj_i) ? 1 : -1] # extracellular is positive q = QuatRotation(1 + u'*t, cross(u, t)...) # Apply the rotation to the coordinates, and offset the z-coordinate to the mean of the leaflets - coords = coordinatesmatrix(chain) - μ = vec(mean(coords, dims=1)) - newcoords = (q * (coords' .- μ))' .+ [0, 0, (median(proj_e) + median(proj_i))/2 - μ'*u]' - return change_coordinates(chain, newcoords) + ccoords = coordarray(chain) + μ = vec(mean(ccoords, dims=2)) + return Transformation(μ, [0, 0, (median(proj_e) + median(proj_i))/2 - μ'*u], RotMatrix(q)) end +""" + tform = align_to_axes(strct) + +Compute the transformation needed to align the principle axes of inertia of +`strct` with the coordinate axes. +""" function align_to_axes(strct) - coords = coordinatesmatrix(strct) - coords = coords .- mean(coords, dims=1) - tform = GaussianMixtureAlignment.inertial_transforms(coords')[1] - tform = AffineMap(Matrix(tform.linear),[tform.translation...]) - return change_coordinates(strct, tform(coords')') + scoords = coordarray(strct) + μ = mean(scoords, dims=2) + scoords = scoords .- μ + rotmtrx = GaussianMixtureAlignment.inertial_transforms(scoords)[1] + return Transformation(μ, [0, 0, 0], rotmtrx) end -function collect_leaflet_residues(chain, tms, extracellular) - leaflet_e, leaflet_i = PDBResidue[], PDBResidue[] +function collect_leaflet_residues(chain, tms, extracellular::Bool) + leaflet_e, leaflet_i = Residue[], Residue[] for r in tms start, stop = extrema(r) if extracellular @@ -122,6 +127,11 @@ end ## Needleman-Wunsch alignment +# This implements sequence alignment based on three-dimensional structure, where +# the cost of aligning two residues is the Euclidean distance between their +# centroids. The Needleman-Wunsch algorithm is used to find the optimal alignment +# of two sequences. + @enum NWParent NONE DIAG LEFT UP Base.show(io::IO, p::NWParent) = print(io, p == NONE ? 'X' : p == DIAG ? '↖' : @@ -203,8 +213,11 @@ Find the optimal `ϕ` matching `seq1[ϕ[k][1]]` to `seq2[ϕ[k][2]]` for all `k`. `mode` controls the computation of pairwise matching penalties, and can be either `:distance` or `:distance_orientation`, where the latter adds any mismatch in sidechain orientation to the distance penalty. + +`seq1` and `seq2` must be aligned to each other in 3D space before calling this +function. See [`align`](@ref). """ -function align_nw(seq1::AbstractVector{PDBResidue}, seq2::AbstractVector{PDBResidue}, gapcosts::NWGapCosts; +function align_nw(seq1::ChainLike, seq2::ChainLike, gapcosts::NWGapCosts; mode::Symbol=:distance_orientation) supported_modes = (:distance, :distance_orientation) mode ∈ supported_modes || throw(ArgumentError("supported modes are $(supported_modes), got $mode")) @@ -227,7 +240,7 @@ Transfer `refranges`, a list of reside index spans in `seq2`, to `seq1`. `seq1` `seq2` must be spatially aligned, and the assignment is made by minimizing inter-chain distance subject to the constraint of preserving sequence order. """ -function align_ranges(seq1::AbstractVector{PDBResidue}, seq2::AbstractVector{PDBResidue}, refranges::AbstractVector{<:AbstractUnitRange}; kwargs...) +function align_ranges(seq1::ChainLike, seq2::AbstractVector{<:AbstractResidue}, refranges::AbstractVector{<:AbstractUnitRange}; kwargs...) anchoridxs = sizehint!(Int[], length(refranges)*2) for r in refranges push!(anchoridxs, first(r), last(r)) @@ -237,6 +250,8 @@ function align_ranges(seq1::AbstractVector{PDBResidue}, seq2::AbstractVector{PDB @assert last.(ϕ) == eachindex(anchoridxs) return [ϕ[i][1]:ϕ[i+1][1] for i in 1:2:length(ϕ)] end +align_ranges(seq1::ChainLike, seq2::Chain, refranges::AbstractVector{<:AbstractUnitRange}; kwargs...) = + align_ranges(seq1, collectresidues(seq2), refranges; kwargs...) function score_nw(D::AbstractMatrix, gapcosts::NWGapCosts) Base.require_one_based_indexing(D) diff --git a/src/alphafold.jl b/src/alphafold.jl index a644e55..ebd4dd8 100644 --- a/src/alphafold.jl +++ b/src/alphafold.jl @@ -89,7 +89,7 @@ end Attempt to download an [AlphaFold](https://alphafold.com/) structure. Returns `nothing` if no entry corresponding to `uniprotXname` exists; otherwise -it return `path`, the pathname of the saved file. +it returns `path`, the pathname of the saved file. In general, a better approach is to use [`download_alphafolds`](@ref) for multiple proteins, or [`query_alphafold_latest`](@ref) combined with `Downloads.download` for a single protein. @@ -154,13 +154,13 @@ function pLDDTcolor(pLDDT::Real) end """ - pLDDTcolor(r::PDBResidue) + pLDDTcolor(r::Residue) pLDDTcolor(score::Real) Return the color corresponding to the pLDDT score (a measure of confidence) of a residue. """ -pLDDTcolor(r::PDBResidue) = pLDDTcolor(pLDDT(r)) -pLDDTcolor(a::PDBAtom) = pLDDTcolor(pLDDT(a)) +pLDDTcolor(r::Residue) = pLDDTcolor(pLDDT(r)) +pLDDTcolor(a::Atom) = pLDDTcolor(pLDDT(a)) -pLDDT(r::PDBResidue) = pLDDT(only(unique(a -> a.B, r.atoms))) -pLDDT(a::PDBAtom) = parse(Float64, a.B) +pLDDT(r::Residue) = only(unique(pLDDT, r)) +pLDDT(a::Atom) = tempfactor(a) diff --git a/src/analyze.jl b/src/analyze.jl index c41e95c..b7d4a67 100644 --- a/src/analyze.jl +++ b/src/analyze.jl @@ -41,10 +41,8 @@ function columnwise_entropy(msa::AbstractMultipleSequenceAlignment, aacode=reduc return map(_entropy, eachcol(resnum)) end -MSA.Residue(r::PDBResidue) = three2residue(r.id.name) - """ - residue_centroid(r::PDBResidue) + residue_centroid(r::AbstractResidue) Compute the mean position of all atoms in `r` excluding hydrogen. @@ -53,7 +51,7 @@ they incorporate the orientation of the side chain relative to the overall fold. See also [`residue_centroid_matrix`](@ref). """ -residue_centroid(r::PDBResidue) = mean(atom -> atom.coordinates, Iterators.filter(atom -> atom.element != "H", r.atoms)) +residue_centroid(r::AbstractResidue) = vec(mean(coordarray(r, heavyatomselector); dims=2)) """ residue_centroid_matrix(seq) @@ -63,23 +61,21 @@ Return a matrix of all residue centroids as columns. See also [`residue_centroid residue_centroid_matrix(seq) = reduce(hcat, map(residue_centroid, seq)) """ - alphacarbon_coordinates(res::PDBResidue) + alphacarbon_coordinates(res::AbstractResidue) Return the coordinates of the α-carbon in `res`. """ -alphacarbon_coordinates(r) = r.atoms[findfirst(a -> a.atom === "CA", r.atoms)].coordinates +alphacarbon_coordinates(r) = coords(firstmatch(calphaselector, r)) """ alphacarbon_coordinates_matrix(seq) Return a matrix of αC coordinates as columns across all residues. See also [`alphacarbon_coordinates`](@ref). """ -function alphacarbon_coordinates_matrix(seq) - return hcat([[r.atoms[findfirst(a -> a.atom === "CA", r.atoms)].coordinates...] for r in seq]...) -end +alphacarbon_coordinates_matrix(seq) = reduce(hcat, map(alphacarbon_coordinates, seq)) """ - chargelocations(pdb::AbstractVector{PDBResidue}; include_his::Bool=false) + chargelocations(chain::ChainLike; include_his::Bool=false) Return a list of potential charge locations in the protein structure. Each is a tuple `(position, residueindex, AAname)`. The positions are those of `N` (in positively-charged residues like Arg & Lys) or `O` (in negatively-charged residues like @@ -88,43 +84,40 @@ the location of each potential charge will be listed (1 for Lys, 2 each for Arg, By default, histidine is not considered charged, but you can include it by setting `include_His=true`. """ -function chargelocations(pdb::AbstractVector{PDBResidue}; include_His::Bool=false) - argpat(atom) = startswith(atom.atom, "NH") # arginine - lyspat(atom) = atom.atom == "NZ" # lysine - asppat(atom) = startswith(atom.atom, "OD") # aspartate - glupat(atom) = startswith(atom.atom, "OE") # glutamate - hispat(atom) = startswith(atom.atom, r"N[DE]") # histidine - - locs = Tuple{Coordinates,Int,String}[] - for (i, res) in pairs(pdb) - name = res.id.name +function chargelocations(chain::ChainLike; include_His::Bool=false) + argpat(atom) = startswith(atomname(atom), "NH") # arginine + lyspat(atom) = atomname(atom) == "NZ" # lysine + asppat(atom) = startswith(atomname(atom), "OD") # aspartate + glupat(atom) = startswith(atomname(atom), "OE") # glutamate + hispat(atom) = startswith(atomname(atom), r"N[DE]") # histidine + + locs = Tuple{typeof(coords(first(first(chain)))),Int,String}[] + for (i, res) in enumerate(chain) + name = resname(res) if name == "ARG" - idx = findfirst(argpat, res.atoms) - push!(locs, (res.atoms[idx].coordinates, i, name)) - idx = findnext(argpat, res.atoms, idx+1) - push!(locs, (res.atoms[idx].coordinates, i, name)) + for a in res + argpat(a) && push!(locs, (coords(a), i, name)) + end elseif name == "LYS" - idx = findfirst(lyspat, res.atoms) - push!(locs, (res.atoms[idx].coordinates, i, name)) + for a in res + lyspat(a) && push!(locs, (coords(a), i, name)) + end elseif name == "ASP" - idx = findfirst(asppat, res.atoms) - push!(locs, (res.atoms[idx].coordinates, i, name)) - idx = findnext(asppat, res.atoms, idx+1) - push!(locs, (res.atoms[idx].coordinates, i, name)) + for a in res + asppat(a) && push!(locs, (coords(a), i, name)) + end elseif name == "GLU" - idx = findfirst(glupat, res.atoms) - push!(locs, (res.atoms[idx].coordinates, i, name)) - idx = findnext(glupat, res.atoms, idx+1) - push!(locs, (res.atoms[idx].coordinates, i, name)) + for a in res + glupat(a) && push!(locs, (coords(a), i, name)) + end elseif include_His && name == "HIS" - idx = findfirst(hispat, res.atoms) - push!(locs, (res.atoms[idx].coordinates, i, name)) - idx = findnext(hispat, res.atoms, idx+1) - push!(locs, (res.atoms[idx].coordinates, i, name)) + for a in res + hispat(a) && push!(locs, (coords(a), i, name)) + end end end return locs end -positive_locations(chargelocs::AbstractVector{Tuple{Coordinates,Int,String}}) = [loc[1] for loc in chargelocs if loc[3] ∈ ("ARG", "LYS", "HIS")] -negative_locations(chargelocs::AbstractVector{Tuple{Coordinates,Int,String}}) = [loc[1] for loc in chargelocs if loc[3] ∈ ("ASP", "GLU")] +positive_locations(chargelocs::AbstractVector{<:Tuple{AbstractVector,Int,String}}) = [loc[1] for loc in chargelocs if loc[3] ∈ ("ARG", "LYS", "HIS")] +negative_locations(chargelocs::AbstractVector{<:Tuple{AbstractVector,Int,String}}) = [loc[1] for loc in chargelocs if loc[3] ∈ ("ASP", "GLU")] diff --git a/src/deprecated.jl b/src/deprecated.jl deleted file mode 100644 index ac878c5..0000000 --- a/src/deprecated.jl +++ /dev/null @@ -1,4 +0,0 @@ -function markers(modelnum::Integer, positions::AbstractVector{<:AbstractVector{<:Real}}, radius::Real, color) - Base.depwarn("`markers` is deprecated, use `marker.(modelnum, positions, radius, color)` instead", :markers) - return [marker(modelnum, pos, radius, color) for pos in positions] -end diff --git a/src/features.jl b/src/features.jl index d2f6f0a..7b9bd8f 100644 --- a/src/features.jl +++ b/src/features.jl @@ -1,22 +1,23 @@ const σdflt = (; hbond=1, ionic=5, aromatic=3) # rough guesses for the interaction distance, in Å (separation is 2σ) equalvolumeradius(radii) = sum(radii.^3)^(1/3) -equalvolumeradius(atoms::AbstractVector{PDBAtom}, r::PDBResidue) = equalvolumeradius([vanderwaalsradius[(r.id.name, a.atom)] for a in atoms]) -equalvolumeradius(atoms::AbstractVector{PDBAtom}, r::PDBResidue, ::Symbol) = equalvolumeradius(atoms, r) +equalvolumeradius(atoms::ResidueLike, r::AbstractResidue) = equalvolumeradius([vanderwaalsradius[(resname(r), atomname(a))] for a in atoms]) +equalvolumeradius(atoms::ResidueLike, r::AbstractResidue, ::Symbol) = equalvolumeradius(atoms, r) function atomic_σfun(a, r, f::Symbol; σhbond=σdflt.hbond, σionic=σdflt.ionic, σaromatic=σdflt.aromatic) - f === :Steric && return vanderwaalsradius[(r.id.name, a.atom)] # repulsive - f === :Hydrophobe && return 2 * vanderwaalsradius[(r.id.name, a.atom)] # attractive - f ∈ (:Donor, :Acceptor) && return sqrt(σhbond^2 + vanderwaalsradius[(r.id.name, a.atom)]^2) + aname, rname = atomname(a), resname(r) + f === :Steric && return vanderwaalsradius[(rname, aname)] # repulsive + f === :Hydrophobe && return 2 * vanderwaalsradius[(rname, aname)] # attractive + f ∈ (:Donor, :Acceptor) && return sqrt(σhbond^2 + vanderwaalsradius[(rname, aname)]^2) f ∈ (:PosIonizable, :NegIonizable) && return σionic f === :Aromatic && return σaromatic # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3530875/ throw(ArgumentError("Unknown feature type: $f")) end function combined_σfun(atoms, r, f::Symbol; σhbond=σdflt.hbond, σionic=σdflt.ionic, σaromatic=σdflt.aromatic) - μ = mean(a -> a.coordinates, atoms) + μ = mean(a -> coords(a), atoms) length(atoms) == 1 && return equalvolumeradius(atoms, r) - σ = std([norm(a.coordinates .- μ) for a in atoms]) + equalvolumeradius(atoms, r) + σ = std([norm(coords(a) .- μ) for a in atoms]) + equalvolumeradius(atoms, r) f === :Hydrophobe && return σ + 3//2 f === :Steric && return σ f ∈ (:Donor, :Acceptor) && return sqrt(σhbond^2 + σ^2) @@ -26,7 +27,7 @@ function combined_σfun(atoms, r, f::Symbol; σhbond=σdflt.hbond, σionic=σdfl end function atomic_ϕfun(a, r, f::Symbol) - name = r.id.name + name = resname(r) if f === :PosIonizable name == "HIS" && return 0.1 name == "ARG" && return 0.5 # arginine has two ionizable atoms, but the net charge is 1 @@ -39,7 +40,7 @@ function atomic_ϕfun(a, r, f::Symbol) end function combined_ϕfun(atoms, r, f::Symbol) - name = r.id.name + name = resname(r) if f === :PosIonizable name == "HIS" && return 0.1 name == "ARG" && return 1.0 @@ -59,38 +60,39 @@ function add_feature!(mgmm::IsotropicMultiGMM, key, μ, σ, ϕ) push!(gmm, eltype(gmm)(vec(μ), σ, ϕ)) end -function add_features_from_atom!(mgmm::IsotropicMultiGMM, a::PDBAtom, r::PDBResidue, σfun, ϕfun) +function add_features_from_atom!(mgmm::IsotropicMultiGMM, a::AbstractAtom, r::AbstractResidue, σfun, ϕfun) ϕ = ϕfun(a, r, :Steric) - !iszero(ϕ) && add_feature!(mgmm, :Steric, a.coordinates, σfun(a, r, :Steric), ϕ) - if ishydrophobic(a, r.id.name) + rname, acoords = resname(r), coords(a) + !iszero(ϕ) && add_feature!(mgmm, :Steric, acoords, σfun(a, r, :Steric), ϕ) + if ishydrophobic(a, rname) ϕ = ϕfun(a, r, :Hydrophobe) - !iszero(ϕ) && add_feature!(mgmm, :Hydrophobe, a.coordinates, σfun(a, r, :Hydrophobe), ϕ) + !iszero(ϕ) && add_feature!(mgmm, :Hydrophobe, acoords, σfun(a, r, :Hydrophobe), ϕ) end - if isaromatic(a, r.id.name) + if isaromatic(a, rname) ϕ = ϕfun(a, r, :Aromatic) - !iszero(ϕ) && add_feature!(mgmm, :Aromatic, a.coordinates, σfun(a, r, :Aromatic), ϕ) + !iszero(ϕ) && add_feature!(mgmm, :Aromatic, acoords, σfun(a, r, :Aromatic), ϕ) end - if iscationic(a, r.id.name) + if iscationic(a, rname) ϕ = ϕfun(a, r, :PosIonizable) - !iszero(ϕ) && add_feature!(mgmm, :PosIonizable, a.coordinates, σfun(a, r, :PosIonizable), ϕ) + !iszero(ϕ) && add_feature!(mgmm, :PosIonizable, acoords, σfun(a, r, :PosIonizable), ϕ) end - if isanionic(a, r.id.name) + if isanionic(a, rname) ϕ = ϕfun(a, r, :NegIonizable) - !iszero(ϕ) && add_feature!(mgmm, :NegIonizable, a.coordinates, σfun(a, r, :NegIonizable), ϕ) + !iszero(ϕ) && add_feature!(mgmm, :NegIonizable, acoords, σfun(a, r, :NegIonizable), ϕ) end - if ishbonddonor(a, r.id.name) + if ishbonddonor(a, rname) ϕ = ϕfun(a, r, :Donor) - !iszero(ϕ) && add_feature!(mgmm, :Donor, a.coordinates, σfun(a, r, :Donor), ϕ) + !iszero(ϕ) && add_feature!(mgmm, :Donor, acoords, σfun(a, r, :Donor), ϕ) end - if ishbondacceptor(a, r.id.name) + if ishbondacceptor(a, rname) ϕ = ϕfun(a, r, :Acceptor) - !iszero(ϕ) && add_feature!(mgmm, :Acceptor, a.coordinates, σfun(a, r, :Acceptor), ϕ) + !iszero(ϕ) && add_feature!(mgmm, :Acceptor, acoords, σfun(a, r, :Acceptor), ϕ) end end function add_features_from_atoms!(mgmm::IsotropicMultiGMM, key, atoms, r, σfun, ϕfun) isempty(atoms) && return - μ = mean(a -> a.coordinates, atoms) + μ = mean(coords, atoms) σ = σfun(atoms, r, key) ϕ = ϕfun(atoms, r, key) add_feature!(mgmm, key, μ, σ, ϕ) @@ -98,30 +100,31 @@ end function add_features_from_residue!( mgmm::IsotropicMultiGMM, - r::PDBResidue; + r::AbstractResidue; combined = false, σfun = combined ? combined_σfun : atomic_σfun, ϕfun = combined ? combined_ϕfun : atomic_ϕfun ) if !combined - for a in r.atoms + for a in r add_features_from_atom!(mgmm, a, r, σfun, ϕfun) end else - add_features_from_atoms!(mgmm, :Steric, r.atoms, r, σfun, ϕfun) - add_features_from_atoms!(mgmm, :Hydrophobe, filter(x -> ishydrophobic(x, r.id.name), r.atoms), r, σfun, ϕfun) - add_features_from_atoms!(mgmm, :Aromatic, filter(x -> isaromatic(x, r.id.name), r.atoms), r, σfun, ϕfun) - add_features_from_atoms!(mgmm, :PosIonizable, filter(x -> iscationic(x, r.id.name), r.atoms), r, σfun, ϕfun) - add_features_from_atoms!(mgmm, :NegIonizable, filter(x -> isanionic(x, r.id.name), r.atoms), r, σfun, ϕfun) - add_features_from_atoms!(mgmm, :Donor, filter(x -> ishbonddonor(x, r.id.name), r.atoms), r, σfun, ϕfun) - add_features_from_atoms!(mgmm, :Acceptor, filter(x -> ishbondacceptor(x, r.id.name), r.atoms), r, σfun, ϕfun) + rname = resname(r) + add_features_from_atoms!(mgmm, :Steric, r, r, σfun, ϕfun) + add_features_from_atoms!(mgmm, :Hydrophobe, collectatoms(r, x -> ishydrophobic(x, rname)) , r, σfun, ϕfun) + add_features_from_atoms!(mgmm, :Aromatic, collectatoms(r, x -> isaromatic(x, rname)) , r, σfun, ϕfun) + add_features_from_atoms!(mgmm, :PosIonizable, collectatoms(r, x -> iscationic(x, rname)) , r, σfun, ϕfun) + add_features_from_atoms!(mgmm, :NegIonizable, collectatoms(r, x -> isanionic(x, rname)) , r, σfun, ϕfun) + add_features_from_atoms!(mgmm, :Donor, collectatoms(r, x -> ishbonddonor(x, rname)) , r, σfun, ϕfun) + add_features_from_atoms!(mgmm, :Acceptor, collectatoms(r, x -> ishbondacceptor(x, rname)), r, σfun, ϕfun) end end """ - mgmm = features_from_structure(seq, idxs=1:length(seq); combined=false) + mgmm = features_from_structure(seq::ChainLike, idxs=1:length(seq); combined=false) -Construct an `IsotropicMultiGMM` from a `PDBResidue` sequence `seq` by adding +Construct an `IsotropicMultiGMM` from `seq` by adding features for each residue in `idxs`. `combined=true` causes all atoms sharing the same feature to be combined, diff --git a/src/forces.jl b/src/forces.jl index 6aea990..0070b5d 100644 --- a/src/forces.jl +++ b/src/forces.jl @@ -41,11 +41,11 @@ The output is a list of 3×4 matrices, one for each residue in `seq`. See also: [`optimize_weights`](@ref), [`forcedict`](@ref). """ -function forcecomponents(seq::AbstractVector{PDBResidue}, interactions::InteractionList, residueindexes=eachindex(seq); kwargs...) +function forcecomponents(seq::AbstractVector{<:AbstractResidue}, interactions::InteractionList, residueindexes=eachindex(seq); kwargs...) # Create a MGMM for each residue mgmms = [IsotropicMultiGMM(Dict{Symbol,IsotropicGMM{3,Float64}}()) for _ in seq] - for i in eachindex(seq) - add_features_from_residue!(mgmms[i], seq[i]; kwargs...) + for (i, seqi) in enumerate(seq) + add_features_from_residue!(mgmms[i], seqi; kwargs...) end forces = forcecomponents(mgmms, interactions, residueindexes) # Project out the component of the force that would stretch or compress the covalent bonds between residues @@ -53,14 +53,14 @@ function forcecomponents(seq::AbstractVector{PDBResidue}, interactions::Interact force = forces[ii] if i > firstindex(seq) # Get the alpha-carbon to nitrogen vector - v = alpha_nitrogen(seq[i-1]).coordinates - alpha_carbon(seq[i]).coordinates + v = coords(alpha_nitrogen(seq[i-1])) - coords(alpha_carbon(seq[i])) v /= norm(v) # Project out the component of the force that would stretch or compress the bond force -= v * (v' * force) end if i < lastindex(seq) # Get the nitrogen to alpha-carbon vector - v = alpha_carbon(seq[i+1]).coordinates - alpha_nitrogen(seq[i]).coordinates + v = coords(alpha_carbon(seq[i+1])) - coords(alpha_nitrogen(seq[i])) v /= norm(v) # Project out the component of the force that would stretch or compress the bond force -= v * (v' * force) @@ -69,6 +69,8 @@ function forcecomponents(seq::AbstractVector{PDBResidue}, interactions::Interact end return forces end +forcecomponents(seq, interactions::InteractionList, residueindexes=eachindex(seq); kwargs...) = + forcecomponents(collectresidues(seq), interactions, residueindexes; kwargs...) function forcecomponents(mgmms::AbstractVector{<:IsotropicMultiGMM{N}}, interactions::InteractionList, residueindexes=eachindex(mgmms)) where N GaussianMixtureAlignment.validate_interactions(Dict(interactions)) || throw(ArgumentError("interactions must appear in only one order, got $interactions")) diff --git a/src/msa.jl b/src/msa.jl index a9da0ec..35fb5c7 100644 --- a/src/msa.jl +++ b/src/msa.jl @@ -36,7 +36,7 @@ Remove all sequences from `msa` with fewer than `minres` matching residues. function filter_long!(msa::AbstractMultipleSequenceAlignment, minres::Real) # Get rid of short sequences nresidues = map(eachrow(msa)) do v - sum(!=(Residue('-')), v) + sum(!=(MSA.Residue('-')), v) end mask = nresidues .> minres filtersequences!(msa, mask) @@ -77,6 +77,22 @@ end struct SequenceMapping <: AbstractVector{Int} seqmap::Vector{Int} end +""" + sm = SequenceMapping([4, 5, 0, ...]) + sm = SequenceMapping(seq::AnnotatedAlignedSequence) + +A `SequenceMapping` is a vector of indexes within a full sequence that map to a +reference. Specifically, `sm[i]` is the index of the residue in the full +sequence that maps to the `i`-th position in the reference. 0 is a placeholder +for a position in the reference that has no mapping to the full sequence. + +# Example + +`SequenceMapping([4, 5, 0, ...])` indicates that: +- the first position in the reference maps to the fourth residue in the full sequence, +- the second position in the reference maps to the fifth residue in the full sequence, and +- the third position in the reference lacks a corresponding residue in the full sequence. +""" SequenceMapping(seq::AnnotatedAlignedSequence) = SequenceMapping(getsequencemapping(seq)) Base.size(sm::SequenceMapping) = size(sm.seqmap) @@ -85,11 +101,18 @@ Base.setindex!(sm::SequenceMapping, v, i::Int) = sm.seqmap[i] = v Base.similar(::SequenceMapping, ::Type{Int}, dims::Dims{1}) = SequenceMapping(Vector{Int}(undef, dims)) # We restrict the following to `Vector` to prevent ambiguities -Base.getindex(fullseqvec::Vector{T}, sm::SequenceMapping) where T = any(iszero, sm.seqmap) ? - Union{T,Nothing}[i == 0 ? nothing : fullseqvec[i] for i in sm.seqmap] : - T[fullseqvec[i] for i in sm.seqmap] +Base.getindex(fullseqvec::Vector{T}, sm::SequenceMapping) where T = + Union{T,Nothing}[i == 0 ? nothing : fullseqvec[i] for i in sm.seqmap] + +Base.getindex(fullseq::Chain, sm::SequenceMapping) = getindex(collectresidues(fullseq), sm) +Base.getindex(fullseq::Model, sm::SequenceMapping) = getindex(only(fullseq), sm) +Base.getindex(fullseq::MolecularStructure, sm::SequenceMapping) = getindex(only(fullseq), sm) function Base.setindex!(fullseqvec::Vector, colvalues::AbstractVector, sm::SequenceMapping) - idxnz = sm.seqmap .> 0 - fullseqvec[sm.seqmap[idxnz]] = colvalues[idxnz] + eachindex(colvalues) === eachindex(sm) || throw(DimensionMismatch("`colvalues` and `sm` must have the same indices")) + for (v, i) in zip(colvalues, sm.seqmap) + i == 0 && continue + fullseqvec[i] = v + end + return fullseqvec end diff --git a/src/pocket.jl b/src/pocket.jl index 61d870b..07aa945 100644 --- a/src/pocket.jl +++ b/src/pocket.jl @@ -17,12 +17,12 @@ function z2tmcoords(z, allitps) return [itpos2coord(p, tmitps) for (p,tmitps) in zip(itpos, allitps)] end -function sidechaincentroid(r::PDBResidue) # ignoring mass for now - scatoms = filter(a -> a.atom ∉ ["N","CA","C","O"], r.atoms) - return length(scatoms) == 0 ? nothing : sum(a.coordinates for a in scatoms) / length(scatoms) +function sidechaincentroid(r::AbstractResidue) # ignoring mass for now + scatoms = collectatoms(r, !backboneselector) + return length(scatoms) == 0 ? nothing : sum(coords(a) for a in scatoms) / length(scatoms) end -function scvector(r::PDBResidue) +function scvector(r::AbstractResidue) cacoord = alphacarbon_coordinates(r) sccoord = sidechaincentroid(r) return sccoord === nothing ? zero(cacoord) : sccoord - cacoord @@ -33,7 +33,7 @@ function res_inside_hull(sccoord, tmcoords) # tmcoords is a list of 2D points in return insidehull(sccoord[1:2], h) end -function tm_res_is_inward(r::PDBResidue, itps) +function tm_res_is_inward(r::AbstractResidue, itps) scvec = scvector(r) all(iszero, scvec) && return false sccoord = alphacarbon_coordinates(r) .+ scvec @@ -47,14 +47,14 @@ Return an array of boolean[] indicating which residues (of those specified by `t `tmidxs` is a vector (typically of length 7, with each entry corresponding to a transmembrane region) of ranges of residue indices. """ -function inward_tm_residues(seq, tmidxs) +function inward_tm_residues(seq::AbstractVector{<:AbstractResidue}, tmidxs) tmcoords = [alphacarbon_coordinates_matrix(seq[tm]) for tm in tmidxs] itps = [[interpolate(tm[i,:], BSpline((i === 3 ? Linear() : Cubic()))) for i=1:3] for tm in tmcoords] return [[tm_res_is_inward(r, itps) for r in seq[tm]] for tm in tmidxs] end +inward_tm_residues(seq::Chain, tmidxs) = inward_tm_residues(collectresidues(seq), tmidxs) - -function ecl_res_is_inward(r::PDBResidue, topcenter) +function ecl_res_is_inward(r::AbstractResidue, topcenter) scvec = scvector(r) all(iszero, scvec) && return false accoord = alphacarbon_coordinates(r) @@ -69,8 +69,10 @@ Return an array of boolean[] indicating which residues (of those specified by `e `eclidxs` is a vector (with each entry corresponding to an extracellular loop) of ranges of residue indices. """ -function inward_ecl_residues(seq, eclidxs; includes_amino_terminus::Bool=false) +function inward_ecl_residues(seq::AbstractVector{<:AbstractResidue}, eclidxs; includes_amino_terminus::Bool=false) tm_top_idxs = filter(x -> 0 < x < length(seq), vcat([[ecl[1] - 1, ecl[end] + 1] for ecl in eclidxs[begin+includes_amino_terminus:end]]...)) topcenter = mean(alphacarbon_coordinates_matrix(seq[tm_top_idxs]); dims=2) return [[ecl_res_is_inward(r, topcenter) for r in seq[ecl]] for ecl in eclidxs] end +inward_ecl_residues(seq, eclidxs; includes_amino_terminus::Bool=false) = + inward_ecl_residues(collectresidues(seq), eclidxs; includes_amino_terminus) diff --git a/src/properties.jl b/src/properties.jl index ffd3b3f..4ef06e3 100644 --- a/src/properties.jl +++ b/src/properties.jl @@ -8,26 +8,26 @@ Base.getindex(v::AAProperties, i::Int) = i == 1 ? v.charge : i == 2 ? v.hydropathy : i == 3 ? v.volume : Base.throw_boundserror(v, i) const aa_properties = Dict( - Residue('A') => AAProperties(0, 1.8, 67), - Residue('R') => AAProperties(1, -4.5, 148), - Residue('N') => AAProperties(0, -3.5, 96), - Residue('D') => AAProperties(-1, -3.5, 91), - Residue('C') => AAProperties(0, 2.5, 86), - Residue('Q') => AAProperties(0, -3.5, 114), - Residue('E') => AAProperties(-1, -3.5, 109), - Residue('G') => AAProperties(0, -0.4, 48), - Residue('H') => AAProperties(0.1, -3.2, 118), # at pH 7.4, His has a 10% chance of being charged - Residue('I') => AAProperties(0, 4.5, 124), - Residue('L') => AAProperties(0, 3.8, 124), - Residue('K') => AAProperties(1, -3.9, 135), - Residue('M') => AAProperties(0, 1.9, 124), - Residue('F') => AAProperties(0, 2.8, 135), - Residue('P') => AAProperties(0, -1.6, 90), - Residue('S') => AAProperties(0, -0.8, 73), - Residue('T') => AAProperties(0, -0.7, 93), - Residue('W') => AAProperties(0, -0.9, 163), - Residue('Y') => AAProperties(0, -1.3, 141), - Residue('V') => AAProperties(0, 4.2, 105), + MSA.Residue('A') => AAProperties(0, 1.8, 67), + MSA.Residue('R') => AAProperties(1, -4.5, 148), + MSA.Residue('N') => AAProperties(0, -3.5, 96), + MSA.Residue('D') => AAProperties(-1, -3.5, 91), + MSA.Residue('C') => AAProperties(0, 2.5, 86), + MSA.Residue('Q') => AAProperties(0, -3.5, 114), + MSA.Residue('E') => AAProperties(-1, -3.5, 109), + MSA.Residue('G') => AAProperties(0, -0.4, 48), + MSA.Residue('H') => AAProperties(0.1, -3.2, 118), # at pH 7.4, His has a 10% chance of being charged + MSA.Residue('I') => AAProperties(0, 4.5, 124), + MSA.Residue('L') => AAProperties(0, 3.8, 124), + MSA.Residue('K') => AAProperties(1, -3.9, 135), + MSA.Residue('M') => AAProperties(0, 1.9, 124), + MSA.Residue('F') => AAProperties(0, 2.8, 135), + MSA.Residue('P') => AAProperties(0, -1.6, 90), + MSA.Residue('S') => AAProperties(0, -0.8, 73), + MSA.Residue('T') => AAProperties(0, -0.7, 93), + MSA.Residue('W') => AAProperties(0, -0.9, 163), + MSA.Residue('Y') => AAProperties(0, -1.3, 141), + MSA.Residue('V') => AAProperties(0, 4.2, 105), ) # z-score the amino acid features using the full covariance matrix diff --git a/src/tmalign.jl b/src/tmalign.jl index e6bdb69..e8472eb 100644 --- a/src/tmalign.jl +++ b/src/tmalign.jl @@ -25,43 +25,47 @@ function parse_tma(filename) lines = lines[idx+1:end] end length(lines) == 3 || error("expecting 3 lines, got $(length(lines))") - seq1 = [Residue(c) for c in lines[1]] - seq2 = [Residue(c) for c in lines[3]] + seq1 = [MSA.Residue(c) for c in lines[1]] + seq2 = [MSA.Residue(c) for c in lines[3]] quality = [c for c in lines[2]] append!(quality, fill('-', length(seq1) - length(quality))) return seq1, seq2, quality end const alignsentinel = -1 -function MapAlign(s::AbstractVector{PDBResidue}, a::AbstractVector{Residue}, quality) +function MapAlign(s::ChainLike, a::AbstractVector{MSA.Residue}, quality) s2a, a2s = fill(alignsentinel, length(s)), Int[] # we only retain the high-quality matches + sj, state = iterate(s) j = 0 for (r, q) in zip(a, quality) - r == Residue('-') && continue + r == MSA.Residue('-') && continue j += 1 - r == three2residue(s[j].id.name) || error("at position $j, residue was $(s[j].id.name) but alignment was $r") - q == ':' || continue - push!(a2s, j) - s2a[j] = length(a2s) + r == three2residue(String(resname(sj))) || error("at position $j, residue was $(resname(sj)) but alignment was $r") + if q == ':' + push!(a2s, j) + s2a[j] = length(a2s) + end + ret = iterate(s, state) + sj, state = ret === nothing ? (nothing, nothing) : ret # if we've reached s[end], continuing should be an error end return MapAlign(s2a, a2s) end -function StructAlign(struct1::AbstractVector{PDBResidue}, struct2::AbstractVector{PDBResidue}, - align1::AbstractVector{Residue}, align2::AbstractVector{Residue}, +function StructAlign(struct1::ChainLike, struct2::ChainLike, + align1::AbstractVector{MSA.Residue}, align2::AbstractVector{MSA.Residue}, quality) StructAlign(MapAlign(struct1, align1, quality), MapAlign(struct2, align2, quality)) end """ - StructAlign(struct1::AbstractVector{PDBResidue}, struct2::AbstractVector{PDBResidue}, filename::AbstractString) + StructAlign(struct1::ChainLike, struct2::ChainLike, filename::AbstractString) Create a structure-based alignment between `struct1` and `struct2`. `filename` is the name of the TM-align "results" file (e.g., https://zhanggroup.org//TM-align/example/873772.html). See also [`residueindex`](@ref). """ -StructAlign(struct1::AbstractVector{PDBResidue}, struct2::AbstractVector{PDBResidue}, filename::AbstractString) = +StructAlign(struct1::ChainLike, struct2::ChainLike, filename::AbstractString) = StructAlign(struct1, struct2, parse_tma(filename)...) """ diff --git a/src/utils.jl b/src/utils.jl index fbc4cdd..c99df90 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -38,16 +38,16 @@ function query_ncbi(id) end """ - getchain(filename::AbstractString; model="1", chain="A") + getchain(filename::AbstractString; model=1, chain="A") Read a PDB file `filename` and extract the specified chain. """ -getchain(filename::AbstractString; model="1", chain="A") = read_file(filename, PDBFile; model, chain) +getchain(filename::AbstractString; model=1, chain="A") = read(filename, PDBFormat)[model][chain] -function validate_seq_residues(seq::AnnotatedAlignedSequence, chain::AbstractVector{PDBResidue}) +function validate_seq_residues(seq::AnnotatedAlignedSequence, chain) for (i, r) in zip(getsequencemapping(seq), seq) (r == GAP || r == XAA) && continue - res = three2residue(chain[i].id.name) + res = three2residue(String(resname(chain[i]))) res == r || return false end return true @@ -78,5 +78,25 @@ function findall_subseq(subseq, seq) return starts end -alpha_nitrogen(r) = r.atoms[findfirst(a -> a.atom == "N", r.atoms)] -alpha_carbon(r) = r.atoms[findfirst(a -> a.atom == "CA", r.atoms)] \ No newline at end of file +alpha_nitrogen(r) = firstmatch(a -> atomname(a) == "N", r) +alpha_carbon(r) = firstmatch(a -> atomname(a) == "CA", r) + +function firstmatch(f, iter) + for x in iter + if f(x) + return x + end + end + return nothing +end + +function skipnothing(v::AbstractVector{Union{T,Nothing}}) where T + return T[x for x in v if x !== nothing] +end + +MIToS.PDB.ishydrophobic(a::AbstractAtom, rname::AbstractString) = (rname, atomname(a)) in MIToS.PDB._hydrophobic +MIToS.PDB.isaromatic(a::AbstractAtom, rname::AbstractString) = (rname, atomname(a)) in MIToS.PDB._aromatic +MIToS.PDB.iscationic(a::AbstractAtom, rname::AbstractString) = (rname, atomname(a)) in MIToS.PDB._cationic +MIToS.PDB.isanionic(a::AbstractAtom, rname::AbstractString) = (rname, atomname(a)) in MIToS.PDB._anionic +MIToS.PDB.ishbonddonor(a::AbstractAtom, rname::AbstractString) = (rname, atomname(a)) in keys(MIToS.PDB._hbond_donor) +MIToS.PDB.ishbondacceptor(a::AbstractAtom, rname::AbstractString) = (rname, atomname(a)) in keys(MIToS.PDB._hbond_acceptor) diff --git a/test/deprecated.jl b/test/deprecated.jl deleted file mode 100644 index 34ce5b4..0000000 --- a/test/deprecated.jl +++ /dev/null @@ -1,11 +0,0 @@ -@testset "Deprecated" begin - @info "Testing deprecated functions (warnings are expected)" - @test only(GPCRAnalysis.markers(2, [Coordinates(5, 4, 3)], 0.5, "blue")) == "marker #2 position 5.0,4.0,3.0 radius 0.5 color blue" - dot = GPCRAnalysis.marker(2, Coordinates(5, 4, 3), 0.5, "blue") - tmpfile = tempname() * ".cxc" - chimerax_script(tmpfile, ["align_ABCD.pdb", "align_EFGH.pdb"], [[5, 10, 15], [7, 11]]; extras=[dot]) - script = read(tmpfile, String) - @test !occursin("matchmaker #2-2 to #1", script) - @test occursin("marker #2 position 5.0,4.0,3.0 radius 0.5 color blue", script) - rm(tmpfile) -end diff --git a/test/runtests.jl b/test/runtests.jl index d593adb..b697607 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,7 +1,8 @@ using GPCRAnalysis -using MIToS -using MIToS.MSA -using MIToS.PDB +using MIToS: MSA, Pfam +using MIToS.MSA: @res_str, three2residue, coverage, GappedAlphabet, filtersequences!, percentsimilarity, + nsequences, sequencenames, getsequencemapping +using BioStructures using GaussianMixtureAlignment using InvertedIndices using Statistics @@ -56,9 +57,9 @@ using Test @testset "MSA" begin # The test file is copied from MIToS/test/data, with gratitude pf09645_sto = "PF09645_full.stockholm" - msa = MIToS.MSA.read_file(pf09645_sto, MIToS.Pfam.Stockholm) - @test MIToS.MSA.nsequences(filter_species!(deepcopy(msa), "ATV")) == 1 - @test MIToS.MSA.nsequences(filter_long!(deepcopy(msa), 70)) == 3 + msa = MSA.read_file(pf09645_sto, Pfam.Stockholm) + @test MSA.nsequences(filter_species!(deepcopy(msa), "ATV")) == 1 + @test MSA.nsequences(filter_long!(deepcopy(msa), 70)) == 3 idx = SequenceMapping([0, 4, 5, 0]) seqvals = fill(NaN, 9) @@ -80,7 +81,7 @@ using Test @test AccessionCode(msa, MSACode("Y070_ATV/2-70")) == AccessionCode("Q3V4T1") @test MSACode(msa, AccessionCode("Q3V4T1")) == MSACode("Y070_ATV/2-70") - @test msa[MSACode("Y070_ATV/2-70")][8] == msa[AccessionCode("Q3V4T1")][8] == Residue('V') + @test msa[MSACode("Y070_ATV/2-70")][8] == msa[AccessionCode("Q3V4T1")][8] == MSA.Residue('V') end @testset "ChimeraX" begin tmpfile = tempname() * ".cxc" @@ -92,11 +93,11 @@ using Test @test occursin("show #2 :11", script) @test occursin("transparency #1-2 80 target c", script) @test occursin("matchmaker #2-2 to #1", script) - dot = GPCRAnalysis.marker(2, Coordinates(5, 4, 3), 0.5, "blue") + dot = GPCRAnalysis.marker(2, [5, 4, 3], 0.5, "blue") chimerax_script(tmpfile, ["ABCD.pdb", "EFGH.pdb"], [[5, 10, 15], [7, 11]]; align=false, extras=[dot]) script = read(tmpfile, String) @test !occursin("matchmaker #2-2 to #1", script) - @test occursin("marker #2 position 5.0,4.0,3.0 radius 0.5 color blue", script) + @test occursin("marker #2 position 5,4,3 radius 0.5 color blue", script) rm(tmpfile) end @testset "Needleman-Wunsch" begin @@ -137,7 +138,7 @@ using Test D = [1 0 1 2 3; 3 2 1 0 1; 4 3 2 1 0; 5 4 3 2 1] @test align_nw(D, gapcosts) == [(1, 2), (2, 3), (3, 4), (4, 5)] - opsd = read_file("AF-P15409-F1-model_v4.pdb", PDBFile) + opsd = getchain("AF-P15409-F1-model_v4.pdb") opsd_tms = [37:61, 74:96, 111:133, 153:173, 203:224, 253:274, 287:308] @test align_ranges(opsd, opsd, opsd_tms) == opsd_tms @@ -153,8 +154,8 @@ using Test testscore(S, P, D, gapcosts) end @testset "Pocket residues and features" begin - opsd = read_file("AF-P15409-F1-model_v4.pdb", PDBFile) - opsdr = [three2residue(r.id.name) for r in opsd] + opsd = getchain("AF-P15409-F1-model_v4.pdb") + opsdr = [three2residue(String(resname(r))) for r in opsd] # These are not quite accurate, from https://www.uniprot.org/uniprotkb/P15409/entry#subcellular_location # the actual answer is # opsd_tms = [37:61, 74:96, 111:133, 153:173, 203:224, 253:274, 287:308] @@ -167,13 +168,13 @@ using Test only(findall_subseq(res"AEKE", opsdr)):only(findall_subseq(res"YIFT", opsdr))+3, only(findall_subseq(res"PIFM", opsdr)):only(findall_subseq(res"YIML", opsdr))+3, ] - opsd = align_to_membrane(opsd, opsd_tms) + applytransform!(opsd, align_to_membrane(opsd, opsd_tms)) # Check the alignment leaflet_e, leaflet_i = GPCRAnalysis.collect_leaflet_residues(opsd, opsd_tms, true) - ce = reduce(vcat, [coordinatesmatrix(r) for r in leaflet_e]) - ci = reduce(vcat, [coordinatesmatrix(r) for r in leaflet_i]) - ze = median(ce[:,3]) - zi = median(ci[:,3]) + ce = reduce(hcat, [coordarray(r) for r in leaflet_e]) + ci = reduce(hcat, [coordarray(r) for r in leaflet_i]) + ze = median(ce[3,:]) + zi = median(ci[3,:]) @test ze > zi # extracellular is positive @test abs(ze + zi) < 1e-6 * (ze - zi) # center of membrane is at z=0 @@ -277,7 +278,7 @@ using Test @test all(abs.(wF*ones(3)) .< 1e-7) # gradient is zero at the minimum even under re-tuning interactions = [(:Steric, :Steric) => 1, (:Hydrophobe, :Hydrophobe) => -1, (:Donor, :Acceptor) => -1] # drop the ionic - opsd = read_file("AF-P15409-F1-model_v4.pdb", PDBFile) + opsd = getchain("AF-P15409-F1-model_v4.pdb") opsd_tms = [37:61, 74:96, 111:133, 153:173, 203:224, 253:274, 287:308] tm_res = inward_tm_residues(opsd, opsd_tms) tm_idxs = reduce(vcat, [tm[idx] for (tm, idx) in zip(opsd_tms, tm_res)]) @@ -328,7 +329,7 @@ using Test absfn = joinpath(dir, fn) @test try_download_alphafold(uname, absfn; version=v) == absfn @test isfile(absfn) - @test getchain(absfn) isa AbstractVector{MIToS.PDB.PDBResidue} + @test getchain(absfn) isa GPCRAnalysis.ChainLike @test only(alphafoldfiles(dir)) == fn end end @@ -340,7 +341,7 @@ using Test if !isfile(pfampath) Pfam.downloadpfam("PF03402"; filename=pfampath) end - msa = MIToS.MSA.read_file(pfampath, MSA.Stockholm, generatemapping=true, useidcoordinates=true) + msa = MSA.read_file(pfampath, MSA.Stockholm, generatemapping=true, useidcoordinates=true) filter_species!(msa, "MOUSE") # Make small enough for a decent download test filtersequences!(msa, coverage(msa) .>= 0.9) @@ -364,18 +365,19 @@ using Test @test @inferred(pLDDTcolor(first(c1))) isa AbstractRGB conserved_residues = c1[SequenceMapping(getsequencemapping(msa, 1))[conserved_cols]] badidx = findall(==(nothing), conserved_residues) - conserved_residues = convert(Vector{PDBResidue}, conserved_residues[Not(badidx)]) + conserved_residues = convert(Vector{Residue}, conserved_residues[Not(badidx)]) conserved_cols = conserved_cols[Not(badidx)] sm = SequenceMapping(getsequencemapping(msa, 2)) - c2a = align(conserved_residues, c2, sm[conserved_cols]) - @test rmsd(conserved_residues, c2a[sm[conserved_cols]]; superimposed=true) < rmsd(conserved_residues, c2[sm[conserved_cols]]; superimposed=true) + c2a = applytransform!(copy(c2), align(conserved_residues, c2, sm[conserved_cols])) + @test rmsd(conserved_residues, skipnothing(c2a[sm[conserved_cols]]); superimpose=false) < + rmsd(conserved_residues, skipnothing(c2[sm[conserved_cols]]); superimpose=false) mc = mapclosest(c1, c2a) cloc = chargelocations(c1) lpos = positive_locations(cloc) - @test !isempty(lpos) && all(item -> isa(item, Coordinates), lpos) + @test !isempty(lpos) && all(item -> isa(item, AbstractVector), lpos) lneg = negative_locations(cloc) - @test !isempty(lneg) && all(item -> isa(item, Coordinates), lneg) + @test !isempty(lneg) && all(item -> isa(item, AbstractVector), lneg) @test isempty(lpos ∩ lneg) # Choose a sufficiently-divergent pair that structural alignment is nontrivial @@ -401,7 +403,7 @@ using Test smref = smref[keep] conserved_residues = cref[smref] smcmp = SequenceMapping(getsequencemapping(msa, idxcmp)) - ccmpa = align(conserved_residues, ccmp, smcmp[conserved_cols]) + ccmpa = applytransform!(copy(ccmp), align(conserved_residues, ccmp, smcmp[conserved_cols])) mc = mapclosest(cref, ccmpa) idxclose = first.(filter(item -> item[2] < 5.0, mc)) # TMAlign scores those closer than 5Å as a match n = 0; for i in Iterators.drop(eachindex(idxclose), 1) @@ -425,5 +427,3 @@ using Test end end end - -include("deprecated.jl") From 32765a806bfcd78ba452231a98c2287cf0426498 Mon Sep 17 00:00:00 2001 From: Tim Holy Date: Thu, 22 Aug 2024 06:37:14 -0500 Subject: [PATCH 2/2] Add `align_closest` and rename `map_closest` --- src/GPCRAnalysis.jl | 4 ++-- src/align.jl | 45 ++++++++++++++++++++++++++++++++++++++++++++- test/runtests.jl | 5 +++-- 3 files changed, 49 insertions(+), 5 deletions(-) diff --git a/src/GPCRAnalysis.jl b/src/GPCRAnalysis.jl index fab9be1..45b9be6 100644 --- a/src/GPCRAnalysis.jl +++ b/src/GPCRAnalysis.jl @@ -45,10 +45,10 @@ export SequenceMapping, AccessionCode, MSACode, NWGapCosts export species, uniprotX, query_uniprot_accession, query_ebi_proteins, query_ncbi export try_download_alphafold, query_alphafold_latest, download_alphafolds, alphafoldfile, alphafoldfiles, getchain, findall_subseq, pLDDT, pLDDTcolor -export align_to_axes, align_to_membrane, align_nw, align_ranges +export align_to_axes, align_to_membrane, align_nw, align_ranges, map_closest, align_closest export filter_species!, filter_long!, sortperm_msa, chimerax_script export project_sequences, columnwise_entropy, align, residue_centroid, residue_centroid_matrix, alphacarbon_coordinates, - alphacarbon_coordinates_matrix, mapclosest, chargelocations, positive_locations, negative_locations + alphacarbon_coordinates_matrix, chargelocations, positive_locations, negative_locations export StructAlign, residueindex, ismapped, skipnothing export BWScheme, lookupbw export aa_properties, aa_properties_zscored diff --git a/src/align.jl b/src/align.jl index ed099f3..327f25f 100644 --- a/src/align.jl +++ b/src/align.jl @@ -23,15 +23,58 @@ function align(fixedpos::AbstractMatrix{Float64}, moving::StructureLike, sm::Seq end align(fixed, moving::StructureLike, sm::SequenceMapping; kwargs...) = align(residue_centroid_matrix(fixed), moving, sm; kwargs...) -function mapclosest(mapto, mapfrom) +""" + mapping = map_closest(mapto::StructureLike, mapfrom::StructureLike) + +Return a vector `mapping[i] = (j, distij)`, matching the `i`th residue +in `mapto` to the `j`th residue in `mapfrom` and reporting the +distance between them. The mapping minimizes the sum of distances. +`mapto` and `mapfrom` must already be aligned for this to be meaningful. + +If `mapfrom` is shorter than `mapto`, some `j`s will be 0, indicating +a skipped residue in `mapto`. +""" +function map_closest(mapto::StructureLike, mapfrom::StructureLike) refcenters = residue_centroid_matrix(mapto) seqcenters = residue_centroid_matrix(mapfrom) + return map_closest(refcenters, seqcenters) +end + +function map_closest(refcenters::AbstractMatrix, seqcenters::AbstractMatrix) D = pairwise(Euclidean(), refcenters, seqcenters; dims=2) assignment, _ = hungarian(D) fillval = convert(eltype(D), NaN) return collect(zip(assignment, [j == 0 ? fillval : D[i, j] for (i, j) in enumerate(assignment)])) end +""" + tform = align_closest(mapto::StructLike, mapfrom::StructLike; Dthresh=5) + tform = align_closest(coordsto, coordsfrom; Dthresh=5) + +Return the rigid transformation best aligning `mapfrom` to `mapto`. The transformation is computed from +residues matched by `map_closest`, using only residues closer than `Dthresh`. + +Because the mapping is determined by distance, this can only "tweak" an already-close alignment. +""" +align_closest(mapto, mapfrom; kwargs...) = align_closest(residue_centroid_matrix(mapto), mapfrom; kwargs...) +align_closest(coordsto::AbstractMatrix, mapfrom; kwargs...) = align_closest(coordsto, residue_centroid_matrix(mapfrom); kwargs...) +align_closest(coordsto::AbstractMatrix, coordsfrom::AbstractMatrix; kwargs...) = + align_closest(coordsto, coordsfrom, map_closest(coordsto, coordsfrom); kwargs...) + +align_closest(mapto, mapfrom, mapping; kwargs...) = align_closest(residue_centroid_matrix(mapto), mapfrom, mapping; kwargs...) +align_closest(coordsto::AbstractMatrix, mapfrom, mapping; kwargs...) = + align_closest(coordsto, residue_centroid_matrix(mapfrom), mapping; kwargs...) +function align_closest(coordsto::AbstractMatrix, coordsfrom::AbstractMatrix, mapping; Dthresh=5) + toidx, fromidx = Int[], Int[] + for (i, (j, d)) in pairs(mapping) + j == 0 && continue + d > Dthresh && continue + push!(toidx, i) + push!(fromidx, j) + end + return Transformation(coordsto[:,toidx], coordsfrom[:,fromidx], toidx, fromidx) +end + ## Align to membrane """ diff --git a/test/runtests.jl b/test/runtests.jl index b697607..1db1fa7 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -371,7 +371,8 @@ using Test c2a = applytransform!(copy(c2), align(conserved_residues, c2, sm[conserved_cols])) @test rmsd(conserved_residues, skipnothing(c2a[sm[conserved_cols]]); superimpose=false) < rmsd(conserved_residues, skipnothing(c2[sm[conserved_cols]]); superimpose=false) - mc = mapclosest(c1, c2a) + mc = map_closest(c1, c2a) + @test align_closest(c1, c2a) isa Transformation cloc = chargelocations(c1) lpos = positive_locations(cloc) @@ -404,7 +405,7 @@ using Test conserved_residues = cref[smref] smcmp = SequenceMapping(getsequencemapping(msa, idxcmp)) ccmpa = applytransform!(copy(ccmp), align(conserved_residues, ccmp, smcmp[conserved_cols])) - mc = mapclosest(cref, ccmpa) + mc = map_closest(cref, ccmpa) idxclose = first.(filter(item -> item[2] < 5.0, mc)) # TMAlign scores those closer than 5Å as a match n = 0; for i in Iterators.drop(eachindex(idxclose), 1) n += idxclose[i] < idxclose[i-1]