HolyLab · timholy · Aug 22, 2024 · Aug 21, 2024 · Aug 22, 2024
diff --git a/NEWS.md b/NEWS.md
@@ -0,0 +1,7 @@
+# v0.5.0
+
+Breaking changes:
+- BioStructures and MIToS implement different parsers for PDBFiles and different representations of structures. As of BioStructures v4, there is momentum for standardizing the Julia ecosystem around BioStructures. GPCRAnalysis version 0.5.0 switches from MIToS to Biostructures. `getchain` now returns the BioStructures representation, which requires users to switch to their API for further manipulations.
+
+  MIToS is still used for its Multiple Sequence Alignment (MSA) module.
+- The `align*` utilities now return a transformation, rather than the transformed structure. Users should use `applytransform!(chain, tform)` or `applytransform!(copy(chain), tform)` manually. An advantage of this approach is that it can also be used to align complexes based on a superposition computed from alignment of individual chains.
diff --git a/Project.toml b/Project.toml
@@ -1,11 +1,11 @@
 name = "GPCRAnalysis"
 uuid = "c1d73f9e-d42a-418a-8d5b-c7b00ec0358f"
 authors = ["Tim Holy <[email protected]> and contributors"]
-version = "0.4.3"
+version = "0.5.0"
 
 [deps]
+BioStructures = "de9282ab-8554-53be-b2d6-f6c222edabfc"
 ColorTypes = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
-CoordinateTransformations = "150eb455-5306-5404-9cee-2592286d6298"
 Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
 Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
 FixedPointNumbers = "53c48c17-4a7d-5ca2-90c5-79b7896eea93"
@@ -34,8 +34,8 @@ JuMP = "4076af6c-e467-56ae-b986-b466b2749572"
 GPCRAnalysisJuMPExt = ["JuMP", "HiGHS"]
 
 [compat]
+BioStructures = "4.2"
 ColorTypes = "0.11"
-CoordinateTransformations = "0.6"
 Distances = "0.10"
 Downloads = "1"
 FixedPointNumbers = "0.8"

diff --git a/README.md b/README.md
@@ -5,4 +5,6 @@
 [![Coverage](https://codecov.io/gh/HolyLab/GPCRAnalysis.jl/branch/main/graph/badge.svg)](https://codecov.io/gh/HolyLab/GPCRAnalysis.jl)
 
 A suite for analyzing G-protein coupled receptors. Much of this is just a convenience wrapper and/or extension of
-[MIToS](https://github.com/diegozea/MIToS.jl), and some of this code may move there or elsewhere eventually.
+[BioStructures](https://github.com/BioJulia/BioStructures.jl) and [MIToS](https://github.com/diegozea/MIToS.jl).
+
+See NEWS.md for major changes.
diff --git a/src/GPCRAnalysis.jl b/src/GPCRAnalysis.jl
@@ -4,10 +4,14 @@ using Downloads
 using Statistics
 using LinearAlgebra
 
-using MIToS
-using MIToS.Pfam
-using MIToS.MSA
-using MIToS.PDB
+using BioStructures
+
+using MIToS: MIToS, Pfam, MSA
+using MIToS.MSA: AbstractMultipleSequenceAlignment, AnnotatedAlignedSequence, AnnotatedMultipleSequenceAlignment,
+                 ReducedAlphabet, GAP, XAA
+using MIToS.MSA: getsequence, getannotsequence, getsequencemapping, getresidues, three2residue, sequencenames,
+                 filtersequences!, percentsimilarity
+using MIToS.PDB: vanderwaalsradius, ishydrophobic, isaromatic, iscationic, isanionic, ishbonddonor, ishbondacceptor
 
 using MultivariateStats
 using Distances
@@ -20,7 +24,6 @@ using TravelingSalesmanHeuristics
 
 using Interpolations
 using MutableConvexHulls
-using CoordinateTransformations
 using GaussianMixtureAlignment
 
 # For querying the Uniprot and Alphafold REST APIs
@@ -32,16 +35,21 @@ using GZip
 using FixedPointNumbers
 using ColorTypes
 
-export @res_str
+const ChainLike = Union{Chain, AbstractVector{<:AbstractResidue}}   # an iterable of residues
+const ResidueLike = Union{Residue, AbstractVector{<:AbstractAtom}}  # an iterable of atoms
+const StructureLike = Union{ChainLike, Model, MolecularStructure}
+
+# export @res_str
 
 export SequenceMapping, AccessionCode, MSACode, NWGapCosts
 export species, uniprotX, query_uniprot_accession, query_ebi_proteins, query_ncbi
 export try_download_alphafold, query_alphafold_latest, download_alphafolds, alphafoldfile, alphafoldfiles, getchain,
        findall_subseq, pLDDT, pLDDTcolor
-export align_to_axes, align_to_membrane, align_nw, align_ranges
+export align_to_axes, align_to_membrane, align_nw, align_ranges, map_closest, align_closest
 export filter_species!, filter_long!, sortperm_msa, chimerax_script
-export project_sequences, columnwise_entropy, align, residue_centroid, residue_centroid_matrix, alphacarbon_coordinates, alphacarbon_coordinates_matrix, mapclosest, chargelocations, positive_locations, negative_locations
-export StructAlign, residueindex, ismapped
+export project_sequences, columnwise_entropy, align, residue_centroid, residue_centroid_matrix, alphacarbon_coordinates,
+       alphacarbon_coordinates_matrix, chargelocations, positive_locations, negative_locations
+export StructAlign, residueindex, ismapped, skipnothing
 export BWScheme, lookupbw
 export aa_properties, aa_properties_zscored
 export sidechaincentroid, scvector, inward_tm_residues, inward_ecl_residues
@@ -61,6 +69,5 @@ include("chimerax.jl")
 include("pocket.jl")
 include("features.jl")
 include("forces.jl")
-include("deprecated.jl")
 
 end
diff --git a/src/align.jl b/src/align.jl
@@ -1,13 +1,16 @@
 ## Minimum-distance alignment
 
 """
-    align(fixedpos::AbstractMatrix{Float64}, moving::AbstractVector{PDBResidue}, sm::SequenceMapping)
-    align(fixed::AbstractVector{PDBResidue}, moving::AbstractVector{PDBResidue}, sm::SequenceMapping)
-
-Return a rotated and shifted version of `moving` so that the α-carbons of residues `moving[sm]` have least
-mean square error deviation from positions `fixedpos` or those of residues `fixed`.
+    tform = align(fixedpos::AbstractMatrix{Float64}, moving::Chain, sm::SequenceMapping)
+    tform = align(fixed::Chain, moving::Chain, sm::SequenceMapping)
+
+Return a rotated and shifted version of `moving` so that the centroids of
+residues `moving[sm]` have least mean square error deviation from positions
+`fixedpos` or those of residues `fixed`. `fixed` or `fixedpos` should include
+just the residues or positions you want to align to, but `moving` should be an
+entire chain.
 """
-function align(fixedpos::AbstractMatrix{Float64}, moving::AbstractVector{PDBResidue}, sm::SequenceMapping; seqname=nothing, warn::Bool=true)
+function align(fixedpos::AbstractMatrix{Float64}, moving::StructureLike, sm::SequenceMapping; seqname=nothing, warn::Bool=true)
     length(sm) == size(fixedpos, 2) || throw(DimensionMismatch("reference has $(size(fixedpos, 2)) positions, but `sm` has $(length(sm))"))
     movres = moving[sm]
     keep = map(!isnothing, movres)
@@ -16,50 +19,90 @@ function align(fixedpos::AbstractMatrix{Float64}, moving::AbstractVector{PDBResi
     end
     movpos = residue_centroid_matrix(movres[keep])
     fixedpos = fixedpos[:,keep]
-    refmean = mean(fixedpos; dims=2)
-    movmean = mean(movpos; dims=2)
-    R = MIToS.PDB.kabsch((fixedpos .- refmean)', (movpos .- movmean)')
-    return change_coordinates(moving, (coordinatesmatrix(moving) .- movmean') * R .+ refmean')
+    return Transformation(movpos, fixedpos, sm[keep], findall(keep))
 end
-align(fixed::AbstractVector{PDBResidue}, moving::AbstractVector{PDBResidue}, sm::SequenceMapping; kwargs...) =
-    align(residue_centroid_matrix(fixed), moving, sm; kwargs...)
+align(fixed, moving::StructureLike, sm::SequenceMapping; kwargs...) = align(residue_centroid_matrix(fixed), moving, sm; kwargs...)
+
+"""
+    mapping = map_closest(mapto::StructureLike, mapfrom::StructureLike)
 
-function mapclosest(mapto::AbstractVector{PDBResidue}, mapfrom::AbstractVector{PDBResidue})
+Return a vector `mapping[i] = (j, distij)`, matching the `i`th residue
+in `mapto` to the `j`th residue in `mapfrom` and reporting the
+distance between them. The mapping minimizes the sum of distances.
+`mapto` and `mapfrom` must already be aligned for this to be meaningful.
+
+If `mapfrom` is shorter than `mapto`, some `j`s will be 0, indicating
+a skipped residue in `mapto`.
+"""
+function map_closest(mapto::StructureLike, mapfrom::StructureLike)
     refcenters = residue_centroid_matrix(mapto)
     seqcenters = residue_centroid_matrix(mapfrom)
+    return map_closest(refcenters, seqcenters)
+end
+
+function map_closest(refcenters::AbstractMatrix, seqcenters::AbstractMatrix)
     D = pairwise(Euclidean(), refcenters, seqcenters; dims=2)
     assignment, _ = hungarian(D)
-    return collect(zip(assignment, [j == 0 ? convert(eltype(D), NaN) : D[i, j] for (i, j) in enumerate(assignment)]))
-    # d, m = findmin(D; dims=1)
-    # return [mi[1] => di for (mi, di) in zip(vec(m), vec(d))]
+    fillval = convert(eltype(D), NaN)
+    return collect(zip(assignment, [j == 0 ? fillval : D[i, j] for (i, j) in enumerate(assignment)]))
+end
+
+"""
+    tform = align_closest(mapto::StructLike, mapfrom::StructLike; Dthresh=5)
+    tform = align_closest(coordsto, coordsfrom; Dthresh=5)
+
+Return the rigid transformation best aligning `mapfrom` to `mapto`. The transformation is computed from
+residues matched by `map_closest`, using only residues closer than `Dthresh`.
+
+Because the mapping is determined by distance, this can only "tweak" an already-close alignment.
+"""
+align_closest(mapto, mapfrom; kwargs...) = align_closest(residue_centroid_matrix(mapto), mapfrom; kwargs...)
+align_closest(coordsto::AbstractMatrix, mapfrom; kwargs...) = align_closest(coordsto, residue_centroid_matrix(mapfrom); kwargs...)
+align_closest(coordsto::AbstractMatrix, coordsfrom::AbstractMatrix; kwargs...) =
+    align_closest(coordsto, coordsfrom, map_closest(coordsto, coordsfrom); kwargs...)
+
+align_closest(mapto, mapfrom, mapping; kwargs...) = align_closest(residue_centroid_matrix(mapto), mapfrom, mapping; kwargs...)
+align_closest(coordsto::AbstractMatrix, mapfrom, mapping; kwargs...) =
+    align_closest(coordsto, residue_centroid_matrix(mapfrom), mapping; kwargs...)
+function align_closest(coordsto::AbstractMatrix, coordsfrom::AbstractMatrix, mapping; Dthresh=5)
+    toidx, fromidx = Int[], Int[]
+    for (i, (j, d)) in pairs(mapping)
+        j == 0 && continue
+        d > Dthresh && continue
+        push!(toidx, i)
+        push!(fromidx, j)
+    end
+    return Transformation(coordsto[:,toidx], coordsfrom[:,fromidx], toidx, fromidx)
 end
 
 ## Align to membrane
 
 """
-    newchain = align_to_membrane(chain::AbstractVector{PDBResidue}, tms; extracellular=true)
+    tform = align_to_membrane(chain::ChainLike, tms; extracellular=true)
 
-Align the `chain` to the membrane, given the transmembrane segments `tms` as
-residue-indexes (e.g., `[37:61, 74:96, ...]`). `extracelluar` should be true if
-the N-terminus of the protein is extracellular (`chain[first(tms[1])]` is at the
-extracellular face), and false otherwise.
+Compute the rigid transformation `tform` needed to align `chain` to the
+membrane, given the transmembrane segments `tms` as residue-indexes (e.g.,
+`[37:61, 74:96, ...]`). `extracelluar` should be true if the N-terminus of the
+protein is extracellular (`chain[first(tms[1])]` is at the extracellular face),
+and false otherwise.
 
-`newchain` is oriented so that the center of the membrane is `z=0` and
-extracellular is positive.
+`applytransform!(chain, tform)` (or the `model` that includes `chain`) will
+re-orient `chain` so that the center of the membrane is `z=0` and extracellular
+is positive.
 
 The algorithm finds the membrane normal `u` by maximizing the ratio
 
     sumᵢ (u ⋅ vᵢ)²
   -----------------
     sumᵢ (u ⋅ δᵢ)²
 
-where `vᵢ` is a vector parallel to the `i`th TM helix, and `δᵢ` is a within-leaflet
-atomic displacement.
+where `vᵢ` is a vector parallel to the `i`th TM helix, and `δᵢ` is a
+within-leaflet atomic displacement.
 """
-function align_to_membrane(chain::AbstractVector{PDBResidue}, tms; extracellular::Bool=true)
+function align_to_membrane(chain::ChainLike, tms; extracellular::Bool=true)
     function leaflet_displacements(residues)
-        coords = reduce(vcat, [coordinatesmatrix(r) for r in residues])
-        meanpos = mean(coords, dims=1)
+        coords = reduce(hcat, [coordarray(r) for r in residues])
+        meanpos = mean(coords, dims=2)
         return coords .- meanpos
     end
 
@@ -79,33 +122,38 @@ function align_to_membrane(chain::AbstractVector{PDBResidue}, tms; extracellular
     # eigenvector corresponding to the smallest eigenvalue of the matrix.
     μtm = mean(tm_vectors)
     A = sum(u -> (Δu = u - μtm; Δu * Δu'), tm_vectors)
-    B = delta_i'*delta_i + delta_e'*delta_e
+    B = delta_i*delta_i' + delta_e*delta_e'
     λ, u = eigen(A, B)
     u = u[:, argmax(λ)]
     u = u / norm(u)
     # Find the projection by `u` of atom coordinates in each leaflet
-    proj_e = reduce(vcat, [coordinatesmatrix(r) for r in leaflet_e]) * u
-    proj_i = reduce(vcat, [coordinatesmatrix(r) for r in leaflet_i]) * u
+    proj_e = reduce(hcat, [coordarray(r) for r in leaflet_e])' * u
+    proj_i = reduce(hcat, [coordarray(r) for r in leaflet_i])' * u
     # Compute a rotation that aligns u to the z-axis
     t = [0, 0, mean(proj_e) > mean(proj_i) ? 1 : -1]   # extracellular is positive
     q = QuatRotation(1 + u'*t, cross(u, t)...)
     # Apply the rotation to the coordinates, and offset the z-coordinate to the mean of the leaflets
-    coords = coordinatesmatrix(chain)
-    μ = vec(mean(coords, dims=1))
-    newcoords = (q * (coords' .- μ))' .+ [0, 0, (median(proj_e) + median(proj_i))/2 - μ'*u]'
-    return change_coordinates(chain, newcoords)
+    ccoords = coordarray(chain)
+    μ = vec(mean(ccoords, dims=2))
+    return Transformation(μ, [0, 0, (median(proj_e) + median(proj_i))/2 - μ'*u], RotMatrix(q))
 end
 
+"""
+    tform = align_to_axes(strct)
+
+Compute the transformation needed to align the principle axes of inertia of
+`strct` with the coordinate axes.
+"""
 function align_to_axes(strct)
-    coords = coordinatesmatrix(strct)
-    coords = coords .- mean(coords, dims=1)
-    tform = GaussianMixtureAlignment.inertial_transforms(coords')[1]
-    tform = AffineMap(Matrix(tform.linear),[tform.translation...])
-    return change_coordinates(strct, tform(coords')')
+    scoords = coordarray(strct)
+    μ = mean(scoords, dims=2)
+    scoords = scoords .- μ
+    rotmtrx = GaussianMixtureAlignment.inertial_transforms(scoords)[1]
+    return Transformation(μ, [0, 0, 0], rotmtrx)
 end
 
-function collect_leaflet_residues(chain, tms, extracellular)
-    leaflet_e, leaflet_i = PDBResidue[], PDBResidue[]
+function collect_leaflet_residues(chain, tms, extracellular::Bool)
+    leaflet_e, leaflet_i = Residue[], Residue[]
     for r in tms
         start, stop = extrema(r)
         if extracellular
@@ -122,6 +170,11 @@ end
 
 ## Needleman-Wunsch alignment
 
+# This implements sequence alignment based on three-dimensional structure, where
+# the cost of aligning two residues is the Euclidean distance between their
+# centroids. The Needleman-Wunsch algorithm is used to find the optimal alignment
+# of two sequences.
+
 @enum NWParent NONE DIAG LEFT UP
 Base.show(io::IO, p::NWParent) = print(io, p == NONE ? 'X' :
                                            p == DIAG ? '↖' :
@@ -203,8 +256,11 @@ Find the optimal `ϕ` matching `seq1[ϕ[k][1]]` to `seq2[ϕ[k][2]]` for all
 `k`. `mode` controls the computation of pairwise matching penalties, and can be
 either `:distance` or `:distance_orientation`, where the latter adds any
 mismatch in sidechain orientation to the distance penalty.
+
+`seq1` and `seq2` must be aligned to each other in 3D space before calling this
+function. See [`align`](@ref).
 """
-function align_nw(seq1::AbstractVector{PDBResidue}, seq2::AbstractVector{PDBResidue}, gapcosts::NWGapCosts;
+function align_nw(seq1::ChainLike, seq2::ChainLike, gapcosts::NWGapCosts;
                   mode::Symbol=:distance_orientation)
     supported_modes = (:distance, :distance_orientation)
     mode ∈ supported_modes || throw(ArgumentError("supported modes are $(supported_modes), got $mode"))
@@ -227,7 +283,7 @@ Transfer `refranges`, a list of reside index spans in `seq2`, to `seq1`. `seq1`
 `seq2` must be spatially aligned, and the assignment is made by minimizing
 inter-chain distance subject to the constraint of preserving sequence order.
 """
-function align_ranges(seq1::AbstractVector{PDBResidue}, seq2::AbstractVector{PDBResidue}, refranges::AbstractVector{<:AbstractUnitRange}; kwargs...)
+function align_ranges(seq1::ChainLike, seq2::AbstractVector{<:AbstractResidue}, refranges::AbstractVector{<:AbstractUnitRange}; kwargs...)
     anchoridxs = sizehint!(Int[], length(refranges)*2)
     for r in refranges
         push!(anchoridxs, first(r), last(r))
@@ -237,6 +293,8 @@ function align_ranges(seq1::AbstractVector{PDBResidue}, seq2::AbstractVector{PDB
     @assert last.(ϕ) == eachindex(anchoridxs)
     return [ϕ[i][1]:ϕ[i+1][1] for i in 1:2:length(ϕ)]
 end
+align_ranges(seq1::ChainLike, seq2::Chain, refranges::AbstractVector{<:AbstractUnitRange}; kwargs...) =
+    align_ranges(seq1, collectresidues(seq2), refranges; kwargs...)
 
 function score_nw(D::AbstractMatrix, gapcosts::NWGapCosts)
     Base.require_one_based_indexing(D)

diff --git a/src/alphafold.jl b/src/alphafold.jl
@@ -89,7 +89,7 @@ end
 
 Attempt to download an [AlphaFold](https://alphafold.com/) structure.
 Returns `nothing` if no entry corresponding to `uniprotXname` exists; otherwise
-it return `path`, the pathname of the saved file.
+it returns `path`, the pathname of the saved file.
 
 In general, a better approach is to use [`download_alphafolds`](@ref) for multiple proteins,
 or [`query_alphafold_latest`](@ref) combined with `Downloads.download` for a single protein.
@@ -154,13 +154,13 @@ function pLDDTcolor(pLDDT::Real)
 end
 
 """
-    pLDDTcolor(r::PDBResidue)
+    pLDDTcolor(r::Residue)
     pLDDTcolor(score::Real)
 
 Return the color corresponding to the pLDDT score (a measure of confidence) of a residue.
 """
-pLDDTcolor(r::PDBResidue) = pLDDTcolor(pLDDT(r))
-pLDDTcolor(a::PDBAtom) = pLDDTcolor(pLDDT(a))
+pLDDTcolor(r::Residue) = pLDDTcolor(pLDDT(r))
+pLDDTcolor(a::Atom) = pLDDTcolor(pLDDT(a))
 
-pLDDT(r::PDBResidue) = pLDDT(only(unique(a -> a.B, r.atoms)))
-pLDDT(a::PDBAtom) = parse(Float64, a.B)
+pLDDT(r::Residue) = only(unique(pLDDT, r))
+pLDDT(a::Atom) = tempfactor(a)