JuliaMolSim · mfherbst · Jun 5, 2024 · May 24, 2024 · May 24, 2024 · May 24, 2024
diff --git a/Project.toml b/Project.toml
@@ -24,7 +24,6 @@ Libxc = "66e17ffc-8502-11e9-23b5-c9248d0eb96d"
 LineSearches = "d3d80556-e9d4-5f37-9878-2ab0fcc64255"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LinearMaps = "7a12625a-238d-50fd-b39a-03d52299707e"
-LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
 MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
 Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a"
 Optim = "429524aa-4258-5aef-a3af-852621145aeb"
@@ -95,7 +94,6 @@ Libxc = "0.3.17"
 LineSearches = "7"
 LinearAlgebra = "1"
 LinearMaps = "3"
-LoopVectorization = "0.12"
 MPI = "0.20.13"
 Markdown = "1"
 Optim = "1"

diff --git a/docs/src/guide/self_consistent_field.jl b/docs/src/guide/self_consistent_field.jl
@@ -414,7 +414,7 @@ plot!(p, x -> ε(χ0_SiO2, x),  label="silica (SiO₂)", ls=:dashdot)
 # metal, such that `KerkerMixing` is indeed appropriate. We will thus employ it as
 # the preconditioner $P$ in the setting
 # ```math
-# rho_{n+1} = \rho_n + \alpha P^{-1} (D(V(\rho_n)) - \rho_n),
+# \rho_{n+1} = \rho_n + \alpha P^{-1} (D(V(\rho_n)) - \rho_n),
 # ```
 # In DFTK this is done by running an SCF as follows:
 # ```julia

diff --git a/docs/src/publications.md b/docs/src/publications.md
@@ -21,8 +21,8 @@ The current DFTK reference paper to cite is
 Additionally the following publications describe DFTK or one of its algorithms:
 
 - E. Cancès, M. Hassan and L. Vidal.
-  [*Modified-Operator Method for the Calculation of Band Diagrams of Crystalline Materials.*](https://arxiv.org/abs/2210.00442)
-  Mathematics of Computation (2023).
+  [*Modified-Operator Method for the Calculation of Band Diagrams of Crystalline Materials.*](https://doi.org/10.1090/mcom/3897)
+  Math. Comp. **93**, 1203 (2024).
   [ArXiv:2210.00442](https://arxiv.org/abs/2210.00442).
   ([Supplementary material and computational scripts](https://github.com/LaurentVidal95/ModifiedOp).
 
@@ -54,11 +54,16 @@ The following publications report research employing DFTK as a core component.
 Feel free to drop us a line if you want your work to be added here.
 
 - J. Cazalis.
-  [*Dirac cones for a mean-field model of graphene*](https://arxiv.org/abs/2207.09893)
-  (Submitted).
+  [*Dirac cones for a mean-field model of graphene*](https//doi.org/10.2140/paa.2024.6.129)
+  Pure and Appl. Anal., **6**, 1 (2024).
   [ArXiv:2207.09893](https://arxiv.org/abs/2207.09893).
   ([Computational script](https://github.com/JuliaMolSim/DFTK.jl/blob/f7fcc31c79436b2582ac1604d4ed8ac51a6fd3c8/examples/publications/2022_cazalis.jl)).
 
+- E. Cancès, G. Kemlin, A. Levitt.
+  [*A Priori Error Analysis of Linear and Nonlinear Periodic Schr\"{o}dinger Equations with Analytic Potentials*](https://doi.org/10.1007/s10915-023-02421-0)
+  J. Sci. Comp., **98**, 25 (2024).
+  [ArXiv:2206.04954](https://arxiv.org/abs/2206.04954).
+
 - E. Cancès, L. Garrigue, D. Gontier.
   [*A simple derivation of moiré-scale continuous models for twisted bilayer graphene*](https://doi.org/10.1103/PhysRevB.107.155403)
   Physical Review B, **107**, 155403 (2023).

diff --git a/src/common/hankel.jl b/src/common/hankel.jl
@@ -10,6 +10,8 @@ as input the radial grid `r`, the precomputed quantity r²f(r) `r2_f`, angular
 momentum / spherical bessel order `l`, and the Hankel coordinate `p`.
 """
 function hankel(r::AbstractVector, r2_f::AbstractVector, l::Integer, p::T)::T where {T<:Real}
-    integrand = r2_f .* sphericalbesselj_fast.(l, p .* r)
-    return 4T(π) * simpson(r, integrand)
+    @assert length(r) == length(r2_f)
+    4T(π) * simpson(r, r2_f) do i, ri, r2_f
+        r2_f[i] * sphericalbesselj_fast(l, p * ri)
+    end
 end
diff --git a/src/common/quadrature.jl b/src/common/quadrature.jl
@@ -1,5 +1,3 @@
-using LoopVectorization
-
 # NumericalIntegration.jl also implements this (in a slightly different way)
 # however, it is unmaintained and has stale, conflicting version requirements
 # for Interpolations.jl
@@ -9,17 +7,20 @@ using LoopVectorization
 Integrate y(x) over x using trapezoidal method quadrature.
 """
 trapezoidal
-@inbounds function trapezoidal(x::AbstractVector, y::AbstractVector)
+@inbounds function trapezoidal(integrand, x::AbstractVector, args...)
     n = length(x)
-    n == length(y) || error("vectors `x` and `y` must have the same number of elements")
-    n == 1 && return zero(promote_type(eltype(x), eltype(y)))
-    I = (x[2] - x[1]) * y[1]
-    @turbo for i = 2:(n-1)
-        # dx[i] + dx[i - 1] = (x[i + 1] - x[i]) + (x[i] - x[i - 1])
-        #                   = x[i + 1] - x[i - 1]
-        I += (x[i + 1] - x[i - 1]) * y[i]
+    Tint = eltype(integrand(1, x[1], args...))
+    n == 1 && return zero(promote_type(eltype(x), Tint))
+    I = (x[2] - x[1]) * integrand(1, x[1], args...)
+    # Note: We used @turbo here before, but actually the allocation overhead
+    #       needed to get all the data into an array is worse than what one gains
+    #       with LoopVectorization
+    @fastmath for i = 2:(n-1)
+        # dx[i] + dx[i-1] = (x[i+1] - x[i]) + (x[i] - x[i-1])
+        #                 = x[i+1] - x[i-1]
+        I += (x[i+1] - x[i-1]) * integrand(i, x[i], args...)
     end
-    I += (x[n] - x[n - 1]) * y[n]
+    I += (x[n] - x[n-1]) * integrand(n, x[n], args...)
     I / 2
 end
 
@@ -29,62 +30,71 @@ end
 Integrate y(x) over x using Simpson's method quadrature.
 """
 simpson
-@inbounds function simpson(x::AbstractVector, y::AbstractVector)
+@inbounds function simpson(integrand, x::AbstractVector, args...)
     n = length(x)
-    n == length(y) || error("vectors `x` and `y` must have the same number of elements")
-    n == 1 && return zero(promote_type(eltype(x), eltype(y)))
-    n <= 4 && return trapezoidal(x, y)
-    (x[2] - x[1]) ≈ (x[3] - x[2]) && return simpson_uniform(x, y)
-    return simpson_nonuniform(x, y)
+    n <= 4 && return trapezoidal(integrand, x, args...)
+    if (x[2] - x[1]) ≈ (x[3] - x[2])
+        simpson_uniform(integrand, x, args...)
+    else
+        simpson_nonuniform(integrand, x, args...)
+    end
 end
 
-@inbounds function simpson_uniform(x::AbstractVector, y::AbstractVector)
+@inbounds function simpson_uniform(integrand, x::AbstractVector, args...)
     dx = x[2] - x[1]
     n = length(x)
     n_intervals = n - 1
 
     istop = isodd(n_intervals) ? n - 1 : n - 2
 
-    I = 1 / 3 * dx * y[1]
-    @turbo for i = 2:2:istop
-        I += 4 / 3 * dx * y[i]
+    I = 1 / 3 * dx * integrand(1, x[1], args...)
+    # Note: We used @turbo here before, but actually the allocation overhead
+    #       needed to get all the data into an array is worse than what one gains
+    #       with LoopVectorization
+    @fastmath for i = 2:2:istop
+        I += 4 / 3 * dx * integrand(i, x[i], args...)
     end
-    @turbo for i = 3:2:istop
-        I += 2 / 3 * dx * y[i]
+    @fastmath for i = 3:2:istop
+        I += 2 / 3 * dx * integrand(i, x[i], args...)
     end
 
     if isodd(n_intervals)
-        I += 5 / 6 * dx * y[n - 1]
-        I += 1 / 2 * dx * y[n]
+        I += 5 / 6 * dx * integrand(n, x[n-1], args...)
+        I += 1 / 2 * dx * integrand(n, x[n], args...)
     else
-        I += 1 / 3 * dx * y[n]
+        I += 1 / 3 * dx * integrand(n, x[n], args...)
     end
     return I
 end
 
-@inbounds function simpson_nonuniform(x::AbstractVector, y::AbstractVector)
+@inbounds function simpson_nonuniform(integrand, x::AbstractVector, args...)
     n = length(x)
-    n_intervals = n - 1
+    n_intervals = n-1
 
-    istop = isodd(n_intervals) ? n - 3 : n - 2
+    istop = isodd(n_intervals) ? n-3 : n-2
 
-    I = zero(promote_type(eltype(x), eltype(y)))
-    # This breaks when @turbo'd
-    @simd for i = 1:2:istop
+    Tint = eltype(integrand(1, x[1], args...))
+    I = zero(promote_type(eltype(x), Tint))
+    # Note: We used @simd here before, but actually the allocation overhead
+    #       needed to get all the data into an array is worse than what one gains
+    #       wtih vectorization
+    @fastmath for i = 1:2:istop
         dx0 = x[i + 1] - x[i]
-        dx1 = x[i + 2] - x[i + 1]
+        dx1 = x[i+2] - x[i+1]
         c = (dx0 + dx1) / 6
-        I += c * (2 - dx1 / dx0) * y[i]
-        I += c * (dx0 + dx1)^2 / (dx0 * dx1) * y[i + 1]
-        I += c * (2 - dx0 / dx1) * y[i + 2]
+        I += c * (2 - dx1 / dx0) * integrand(i, x[i], args...)
+        I += c * (dx0 + dx1)^2 / (dx0 * dx1) * integrand(i+1, x[i+1], args...)
+
+        I += c * (2 - dx0 / dx1) * integrand(i+2, x[i+2], args...)
+
     end
 
     if isodd(n_intervals)
-        dxn = x[end] - x[end - 1]
-        dxnm1 = x[end - 1] - x[end - 2]
-        I += (2 * dxn^2 + 3 * dxn * dxnm1) / (6 * (dxnm1 + dxn)) * y[end]
-        I += (dxn^2 + 3 * dxn * dxnm1) / (6 * dxnm1) * y[end - 1]
-        I -= dxn^3 / (6 * dxnm1 * (dxnm1 + dxn)) * y[end - 2]
+        dxn = x[end] - x[end-1]
+        dxnm1 = x[end - 1] - x[end-2]
+        I += (2 * dxn^2 + 3 * dxn * dxnm1) / (6 * (dxnm1 + dxn)) * integrand(n, x[n], args...)
+        I += (dxn^2 + 3 * dxn * dxnm1) / (6 * dxnm1) * integrand(n-1, x[n-1], args...)
+        I -= dxn^3 / (6 * dxnm1 * (dxnm1 + dxn)) * integrand(n-2, x[n-2], args...)
     end
 
     return I

diff --git a/src/densities.jl b/src/densities.jl
@@ -10,9 +10,13 @@ grid `basis`, where the individual k-points are occupied according to `occupatio
 It is possible to ask only for occupations higher than a certain level to be computed by
 using an optional `occupation_threshold`. By default all occupation numbers are considered.
 """
-@views @timing function compute_density(basis::PlaneWaveBasis{T}, ψ, occupation;
-                                        occupation_threshold=zero(T)) where {T}
-    Tρ = promote_type(T, real(eltype(ψ[1])))
+@views @timing function compute_density(basis::PlaneWaveBasis{T,VT}, ψ, occupation;
+                                        occupation_threshold=zero(T)) where {T,VT}
+    Tρ = promote_type(T,  real(eltype(ψ[1])))
+    Tψ = promote_type(VT, real(eltype(ψ[1])))
+    # Note, that we special-case Tψ, since when T is Dual and eltype(ψ[1]) is not
+    # (e.g. stress calculation), then only the normalisation factor introduces
+    # dual numbers, but not yet the FFT
 
     # Occupation should be on the CPU as we are going to be doing scalar indexing.
     occupation = [to_cpu(oc) for oc in occupation]
@@ -21,18 +25,18 @@ using an optional `occupation_threshold`. By default all occupation numbers are
 
     function allocate_local_storage()
         (; ρ=zeros_like(basis.G_vectors, Tρ, basis.fft_size..., basis.model.n_spin_components),
-         ψnk_real=zeros_like(basis.G_vectors, complex(Tρ), basis.fft_size...))
+         ψnk_real=zeros_like(basis.G_vectors, complex(Tψ), basis.fft_size...))
     end
     # We split the total iteration range (ik, n) in chunks, and parallelize over them.
     range = [(ik, n) for ik = 1:length(basis.kpoints) for n = mask_occ[ik]]
 
     storages = parallel_loop_over_range(range; allocate_local_storage) do kn, storage
         (ik, n) = kn
         kpt = basis.kpoints[ik]
-
-        ifft!(storage.ψnk_real, basis, kpt, ψ[ik][:, n])
+        ifft!(storage.ψnk_real, basis, kpt, ψ[ik][:, n]; normalize=false)
         storage.ρ[:, :, :, kpt.spin] .+= (occupation[ik][n] .* basis.kweights[ik]
-                                              .* abs2.(storage.ψnk_real))
+                                          .* (basis.ifft_normalization)^2
+                                          .* abs2.(storage.ψnk_real))
 
         synchronize_device(basis.architecture)
     end

diff --git a/src/eigen/lobpcg_hyper_impl.jl b/src/eigen/lobpcg_hyper_impl.jl
@@ -306,7 +306,7 @@ function final_retval(X, AX, BX, resid_history, niter, n_matvec)
         p = sortperm(λ)
         λ = λ[p]
         residuals = residuals[:, p]
-        X = X[:, p]
+        X  = X[:, p]
         AX = AX[:, p]
         BX = BX[:, p]
         resid_history = resid_history[p, :]

diff --git a/src/external/atomsbase.jl b/src/external/atomsbase.jl
@@ -45,7 +45,7 @@ function parse_system(system::AbstractSystem{D}) where {D}
         magnetic_moments = normalize_magnetic_moment.(magnetic_moments)
     end
 
-    sum_atomic_charge = sum(atom -> get(atom, :charge, 0.0u"e_au"), system)
+    sum_atomic_charge = sum(atom -> get(atom, :charge, 0.0u"e_au"), system; init=0.0u"e_au")
     if abs(sum_atomic_charge) > 1e-6u"e_au"
         error("Charged systems not yet supported in DFTK.")
     end

diff --git a/src/external/spglib.jl b/src/external/spglib.jl
@@ -5,6 +5,7 @@
 import Spglib
 
 function spglib_cell(lattice, atom_groups, positions, magnetic_moments)
+    @assert !isempty(atom_groups)  # Otherwise spglib cannot work properly
     magnetic_moments = normalize_magnetic_moment.(magnetic_moments)
 
     spg_atoms     = Int[]

diff --git a/src/fft.jl b/src/fft.jl
@@ -33,8 +33,7 @@ function ifft!(f_real::AbstractArray3, basis::PlaneWaveBasis,
     fill!(f_real, 0)
     f_real[kpt.mapping] = f_fourier
 
-    # Perform an IFFT
-    mul!(f_real, basis.ipBFFT, f_real)
+    mul!(f_real, basis.ipBFFT, f_real)  # perform IFFT
     normalize && (f_real .*= basis.ifft_normalization)
     f_real
 end

diff --git a/src/postprocess/stresses.jl b/src/postprocess/stresses.jl
@@ -7,15 +7,14 @@ is given by
 σ_{yx} σ_{yy} σ_{yz} \\
 σ_{zx} σ_{zy} σ_{zz}
 \end{array}
-\right) = \frac{1}{|Ω|} \left. \frac{dE[ (I+ϵ) * L]}{dM}\right|_{ϵ=0}
+\right) = \frac{1}{|Ω|} \left. \frac{dE[ (I+ϵ) * L]}{dϵ}\right|_{ϵ=0}
 ```
 where ``ϵ`` is the strain.
 See [O. Nielsen, R. Martin Phys. Rev. B. **32**, 3792 (1985)](https://doi.org/10.1103/PhysRevB.32.3792)
 for details. In Voigt notation one would use the vector
 ``[σ_{xx} σ_{yy} σ_{zz} σ_{zy} σ_{zx} σ_{yx}]``.
 """
 @timing function compute_stresses_cart(scfres)
-    # TODO optimize by only computing derivatives wrt 6 independent parameters
     # compute the Hellmann-Feynman energy (with fixed ψ/occ/ρ)
     function HF_energy(lattice::AbstractMatrix{T}) where {T}
         basis = scfres.basis
@@ -26,12 +25,38 @@ for details. In Voigt notation one would use the vector
                                    basis.use_symmetries_for_kpoint_reduction,
                                    basis.comm_kpts, basis.architecture)
         ρ = compute_density(new_basis, scfres.ψ, scfres.occupation)
-        energies = energy_hamiltonian(new_basis, scfres.ψ, scfres.occupation;
-                                      ρ, scfres.eigenvalues, scfres.εF).energies
+        (; energies) = energy(new_basis, scfres.ψ, scfres.occupation;
+                              ρ, scfres.eigenvalues, scfres.εF)
         energies.total
     end
-    L = scfres.basis.model.lattice
-    Ω = scfres.basis.model.unit_cell_volume
-    stresses = ForwardDiff.gradient(M -> HF_energy((I+M) * L), zero(L)) / Ω
-    symmetrize_stresses(scfres.basis, stresses)
+    L  = scfres.basis.model.lattice
+    Ω  = scfres.basis.model.unit_cell_volume
+
+    # Define f(ϵ) = E[ (I+ϵ) * L]. Since the strain is symmetric (same as σ) it has only
+    # 6 free components which we collect as in Voigt notation
+    #    M = [ϵ_{xx} ϵ_{yy} ϵ_{zz} ϵ_{zy} ϵ_{zx} ϵ_{yx}]
+    # Then
+    function HF_energy_voigt(M)
+        D = [1+M[1]   M[6]   M[5];  # Lattice distortion matrix
+               M[6] 1+M[2]   M[4];
+               M[5]   M[4] 1+M[3]]
+        HF_energy(D * L)
+    end
+    # The derivative of this function wrt. M is by the chain rule
+    #    [df/dϵ_{xx}, df/dϵ_{yy}, df/dϵ_{zz},
+    #     df/dϵ_{zy}+df/dϵ_{yz}, df/dϵ_{zx}+df/dϵ_{xz}, df/dϵ_{yx}+df/dϵ_{xy}]
+    # Therefore
+    stress_voigt = 1/Ω * ForwardDiff.gradient(HF_energy_voigt, zeros(eltype(L), 6))
+    symmetrize_stresses(scfres.basis, voigt_to_full(stress_voigt))
+end
+function voigt_to_full(v::AbstractVector{T}) where {T}
+    @SArray[v[1]       v[6]/T(2)  v[5]/T(2);
+            v[6]/T(2)  v[2]       v[4]/T(2);
+            v[5]/T(2)  v[4]/T(2)  v[3]     ]
+end
+function full_to_voigt(ε::AbstractMatrix{T}) where {T}
+    [ε[1, 1], ε[2, 2], ε[3, 3],
+     ε[3, 2] + ε[2, 3],
+     ε[3, 1] + ε[1, 3],
+     ε[1, 2] + ε[2, 1]]
 end
diff --git a/src/pseudo/PspUpf.jl b/src/pseudo/PspUpf.jl
@@ -168,7 +168,7 @@ function eval_psp_projector_fourier(psp::PspUpf, i, l, p::T)::T where {T<:Real}
     ircut_proj = min(psp.ircut, length(psp.r2_projs[l+1][i]))
     rgrid = @view psp.rgrid[1:ircut_proj]
     r2_proj = @view psp.r2_projs[l+1][i][1:ircut_proj]
-    return hankel(rgrid, r2_proj, l, p)
+    hankel(rgrid, r2_proj, l, p)
 end
 
 function eval_psp_pswfc_real(psp::PspUpf, i, l, r::T)::T where {T<:Real}
@@ -180,7 +180,7 @@ function eval_psp_pswfc_fourier(psp::PspUpf, i, l, p::T)::T where {T<:Real}
     # quantities. They are the reason that PseudoDojo UPF files have a much
     # larger radial grid than their psp8 counterparts.
     # If issues arise, try cutting them off too.
-    return hankel(psp.rgrid, psp.r2_pswfcs[l+1][i], l, p)
+    hankel(psp.rgrid, psp.r2_pswfcs[l+1][i], l, p)
 end
 
 eval_psp_local_real(psp::PspUpf, r::T) where {T<:Real} = psp.vloc_interp(r)
@@ -193,12 +193,10 @@ function eval_psp_local_fourier(psp::PspUpf, p::T)::T where {T<:Real}
     # ABINIT uses a more 'pure' Coulomb term with the same asymptotic behavior
     # C(r) = -Z/r; H[-Z/r] = -Z/p^2
     rgrid = @view psp.rgrid[1:psp.ircut]
-    vloc = @view psp.vloc[1:psp.ircut]
-    f = (
-        rgrid .* (rgrid .* vloc .- -psp.Zion * erf.(rgrid))
-        .* sphericalbesselj_fast.(0, p .* rgrid)
-    )
-    I = trapezoidal(rgrid, f)
+    vloc  = @view psp.vloc[1:psp.ircut]
+    I = trapezoidal(rgrid, vloc, psp.Zion) do i, r, vloc, Zion
+         r * (r * vloc[i] - -Zion * erf(r)) * sphericalbesselj_fast(0, p * r)
+    end
     4T(π) * (I + -psp.Zion / p^2 * exp(-p^2 / T(4)))
 end
 
@@ -225,6 +223,7 @@ end
 function eval_psp_energy_correction(T, psp::PspUpf, n_electrons)
     rgrid = @view psp.rgrid[1:psp.ircut]
     vloc = @view psp.vloc[1:psp.ircut]
-    f = rgrid .* (rgrid .* vloc .- -psp.Zion)
-    4T(π) * n_electrons * trapezoidal(rgrid, f)
+    4T(π) * n_electrons * trapezoidal(rgrid, vloc, psp.Zion) do i, r, vloc, Zion
+        r * (r * vloc[i] - -Zion)
+    end
 end
diff --git a/src/scf/self_consistent_field.jl b/src/scf/self_consistent_field.jl
@@ -188,8 +188,7 @@ Overview of parameters:
 
         # Compute the energy of the new state
         if compute_consistent_energies
-            energies = energy_hamiltonian(basis, ψ, occupation;
-                                          ρ=ρout, eigenvalues, εF).energies
+            (; energies) = energy(basis, ψ, occupation; ρ=ρout, eigenvalues, εF)
         end
 
         # Push energy and density change of this step.