# [1] https://link.springer.com/article/10.1007/s00214-016-1847-3
# Birkholz, 2016
# [2] Geometry optimization in Cartesian coordinates: Constrained optimization
# Baker, 1992
# [3] https://epubs.siam.org/doi/pdf/10.1137/S1052623496306450
# BFGS WITH UPDATE SKIPPING AND VARYING MEMORY
# Kolda, 1998
# [4] https://link.springer.com/article/10.1186/1029-242X-2012-241
# New cautious BFGS algorithm based on modified Armijo-type line search
# Wan, 2012
# [5] Numerical optimization, 2nd ed.
# Nocedal, Wright
# [6] https://arxiv.org/abs/2006.08877
# Goldfarb, 2020
# [7] https://pubs.acs.org/doi/10.1021/acs.jctc.9b00869
# Hermes, Zádor, 2019
# [8] https://doi.org/10.1002/(SICI)1096-987X(199802)19:3<349::AID-JCC8>3.0.CO;2-T
# Bofill, 1998
# [9] http://dx.doi.org/10.1016/S0166-1280(02)00209-9
# Bungay, Poirier
import numpy as np
from pysisyphus.optimizers.closures import bfgs_multiply
[docs]
def bfgs_update(H, dx, dg):
first_term = np.outer(dg, dg) / dg.dot(dx)
second_term = H.dot(np.outer(dx, dx)).dot(H) / dx.dot(H).dot(dx)
return first_term - second_term, "BFGS"
[docs]
def damped_bfgs_update(H, dx, dg):
"""See [5]"""
dxdg = dx.dot(dg)
dxHdx = dx.dot(H).dot(dx)
theta = 1
if dxdg < 0.2 * dxHdx:
theta = 0.8 * dxHdx / (dxHdx - dxdg)
r = theta * dg + (1 - theta) * H.dot(dx)
first_term = np.outer(r, r) / r.dot(dx)
second_term = H.dot(np.outer(dx, dx)).dot(H) / dxHdx
return first_term - second_term, "damped BFGS"
[docs]
def double_damp(
s, y, H=None, s_list=None, y_list=None, mu_1=0.2, mu_2=0.2, logger=None
):
"""Double damped step 's' and gradient differences 'y'.
H is the inverse Hessian!
See [6]. Potentially updates s and y. y is only
updated if mu_2 is not None.
Parameters
----------
s : np.array, shape (N, ), floats
Coordiante differences/step.
y : np.array, shape (N, ), floats
Gradient differences
H : np.array, shape (N, N), floats, optional
Inverse Hessian.
s_list : list of nd.array, shape (K, N), optional
List of K previous steps. If no H is supplied and prev_ys is given
the matrix-vector product Hy will be calculated through the
two-loop LBFGS-recursion.
y_list : list of nd.array, shape (K, N), optional
List of K previous gradient differences. See s_list.
mu_1 : float, optional
Parameter for 's' damping.
mu_2 : float, optional
Parameter for 'y' damping.
logger : logging.Logger, optional
Logger to be used.
Returns
-------
s : np.array, shape (N, ), floats
Damped coordiante differences/step.
y : np.array, shape (N, ), floats
Damped gradient differences
"""
sy = s.dot(y)
# Calculate Hy directly
if H is not None:
Hy = H.dot(y)
# Calculate Hy via BFGS_multiply as in LBFGS
else:
Hy = bfgs_multiply(s_list, y_list, y, logger=logger)
yHy = y.dot(Hy)
theta_1 = 1
damped_s = ""
if sy < mu_1 * yHy:
theta_1 = (1 - mu_1) * yHy / (yHy - sy)
s = theta_1 * s + (1 - theta_1) * Hy
if theta_1 < 1.0:
damped_s = ", damped s"
msg = f"damped BFGS\n\ttheta_1={theta_1:.4f} {damped_s}"
# Double damping
damped_y = ""
if mu_2 is not None:
sy = s.dot(y)
ss = s.dot(s)
theta_2 = 1
if sy < mu_2 * ss:
theta_2 = (1 - mu_2) * ss / (ss - sy)
y = theta_2 * y + (1 - theta_2) * s
if theta_2 < 1.0:
damped_y = ", damped y"
msg = "double " + msg + f"\n\ttheta_2={theta_2:.4f} {damped_y}"
if logger is not None:
logger.debug(msg.capitalize())
return s, y
[docs]
def sr1_update(z, dx):
return np.outer(z, z) / z.dot(dx), "SR1"
[docs]
def psb_update(z, dx):
first_term = (np.outer(dx, z) + np.outer(z, dx)) / dx.dot(dx)
sec_term = dx.dot(z) * np.outer(dx, dx) / dx.dot(dx) ** 2
return first_term - sec_term, "PSB"
[docs]
def flowchart_update(H, dx, dg):
# See [1], Sec. 2, equations 1 to 3
z = dg - H.dot(dx)
sr1_quot = z.dot(dx) / (np.linalg.norm(z) * np.linalg.norm(dx))
bfgs_quot = dg.dot(dx) / (np.linalg.norm(dg) * np.linalg.norm(dx))
if sr1_quot < -0.1:
update, key = sr1_update(z, dx)
elif bfgs_quot > 0.1:
update, key = bfgs_update(H, dx, dg)
else:
update, key = psb_update(z, dx)
return update, key
[docs]
def mod_flowchart_update(H, dx, dg):
# This version seems to work too ... at least for minimizations
# starting from a geometry near a transition state. Interesing.
z = dg - H.dot(dx)
quot = z.dot(dx) / (np.linalg.norm(z) * np.linalg.norm(dx))
if quot < -0.1:
update, key = bfgs_update(H, dx, dg)
elif quot > 0.1:
update, key = sr1_update(z, dx)
else:
update, key = psb_update(z, dx)
return update, key
[docs]
def bofill_update(H, dx, dg):
z = dg - H.dot(dx)
# Symmetric, rank-one (SR1) update
sr1, _ = sr1_update(z, dx)
# Powell (PSB) update
powell, _ = psb_update(z, dx)
# Bofill mixing-factor
mix = z.dot(dx) ** 2 / (z.dot(z) * dx.dot(dx))
# Bofill update
bofill_update = (mix * sr1) + (1 - mix) * (powell)
return bofill_update, "Bofill"
[docs]
def ts_bfgs_update(H, dx, dg):
"""As described in [7]"""
dx = dx[:, None]
dg = dg[:, None]
j = dg - H @ dx
jdx = j.T @ dx
# Diagonalize Hessian, to construct positive definite version of it
w, v = np.linalg.eigh(H)
Hdx = np.abs(w) * v @ (v.T @ dx)
M = dg @ dg.T + Hdx @ Hdx.T
dxTM = dx.T @ M
u = np.linalg.solve(dxTM @ dx, dxTM).T
juT = j @ u.T
ts_bfgs_update = juT + juT.T - jdx * u @ u.T
return ts_bfgs_update, "TS-BFGS"
[docs]
def ts_bfgs_update_org(H, dx, dg):
"""Do not use! Implemented as described in the 1998 bofill paper [8].
This does not seem to work too well."""
dx = dx[:, None]
dg = dg[:, None]
u = dg
j = H @ dx
j = u - j
# jTdx = float(j.T @ dx)
jTdx = j.T @ dx
# dxTdx = float(dx.T @ dx)
dxTdx = dx.T @ dx
# jTj = float(j.T @ j)
jTj = j.T @ j
phi = jTdx ** 2 / dxTdx / jTj
u = phi * dg * dg.T @ dx
w, v = np.linalg.eigh(H)
u = u + (1 - phi) * np.abs(w) * v @ (v.T @ dx)
u = u / (u.T @ dx)
juT = j @ u.T
ts_bfgs_update = juT + juT.T - jTdx * u @ u.T
return ts_bfgs_update, "TS-BFGS"
[docs]
def ts_bfgs_update_revised(H, dx, dg):
"""TS-BFGS update as described in [9].
Better than the original formula of Bofill, worse than the implementation
in [7]. a is caluclated as described in the footnote 1 on page 38. Eq. (8)
looks suspicious as it contains the inverse of a vector?! As also outlined
in the paper abs(a) is used (|a| in the paper)."""
dx = dx[:, None]
dg = dg[:, None]
dgTdg = dg.T @ dg
dgTdx = dg.T @ dx
a = (dgTdg - dg.T @ H @ dx) / (dgTdg * dgTdx)
a = abs(a)
# Diagonalize Hessian, to construct positive definite version of it
w, v = np.linalg.eigh(H)
H_pos_dx = np.abs(w) * v @ (v.T @ dx)
# Mixing factor
j = dg - H @ dx
jTdx = j.T @ dx
dxTdx = dx.T @ dx
jTj = j.T @ j
phi = jTdx ** 2 / dxTdx / jTj
u = ((1 - phi) * H_pos_dx + a * phi * dgTdx * dg) / (
(1 - phi) * dx.T @ H_pos_dx + phi * a * dgTdx ** 2
)
juT = j @ u.T
ts_bfgs_update = juT + juT.T - jTdx * u @ u.T
return ts_bfgs_update, "TS-BFGS"