Source code for pysisyphus.optimizers.hessian_updates

# [1] https://link.springer.com/article/10.1007/s00214-016-1847-3
#     Birkholz, 2016
# [2] Geometry optimization in Cartesian coordinates: Constrained optimization
#     Baker, 1992
# [3] https://epubs.siam.org/doi/pdf/10.1137/S1052623496306450
#     BFGS WITH UPDATE SKIPPING AND VARYING MEMORY
#     Kolda, 1998
# [4] https://link.springer.com/article/10.1186/1029-242X-2012-241
#     New cautious BFGS algorithm based on modified Armijo-type line search
#     Wan, 2012
# [5] Numerical optimization, 2nd ed.
#     Nocedal, Wright
# [6] https://arxiv.org/abs/2006.08877
#     Goldfarb, 2020
# [7] https://pubs.acs.org/doi/10.1021/acs.jctc.9b00869
#     Hermes, Zádor, 2019
# [8] https://doi.org/10.1002/(SICI)1096-987X(199802)19:3<349::AID-JCC8>3.0.CO;2-T
#     Bofill, 1998
# [9] http://dx.doi.org/10.1016/S0166-1280(02)00209-9
#     Bungay, Poirier


import numpy as np

from pysisyphus.optimizers.closures import bfgs_multiply


[docs] def bfgs_update(H, dx, dg): first_term = np.outer(dg, dg) / dg.dot(dx) second_term = H.dot(np.outer(dx, dx)).dot(H) / dx.dot(H).dot(dx) return first_term - second_term, "BFGS"
[docs] def damped_bfgs_update(H, dx, dg): """See [5]""" dxdg = dx.dot(dg) dxHdx = dx.dot(H).dot(dx) theta = 1 if dxdg < 0.2 * dxHdx: theta = 0.8 * dxHdx / (dxHdx - dxdg) r = theta * dg + (1 - theta) * H.dot(dx) first_term = np.outer(r, r) / r.dot(dx) second_term = H.dot(np.outer(dx, dx)).dot(H) / dxHdx return first_term - second_term, "damped BFGS"
[docs] def double_damp( s, y, H=None, s_list=None, y_list=None, mu_1=0.2, mu_2=0.2, logger=None ): """Double damped step 's' and gradient differences 'y'. H is the inverse Hessian! See [6]. Potentially updates s and y. y is only updated if mu_2 is not None. Parameters ---------- s : np.array, shape (N, ), floats Coordiante differences/step. y : np.array, shape (N, ), floats Gradient differences H : np.array, shape (N, N), floats, optional Inverse Hessian. s_list : list of nd.array, shape (K, N), optional List of K previous steps. If no H is supplied and prev_ys is given the matrix-vector product Hy will be calculated through the two-loop LBFGS-recursion. y_list : list of nd.array, shape (K, N), optional List of K previous gradient differences. See s_list. mu_1 : float, optional Parameter for 's' damping. mu_2 : float, optional Parameter for 'y' damping. logger : logging.Logger, optional Logger to be used. Returns ------- s : np.array, shape (N, ), floats Damped coordiante differences/step. y : np.array, shape (N, ), floats Damped gradient differences """ sy = s.dot(y) # Calculate Hy directly if H is not None: Hy = H.dot(y) # Calculate Hy via BFGS_multiply as in LBFGS else: Hy = bfgs_multiply(s_list, y_list, y, logger=logger) yHy = y.dot(Hy) theta_1 = 1 damped_s = "" if sy < mu_1 * yHy: theta_1 = (1 - mu_1) * yHy / (yHy - sy) s = theta_1 * s + (1 - theta_1) * Hy if theta_1 < 1.0: damped_s = ", damped s" msg = f"damped BFGS\n\ttheta_1={theta_1:.4f} {damped_s}" # Double damping damped_y = "" if mu_2 is not None: sy = s.dot(y) ss = s.dot(s) theta_2 = 1 if sy < mu_2 * ss: theta_2 = (1 - mu_2) * ss / (ss - sy) y = theta_2 * y + (1 - theta_2) * s if theta_2 < 1.0: damped_y = ", damped y" msg = "double " + msg + f"\n\ttheta_2={theta_2:.4f} {damped_y}" if logger is not None: logger.debug(msg.capitalize()) return s, y
[docs] def sr1_update(z, dx): return np.outer(z, z) / z.dot(dx), "SR1"
[docs] def psb_update(z, dx): first_term = (np.outer(dx, z) + np.outer(z, dx)) / dx.dot(dx) sec_term = dx.dot(z) * np.outer(dx, dx) / dx.dot(dx) ** 2 return first_term - sec_term, "PSB"
[docs] def flowchart_update(H, dx, dg): # See [1], Sec. 2, equations 1 to 3 z = dg - H.dot(dx) sr1_quot = z.dot(dx) / (np.linalg.norm(z) * np.linalg.norm(dx)) bfgs_quot = dg.dot(dx) / (np.linalg.norm(dg) * np.linalg.norm(dx)) if sr1_quot < -0.1: update, key = sr1_update(z, dx) elif bfgs_quot > 0.1: update, key = bfgs_update(H, dx, dg) else: update, key = psb_update(z, dx) return update, key
[docs] def mod_flowchart_update(H, dx, dg): # This version seems to work too ... at least for minimizations # starting from a geometry near a transition state. Interesing. z = dg - H.dot(dx) quot = z.dot(dx) / (np.linalg.norm(z) * np.linalg.norm(dx)) if quot < -0.1: update, key = bfgs_update(H, dx, dg) elif quot > 0.1: update, key = sr1_update(z, dx) else: update, key = psb_update(z, dx) return update, key
[docs] def bofill_update(H, dx, dg): z = dg - H.dot(dx) # Symmetric, rank-one (SR1) update sr1, _ = sr1_update(z, dx) # Powell (PSB) update powell, _ = psb_update(z, dx) # Bofill mixing-factor mix = z.dot(dx) ** 2 / (z.dot(z) * dx.dot(dx)) # Bofill update bofill_update = (mix * sr1) + (1 - mix) * (powell) return bofill_update, "Bofill"
[docs] def ts_bfgs_update(H, dx, dg): """As described in [7]""" dx = dx[:, None] dg = dg[:, None] j = dg - H @ dx jdx = j.T @ dx # Diagonalize Hessian, to construct positive definite version of it w, v = np.linalg.eigh(H) Hdx = np.abs(w) * v @ (v.T @ dx) M = dg @ dg.T + Hdx @ Hdx.T dxTM = dx.T @ M u = np.linalg.solve(dxTM @ dx, dxTM).T juT = j @ u.T ts_bfgs_update = juT + juT.T - jdx * u @ u.T return ts_bfgs_update, "TS-BFGS"
[docs] def ts_bfgs_update_org(H, dx, dg): """Do not use! Implemented as described in the 1998 bofill paper [8]. This does not seem to work too well.""" dx = dx[:, None] dg = dg[:, None] u = dg j = H @ dx j = u - j # jTdx = float(j.T @ dx) jTdx = j.T @ dx # dxTdx = float(dx.T @ dx) dxTdx = dx.T @ dx # jTj = float(j.T @ j) jTj = j.T @ j phi = jTdx ** 2 / dxTdx / jTj u = phi * dg * dg.T @ dx w, v = np.linalg.eigh(H) u = u + (1 - phi) * np.abs(w) * v @ (v.T @ dx) u = u / (u.T @ dx) juT = j @ u.T ts_bfgs_update = juT + juT.T - jTdx * u @ u.T return ts_bfgs_update, "TS-BFGS"
[docs] def ts_bfgs_update_revised(H, dx, dg): """TS-BFGS update as described in [9]. Better than the original formula of Bofill, worse than the implementation in [7]. a is caluclated as described in the footnote 1 on page 38. Eq. (8) looks suspicious as it contains the inverse of a vector?! As also outlined in the paper abs(a) is used (|a| in the paper).""" dx = dx[:, None] dg = dg[:, None] dgTdg = dg.T @ dg dgTdx = dg.T @ dx a = (dgTdg - dg.T @ H @ dx) / (dgTdg * dgTdx) a = abs(a) # Diagonalize Hessian, to construct positive definite version of it w, v = np.linalg.eigh(H) H_pos_dx = np.abs(w) * v @ (v.T @ dx) # Mixing factor j = dg - H @ dx jTdx = j.T @ dx dxTdx = dx.T @ dx jTj = j.T @ j phi = jTdx ** 2 / dxTdx / jTj u = ((1 - phi) * H_pos_dx + a * phi * dgTdx * dg) / ( (1 - phi) * dx.T @ H_pos_dx + phi * a * dgTdx ** 2 ) juT = j @ u.T ts_bfgs_update = juT + juT.T - jTdx * u @ u.T return ts_bfgs_update, "TS-BFGS"