Source code for pysisyphus.io.molden

import collections
import re
from typing import Optional

import numpy as np
import pyparsing as pp
from scipy.special import gamma

from pysisyphus.elem_data import ATOMIC_NUMBERS, nuc_charges_for_atoms
from pysisyphus.Geometry import Geometry
from pysisyphus.io.xyz import parse_xyz
from pysisyphus.helpers_pure import file_or_str
from pysisyphus.wavefunction import (
    get_l,
    MoldenShells,
    ORCAMoldenShells,
    Shell,
    Wavefunction,
)
from pysisyphus.wavefunction.helpers import BFType


[docs] class MoldenError(Exception): pass
HEADER_RE = re.compile( r"\[Molden Format]\s+" r"\[Title\](?P<title>.*)\s+" r"\[Atoms\]\s*(?P<unit>Angs|Au){0,1}\s*" r"(?P<first_atom>\w)\s+", re.IGNORECASE | re.DOTALL, ) MOLDEN_FROM = collections.defaultdict() MOLDEN_FROM.update( { # key: # (atoms lowercase, title of length 0, unit) # # XTB capitalizes atoms, has no title and uses AU as unit (False, True, "AU"): "xtb", # TURBOMOLE has lowercase atoms, a title full of whitespace and uses AU as unit (True, False, "AU"): "turbomole", } ) @file_or_str(".molden", ".input") def molden_is_from(text: str) -> Optional[str]: mobj = HEADER_RE.match(text) if not mobj: raise MoldenError("Parsing molden header failed!") title = mobj.group("title") unit = mobj.group("unit") first_atom = mobj.group("first_atom") # Quick check for ORCA orca_title = "Molden file created by orca_2mkl" if title.strip().startswith(orca_title): return "orca" atom_lower = first_atom[0] == first_atom[0].lower() len0_title = len(title) == 0 key = (atom_lower, len0_title, unit) return MOLDEN_FROM[key] @file_or_str(".molden", ".input") def parse_molden_atoms_coords(molden_str): _, geometries = re.split(r"\[GEOMETRIES\] \(XYZ\)", molden_str) atoms_coords, comments = parse_xyz(geometries, with_comment=True) return atoms_coords, comments
[docs] def geoms_from_molden(fn, **kwargs): atoms_coords, comments = parse_molden_atoms_coords(fn) geoms = [ Geometry(atoms, coords.flatten(), comment=comment, **kwargs) for (atoms, coords), comment in zip(atoms_coords, comments) ] return geoms
[docs] def parse_mo(mo_lines): # sym, ene_key, energy, spin_key, spin, occup_key, occup, *rest = mo_lines sym, ene_line, spin_line, occup_line, *rest = mo_lines ene_key, energy = ene_line.split("=") spin_key, spin = spin_line.split("=") occup_key, occup = occup_line.split("=") assert ( (ene_key.strip() == "ene") and (spin_key.strip() == "spin") and (occup_key.strip() == "occup") ) prev_ind = 0 coeffs = list() for line in rest: ind, coeff = line.split() ind = int(ind) assert ind == prev_ind + 1 prev_ind = ind coeffs.append(coeff) return { "sym": sym.strip(), "ene": float(energy), "spin": spin.strip(), "occup": float(occup), "coeffs": np.array(coeffs), }
@file_or_str(".molden", ".input") def parse_molden(text, with_mos=True): int_ = pp.common.integer real = pp.common.real sci_real = pp.common.sci_real def get_section(name): return pp.CaselessLiteral(f"[{name}]") molden_format = get_section("Molden Format") n_atoms = get_section("N_Atoms") + int_ atoms_header = get_section("Atoms") title = get_section("Title") + pp.Optional( pp.ZeroOrMore(~atoms_header + pp.Word(pp.printables)) ) alpha_re = re.compile("([a-zA-Z]+)") def drop_number(toks): """OpenMolcas writes atoms like H1, C2, ... We don't want the number.""" return alpha_re.match(toks[0]).group(1) # [Atoms] # element_name number atomic_number x y z unit = pp.one_of(("AU Angs (AU) (Angs)"), caseless=True) atom_symbol = pp.Word(pp.alphanums).set_parse_action(drop_number) xyz = pp.Group(sci_real + sci_real + sci_real) atom_line = pp.Group( atom_symbol.set_results_name("symbol") + int_.set_results_name("number") + int_.set_results_name("atomic_number") + xyz.set_results_name("xyz") ) atoms = ( atoms_header + pp.Optional(unit).set_results_name("unit") + pp.OneOrMore(atom_line).set_results_name("atoms") ) # [Charge], written by OpenMOLCAS charge = get_section("Charge") charge_kind = pp.one_of(("(Mulliken)",), caseless=True) charges = charge + charge_kind + pp.OneOrMore(sci_real) def pgto_exps_coeffs(s, loc, toks): data = toks.asList() exps, coeffs = np.array(data, dtype=float).reshape(-1, 2).T pr = pp.ParseResults.from_dict( { "exponents": exps, "coeffs": coeffs, } ) # Somehow this function will result in exponents and coeffs being both # stored in lists of length 1. return pr # [GTO] ang_mom = pp.one_of("s p sp d f g h i", caseless=True) pgto = sci_real + sci_real shell = pp.Group( ang_mom.set_results_name("ang_mom") + int_.set_results_name("contr_depth") # Usually 1.0, can be left out + pp.Optional(real.set_whitespace_chars(" ").suppress() + pp.LineEnd()) + pp.OneOrMore(pgto).set_parse_action(pgto_exps_coeffs) ) atom_gtos = pp.Group( int_.set_results_name("number") # Center + pp.Optional(pp.Literal("0")) + pp.OneOrMore(shell).set_results_name("shells") ) gto = ( pp.CaselessLiteral("[GTO]") + pp.Optional(unit) + pp.Group(pp.OneOrMore(atom_gtos)).set_results_name("gto") ) # [5D] [7F] [9G] etc. ang_mom_flags = pp.ZeroOrMore(pp.one_of("[5D] [7F] [9G]", caseless=True)) # Pyparsing code to parse MOs ... slow. Regex-based code is much faster. # [MO] # sym_label = pp.Word(pp.printables) # spin = pp.one_of("Alpha Beta", caseless=True) # mo_coeff = pp.Suppress(int_) + real # mo = pp.Group( # pp.CaselessLiteral("Sym=").suppress() # + sym_label.set_results_name("sym") # + pp.CaselessLiteral("Ene=").suppress() # + sci_real.set_results_name("ene") # + pp.CaselessLiteral("Spin=").suppress() # + spin.set_results_name("spin") # + pp.CaselessLiteral("Occup=").suppress() # + real.set_results_name("occup") # + pp.Group(pp.OneOrMore(mo_coeff)).set_results_name("coeffs") # ) # mos = pp.CaselessLiteral("[MO]") + pp.OneOrMore(mo).set_results_name("mos") mos = pp.CaselessLiteral("[MO]") + pp.OneOrMore( pp.Regex(r"[^\[]+") # Match everything until the next [ is encountered ).set_results_name("mos") # Actual parser parser = molden_format + pp.Each( [ pp.Optional(expr) for expr in (n_atoms, title, atoms, charges, gto, ang_mom_flags) ] ) if with_mos: parser += mos res = parser.parse_string(text) as_dict = res.asDict() if with_mos: mos_raw = as_dict["mos"][0] by_mo = [bm.strip().split("\n") for bm in mos_raw.lower().strip().split("sym=")] assert by_mo[0] == [""], by_mo[0] by_mo = by_mo[1:] as_dict["mos"] = [parse_mo(mo_lines) for mo_lines in by_mo] return as_dict
[docs] def get_xtb_nuc_charges(atoms, as_ecp_electrons=False): """Modified nuclear charges w/o core electrons. Adapated from Multiwfn 3.8 """ org_nuc_charges = nuc_charges_for_atoms(atoms) ecp_electrons = np.zeros_like(org_nuc_charges, dtype=int) for i, Z in enumerate(org_nuc_charges): # H, He if 1 <= Z <= 2: ecpel = 0 # Li - Ne elif 3 <= Z <= 10: ecpel = 2 # Na - Ar elif 11 <= Z <= 18: ecpel = 10 # K - Cu elif 19 <= Z <= 29: ecpel = 18 # Zn - Kr elif 30 <= Z <= 36: ecpel = 28 # Rb - Ag elif 37 <= Z <= 47: ecpel = 36 # Cd - Xe elif 48 <= Z <= 54: ecpel = 46 # Cs - Ba elif 55 <= Z <= 56: ecpel = 54 # La - Lu, crazy elif 57 <= Z <= 71: ecpel = Z - 3 # Resulting charge is always 3 # Hf - Au elif 72 <= Z <= 79: ecpel = 68 # Hg - Rn elif 80 <= Z <= 86: ecpel = 78 ecp_electrons[i] = ecpel if as_ecp_electrons: result = ecp_electrons else: result = org_nuc_charges - ecp_electrons return result
[docs] def parse_molden_atoms(data): atoms = list() coords = list() nuc_charges = list() for atom in data["atoms"]: atoms.append(atom["symbol"]) coords.extend(atom["xyz"]) Z = atom["atomic_number"] nuc_charges.append(Z) atoms = tuple(atoms) coords = np.array(coords, dtype=float).flatten() nuc_charges = np.array(nuc_charges, dtype=int) return atoms, coords, nuc_charges
@file_or_str(".molden", ".input") def shells_from_molden(text, **kwargs): data = parse_molden(text, with_mos=False) atoms, coords, _ = parse_molden_atoms(data) atomic_numbers = [ATOMIC_NUMBERS[atom.lower()] for atom in atoms] coords3d = coords.reshape(-1, 3) _shells = list() for atom_gtos, center, atomic_num in zip(data["gto"], coords3d, atomic_numbers): center_ind = atom_gtos["number"] - 1 for shell in atom_gtos["shells"]: L = shell["ang_mom"] exps = shell["exponents"][0] coeffs = shell["coeffs"][0] shell = Shell( L=L, center=center, coeffs=coeffs, exps=exps, center_ind=center_ind, atomic_num=atomic_num, ) _shells.append(shell) shells = MoldenShells(_shells, **kwargs) return shells
[docs] def radial_integral(l, exponent): """ Integrates (r r**l * exp(-exponent * r**2))**2 dr from r=0 to r=oo as described in the SI of the JANPA paper [1] (see top of page 8, second integral in the square root. In my opinion, the integrals lacks a factor 'r'. Below, some sympy code can be found to solve this integral (including 1*r). import sympy as sym r, z = sym.symbols("r z", positive=True) l = sym.symbols("l", integer=True, positive=True) sym.integrate((r * r**l * sym.exp(-z*r**2))**2, (r, 0, sym.oo)) The 'solved' integral on page 8 is correct again. ⎮ 2 ⎮ 2 2⋅l -2⋅r ⋅z ⎮ r ⋅r ⋅ℯ dr = (2*z)**(-l - 1/2)*gamma(l + 3/2)/(4*z) 0 """ return (2 * exponent) ** (-l - 1 / 2) * gamma(l + 3 / 2) / (4 * exponent)
@file_or_str(".molden", ".input") def shells_from_orca_molden(text, **kwargs): molden_shells = shells_from_molden(text) dividers = { 0: 1, 1: 1, 2: 3, 3: 15, 4: 35, } def fix_contr_coeffs(l, coeffs, exponents): """Fix contraction coefficients. Based on equations found in the SI of the JANPA paper [1].""" l = get_l(l) divider = dividers[l] rad_ints = radial_integral(l, exponents) norms2 = rad_ints * 4 * np.pi / (2 * l + 1) / divider norms = np.sqrt(norms2) normed_coeffs = coeffs * norms return normed_coeffs _shells = list() for shell in molden_shells.shells: L, center, _, exps, *_ = shell.as_tuple() fixed_coeffs = fix_contr_coeffs(L, shell.coeffs_org, exps) fixed_shell = Shell( L=L, center=center, coeffs=fixed_coeffs, exps=exps, center_ind=shell.center_ind, atomic_num=shell.atomic_num, ) _shells.append(fixed_shell) shells = ORCAMoldenShells(_shells, **kwargs) return shells @file_or_str(".molden", ".input") def wavefunction_from_molden( text, charge=None, orca_contr_renorm=False, xtb_nuc_charges: bool = False, **kwargs, ): """Construct Wavefunction object from .molden file. orca_contr_renorm fixes contraction coefficients, as required for ORCA and XTB. """ kwargs = kwargs.copy() shell_kwargs = kwargs.pop("shell_kwargs", {}) is_from = molden_is_from(text) if is_from in ("orca", "xtb"): orca_contr_renorm = True elif is_from == "turbomole": orca_contr_renorm = False else: raise Exception("Could not determine origin of molden file!") data = parse_molden(text) atoms, coords, nuc_charges = parse_molden_atoms(data) nuc_charge = sum(nuc_charges) # XTB only describes valence electrons. The missing electrons will be # treated as ECP electrons. if xtb_nuc_charges: ecp_electrons = get_xtb_nuc_charges(atoms, as_ecp_electrons=True) kwargs["ecp_electrons"] = ecp_electrons nuc_charge = nuc_charge - sum(ecp_electrons) spins = list() occ_a = 0.0 occ_b = 0.0 occ_a_unpaired = 0.0 occ_b_unpaired = 0.0 Ca = list() Cb = list() for mo in data["mos"]: spin = mo["spin"].lower() occ = mo["occup"] assert occ >= 0.0 is_unpaired = 0.0 < occ < 2.0 spins.append(spin) coeffs = mo["coeffs"] if spin == "alpha": occ_a += occ Ca.append(coeffs) if is_unpaired: occ_a_unpaired += occ elif spin == "beta": occ_b += occ Cb.append(coeffs) if is_unpaired: occ_b_unpaired += occ else: raise Exception(f"Spin can only be 'alpha' or 'beta', but got '{spin}'!") assert occ_a.is_integer assert occ_b.is_integer occ_a = int(occ_a) occ_b = int(occ_b) occ_a_unpaired = int(occ_a_unpaired) occ_b_unpaired = int(occ_b_unpaired) unpaired_electrons = occ_a_unpaired or occ_b_unpaired unrestricted = set(spins) == {"Alpha", "Beta"} or unpaired_electrons restricted = not unrestricted # MOs must be in columns Ca = np.array(Ca, dtype=float).T Cb = np.array(Cb, dtype=float).T # Restricted calculation if restricted: Cb = Ca.copy() occ_a = occ_b = occ_a // 2 # The second case is hit w/ XTB molden files and restriced-open-shell calculatons elif unrestricted and Cb.size == 0: assert occ_a >= occ_b common_electrons = (occ_a - occ_a_unpaired) // 2 Cb = Ca.copy() occ_a -= common_electrons occ_b = common_electrons C = np.stack((Ca, Cb)) molden_charge = nuc_charge - (occ_a + occ_b) if charge is None: charge = molden_charge # Multiplicity = (2S + 1), S = number of unpaired elecs. * 0.5 mult = (occ_a - occ_b) + 1 if orca_contr_renorm: shells_func = shells_from_orca_molden else: shells_func = shells_from_molden shells = shells_func(text, **shell_kwargs) # Molden defaults to Cartesian bfs. Below, we pick the correct basis function type, # depending on the number of present basis functions. _, nao, _ = C.shape bf_type = { shells.cart_size: BFType.CARTESIAN, shells.sph_size: BFType.PURE_SPHERICAL, }[nao] return Wavefunction( atoms=atoms, coords=coords, charge=charge, mult=mult, unrestricted=unrestricted, occ=(occ_a, occ_b), C=C, bf_type=bf_type, shells=shells, **kwargs, )