import collections
import re
from typing import Optional
import numpy as np
import pyparsing as pp
from scipy.special import gamma
from pysisyphus.elem_data import ATOMIC_NUMBERS, nuc_charges_for_atoms
from pysisyphus.Geometry import Geometry
from pysisyphus.io.xyz import parse_xyz
from pysisyphus.helpers_pure import file_or_str
from pysisyphus.wavefunction import (
get_l,
MoldenShells,
ORCAMoldenShells,
Shell,
Wavefunction,
)
from pysisyphus.wavefunction.helpers import BFType
[docs]
class MoldenError(Exception):
pass
HEADER_RE = re.compile(
r"\[Molden Format]\s+"
r"\[Title\](?P<title>.*)\s+"
r"\[Atoms\]\s*(?P<unit>Angs|Au){0,1}\s*"
r"(?P<first_atom>\w)\s+",
re.IGNORECASE | re.DOTALL,
)
MOLDEN_FROM = collections.defaultdict()
MOLDEN_FROM.update(
{
# key:
# (atoms lowercase, title of length 0, unit)
#
# XTB capitalizes atoms, has no title and uses AU as unit
(False, True, "AU"): "xtb",
# TURBOMOLE has lowercase atoms, a title full of whitespace and uses AU as unit
(True, False, "AU"): "turbomole",
}
)
@file_or_str(".molden", ".input")
def molden_is_from(text: str) -> Optional[str]:
mobj = HEADER_RE.match(text)
if not mobj:
raise MoldenError("Parsing molden header failed!")
title = mobj.group("title")
unit = mobj.group("unit")
first_atom = mobj.group("first_atom")
# Quick check for ORCA
orca_title = "Molden file created by orca_2mkl"
if title.strip().startswith(orca_title):
return "orca"
atom_lower = first_atom[0] == first_atom[0].lower()
len0_title = len(title) == 0
key = (atom_lower, len0_title, unit)
return MOLDEN_FROM[key]
@file_or_str(".molden", ".input")
def parse_molden_atoms_coords(molden_str):
_, geometries = re.split(r"\[GEOMETRIES\] \(XYZ\)", molden_str)
atoms_coords, comments = parse_xyz(geometries, with_comment=True)
return atoms_coords, comments
[docs]
def geoms_from_molden(fn, **kwargs):
atoms_coords, comments = parse_molden_atoms_coords(fn)
geoms = [
Geometry(atoms, coords.flatten(), comment=comment, **kwargs)
for (atoms, coords), comment in zip(atoms_coords, comments)
]
return geoms
[docs]
def parse_mo(mo_lines):
# sym, ene_key, energy, spin_key, spin, occup_key, occup, *rest = mo_lines
sym, ene_line, spin_line, occup_line, *rest = mo_lines
ene_key, energy = ene_line.split("=")
spin_key, spin = spin_line.split("=")
occup_key, occup = occup_line.split("=")
assert (
(ene_key.strip() == "ene")
and (spin_key.strip() == "spin")
and (occup_key.strip() == "occup")
)
prev_ind = 0
coeffs = list()
for line in rest:
ind, coeff = line.split()
ind = int(ind)
assert ind == prev_ind + 1
prev_ind = ind
coeffs.append(coeff)
return {
"sym": sym.strip(),
"ene": float(energy),
"spin": spin.strip(),
"occup": float(occup),
"coeffs": np.array(coeffs),
}
@file_or_str(".molden", ".input")
def parse_molden(text, with_mos=True):
int_ = pp.common.integer
real = pp.common.real
sci_real = pp.common.sci_real
def get_section(name):
return pp.CaselessLiteral(f"[{name}]")
molden_format = get_section("Molden Format")
n_atoms = get_section("N_Atoms") + int_
atoms_header = get_section("Atoms")
title = get_section("Title") + pp.Optional(
pp.ZeroOrMore(~atoms_header + pp.Word(pp.printables))
)
alpha_re = re.compile("([a-zA-Z]+)")
def drop_number(toks):
"""OpenMolcas writes atoms like H1, C2, ...
We don't want the number."""
return alpha_re.match(toks[0]).group(1)
# [Atoms]
# element_name number atomic_number x y z
unit = pp.one_of(("AU Angs (AU) (Angs)"), caseless=True)
atom_symbol = pp.Word(pp.alphanums).set_parse_action(drop_number)
xyz = pp.Group(sci_real + sci_real + sci_real)
atom_line = pp.Group(
atom_symbol.set_results_name("symbol")
+ int_.set_results_name("number")
+ int_.set_results_name("atomic_number")
+ xyz.set_results_name("xyz")
)
atoms = (
atoms_header
+ pp.Optional(unit).set_results_name("unit")
+ pp.OneOrMore(atom_line).set_results_name("atoms")
)
# [Charge], written by OpenMOLCAS
charge = get_section("Charge")
charge_kind = pp.one_of(("(Mulliken)",), caseless=True)
charges = charge + charge_kind + pp.OneOrMore(sci_real)
def pgto_exps_coeffs(s, loc, toks):
data = toks.asList()
exps, coeffs = np.array(data, dtype=float).reshape(-1, 2).T
pr = pp.ParseResults.from_dict(
{
"exponents": exps,
"coeffs": coeffs,
}
)
# Somehow this function will result in exponents and coeffs being both
# stored in lists of length 1.
return pr
# [GTO]
ang_mom = pp.one_of("s p sp d f g h i", caseless=True)
pgto = sci_real + sci_real
shell = pp.Group(
ang_mom.set_results_name("ang_mom")
+ int_.set_results_name("contr_depth")
# Usually 1.0, can be left out
+ pp.Optional(real.set_whitespace_chars(" ").suppress() + pp.LineEnd())
+ pp.OneOrMore(pgto).set_parse_action(pgto_exps_coeffs)
)
atom_gtos = pp.Group(
int_.set_results_name("number") # Center
+ pp.Optional(pp.Literal("0"))
+ pp.OneOrMore(shell).set_results_name("shells")
)
gto = (
pp.CaselessLiteral("[GTO]")
+ pp.Optional(unit)
+ pp.Group(pp.OneOrMore(atom_gtos)).set_results_name("gto")
)
# [5D] [7F] [9G] etc.
ang_mom_flags = pp.ZeroOrMore(pp.one_of("[5D] [7F] [9G]", caseless=True))
# Pyparsing code to parse MOs ... slow. Regex-based code is much faster.
# [MO]
# sym_label = pp.Word(pp.printables)
# spin = pp.one_of("Alpha Beta", caseless=True)
# mo_coeff = pp.Suppress(int_) + real
# mo = pp.Group(
# pp.CaselessLiteral("Sym=").suppress()
# + sym_label.set_results_name("sym")
# + pp.CaselessLiteral("Ene=").suppress()
# + sci_real.set_results_name("ene")
# + pp.CaselessLiteral("Spin=").suppress()
# + spin.set_results_name("spin")
# + pp.CaselessLiteral("Occup=").suppress()
# + real.set_results_name("occup")
# + pp.Group(pp.OneOrMore(mo_coeff)).set_results_name("coeffs")
# )
# mos = pp.CaselessLiteral("[MO]") + pp.OneOrMore(mo).set_results_name("mos")
mos = pp.CaselessLiteral("[MO]") + pp.OneOrMore(
pp.Regex(r"[^\[]+") # Match everything until the next [ is encountered
).set_results_name("mos")
# Actual parser
parser = molden_format + pp.Each(
[
pp.Optional(expr)
for expr in (n_atoms, title, atoms, charges, gto, ang_mom_flags)
]
)
if with_mos:
parser += mos
res = parser.parse_string(text)
as_dict = res.asDict()
if with_mos:
mos_raw = as_dict["mos"][0]
by_mo = [bm.strip().split("\n") for bm in mos_raw.lower().strip().split("sym=")]
assert by_mo[0] == [""], by_mo[0]
by_mo = by_mo[1:]
as_dict["mos"] = [parse_mo(mo_lines) for mo_lines in by_mo]
return as_dict
[docs]
def get_xtb_nuc_charges(atoms, as_ecp_electrons=False):
"""Modified nuclear charges w/o core electrons.
Adapated from Multiwfn 3.8
"""
org_nuc_charges = nuc_charges_for_atoms(atoms)
ecp_electrons = np.zeros_like(org_nuc_charges, dtype=int)
for i, Z in enumerate(org_nuc_charges):
# H, He
if 1 <= Z <= 2:
ecpel = 0
# Li - Ne
elif 3 <= Z <= 10:
ecpel = 2
# Na - Ar
elif 11 <= Z <= 18:
ecpel = 10
# K - Cu
elif 19 <= Z <= 29:
ecpel = 18
# Zn - Kr
elif 30 <= Z <= 36:
ecpel = 28
# Rb - Ag
elif 37 <= Z <= 47:
ecpel = 36
# Cd - Xe
elif 48 <= Z <= 54:
ecpel = 46
# Cs - Ba
elif 55 <= Z <= 56:
ecpel = 54
# La - Lu, crazy
elif 57 <= Z <= 71:
ecpel = Z - 3 # Resulting charge is always 3
# Hf - Au
elif 72 <= Z <= 79:
ecpel = 68
# Hg - Rn
elif 80 <= Z <= 86:
ecpel = 78
ecp_electrons[i] = ecpel
if as_ecp_electrons:
result = ecp_electrons
else:
result = org_nuc_charges - ecp_electrons
return result
[docs]
def parse_molden_atoms(data):
atoms = list()
coords = list()
nuc_charges = list()
for atom in data["atoms"]:
atoms.append(atom["symbol"])
coords.extend(atom["xyz"])
Z = atom["atomic_number"]
nuc_charges.append(Z)
atoms = tuple(atoms)
coords = np.array(coords, dtype=float).flatten()
nuc_charges = np.array(nuc_charges, dtype=int)
return atoms, coords, nuc_charges
@file_or_str(".molden", ".input")
def shells_from_molden(text, **kwargs):
data = parse_molden(text, with_mos=False)
atoms, coords, _ = parse_molden_atoms(data)
atomic_numbers = [ATOMIC_NUMBERS[atom.lower()] for atom in atoms]
coords3d = coords.reshape(-1, 3)
_shells = list()
for atom_gtos, center, atomic_num in zip(data["gto"], coords3d, atomic_numbers):
center_ind = atom_gtos["number"] - 1
for shell in atom_gtos["shells"]:
L = shell["ang_mom"]
exps = shell["exponents"][0]
coeffs = shell["coeffs"][0]
shell = Shell(
L=L,
center=center,
coeffs=coeffs,
exps=exps,
center_ind=center_ind,
atomic_num=atomic_num,
)
_shells.append(shell)
shells = MoldenShells(_shells, **kwargs)
return shells
[docs]
def radial_integral(l, exponent):
"""
Integrates
(r r**l * exp(-exponent * r**2))**2 dr from r=0 to r=oo
as described in the SI of the JANPA paper [1] (see top of page 8,
second integral in the square root.
In my opinion, the integrals lacks a factor 'r'. Below, some sympy code
can be found to solve this integral (including 1*r).
import sympy as sym
r, z = sym.symbols("r z", positive=True)
l = sym.symbols("l", integer=True, positive=True)
sym.integrate((r * r**l * sym.exp(-z*r**2))**2, (r, 0, sym.oo))
The 'solved' integral on page 8 is correct again.
∞
⌠
⎮ 2
⎮ 2 2⋅l -2⋅r ⋅z
⎮ r ⋅r ⋅ℯ dr = (2*z)**(-l - 1/2)*gamma(l + 3/2)/(4*z)
⌡
0
"""
return (2 * exponent) ** (-l - 1 / 2) * gamma(l + 3 / 2) / (4 * exponent)
@file_or_str(".molden", ".input")
def shells_from_orca_molden(text, **kwargs):
molden_shells = shells_from_molden(text)
dividers = {
0: 1,
1: 1,
2: 3,
3: 15,
4: 35,
}
def fix_contr_coeffs(l, coeffs, exponents):
"""Fix contraction coefficients. Based on equations found in the SI
of the JANPA paper [1]."""
l = get_l(l)
divider = dividers[l]
rad_ints = radial_integral(l, exponents)
norms2 = rad_ints * 4 * np.pi / (2 * l + 1) / divider
norms = np.sqrt(norms2)
normed_coeffs = coeffs * norms
return normed_coeffs
_shells = list()
for shell in molden_shells.shells:
L, center, _, exps, *_ = shell.as_tuple()
fixed_coeffs = fix_contr_coeffs(L, shell.coeffs_org, exps)
fixed_shell = Shell(
L=L,
center=center,
coeffs=fixed_coeffs,
exps=exps,
center_ind=shell.center_ind,
atomic_num=shell.atomic_num,
)
_shells.append(fixed_shell)
shells = ORCAMoldenShells(_shells, **kwargs)
return shells
@file_or_str(".molden", ".input")
def wavefunction_from_molden(
text,
charge=None,
orca_contr_renorm=False,
xtb_nuc_charges: bool = False,
**kwargs,
):
"""Construct Wavefunction object from .molden file.
orca_contr_renorm fixes contraction coefficients, as required for
ORCA and XTB.
"""
kwargs = kwargs.copy()
shell_kwargs = kwargs.pop("shell_kwargs", {})
is_from = molden_is_from(text)
if is_from in ("orca", "xtb"):
orca_contr_renorm = True
elif is_from == "turbomole":
orca_contr_renorm = False
else:
raise Exception("Could not determine origin of molden file!")
data = parse_molden(text)
atoms, coords, nuc_charges = parse_molden_atoms(data)
nuc_charge = sum(nuc_charges)
# XTB only describes valence electrons. The missing electrons will be
# treated as ECP electrons.
if xtb_nuc_charges:
ecp_electrons = get_xtb_nuc_charges(atoms, as_ecp_electrons=True)
kwargs["ecp_electrons"] = ecp_electrons
nuc_charge = nuc_charge - sum(ecp_electrons)
spins = list()
occ_a = 0.0
occ_b = 0.0
occ_a_unpaired = 0.0
occ_b_unpaired = 0.0
Ca = list()
Cb = list()
for mo in data["mos"]:
spin = mo["spin"].lower()
occ = mo["occup"]
assert occ >= 0.0
is_unpaired = 0.0 < occ < 2.0
spins.append(spin)
coeffs = mo["coeffs"]
if spin == "alpha":
occ_a += occ
Ca.append(coeffs)
if is_unpaired:
occ_a_unpaired += occ
elif spin == "beta":
occ_b += occ
Cb.append(coeffs)
if is_unpaired:
occ_b_unpaired += occ
else:
raise Exception(f"Spin can only be 'alpha' or 'beta', but got '{spin}'!")
assert occ_a.is_integer
assert occ_b.is_integer
occ_a = int(occ_a)
occ_b = int(occ_b)
occ_a_unpaired = int(occ_a_unpaired)
occ_b_unpaired = int(occ_b_unpaired)
unpaired_electrons = occ_a_unpaired or occ_b_unpaired
unrestricted = set(spins) == {"Alpha", "Beta"} or unpaired_electrons
restricted = not unrestricted
# MOs must be in columns
Ca = np.array(Ca, dtype=float).T
Cb = np.array(Cb, dtype=float).T
# Restricted calculation
if restricted:
Cb = Ca.copy()
occ_a = occ_b = occ_a // 2
# The second case is hit w/ XTB molden files and restriced-open-shell calculatons
elif unrestricted and Cb.size == 0:
assert occ_a >= occ_b
common_electrons = (occ_a - occ_a_unpaired) // 2
Cb = Ca.copy()
occ_a -= common_electrons
occ_b = common_electrons
C = np.stack((Ca, Cb))
molden_charge = nuc_charge - (occ_a + occ_b)
if charge is None:
charge = molden_charge
# Multiplicity = (2S + 1), S = number of unpaired elecs. * 0.5
mult = (occ_a - occ_b) + 1
if orca_contr_renorm:
shells_func = shells_from_orca_molden
else:
shells_func = shells_from_molden
shells = shells_func(text, **shell_kwargs)
# Molden defaults to Cartesian bfs. Below, we pick the correct basis function type,
# depending on the number of present basis functions.
_, nao, _ = C.shape
bf_type = {
shells.cart_size: BFType.CARTESIAN,
shells.sph_size: BFType.PURE_SPHERICAL,
}[nao]
return Wavefunction(
atoms=atoms,
coords=coords,
charge=charge,
mult=mult,
unrestricted=unrestricted,
occ=(occ_a, occ_b),
C=C,
bf_type=bf_type,
shells=shells,
**kwargs,
)