import re
import numpy as np
import pyparsing as pp
from pysisyphus.elem_data import ATOMIC_NUMBERS
from pysisyphus.Geometry import Geometry
from pysisyphus.io.xyz import parse_xyz
from pysisyphus.helpers_pure import file_or_str
from pysisyphus.wavefunction import MoldenShells, Shell, Wavefunction
from pysisyphus.wavefunction.helpers import BFType
@file_or_str(".molden", ".input")
def parse_molden_atoms_coords(molden_str):
_, geometries = re.split(r"\[GEOMETRIES\] \(XYZ\)", molden_str)
atoms_coords, comments = parse_xyz(geometries, with_comment=True)
return atoms_coords, comments
[docs]
def geoms_from_molden(fn, **kwargs):
atoms_coords, comments = parse_molden_atoms_coords(fn)
geoms = [
Geometry(atoms, coords.flatten(), comment=comment, **kwargs)
for (atoms, coords), comment in zip(atoms_coords, comments)
]
return geoms
[docs]
def parse_mo(mo_lines):
# sym, ene_key, energy, spin_key, spin, occup_key, occup, *rest = mo_lines
sym, ene_line, spin_line, occup_line, *rest = mo_lines
ene_key, energy = ene_line.split("=")
spin_key, spin = spin_line.split("=")
occup_key, occup = occup_line.split("=")
assert (
(ene_key.strip() == "ene")
and (spin_key.strip() == "spin")
and (occup_key.strip() == "occup")
)
prev_ind = 0
coeffs = list()
for line in rest:
ind, coeff = line.split()
ind = int(ind)
assert ind == prev_ind + 1
prev_ind = ind
coeffs.append(coeff)
return {
"sym": sym.strip(),
"ene": float(energy),
"spin": spin.strip(),
"occup": float(occup),
"coeffs": np.array(coeffs),
}
@file_or_str(".molden", ".input")
def parse_molden(text, with_mos=True):
int_ = pp.common.integer
real = pp.common.real
sci_real = pp.common.sci_real
def get_section(name):
return pp.CaselessLiteral(f"[{name}]")
molden_format = get_section("Molden Format")
n_atoms = get_section("N_Atoms") + int_
atoms_header = get_section("Atoms")
title = get_section("Title") + pp.Optional(
pp.ZeroOrMore(~atoms_header + pp.Word(pp.printables))
)
alpha_re = re.compile("([a-zA-Z]+)")
def drop_number(toks):
"""OpenMolcas writes atoms like H1, C2, ...
We don't want the number."""
return alpha_re.match(toks[0]).group(1)
# [Atoms]
# element_name number atomic_number x y z
unit = pp.one_of(("AU Angs (AU) (Angs)"), caseless=True)
atom_symbol = pp.Word(pp.alphanums).set_parse_action(drop_number)
xyz = pp.Group(sci_real + sci_real + sci_real)
atom_line = pp.Group(
atom_symbol.set_results_name("symbol")
+ int_.set_results_name("number")
+ int_.set_results_name("atomic_number")
+ xyz.set_results_name("xyz")
)
atoms = (
atoms_header
+ pp.Optional(unit).set_results_name("unit")
+ pp.OneOrMore(atom_line).set_results_name("atoms")
)
# [Charge], written by OpenMOLCAS
charge = get_section("Charge")
charge_kind = pp.one_of(("(Mulliken)",), caseless=True)
charges = charge + charge_kind + pp.OneOrMore(sci_real)
def pgto_exps_coeffs(s, loc, toks):
data = toks.asList()
exps, coeffs = np.array(data, dtype=float).reshape(-1, 2).T
pr = pp.ParseResults.from_dict(
{
"exponents": exps,
"coeffs": coeffs,
}
)
# Somehow this function will result in exponents and coeffs being both
# stored in lists of length 1.
return pr
# [GTO]
ang_mom = pp.one_of("s p sp d f g h i", caseless=True)
pgto = sci_real + sci_real
shell = pp.Group(
ang_mom.set_results_name("ang_mom")
+ int_.set_results_name("contr_depth")
# Usually 1.0, can be left out
+ pp.Optional(real.set_whitespace_chars(" ").suppress() + pp.LineEnd())
+ pp.OneOrMore(pgto).set_parse_action(pgto_exps_coeffs)
)
atom_gtos = pp.Group(
int_.set_results_name("number") # Center
+ pp.Optional(pp.Literal("0"))
+ pp.OneOrMore(shell).set_results_name("shells")
)
gto = (
pp.CaselessLiteral("[GTO]")
+ pp.Optional(unit)
+ pp.Group(pp.OneOrMore(atom_gtos)).set_results_name("gto")
)
# [5D] [7F] [9G] etc.
ang_mom_flags = pp.ZeroOrMore(pp.one_of("[5D] [7F] [9G]", caseless=True))
# Pyparsing code to parse MOs ... slow. Regex-based code is much faster.
# [MO]
# sym_label = pp.Word(pp.printables)
# spin = pp.one_of("Alpha Beta", caseless=True)
# mo_coeff = pp.Suppress(int_) + real
# mo = pp.Group(
# pp.CaselessLiteral("Sym=").suppress()
# + sym_label.set_results_name("sym")
# + pp.CaselessLiteral("Ene=").suppress()
# + sci_real.set_results_name("ene")
# + pp.CaselessLiteral("Spin=").suppress()
# + spin.set_results_name("spin")
# + pp.CaselessLiteral("Occup=").suppress()
# + real.set_results_name("occup")
# + pp.Group(pp.OneOrMore(mo_coeff)).set_results_name("coeffs")
# )
# mos = pp.CaselessLiteral("[MO]") + pp.OneOrMore(mo).set_results_name("mos")
mos = pp.CaselessLiteral("[MO]") + pp.OneOrMore(
pp.Regex(r"[^\[]+") # Match everything until the next [ is encountered
).set_results_name("mos")
# Actual parser
parser = molden_format + pp.Each(
[
pp.Optional(expr)
for expr in (n_atoms, title, atoms, charges, gto, ang_mom_flags)
]
)
if with_mos:
parser += mos
res = parser.parse_string(text)
as_dict = res.asDict()
if with_mos:
mos_raw = as_dict["mos"][0]
by_mo = [bm.strip().split("\n") for bm in mos_raw.lower().strip().split("sym=")]
assert by_mo[0] == [""], by_mo[0]
by_mo = by_mo[1:]
as_dict["mos"] = [parse_mo(mo_lines) for mo_lines in by_mo]
return as_dict
[docs]
def parse_molden_atoms(data):
atoms = list()
coords = list()
nuc_charges = list()
for atom in data["atoms"]:
atoms.append(atom["symbol"])
coords.extend(atom["xyz"])
Z = atom["atomic_number"]
nuc_charges.append(Z)
atoms = tuple(atoms)
coords = np.array(coords, dtype=float).flatten()
nuc_charges = np.array(nuc_charges, dtype=int)
return atoms, coords, nuc_charges
@file_or_str(".molden", ".input")
def shells_from_molden(text):
data = parse_molden(text, with_mos=False)
atoms, coords, _ = parse_molden_atoms(data)
atomic_numbers = [ATOMIC_NUMBERS[atom.lower()] for atom in atoms]
coords3d = coords.reshape(-1, 3)
_shells = list()
for atom_gtos, center, atomic_num in zip(data["gto"], coords3d, atomic_numbers):
center_ind = atom_gtos["number"] - 1
for shell in atom_gtos["shells"]:
L = shell["ang_mom"]
exps = shell["exponents"][0]
coeffs = shell["coeffs"][0]
shell = Shell(
L=L,
center=center,
coeffs=coeffs,
exps=exps,
center_ind=center_ind,
atomic_num=atomic_num,
)
_shells.append(shell)
shells = MoldenShells(_shells)
return shells
@file_or_str(".molden", ".input")
def wavefunction_from_molden(text, charge=None, shells_func=None, **wf_kwargs):
"""Construct Wavefunction object from .molden file.
shells_func is used for ORCA, to modify the contraction coefficients.
"""
data = parse_molden(text)
atoms, coords, nuc_charges = parse_molden_atoms(data)
nuc_charge = sum(nuc_charges)
spins = list()
occ_a = 0.0
occ_b = 0.0
Ca = list()
Cb = list()
for mo in data["mos"]:
spin = mo["spin"].lower()
occ = mo["occup"]
spins.append(spin)
coeffs = mo["coeffs"]
if spin == "alpha":
occ_a += occ
Ca.append(coeffs)
elif spin == "beta":
occ_b += occ
Cb.append(coeffs)
else:
raise Exception(f"Spin can only be 'alpha' or 'beta', but got '{spin}'!")
assert occ_a.is_integer
assert occ_b.is_integer
occ_a = int(occ_a)
occ_b = int(occ_b)
# MOs must be in columns
Ca = np.array(Ca, dtype=float).T
Cb = np.array(Cb, dtype=float).T
# Restricted calculation
if Cb.size == 0:
Cb = Ca.copy()
occ_a = occ_b = occ_a // 2
C = np.stack((Ca, Cb))
molden_charge = nuc_charge - (occ_a + occ_b)
if charge is None:
charge = molden_charge
# Multiplicity = (2S + 1), S = number of unpaired elecs. * 0.5
mult = (occ_a - occ_b) + 1
unrestricted = set(spins) == {"Alpha", "Beta"}
if shells_func is None:
shells_func = shells_from_molden
shells = shells_func(text)
return Wavefunction(
atoms=atoms,
coords=coords,
charge=charge,
mult=mult,
unrestricted=unrestricted,
occ=(occ_a, occ_b),
C=C,
bf_type=BFType.PURE_SPHERICAL,
shells=shells,
**wf_kwargs,
)