Source code for pysisyphus.io.pdb

from collections import OrderedDict
import re
import struct
import textwrap

from jinja2 import Template
import numpy as np

from pysisyphus.elem_data import KNOWN_ATOMS
from pysisyphus.constants import ANG2BOHR
from pysisyphus.Geometry import Geometry
from pysisyphus.helpers_pure import chunks, file_or_str
from pysisyphus.intcoords.setup_fast import find_bonds


AMINOS = (
    "ALA CYS ASP GLU PHE GLY HIS ILE LYS LEU"
    "MET ASN PRO GLN ARG SER THR VAL TRP TYR".split()
)



[docs]
def get_parser(widths):
    fmt = " ".join("{}{}".format(width, "s") for width in widths)
    fieldstruct = struct.Struct(fmt)
    parse = lambda line: tuple(s.decode() for s in fieldstruct.unpack(line.encode()))
    return parse



STRIP_RE = re.compile(r"[\d\s]*")  # To remove numbers and whitespace
NAME_MAP = {
    "hb": "H",
    "he": "H",
    "hh": "H",
    "hd": "H",
    "hg": "H",
    "so": "Na",
}
FULL_NAME = {
    " sod",
    " cla",
    " cal",
}



[docs]
def parse_atom_name(name):
    org_name = name
    assert len(name) == 4
    name = name.lower()
    # Cases like " SOD" require special handling. Sticking to the PDB specification (!)
    # and using only the first two characters would result in S (sulphur).
    use_full_name = name in FULL_NAME
    if not use_full_name:
        name = name[:2]
    stripped = STRIP_RE.sub("", name).lower()
    if use_full_name:
        stripped = stripped[:2]
    try:
        mapped = NAME_MAP[stripped]
    except KeyError:
        assert stripped in KNOWN_ATOMS, f"Could not parse atom name '{org_name}'"
        mapped = stripped
    return mapped.capitalize()



@file_or_str(".pdb")
def parse_pdb(text):
    atm_lines = list()
    for line in text.split("\n"):
        if line.startswith("HETATM") or line.startswith("ATOM"):
            atm_lines.append(line.strip())
            continue
        elif line.startswith("MASTER"):
            master_line = line.strip()

    """
        0  Record name
        1  serial
        2  name
        3  altLoc
        4  ResName
        5  chainID
        6  resSeq
        7  iCode
        8  x
        9  y
        10 z
        11 occupancy
        12 tempFactor
        13 element
        (14 charge), not parsed

        See https://stackoverflow.com/questions/4914008
    """
    atm_widths = (6, 6, 4, 1, 4, 1, 4, 1, 11, 8, 8, 6, 6, 12)
    atm_parse = get_parser(atm_widths)

    """
        0  Record name
        1  Num. of REMARK
        2  "0"
        3  Num. of HET
        4  Num. of HELIX
        5  Num. of SHEET
        6  deprecated
        7  Num. of SITE
        8  numXform
        9  Num. of atomic coordinates (HETATM + ATOMS)
        10 Num. of CONECT
        11 Num. of SEQRES
    """
    master_widths = (6, 9, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5)
    master_parse = get_parser(master_widths)
    try:
        master_fields = master_parse(master_line)
    except UnboundLocalError:
        master_fields = None
        print(f"Warning! No MASTER line found in PDB!")

    atoms = list()
    coords = list()
    fragments = {}
    atom_map = dict()

    for i, line in enumerate(atm_lines):
        # Charge is not considered right now ...
        fields = atm_parse(f"{line[:78]: <78}")
        res_name = fields[4].strip()
        chain = fields[5].strip()
        res_seq = int(fields[6])
        frag = f"{chain}_{res_name}{res_seq}"

        xyz = np.array(fields[8:11], dtype=float)
        atom = fields[13].strip()
        if not atom.lower() in KNOWN_ATOMS:
            name = fields[2]
            atom = parse_atom_name(name)
        atoms.append(atom)
        coords.append(xyz)
        id_ = int(fields[1])
        atom_map[id_] = i

        frag = fragments.setdefault(frag, list())
        frag.append(i)

    # Verification using MASTER record, if present.
    if master_fields is not None:
        num_coord = int(master_fields[9])
        try:
            assert len(atoms) == num_coord
        except AssertionError as err:
            if len(atoms) < 99_999:  # See, e.g., 1HTQ
                raise err

    coords = np.array(coords, dtype=float) * ANG2BOHR
    return atoms, coords.flatten(), fragments, atom_map



[docs]
def geom_from_pdb(fn, **kwargs):
    atoms, coords, fragments, _ = parse_pdb(fn)
    kwargs["fragments"] = fragments
    geom = Geometry(atoms, coords.flatten(), **kwargs)
    return geom




[docs]
def get_conect_lines(atoms, coords):
    bonds = find_bonds(atoms, coords)
    # PDB indexing is 1-based
    bonds += 1
    # Bring in a suitable order for CONECT entries
    bonds.sort(axis=1)
    bonds = sorted(bonds, key=lambda row: row[0])
    conect = OrderedDict()
    for from_, to_ in bonds:
        conect.setdefault(from_, list()).append(to_)

    conect_lines = list()
    for from_, to_ in conect.items():
        to_iter = chunks(sorted(to_), 4)
        for to_chunk in to_iter:
            conect_lines.append([from_] + to_chunk)
    return conect_lines




[docs]
def atoms_coords_to_pdb_str(atoms, coords, fragments=None, resname="", conect=True):
    coords3d = coords.reshape(-1, 3)
    coords3d_ang = coords3d / ANG2BOHR

    coord_fmt = "{: >8.3f}" * 3
    #            serial  name  altLoc      chainID     iCode
    #                                resName     resSeq
    hetatm_fmt = "HETATM{: >5d} {: >4}{: >1}{: >3}{: >1} {: >4d}{: >1}   "
    #      xyz
    hetatm_fmt += coord_fmt
    #      occupancy tempFactor            atom
    hetatm_fmt += "{: >6.2f}{: >6.2f}" + 10 * " " + "{: >2s}"
    ter_fmt = "TER   {: >5d}      {: >3s} {:1s}{: >4d}"

    if fragments is None:
        fragments = [
            range(len(atoms)),
        ]

    # Fixed for now
    altLoc = ""
    resName = resname
    chainID = ""
    iCode = ""
    occupancy = 1.0
    tempFactor = 0.0

    lines = list()
    serial = 1
    for resSeq, fragment in enumerate(fragments, 1):
        for id_ in fragment:
            name = atoms[id_]
            xyz = coords3d_ang[id_]
            line = hetatm_fmt.format(
                serial,
                name,
                altLoc,
                resName,
                chainID,
                resSeq,
                iCode,
                *xyz,
                occupancy,
                tempFactor,
                name,
            )
            lines.append(line)
            serial += 1
        lines.append(ter_fmt.format(serial, resName, chainID, resSeq))

    def fmt_conect_line(conect_line):
        return "CONECT" + ("{: >5d}" * len(conect_line)).format(*conect_line)

    if conect:
        conect_lines = get_conect_lines(atoms, coords)
    else:
        conect_lines = []
    conect_str = "\n".join([fmt_conect_line(cl) for cl in conect_lines])

    pdb_tpl = Template(
        textwrap.dedent(
            """\
    REMARK 1 Created by pysisyphus
    {%+ for line in lines -%}
    {{ line }}
    {%+ endfor -%}
    {{- conect_str }}
    END"""
        )
    )

    pdb_str = pdb_tpl.render(lines=lines, conect_str=conect_str)
    return pdb_str




[docs]
def geom_to_pdb_str(geom, detect_fragments=False, **kwargs):
    fragments = None
    if detect_fragments:
        try:
            fragments = geom.internal.fragments
        except AttributeError:
            geom_ = geom.copy(coord_type="redund")
            fragments = geom_.internal.fragments
    pdb_str = atoms_coords_to_pdb_str(
        geom.atoms, geom.coords3d, fragments=fragments, **kwargs
    )
    return pdb_str