Source code for aiida_aimall.workchains.calcfunctions

"""Calcfunctions used throughout workchains"""
# pylint: disable=c-extension-no-member
# pylint:disable=no-member
import io
from string import digits

import ase.io
from aiida.engine import calcfunction
from aiida.orm import Dict, SinglefileData, Str, StructureData
from rdkit import Chem
from rdkit.Chem import AllChem, rdmolops, rdqueries
from rdkit.Chem.MolKey.MolKey import BadMoleculeException
from subproptools.sub_reor import rotate_substituent_aiida


@calcfunction
[docs]def generate_rotated_structure_aiida(FolderData, atom_dict, cc_dict):
    """Rotates the fragment to the defined coordinate system

    Args:
        FolderData (aiida.orm.FolderData): aim calculation folder
        atom_dict: AIM atom dict
        cc_dict: AIM cc_dict

    Returns:
        Dict with keys 'atom_symbols' and 'geom' containing atomic symbols and the
            the rotated geometry.

    """
    return Dict(rotate_substituent_aiida(FolderData, atom_dict, cc_dict))


[docs]def remove_numcharss_from_strlist(in_list):
    """Remove digits from a list of strings. e.g. ['O1','H2','H3'] -> ['O','H','H']

    Args:
        in_list: input list to remove digits from

    Returns:
        output list with the numerical digits removed from each element

    Note:
        The intention for this list is to convert numered atomic symbols, e.g. from Gaussian
            to just symbols

    """
    remove_digits = str.maketrans("", "", digits)
    out_list = [i.translate(remove_digits) for i in in_list]
    return out_list


@calcfunction
[docs]def dict_to_structure(fragment_dict):
    """Generate a StructureData for Gaussian inputs

    Args:
        fragment_dict (aiida.orm.Dict): AiiDA orm.Dict with keys 'atom_symbols' and 'geom'

    Returns:
        aiida.orm.StructureData for the molecule

    Note:
        input can be generated, for example, by
            :func:`aiida_aimall.workchains.calcfunctions.generate_rotated_structure_aiida`

    """
    inp_dict = fragment_dict.get_dict()
    symbols = inp_dict["atom_symbols"]
    symbols = remove_numcharss_from_strlist(symbols)
    coords = inp_dict["geom"]
    # outstr is xyz file contents
    outstr = ""
    # numatoms then blank line, then coordinates and symbols for each atom
    outstr += f"{len(symbols)}\n\n"
    for i, symbol in enumerate(symbols):
        if i != len(symbols) - 1:
            outstr = (
                outstr
                + symbol
                + "   "
                + str(coords[i][0])
                + "   "
                + str(coords[i][1])
                + "   "
                + str(coords[i][2])
                + "\n"
            )
        else:
            outstr = (
                outstr
                + symbol
                + "   "
                + str(coords[i][0])
                + "   "
                + str(coords[i][1])
                + "   "
                + str(coords[i][2])
            )
    # create StructureData from .xyz file string
    f = io.StringIO(outstr)
    struct_data = StructureData(ase=ase.io.read(f, format="xyz"))
    f.close()
    return struct_data


[docs]def calc_multiplicity(mol):
    """Calculate the multiplicity of a molecule as 2S +1

    Loops over the atoms in the molecule and gets number of radical electrons,
    then converts that number to the multiplicity.

    Args:
        mol: rdkit.Chem molecule object

    Returns:
        integer number of multiplicity

    """
    num_radicals = 0
    for atom in mol.GetAtoms():
        num_radicals += atom.GetNumRadicalElectrons()
    multiplicity = num_radicals + 1
    return multiplicity


[docs]def find_attachment_atoms(mol):
    """Given molecule object, find the atoms corresponding to a * and the atom to which that is bound

    Args:
        mol: rdkit molecule object

    Returns:
        molecule with added hydrogens, the * atom object, and the atom object to which that is attached

    Note:
        Assumes that only one * is present in the molecule
    """
    # * has atomic number 0
    query = rdqueries.AtomNumEqualsQueryAtom(0)
    # add hydrogens now
    h_mol_rw = Chem.RWMol(mol)  # Change type of molecule object
    h_mol_rw = Chem.AddHs(h_mol_rw)
    query_ats = h_mol_rw.GetAtomsMatchingQuery(query)
    if len(query_ats) != 1:
        raise ValueError(
            f"Molecule should have one placeholder atom with atomic number 0, found {len(query_ats)}"
        )
    zero_at = query_ats[0]
    # this will be bonded to one atom - whichever atom in the bond is not *, is the one we are looking for
    bond = zero_at.GetBonds()[0]
    begin_atom = bond.GetBeginAtom()
    if begin_atom.GetSymbol() != "*":
        attached_atom = begin_atom
    else:
        attached_atom = bond.GetEndAtom()
    return h_mol_rw, zero_at, attached_atom


[docs]def reorder_molecule(h_mol_rw, zero_at, attached_atom):
    """Reindexes the atoms in a molecule, setting attached_atom to index 0, and zero_at to index 1

    Args:
        h_mol_rw: RWMol rdkit object with explicit hydrogens
        zero_at: the placeholder * atom
        attached_atom: the atom bonded to *

    Returns:
        molecule with reordered indices
    """
    zero_at_idx = zero_at.GetIdx()
    zero_at.SetAtomicNum(1)

    attached_atom_idx = attached_atom.GetIdx()
    # Initialize the new index so that our desired atoms are at the indices we want
    first_two_atoms = [attached_atom_idx, zero_at_idx]
    # Add the rest of the indices in original order
    remaining_idx = [
        atom.GetIdx()
        for atom in h_mol_rw.GetAtoms()
        if atom.GetIdx() not in first_two_atoms
    ]
    out_atom_order = first_two_atoms + remaining_idx
    reorder_mol = rdmolops.RenumberAtoms(h_mol_rw, out_atom_order)
    return reorder_mol


[docs]def get_xyz(reorder_mol):
    """MMFF optimize the molecule to generate xyz coordiantes

    Args:
        reorder_mol: rdkit.Chem molecule output, output of :func:`aiida_aimall.workchains.calcfunctions.reorder_molecule`

    Returns:
        string of the geometry block of an .xyz file

    """
    AllChem.EmbedMolecule(reorder_mol)
    # not_optimized will be 0 if done, 1 if more steps needed
    max_iters = 200
    for i in range(0, 6):
        not_optimized = AllChem.MMFFOptimizeMolecule(
            reorder_mol, maxIters=max_iters
        )  # Optimize with MMFF94
        # -1 is returned for molecules where there are no heavy atom-heavy atom bonds
        # for these, hopefully the embed geometry is good enough
        # 0 is returned on successful opt
        if not_optimized in [0, -1]:
            break
        if i == 5:
            return "Could not determine xyz coordinates"
        max_iters = max_iters + 200
    xyz_block = AllChem.rdmolfiles.MolToXYZBlock(
        reorder_mol
    )  # pylint:disable=no-member  # Store xyz coordinates
    split_xyz_block = xyz_block.split("\n")
    # first two lines are: number of atoms and blank. Last line is blank
    xyz_lines = split_xyz_block[2 : len(split_xyz_block) - 1]
    xyz_string = "\n".join([str(item) for item in xyz_lines])
    return xyz_string


@calcfunction
[docs]def get_substituent_input(smiles: str) -> dict:
    """For a given smiles, determine xyz structure, charge, and multiplicity

    Args:
        smiles (str): SMILEs of substituent to run

    Returns:
        Dict with keys xyz, charge, multiplicity

    Raise:
        ValueError: if molecule cannot be built from SMILES

    """
    mol = Chem.MolFromSmiles(smiles.value)
    # If the mol could not be built, mol will be None
    if not mol:
        raise ValueError(
            f"Molecule could not be constructed for substituent input SMILES {smiles.value}"
        )
    # add hydrogens to the molecule, and find the atom to put at origin and the atom attached to it
    h_mol_rw, zero_at, attached_atom = find_attachment_atoms(mol)
    # Set zero_at to be the first atom, and attached atom as the second, by number
    reorder_mol = reorder_molecule(h_mol_rw, zero_at, attached_atom)
    xyz_string = get_xyz(reorder_mol)
    if xyz_string == "Could not determine xyz coordinates":
        raise BadMoleculeException(
            "Maximum iterations exceeded, could not determine xyz coordinates for f{smiles.value}"
        )
    # get store charge, multiplicity and geometry
    reorder_mol.UpdatePropertyCache()
    charge = Chem.GetFormalCharge(h_mol_rw)
    multiplicity = calc_multiplicity(h_mol_rw)
    out_dict = Dict({"xyz": xyz_string, "charge": charge, "multiplicity": multiplicity})
    return out_dict


@calcfunction
[docs]def generate_structure_data(smiles_dict):
    """Take an input xyz string and convert it to StructureData

    Args:
        smiles_dict: output of :func:`aiida_aimall.workchains.calcfunctions.get_substituent_input`

    Returns:
        StructureData of the molecule

    """
    structure_Str = smiles_dict["xyz"]
    structure_str = structure_Str
    # Use the geometry string and create the xyz string for a full .xyz file
    num_atoms = len(structure_str.split("\n"))
    xyz_string = f"{num_atoms}\n\n" + structure_str
    # Convert string to StructureData by encoding it as a file
    f = io.StringIO(xyz_string)
    struct_data = StructureData(ase=ase.io.read(f, format="xyz"))
    f.close()
    return struct_data


@calcfunction
[docs]def parameters_with_cm(parameters, smiles_dict):
    """Add charge and multiplicity keys to Gaussian Input

    Args:
        parameters: dictionary to be provided to GaussianCalculation
        smiles_dict: `aiida_aimall.workchains.calcfunctions.get_substituent_input`

    Returns:
        Dict of Gaussian parameters updated with charge and multiplicity

    """
    parameters_dict = parameters.get_dict()
    smiles_dict_dict = smiles_dict.get_dict()
    parameters_dict["charge"] = smiles_dict_dict["charge"]
    parameters_dict["multiplicity"] = smiles_dict_dict["multiplicity"]
    return Dict(parameters_dict)


@calcfunction
[docs]def get_wfxname_from_gaussianinputs(gaussian_parameters):
    """Find the .wfx filename from gaussian_parameters

    Check if input parameters was provided to gaussian_parameters, and if so, look for
    .wfx file names supplied. If it was, return the first .wfx filename found

    Args:
        gaussian_parameters: input dictionary to be provided to GaussianCalculation

    Returns:
        Str of .wfx filename

    """
    gaussian_dict = gaussian_parameters.get_dict()
    if "input_parameters" not in gaussian_dict:
        return Str("")
    object_names = list(gaussian_dict["input_parameters"].keys())
    wfx_files = [x for x in object_names if "wfx" in x]
    if len(wfx_files) >= 1:
        return Str(wfx_files[0])
    if len(wfx_files) == 0:
        wfn_files = [x for x in object_names if "wfn" in x]
        if len(wfn_files) >= 1:
            return Str(wfn_files[0])
        if len(wfn_files) == 0:
            return Str("")
    # should not get here
    return Str("")


@calcfunction
[docs]def create_wfx_from_retrieved(wfxname, retrieved_folder):
    """Create wavefunction SinglefileData from retrieved folder

    Args:
        wfxname: Str of the name of a .wfx file to get from the retrieved folder
        retrieved_folder: FolderData of a completed GaussianCalculation

    Returns:
        SinglefileData of the .wfx file to find in the FolderData

    """
    wfx_file_string = retrieved_folder.get_object_content(wfxname.value.strip())
    return SinglefileData(io.BytesIO(wfx_file_string.encode()))


[docs]def validate_shell_code(node, _):
    """Validate the shell code, ensuring that it is ShellCode or Str

    Args:
        node: input node to check the type for ShellCode or Str

    Returns:
        None if the type is ShellCode or Str, or error string if node is not

    """
    if node.node_type not in [
        "data.core.code.installed.shell.ShellCode.",
        "data.core.str.Str.",
    ]:
        return "the `shell_code` input must be either ShellCode or Str of the command."
    return None


[docs]def validate_parser(node, _):
    """Validate the parser, ensuring that the provided value is one of the accepted values.

    Args:
        node: input node to check the type for ShellCode or Str

    Returns:
        None if the value is aimall.base or aimall.group, or an error string if it is not
    """

    if node.value not in ["aimall.base", "aimall.group"]:
        return "the `aim_parser` input must be either aimall.base or aimall.group"
    return None


[docs]def validate_file_ext(node, _):
    """Validates that the file extension provided for AIM is wfx, wfn or fchk

    Args:
        node: node to check the value of to ensure it is in a supported format

    Returns:
        None if the type is ShellCode or Str, or error string if node is not

    """
    if node.value not in ["wfx", "wfn", "fchk"]:
        return "the `aim_file_ext` input must be a valid file format for AIMQB: wfx, wfn, or fchk"
    return None


@calcfunction
[docs]def get_molecule_str_from_smiles(smiles):
    """For a given smiles, determine xyz structure, charge, and multiplicity

    Args:
        smiles: SMILEs of substituent to run

    Returns:
        Dict with keys xyz, charge, multiplicity

    """
    mol = Chem.MolFromSmiles(smiles.value)
    if not mol:
        raise ValueError(
            f"Molecule could not be constructed for substituent input SMILES {smiles.value}"
        )
    h_mol_rw = Chem.RWMol(mol)  # Change type of molecule object
    h_mol_rw = Chem.AddHs(h_mol_rw)
    xyz_string = get_xyz(h_mol_rw)
    if xyz_string == "Could not determine xyz coordinates":
        raise BadMoleculeException(
            "Maximum iterations exceeded, could not determine xyz coordinates for f{smiles.value}"
        )
    h_mol_rw.UpdatePropertyCache()
    charge = Chem.GetFormalCharge(h_mol_rw)
    multiplicity = calc_multiplicity(h_mol_rw)
    out_dict = Dict({"xyz": xyz_string, "charge": charge, "multiplicity": multiplicity})
    return out_dict


@calcfunction
[docs]def xyzfile_to_StructureData(xyz_SFD):
    """Convert the xyz file provided as SinglefileData to StructureData"""
    with xyz_SFD.as_path() as filepath:
        return StructureData(ase=ase.io.read(filepath, format="xyz"))