Source code for nfp.preprocessing.xtb_preprocessor

import json
from typing import TYPE_CHECKING, Dict, List, Optional

import networkx as nx
import numpy as np
from nfp.frameworks import tf
from nfp.preprocessing import features
from nfp.preprocessing.mol_preprocessor import MolPreprocessor, SmilesPreprocessor

if TYPE_CHECKING:  # only for type checking
    import rdkit.Chem


[docs]class xTBPreprocessor(MolPreprocessor): def __init__( self, *args, explicit_hs: bool = True, xtb_atom_features: Optional[List[str]] = None, xtb_bond_features: Optional[List[str]] = None, xtb_mol_features: Optional[List[str]] = None, cutoff: float = 0.3, **kwargs ): super(xTBPreprocessor, self).__init__(*args, **kwargs) self.explicit_hs = explicit_hs self.cutoff = cutoff # update only bond features as we dont use rdkit self.bond_features = features.bond_features_wbo if xtb_atom_features is None: self.xtb_atom_features = [ "mulliken charges", "cm5 charges", "FUKUI+", "FUKUI-", "FUKUIrad", "s proportion", "p proportion", "d proportion", "FOD", "FOD s proportion", "FOD p proportion", "FOD d proportion", "Dispersion coefficient C6", "Polarizability alpha", ] else: self.xtb_atom_features = xtb_atom_features if xtb_bond_features is None: self.xtb_bond_features = ["Wiberg matrix", "bond_dist"] else: self.xtb_bond_features = xtb_bond_features if xtb_mol_features is None: self.xtb_mol_features = [ "total energy", "electronic energy", "HOMO", "LUMO", ] else: self.xtb_mol_features = xtb_mol_features
[docs] def create_nx_graph( self, mol: "rdkit.Chem.Mol", jsonfile: str, **kwargs ) -> nx.DiGraph: with open(jsonfile, "r") as f: json_data = json.load(f) # add hydrogens as wbo contains hydrogens and add xtb features to the graphs mol_data = {"mol_xtb": [json_data[prop] for prop in self.xtb_mol_features]} g = nx.Graph(mol=mol, **mol_data) for atom in mol.GetAtoms(): atom_data = { "atom": atom, "atom_xtb": [ json_data[prop][atom.GetIdx()] for prop in self.xtb_atom_features ], } g.add_node(atom.GetIdx(), **atom_data) # add edges based on wilberg bond orders. wbo = np.array(json_data["Wiberg matrix"]) edges_to_add = ( (i, j) for i in range(len(wbo)) for j in range(len(wbo)) if wbo[i][j] > self.cutoff ) for i, j in edges_to_add: edge_data = { "bondatoms": (mol.GetAtomWithIdx(i), mol.GetAtomWithIdx(j)), "bond_xtb": [json_data[prop][i][j] for prop in self.xtb_bond_features], } g.add_edge(i, j, **edge_data) return nx.DiGraph(g)
[docs] def get_edge_features( self, edge_data: list, max_num_edges ) -> Dict[str, np.ndarray]: bond_feature_matrix = np.zeros(max_num_edges, dtype=self.output_dtype) bond_feature_matrix_xtb = np.zeros( (max_num_edges, len(self.xtb_bond_features)), dtype="float32" ) for n, (start_atom, end_atom, bond_dict) in enumerate(edge_data): bond_feature_matrix[n] = self.bond_tokenizer( self.bond_features(start_atom, end_atom, bond_dict["bondatoms"]) ) bond_feature_matrix_xtb[n] = bond_dict["bond_xtb"] return {"bond": bond_feature_matrix, "bond_xtb": bond_feature_matrix_xtb}
[docs] def get_node_features( self, node_data: list, max_num_nodes: int ) -> Dict[str, np.ndarray]: node_features = super().get_node_features(node_data, max_num_nodes) node_features["atom_xtb"] = np.zeros( [max_num_nodes, len(self.xtb_atom_features)], dtype="float32" ) for n, atom_dict in node_data: node_features["atom_xtb"][n] = atom_dict["atom_xtb"] return node_features
[docs] def get_graph_features(self, graph_data: dict) -> Dict[str, np.ndarray]: return {"mol_xtb": np.asarray(graph_data["mol_xtb"])}
@property def output_signature(self) -> Dict[str, tf.TensorSpec]: output_signature = super().output_signature output_signature["atom_xtb"] = tf.TensorSpec( shape=(None, None), dtype="float32" ) output_signature["bond_xtb"] = tf.TensorSpec( shape=(None, None), dtype="float32" ) output_signature["mol_xtb"] = tf.TensorSpec(shape=(None,), dtype="float32") return output_signature @property def padding_values(self) -> Dict[str, tf.constant]: padding_values = super().padding_values padding_values["atom_xtb"] = tf.constant(np.nan, dtype="float32") padding_values["bond_xtb"] = tf.constant(np.nan, dtype="float32") padding_values["mol_xtb"] = tf.constant(np.nan, dtype="float32") return padding_values @property def tfrecord_features(self) -> Dict[str, tf.io.FixedLenFeature]: tfrecord_features = super().tfrecord_features tfrecord_features["atom_xtb"] = tf.io.FixedLenFeature( [], dtype="float32" if len(self.output_signature["atom_xtb"].shape) == 0 else tf.string, ) tfrecord_features["bond_xtb"] = tf.io.FixedLenFeature( [], dtype=self.output_dtype if len(self.output_signature["bond_xtb"].shape) == 0 else tf.string, ) tfrecord_features["mol_xtb"] = tf.io.FixedLenFeature( [], dtype=self.output_dtype if len(self.output_signature["mol_xtb"].shape) == 0 else tf.string, ) return tfrecord_features
[docs]class xTBSmilesPreprocessor(SmilesPreprocessor, xTBPreprocessor): pass