Source code for rxn_insight.reaction

"""Reaction module"""

import hashlib
import warnings
from typing import Optional, Union

import numpy as np
import pandas as pd
from rdkit import Chem
from rxnmapper import RXNMapper
from tqdm import tqdm

from rxn_insight.classification import ReactionClassifier
from rxn_insight.utils import (
    atom_remover,
    curate_smirks,
    draw_chemical_reaction,
    get_catalyst_ranking,
    get_fp,
    get_reagent_ranking,
    get_ring_systems,
    get_scaffold,
    get_similarity,
    get_solvent_ranking,
    maccs_fp,
    morgan_fp,
    remove_atom_mapping,
    sanitize_ring,
)


[docs] class Reaction: """Handles operations related to chemical reactions. This class facilitates various operations on chemical reactions, such as parsing reaction strings, identifying components like solvents and reagents, classifying reactions, and analyzing ring structures. Attributes: reaction (str): The SMILES representation of the reaction. solvent (str): Solvents used in the reaction. reagent (str): Reagents used in the reaction. catalyst (str): Catalysts used in the reaction. reference (str): Reference or note associated with the reaction. smirks_db (pd.DataFrame): Database of SMIRKS transformations. fg_db (pd.DataFrame): Functional group data. classifier (ReactionClassifier): Reaction classification object. reactants (str): SMILES string of the reactants. products (str): SMILES string of the products. mapped_reaction (str): Reaction with atom mappings included. reaction_class (str): Class of the reaction. template (str): Reaction template derived from the classifier. reaction_info (dict): Additional information about the reaction. tag (str): Optional tag for the reaction. name (str): Optional name of the reaction. byproducts (tuple): Tuple of byproducts in the reaction. scaffold (str): Molecular scaffold of the reaction. neighbors (Any): Placeholder for reaction neighborhood information. suggested_solvent (str): Suggested solvent for the reaction. suggested_catalyst (str): Suggested catalyst for the reaction. suggested_reagent (str): Suggested reagent for the reaction. Example: >>> import rxn_insight as ri >>> rxn = ri.Reaction("OB(O)c1ccccc1.Brc1ccccc1>>c1ccc(-c2ccccc2)cc1") >>> ri = rxn.get_reaction_info() >>> print(ri) {'REACTION': 'Brc1ccccc1.OB(O)c1ccccc1>>c1ccc(-c2ccccc2)cc1', 'MAPPED_REACTION': 'Br[c:5]1[cH:6][cH:7][cH:8][cH:9][cH:10]1.OB(O)[c:4]1[cH:3][cH:2][cH:1][cH:12][cH:11]1>>[cH:1]1[cH:2][cH:3][c:4](-[c:5]2[cH:6][cH:7][cH:8][cH:9][cH:10]2)[cH:11][cH:12]1', 'N_REACTANTS': 2, 'N_PRODUCTS': 1, 'FG_REACTANTS': ['Aromatic halide', 'Boronic acid'], 'FG_PRODUCTS': [], 'PARTICIPATING_RINGS_REACTANTS': ['c1ccccc1', 'c1ccccc1'], 'PARTICIPATING_RINGS_PRODUCTS': ['c1ccccc1', 'c1ccccc1'], 'ALL_RINGS_PRODUCTS': ['c1ccccc1', 'c1ccccc1'], 'BY-PRODUCTS': ['HBr', 'B'], 'CLASS': 'C-C Coupling', 'TAG': 'd79a78c79f0c392f0911481acf5c300cc98205269acdb93c24fb610a61c4c868', 'SOLVENT': [''], 'REAGENT': [''], 'CATALYST': [''], 'REF': '', 'NAME': 'Suzuki coupling with boronic acids', 'SCAFFOLD': 'c1ccc(-c2ccccc2)cc1'} """ def __init__( self, reaction: str, solvent: str = "", reagent: str = "", catalyst: str = "", ref: str = "", rxn_mapper: Optional[RXNMapper] = None, keep_mapping: bool = False, smirks: pd.DataFrame = None, fg: pd.DataFrame = None, search_template: bool = True ): """Initializes a Reaction object with details of the reaction. Args: reaction (str): A string representing the reaction in SMILES format. solvent (str, optional): Solvent(s) used in the reaction. Defaults to an empty string. reagent (str, optional): Reagent(s) used in the reaction. Defaults to an empty string. catalyst (str, optional): Catalyst(s) used in the reaction. Defaults to an empty string. ref (str, optional): Reference or note associated with the reaction. Defaults to an empty string. rxn_mapper (RXNMapper, optional): Object for reaction mapping. Defaults to None. keep_mapping (bool, optional): Whether to retain atom mappings in the reaction. Defaults to False. smirks (pd.DataFrame, optional): DataFrame of SMIRKS transformations. Defaults to None. fg (pd.DataFrame, optional): DataFrame of functional groups data. Defaults to None. search_template (bool, optional): Whether to search for reaction templates. Defaults to True. """ self.reaction = "" self.solvent = solvent self.reagent = reagent self.catalyst = catalyst self.reference = ref self.read_reaction(reaction) if ":" in self.reaction and not keep_mapping: self.reaction = remove_atom_mapping( self.reaction ) # Remove atom mapping for consistency else: self.reaction = self.reaction self.smirks_db = smirks self.fg_db = fg self.classifier = ReactionClassifier( reaction, rxn_mapper=rxn_mapper, keep_mapping=keep_mapping, search_template=search_template ) self.add_agents() self.reactants, self.products = self.classifier.sanitized_reaction.split(">>") self.mapped_reaction = self.classifier.sanitized_mapped_reaction self.reaction_class = "" self.template = self.classifier.template self.reaction_info: dict[str, tuple[str, ...] | str] = dict() self.tag = "" self.name = "" self.byproducts: tuple[str, ...] = tuple() self.scaffold = self.get_scaffold() self.neighbors = None self.suggested_solvent = "" self.suggested_catalyst = "" self.suggested_reagent = ""
[docs] def read_reaction( self, reaction: str ) -> None: """Processes a reaction string in SMILES format. Args: reaction (str): Reaction string in SMILES format, with components separated by `>`. """ reaction_elements = reaction.split(">") self.reaction = f"{reaction_elements[0]}>>{reaction_elements[2]}" reagents = reaction_elements[1].split(".") if len(reagents) == 1 and reagents[0] == "": self.reagent = "" else: solvents = self.solvent.split(".") catalysts = self.catalyst.split(".") agents = [] for reagent in reagents: if reagent in solvents or reagent in catalysts: continue else: agents.append(reagent) self.reagent = ".".join(agents)
[docs] def add_agents(self) -> None: """Adds agents identified by the classifier to the reagent list.""" reagents = self.reagent.split(".") reagents += self.classifier.extra_agents self.reagent = ".".join(reagents)
[docs] def get_class(self) -> str: """Determines and returns the class of the reaction.""" self.reaction_class = self.classifier.classify_reaction() return self.reaction_class
[docs] def get_rings_in_products(self) -> list[str]: """Identifies and returns ring structures in the reaction products.""" return self.classifier.get_ring_type(self.classifier.mol_product)
[docs] def get_rings_in_reactants(self) -> list[str]: """Identifies and returns ring structures in the reaction reactants.""" return self.classifier.get_ring_type(self.classifier.mol_reactant)
[docs] def get_rings_in_reaction_center( self, ) -> tuple[list[str], ...]: """Identifies and returns rings in the reaction center for reactants and products.""" return tuple( [ self.classifier.get_ring_type( self.classifier.mol_reactant, self.classifier.reactant_map_dict ), self.classifier.get_ring_type( self.classifier.mol_product, self.classifier.product_map_dict ), ] )
[docs] def get_functional_groups(self) -> tuple[list[str], ...]: """Identifies and returns functional groups in reactants and products.""" if self.fg_db is None: from importlib import resources with resources.path( f"{__package__}.data", "functional_groups.json" ) as path: self.fg_db = pd.read_json(path, orient="records", lines=True) c = self.classifier return tuple( [ c.get_functional_groups( c.mol_reactant, c.reactant_map_dict, self.fg_db ), c.get_functional_groups(c.mol_product, c.product_map_dict, self.fg_db), ] )
[docs] def get_byproducts(self) -> list[str]: """Calculates and returns byproducts of the reaction based on functional group analysis.""" fg_r, fg_p = self.get_functional_groups() calculated_byproducts = self.classifier.balance_reaction(fg_r, fg_p) self.byproducts = calculated_byproducts return calculated_byproducts
[docs] def get_scaffold(self) -> Optional[str]: """Extracts and returns the molecular scaffold of the product.""" return get_scaffold(self.classifier.mol_product)
[docs] def get_name(self) -> str: """Determines and returns the name of the reaction based on SMIRKS data.""" if self.smirks_db is None: from importlib import resources with resources.path(f"{__package__}.data", "smirks.json") as path: self.smirks_db = curate_smirks( pd.read_json(path, orient="records", lines=True) ) self.name = self.classifier.name_reaction(self.smirks_db) return self.name
[docs] def get_reaction_info(self) -> dict[str, list[str] | str]: """This function compiles all reaction-related information at once. Upon calling this function, the T-matrix of the reaction will be calculated, a class and name will be assigned, the functional groups, rings, and scaffold of the reaction are determined. All information is returned as a dictionary.""" if self.fg_db is None: from importlib import resources with resources.path( f"{__package__}.data", "functional_groups.json" ) as path: self.fg_db = pd.read_json(path, orient="records", lines=True) info_dict = self.classifier.get_reaction_center_info(self.fg_db) self.tag = info_dict["TAG"] self.reaction_class = info_dict["CLASS"] try: info_dict["SOLVENT"] = self.solvent.split(".") except AttributeError: info_dict["SOLVENT"] = [] try: info_dict["REAGENT"] = self.reagent.split(".") except AttributeError: info_dict["REAGENT"] = [] try: info_dict["CATALYST"] = self.catalyst.split(".") except AttributeError: info_dict["CATALYST"] = [] try: info_dict["REF"] = self.reference except AttributeError: info_dict["REF"] = "" if self.name == "": info_dict["NAME"] = self.get_name() else: info_dict["NAME"] = self.name info_dict["SCAFFOLD"] = self.get_scaffold() self.name = info_dict["NAME"] self.scaffold = info_dict["SCAFFOLD"] self.reaction_info = info_dict return info_dict
[docs] def get_reaction_center(self) -> Optional[str]: """Returns the reaction center SMILES string if available.""" return self.classifier.template_smiles
[docs] def find_neighbors( self, df: pd.DataFrame, fp: str = "MACCS", concatenate: bool = True, max_return: int = 100, threshold: float = 0.3, broaden: bool = False, full_search: bool = False, ) -> pd.DataFrame: """Finds and returns similar reactions in the database. Args: df: The DataFrame to search within. fp: The type of fingerprint to use, 'MACCS' or 'Morgan'. concatenate: Whether to concatenate patterns in fingerprinting. max_return: Maximum number of similar reactions to return. threshold: The similarity threshold to consider for matching. broaden: Whether to use a broadened search criteria based on tags. full_search: If true, performs an exhaustive search across the database. Example: >>> from rxn_insight.reaction import Reaction >>> df_uspto = pd.read_parquet("uspto_rxn_insight.gzip") # Download: https://zenodo.org/records/10171745 >>> rxn = Reaction("OB(O)c1ccccc1.Brc1ccccc1>>c1ccc(-c2ccccc2)cc1") >>> df_neighbors = rxn.find_neighbors(df_uspto) """ self.get_reaction_info() if full_search: warnings.warn("Full database search is activated. This may take long.") df_tag = df.copy() elif broaden: tag = self.give_broad_tag() df_tag = df[df["TAG2"] == tag].copy() else: tag = self.tag df_tag = df[df["TAG"] == tag].copy() if len(df_tag.index) == 0: print("No similar reactions found...") return None fps = [] if fp.lower() == "maccs" and concatenate: if "rxn_str_patt_fp" in df_tag: fps = [ np.fromiter(fp, dtype=np.int64) for fp in tqdm( df_tag["rxn_str_patt_fp"].tolist(), desc="Loading fingerprints...", ) ] elif fp.lower() == "maccs" and not concatenate: if "rxn_dif_patt_fp" in df_tag: fps = [ np.fromiter(fp, dtype=np.int64) for fp in tqdm( df_tag["rxn_dif_patt_fp"].tolist(), desc="Loading fingerprints...", ) ] elif fp.lower() == "morgan" and concatenate: if "rxn_str_morgan_fp" in df_tag: fps = [ np.fromiter(fp, dtype=np.int64) for fp in tqdm( df_tag["rxn_str_morgan_fp"].tolist(), desc="Loading fingerprints...", ) ] elif fp.lower() == "morgan" and not concatenate: if "rxn_dif_morgan_fp" in df_tag: fps = [ np.fromiter(fp, dtype=np.int64) for fp in tqdm( df_tag["rxn_dif_morgan_fp"].tolist(), desc="Loading fingerprints...", ) ] else: raise KeyError( f"Fingerprint choice {fp} is not supported. Select either MACCS or Morgan." ) if len(fps) == 0: fps = [ get_fp(r, fp, concatenate) for r in tqdm( df_tag["REACTION"].tolist(), desc="Creating fingerprints..." ) ] rxnfp = get_fp(self.reaction, fp, concatenate) sims = [ get_similarity(rxnfp, fp) for fp in tqdm(fps, desc="Calculating Tanimoto similarity") ] df_tag["SIMILARITY"] = sims df_tag = df_tag.sort_values(by="SIMILARITY", ascending=False) df_tag["SOLVENT"].fillna("", inplace=True) df_tag["CATALYST"].fillna("", inplace=True) df_tag["REAGENT"].fillna("", inplace=True) max_similarity = df_tag["SIMILARITY"].max() df_tag = df_tag[df_tag["SIMILARITY"] > threshold].copy() print( f"Reaction found with similarity of {max_similarity:.3f}. This will be our best match." ) df_return = df_tag.iloc[:max_return].copy() if "rxn_str_patt_fp" in df_return.keys(): df_return = df_return.drop(columns=["rxn_str_patt_fp"]) if "rxn_dif_patt_fp" in df_return.keys(): df_return = df_return.drop(columns=["rxn_dif_patt_fp"]) if "rxn_str_morgan_fp" in df_return.keys(): df_return = df_return.drop(columns=["rxn_str_morgan_fp"]) if "rxn_dif_morgan_fp" in df_return.keys(): df_return = df_return.drop(columns=["rxn_dif_morgan_fp"]) if "TAG" in df_return.keys(): df_return = df_return.drop(columns=["TAG"]) if "TAG2" in df_return.keys(): df_return = df_return.drop(columns=["TAG2"]) self.neighbors = df_return return df_return
[docs] def give_broad_tag(self) -> str: """Generates a broadened tag for the reaction based on its characteristics.""" rxn_info = self.reaction_info tag = f"{rxn_info['CLASS']} " try: fg_r = sorted(list(rxn_info["FG_REACTANTS"])) except AttributeError: fg_r = "" try: fg_p = sorted(list(rxn_info["FG_PRODUCTS"])) except AttributeError: fg_p = "" tag += " ".join(fg_r) + " " tag += " ".join(fg_p) tag_bytes = tag.encode("UTF-8") hashtag = hashlib.sha256(tag_bytes).hexdigest() return str(hashtag)
[docs] def suggest_conditions(self, df: pd.DataFrame) -> dict[str, pd.DataFrame]: """Suggests reaction conditions based on similar reactions found. Args: df: The DataFrame containing reaction data to analyze. Example: >>> import rxn_insight as ri >>> df_uspto = pd.read_parquet("uspto_rxn_insight.gzip") # Download: https://zenodo.org/records/10171745 >>> rxn = ri.Reaction("OB(O)c1ccccc1.Brc1ccccc1>>c1ccc(-c2ccccc2)cc1") >>> df_conditions = rxn.suggest_conditions(df_uspto) """ if self.neighbors is None or len(self.neighbors.index) == 0: nbs = self.find_neighbors(df, max_return=5000, threshold=0.3, broaden=True) else: nbs = self.neighbors solvent_rank = get_solvent_ranking(nbs) solvent_rank = solvent_rank.copy().sort_values(by="COUNT", ascending=False) catalyst_rank = get_catalyst_ranking(nbs) catalyst_rank = catalyst_rank.copy().sort_values(by="COUNT", ascending=False) reagent_rank = get_reagent_ranking(nbs) reagent_rank = reagent_rank.copy().sort_values(by="COUNT", ascending=False) conditions_dict = { "Solvent": solvent_rank["NAME"][solvent_rank.index[0]], "Catalyst": catalyst_rank["NAME"][catalyst_rank.index[0]], "Reagent": reagent_rank["NAME"][reagent_rank.index[0]], } self.suggested_solvent = solvent_rank self.suggested_catalyst = catalyst_rank self.suggested_reagent = reagent_rank return conditions_dict
[docs] def draw(self, include_mapping: bool = False, filename: Union[str, None] = None) -> pd.DataFrame: try: from IPython.display import SVG, display display(SVG(draw_chemical_reaction(self.reaction))) except ImportError: print("This function requires IPython to be installed: pip install ipython") return None