Source code for rxn_insight.reaction

"""Reaction module"""

import hashlib
import warnings
from typing import Optional, Union

import numpy as np
import pandas as pd
from rdkit import Chem
from rxnmapper import RXNMapper
from tqdm import tqdm

from rxn_insight.classification import ReactionClassifier
from rxn_insight.utils import (
    atom_remover,
    curate_smirks,
    draw_chemical_reaction,
    get_catalyst_ranking,
    get_fp,
    get_reagent_ranking,
    get_ring_systems,
    get_scaffold,
    get_similarity,
    get_solvent_ranking,
    maccs_fp,
    morgan_fp,
    remove_atom_mapping,
    sanitize_ring,
)



[docs]
class Reaction:

    """Handles operations related to chemical reactions.

    This class facilitates various operations on chemical reactions, such as
    parsing reaction strings, identifying components like solvents and reagents,
    classifying reactions, and analyzing ring structures.

    Attributes:
        reaction (str): The SMILES representation of the reaction.
        solvent (str): Solvents used in the reaction.
        reagent (str): Reagents used in the reaction.
        catalyst (str): Catalysts used in the reaction.
        reference (str): Reference or note associated with the reaction.
        smirks_db (pd.DataFrame): Database of SMIRKS transformations.
        fg_db (pd.DataFrame): Functional group data.
        classifier (ReactionClassifier): Reaction classification object.
        reactants (str): SMILES string of the reactants.
        products (str): SMILES string of the products.
        mapped_reaction (str): Reaction with atom mappings included.
        reaction_class (str): Class of the reaction.
        template (str): Reaction template derived from the classifier.
        reaction_info (dict): Additional information about the reaction.
        tag (str): Optional tag for the reaction.
        name (str): Optional name of the reaction.
        byproducts (tuple): Tuple of byproducts in the reaction.
        scaffold (str): Molecular scaffold of the reaction.
        neighbors (Any): Placeholder for reaction neighborhood information.
        suggested_solvent (str): Suggested solvent for the reaction.
        suggested_catalyst (str): Suggested catalyst for the reaction.
        suggested_reagent (str): Suggested reagent for the reaction.

    Example:
        >>> import rxn_insight as ri
        >>> rxn = ri.Reaction("OB(O)c1ccccc1.Brc1ccccc1>>c1ccc(-c2ccccc2)cc1")
        >>> ri = rxn.get_reaction_info()
        >>> print(ri)
        {'REACTION': 'Brc1ccccc1.OB(O)c1ccccc1>>c1ccc(-c2ccccc2)cc1',
        'MAPPED_REACTION': 'Br[c:5]1[cH:6][cH:7][cH:8][cH:9][cH:10]1.OB(O)[c:4]1[cH:3][cH:2][cH:1][cH:12][cH:11]1>>[cH:1]1[cH:2][cH:3][c:4](-[c:5]2[cH:6][cH:7][cH:8][cH:9][cH:10]2)[cH:11][cH:12]1',
        'N_REACTANTS': 2, 'N_PRODUCTS': 1, 'FG_REACTANTS': ['Aromatic halide', 'Boronic acid'], 'FG_PRODUCTS': [],
        'PARTICIPATING_RINGS_REACTANTS': ['c1ccccc1', 'c1ccccc1'], 'PARTICIPATING_RINGS_PRODUCTS': ['c1ccccc1', 'c1ccccc1'],
        'ALL_RINGS_PRODUCTS': ['c1ccccc1', 'c1ccccc1'], 'BY-PRODUCTS': ['HBr', 'B'], 'CLASS': 'C-C Coupling',
        'TAG': 'd79a78c79f0c392f0911481acf5c300cc98205269acdb93c24fb610a61c4c868', 'SOLVENT': [''], 'REAGENT': [''],
        'CATALYST': [''], 'REF': '', 'NAME': 'Suzuki coupling with boronic acids', 'SCAFFOLD': 'c1ccc(-c2ccccc2)cc1'}

    """

    def __init__(
        self,
        reaction: str,
        solvent: str = "",
        reagent: str = "",
        catalyst: str = "",
        ref: str = "",
        rxn_mapper: Optional[RXNMapper] = None,
        keep_mapping: bool = False,
        smirks: pd.DataFrame = None,
        fg: pd.DataFrame = None,
        search_template: bool = True
    ):

        """Initializes a Reaction object with details of the reaction.

        Args:
            reaction (str): A string representing the reaction in SMILES format.
            solvent (str, optional): Solvent(s) used in the reaction. Defaults to an empty string.
            reagent (str, optional): Reagent(s) used in the reaction. Defaults to an empty string.
            catalyst (str, optional): Catalyst(s) used in the reaction. Defaults to an empty string.
            ref (str, optional): Reference or note associated with the reaction. Defaults to an empty string.
            rxn_mapper (RXNMapper, optional): Object for reaction mapping. Defaults to None.
            keep_mapping (bool, optional): Whether to retain atom mappings in the reaction. Defaults to False.
            smirks (pd.DataFrame, optional): DataFrame of SMIRKS transformations. Defaults to None.
            fg (pd.DataFrame, optional): DataFrame of functional groups data. Defaults to None.
            search_template (bool, optional): Whether to search for reaction templates. Defaults to True.
        """

        self.reaction = ""
        self.solvent = solvent
        self.reagent = reagent
        self.catalyst = catalyst
        self.reference = ref
        self.read_reaction(reaction)
        if ":" in self.reaction and not keep_mapping:
            self.reaction = remove_atom_mapping(
                self.reaction
            )  # Remove atom mapping for consistency
        else:
            self.reaction = self.reaction
        self.smirks_db = smirks
        self.fg_db = fg
        self.classifier = ReactionClassifier(
            reaction, rxn_mapper=rxn_mapper, keep_mapping=keep_mapping, search_template=search_template
        )
        self.add_agents()
        self.reactants, self.products = self.classifier.sanitized_reaction.split(">>")
        self.mapped_reaction = self.classifier.sanitized_mapped_reaction
        self.reaction_class = ""
        self.template = self.classifier.template
        self.reaction_info: dict[str, tuple[str, ...] | str] = dict()
        self.tag = ""
        self.name = ""
        self.byproducts: tuple[str, ...] = tuple()
        self.scaffold = self.get_scaffold()
        self.neighbors = None
        self.suggested_solvent = ""
        self.suggested_catalyst = ""
        self.suggested_reagent = ""


[docs]
    def read_reaction(
            self,
            reaction: str
    ) -> None:

        """Processes a reaction string in SMILES format.

        Args:
            reaction (str): Reaction string in SMILES format, with components separated by `>`.
        """

        reaction_elements = reaction.split(">")
        self.reaction = f"{reaction_elements[0]}>>{reaction_elements[2]}"
        reagents = reaction_elements[1].split(".")
        if len(reagents) == 1 and reagents[0] == "":
            self.reagent = ""
        else:
            solvents = self.solvent.split(".")
            catalysts = self.catalyst.split(".")
            agents = []
            for reagent in reagents:
                if reagent in solvents or reagent in catalysts:
                    continue
                else:
                    agents.append(reagent)
            self.reagent = ".".join(agents)



[docs]
    def add_agents(self) -> None:
        """Adds agents identified by the classifier to the reagent list."""
        reagents = self.reagent.split(".")
        reagents += self.classifier.extra_agents
        self.reagent = ".".join(reagents)



[docs]
    def get_class(self) -> str:
        """Determines and returns the class of the reaction."""
        self.reaction_class = self.classifier.classify_reaction()
        return self.reaction_class



[docs]
    def get_rings_in_products(self) -> list[str]:
        """Identifies and returns ring structures in the reaction products."""
        return self.classifier.get_ring_type(self.classifier.mol_product)



[docs]
    def get_rings_in_reactants(self) -> list[str]:
        """Identifies and returns ring structures in the reaction reactants."""
        return self.classifier.get_ring_type(self.classifier.mol_reactant)



[docs]
    def get_rings_in_reaction_center(
        self,
    ) -> tuple[list[str], ...]:
        """Identifies and returns rings in the reaction center for reactants and products."""
        return tuple(
            [
                self.classifier.get_ring_type(
                    self.classifier.mol_reactant, self.classifier.reactant_map_dict
                ),
                self.classifier.get_ring_type(
                    self.classifier.mol_product, self.classifier.product_map_dict
                ),
            ]
        )



[docs]
    def get_functional_groups(self) -> tuple[list[str], ...]:
        """Identifies and returns functional groups in reactants and products."""
        if self.fg_db is None:
            from importlib import resources

            with resources.path(
                f"{__package__}.data", "functional_groups.json"
            ) as path:
                self.fg_db = pd.read_json(path, orient="records", lines=True)
        c = self.classifier
        return tuple(
            [
                c.get_functional_groups(
                    c.mol_reactant, c.reactant_map_dict, self.fg_db
                ),
                c.get_functional_groups(c.mol_product, c.product_map_dict, self.fg_db),
            ]
        )



[docs]
    def get_byproducts(self) -> list[str]:
        """Calculates and returns byproducts of the reaction based on functional group analysis."""
        fg_r, fg_p = self.get_functional_groups()
        calculated_byproducts = self.classifier.balance_reaction(fg_r, fg_p)
        self.byproducts = calculated_byproducts
        return calculated_byproducts



[docs]
    def get_scaffold(self) -> Optional[str]:
        """Extracts and returns the molecular scaffold of the product."""
        return get_scaffold(self.classifier.mol_product)



[docs]
    def get_name(self) -> str:
        """Determines and returns the name of the reaction based on SMIRKS data."""
        if self.smirks_db is None:
            from importlib import resources

            with resources.path(f"{__package__}.data", "smirks.json") as path:
                self.smirks_db = curate_smirks(
                    pd.read_json(path, orient="records", lines=True)
                )
        self.name = self.classifier.name_reaction(self.smirks_db)
        return self.name



[docs]
    def get_reaction_info(self) -> dict[str, list[str] | str]:

        """This function compiles all reaction-related information at once. Upon calling this function,
        the T-matrix of the reaction will be calculated, a class and name will be assigned, the functional groups,
        rings, and scaffold of the reaction are determined. All information is returned as a dictionary."""

        if self.fg_db is None:
            from importlib import resources

            with resources.path(
                f"{__package__}.data", "functional_groups.json"
            ) as path:
                self.fg_db = pd.read_json(path, orient="records", lines=True)

        info_dict = self.classifier.get_reaction_center_info(self.fg_db)
        self.tag = info_dict["TAG"]
        self.reaction_class = info_dict["CLASS"]

        try:
            info_dict["SOLVENT"] = self.solvent.split(".")
        except AttributeError:
            info_dict["SOLVENT"] = []
        try:
            info_dict["REAGENT"] = self.reagent.split(".")
        except AttributeError:
            info_dict["REAGENT"] = []
        try:
            info_dict["CATALYST"] = self.catalyst.split(".")
        except AttributeError:
            info_dict["CATALYST"] = []
        try:
            info_dict["REF"] = self.reference
        except AttributeError:
            info_dict["REF"] = ""

        if self.name == "":
            info_dict["NAME"] = self.get_name()
        else:
            info_dict["NAME"] = self.name

        info_dict["SCAFFOLD"] = self.get_scaffold()

        self.name = info_dict["NAME"]
        self.scaffold = info_dict["SCAFFOLD"]

        self.reaction_info = info_dict

        return info_dict



[docs]
    def get_reaction_center(self) -> Optional[str]:
        """Returns the reaction center SMILES string if available."""
        return self.classifier.template_smiles



[docs]
    def find_neighbors(
        self,
        df: pd.DataFrame,
        fp: str = "MACCS",
        concatenate: bool = True,
        max_return: int = 100,
        threshold: float = 0.3,
        broaden: bool = False,
        full_search: bool = False,
    ) -> pd.DataFrame:
        """Finds and returns similar reactions in the database.

        Args:
            df: The DataFrame to search within.
            fp: The type of fingerprint to use, 'MACCS' or 'Morgan'.
            concatenate: Whether to concatenate patterns in fingerprinting.
            max_return: Maximum number of similar reactions to return.
            threshold: The similarity threshold to consider for matching.
            broaden: Whether to use a broadened search criteria based on tags.
            full_search: If true, performs an exhaustive search across the database.

        Example:
            >>> from rxn_insight.reaction import Reaction
            >>> df_uspto = pd.read_parquet("uspto_rxn_insight.gzip")  # Download: https://zenodo.org/records/10171745
            >>> rxn = Reaction("OB(O)c1ccccc1.Brc1ccccc1>>c1ccc(-c2ccccc2)cc1")
            >>> df_neighbors = rxn.find_neighbors(df_uspto)
        """
        self.get_reaction_info()
        if full_search:
            warnings.warn("Full database search is activated. This may take long.")
            df_tag = df.copy()
        elif broaden:
            tag = self.give_broad_tag()
            df_tag = df[df["TAG2"] == tag].copy()
        else:
            tag = self.tag
            df_tag = df[df["TAG"] == tag].copy()
        if len(df_tag.index) == 0:
            print("No similar reactions found...")
            return None
        fps = []
        if fp.lower() == "maccs" and concatenate:
            if "rxn_str_patt_fp" in df_tag:
                fps = [
                    np.fromiter(fp, dtype=np.int64)
                    for fp in tqdm(
                        df_tag["rxn_str_patt_fp"].tolist(),
                        desc="Loading fingerprints...",
                    )
                ]
        elif fp.lower() == "maccs" and not concatenate:
            if "rxn_dif_patt_fp" in df_tag:
                fps = [
                    np.fromiter(fp, dtype=np.int64)
                    for fp in tqdm(
                        df_tag["rxn_dif_patt_fp"].tolist(),
                        desc="Loading fingerprints...",
                    )
                ]
        elif fp.lower() == "morgan" and concatenate:
            if "rxn_str_morgan_fp" in df_tag:
                fps = [
                    np.fromiter(fp, dtype=np.int64)
                    for fp in tqdm(
                        df_tag["rxn_str_morgan_fp"].tolist(),
                        desc="Loading fingerprints...",
                    )
                ]
        elif fp.lower() == "morgan" and not concatenate:
            if "rxn_dif_morgan_fp" in df_tag:
                fps = [
                    np.fromiter(fp, dtype=np.int64)
                    for fp in tqdm(
                        df_tag["rxn_dif_morgan_fp"].tolist(),
                        desc="Loading fingerprints...",
                    )
                ]
        else:
            raise KeyError(
                f"Fingerprint choice {fp} is not supported. Select either MACCS or Morgan."
            )
        if len(fps) == 0:
            fps = [
                get_fp(r, fp, concatenate)
                for r in tqdm(
                    df_tag["REACTION"].tolist(), desc="Creating fingerprints..."
                )
            ]
        rxnfp = get_fp(self.reaction, fp, concatenate)

        sims = [
            get_similarity(rxnfp, fp)
            for fp in tqdm(fps, desc="Calculating Tanimoto similarity")
        ]
        df_tag["SIMILARITY"] = sims
        df_tag = df_tag.sort_values(by="SIMILARITY", ascending=False)
        df_tag["SOLVENT"].fillna("", inplace=True)
        df_tag["CATALYST"].fillna("", inplace=True)
        df_tag["REAGENT"].fillna("", inplace=True)
        max_similarity = df_tag["SIMILARITY"].max()
        df_tag = df_tag[df_tag["SIMILARITY"] > threshold].copy()
        print(
            f"Reaction found with similarity of {max_similarity:.3f}. This will be our best match."
        )
        df_return = df_tag.iloc[:max_return].copy()
        if "rxn_str_patt_fp" in df_return.keys():
            df_return = df_return.drop(columns=["rxn_str_patt_fp"])
        if "rxn_dif_patt_fp" in df_return.keys():
            df_return = df_return.drop(columns=["rxn_dif_patt_fp"])
        if "rxn_str_morgan_fp" in df_return.keys():
            df_return = df_return.drop(columns=["rxn_str_morgan_fp"])
        if "rxn_dif_morgan_fp" in df_return.keys():
            df_return = df_return.drop(columns=["rxn_dif_morgan_fp"])
        if "TAG" in df_return.keys():
            df_return = df_return.drop(columns=["TAG"])
        if "TAG2" in df_return.keys():
            df_return = df_return.drop(columns=["TAG2"])

        self.neighbors = df_return

        return df_return



[docs]
    def give_broad_tag(self) -> str:
        """Generates a broadened tag for the reaction based on its characteristics."""
        rxn_info = self.reaction_info
        tag = f"{rxn_info['CLASS']} "
        try:
            fg_r = sorted(list(rxn_info["FG_REACTANTS"]))
        except AttributeError:
            fg_r = ""
        try:
            fg_p = sorted(list(rxn_info["FG_PRODUCTS"]))
        except AttributeError:
            fg_p = ""
        tag += " ".join(fg_r) + " "
        tag += " ".join(fg_p)
        tag_bytes = tag.encode("UTF-8")
        hashtag = hashlib.sha256(tag_bytes).hexdigest()
        return str(hashtag)



[docs]
    def suggest_conditions(self, df: pd.DataFrame) -> dict[str, pd.DataFrame]:
        """Suggests reaction conditions based on similar reactions found.

        Args:
            df: The DataFrame containing reaction data to analyze.

        Example:
            >>> import rxn_insight as ri
            >>> df_uspto = pd.read_parquet("uspto_rxn_insight.gzip")  # Download: https://zenodo.org/records/10171745
            >>> rxn = ri.Reaction("OB(O)c1ccccc1.Brc1ccccc1>>c1ccc(-c2ccccc2)cc1")
            >>> df_conditions = rxn.suggest_conditions(df_uspto)

        """
        if self.neighbors is None or len(self.neighbors.index) == 0:
            nbs = self.find_neighbors(df, max_return=5000, threshold=0.3, broaden=True)
        else:
            nbs = self.neighbors

        solvent_rank = get_solvent_ranking(nbs)
        solvent_rank = solvent_rank.copy().sort_values(by="COUNT", ascending=False)
        catalyst_rank = get_catalyst_ranking(nbs)
        catalyst_rank = catalyst_rank.copy().sort_values(by="COUNT", ascending=False)
        reagent_rank = get_reagent_ranking(nbs)
        reagent_rank = reagent_rank.copy().sort_values(by="COUNT", ascending=False)

        conditions_dict = {
            "Solvent": solvent_rank["NAME"][solvent_rank.index[0]],
            "Catalyst": catalyst_rank["NAME"][catalyst_rank.index[0]],
            "Reagent": reagent_rank["NAME"][reagent_rank.index[0]],
        }
        self.suggested_solvent = solvent_rank
        self.suggested_catalyst = catalyst_rank
        self.suggested_reagent = reagent_rank

        return conditions_dict



[docs]
    def draw(self, include_mapping: bool = False, filename: Union[str, None] = None) -> pd.DataFrame:
        try:
            from IPython.display import SVG, display
            display(SVG(draw_chemical_reaction(self.reaction)))
        except ImportError:
            print("This function requires IPython to be installed: pip install ipython")
            return None