Source code for torch_molecule.generator.graph_ga.oracle

import numpy as np
from sklearn.ensemble import RandomForestRegressor
from rdkit import Chem
from typing import List, Any
from ...utils.graph.features import getmorganfingerprint


[docs] class Oracle: """The default Oracle class for scoring molecules in GraphGA. This class wraps predictive models (like RandomForestRegressor) to score molecules based on their properties. It handles conversion of SMILES to fingerprints. Parameters ---------- models : List[Any], optional List of trained models that implement a predict method. If None, RandomForestRegressors will be created when fit is called. num_task : int, default=1 Number of properties to predict. """ def __init__(self, models=None, num_task=1): self.models = models if models is not None else [RandomForestRegressor() for _ in range(num_task)] self.num_task = num_task if models is None else len(models) def _convert_to_fingerprint(self, molecules): """Convert SMILES or RDKit molecules to fingerprints.""" if isinstance(molecules[0], str): return np.array([getmorganfingerprint(Chem.MolFromSmiles(mol)) for mol in molecules]) else: return np.array([getmorganfingerprint(mol) for mol in molecules])
[docs] def fit(self, X_train, y_train): """Fit the underlying models with training data. Parameters ---------- X_train : List[str] or List[RDKit.Mol] Training molecules as SMILES strings or RDKit Mol objects. y_train : np.ndarray Training labels with shape (n_samples, num_task). Returns ------- self : Oracle Fitted oracle. """ X_train_fp = self._convert_to_fingerprint(X_train) for i in range(self.num_task): nan_mask = ~np.isnan(y_train[:, i]) y_train_ = y_train[:, i][nan_mask] X_train_fp_ = X_train_fp[nan_mask] self.models[i].fit(X_train_fp_, y_train_) return self
def __call__(self, molecules, target_values): """Score molecules based on their predicted properties. Parameters ---------- molecules : List[str] or List[RDKit.Mol] Molecules to score as SMILES strings or RDKit Mol objects. target_values : np.ndarray, Scores will be based on distance to these targets. Returns ------- List[float] Scores for each molecule. """ fps = self._convert_to_fingerprint(molecules) scores_list = [] for i, fp in enumerate(fps): if self.num_task == 1: score = self.models[0].predict([fp])[0] scores_list.append(float(score)) else: mol_scores = [] for idx in range(self.num_task): pred = self.models[idx].predict([fp])[0] if target_values is not None and not np.isnan(target_values[0][idx]): # Lower score for values closer to target target = target_values[0][idx] dist = abs(float(pred) - target) / (abs(target) + 1e-8) mol_scores.append(dist) score = np.nanmean(mol_scores) # Lower is better when using distances scores_list.append(float(score)) return scores_list