[docs]classOracle:"""Oracle class for scoring molecules. This class wraps predictive models (like RandomForestRegressor) to score molecules based on their properties. It handles conversion of SMILES to fingerprints. Parameters ---------- models : List[Any], optional List of trained models that implement a predict method. If None, RandomForestRegressors will be created when fit is called. num_task : int, default=1 Number of properties to predict. """def__init__(self,models=None,num_task=1):self.models=modelsifmodelsisnotNoneelse[RandomForestRegressor()for_inrange(num_task)]self.num_task=num_taskifmodelsisNoneelselen(models)def_convert_to_fingerprint(self,molecules):"""Convert SMILES or RDKit molecules to fingerprints."""ifisinstance(molecules[0],str):returnnp.array([getmorganfingerprint(Chem.MolFromSmiles(mol))formolinmolecules])else:returnnp.array([getmorganfingerprint(mol)formolinmolecules])
[docs]deffit(self,X_train,y_train):"""Fit the underlying models with training data. Parameters ---------- X_train : List[str] or List[RDKit.Mol] Training molecules as SMILES strings or RDKit Mol objects. y_train : np.ndarray Training labels with shape (n_samples, num_task). Returns ------- self : Oracle Fitted oracle. """X_train_fp=self._convert_to_fingerprint(X_train)foriinrange(self.num_task):nan_mask=~np.isnan(y_train[:,i])y_train_=y_train[:,i][nan_mask]X_train_fp_=X_train_fp[nan_mask]self.models[i].fit(X_train_fp_,y_train_)returnself
def__call__(self,molecules,target_values):"""Score molecules based on their predicted properties. Parameters ---------- molecules : List[str] or List[RDKit.Mol] Molecules to score as SMILES strings or RDKit Mol objects. target_values : np.ndarray, Scores will be based on distance to these targets. Returns ------- List[float] Scores for each molecule. """fps=self._convert_to_fingerprint(molecules)scores_list=[]fori,fpinenumerate(fps):ifself.num_task==1:score=self.models[0].predict([fp])[0]scores_list.append(float(score))else:mol_scores=[]foridxinrange(self.num_task):pred=self.models[idx].predict([fp])[0]iftarget_valuesisnotNoneandnotnp.isnan(target_values[0][idx]):# Lower score for values closer to targettarget=target_values[0][idx]dist=abs(float(pred)-target)/(abs(target)+1e-8)mol_scores.append(dist)score=np.nanmean(mol_scores)# Lower is better when using distancesscores_list.append(float(score))returnscores_list