[docs]classOracle:"""The default Oracle class for scoring molecules in GraphGA. This class wraps predictive models (like RandomForestRegressor) to score molecules based on their properties. It handles conversion of SMILES to fingerprints. Parameters ---------- models : List[Any], optional List of trained models that implement a predict method. If None, RandomForestRegressors will be created when fit is called. num_task : int, default=1 Number of properties to predict. """def__init__(self,models=None,num_task=1):self.models=modelsifmodelsisnotNoneelse[RandomForestRegressor()for_inrange(num_task)]self.num_task=num_taskifmodelsisNoneelselen(models)def_convert_to_fingerprint(self,molecules):"""Convert SMILES or RDKit molecules to fingerprints."""ifisinstance(molecules[0],str):returnnp.array([getmorganfingerprint(Chem.MolFromSmiles(mol))formolinmolecules])else:returnnp.array([getmorganfingerprint(mol)formolinmolecules])
[docs]deffit(self,X_train,y_train):"""Fit the underlying models with training data. Parameters ---------- X_train : List[str] or List[RDKit.Mol] Training molecules as SMILES strings or RDKit Mol objects. y_train : np.ndarray Training labels with shape (n_samples, num_task). Returns ------- self : Oracle Fitted oracle. """X_train_fp=self._convert_to_fingerprint(X_train)foriinrange(self.num_task):nan_mask=~np.isnan(y_train[:,i])y_train_=y_train[:,i][nan_mask]X_train_fp_=X_train_fp[nan_mask]self.models[i].fit(X_train_fp_,y_train_)returnself
def__call__(self,molecules,target_values):"""Score molecules based on their predicted properties. Parameters ---------- molecules : List[str] or List[RDKit.Mol] Molecules to score as SMILES strings or RDKit Mol objects. target_values : np.ndarray, Scores will be based on distance to these targets. Returns ------- List[float] Scores for each molecule. """fps=self._convert_to_fingerprint(molecules)scores_list=[]fori,fpinenumerate(fps):ifself.num_task==1:score=self.models[0].predict([fp])[0]scores_list.append(float(score))else:mol_scores=[]foridxinrange(self.num_task):pred=self.models[idx].predict([fp])[0]iftarget_valuesisnotNoneandnotnp.isnan(target_values[0][idx]):# Lower score for values closer to targettarget=target_values[0][idx]dist=abs(float(pred)-target)/(abs(target)+1e-8)mol_scores.append(dist)score=np.nanmean(mol_scores)# Lower is better when using distancesscores_list.append(float(score))returnscores_list