In [ ]:
from io import StringIO
import io #Python3
from rdkit import Chem
from rdkit.Chem import AllChem
from numpy import linalg
from rdkit.Chem import PandasTools
from rdkit import RDConfig
from rdkit.Chem import Descriptors
from pathlib import Path
%matplotlib inline
In [ ]:
import os
# Setup Imports
import pandas as pd
import numpy as np
from scipy import stats, integrate
import matplotlib.pyplot as plt
import seaborn as sns
#requires scikit-learn==1.5.2
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import (
accuracy_score,
mean_absolute_error,
mean_squared_error,
root_mean_squared_error,
r2_score,
roc_auc_score,
)
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import LabelEncoder
from IPython.display import display, Markdown, Latex
# Baseline Imports
from xgboost import XGBClassifier, XGBRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
import torch
from tabpfn import TabPFNClassifier, TabPFNRegressor
In [ ]:
def getDeviceType() -> str:
"""Get the device type to use for training and inference."""
if torch.cuda.is_available():
return "cuda"
elif torch.backends.mps.is_available(): #comment out to use cpu
return "mps" #comment out to use cpu
else:
return "cpu"
In [ ]:
device_type = getDeviceType()
print(f"Training and inference completed using: {device_type}")
#For this work all testing was done on cpu.
In [ ]:
#Enter the path to sdf file
sdfFilePath = '/Users/chrisswain/Projects/Polaris/Data/RatLM.sdf'
In [ ]:
moldf = PandasTools.LoadSDF(sdfFilePath,molColName='Molecule')
moldf
In [ ]:
def getMolDescriptors(mol, missingVal=None):
''' calculate the full list of descriptors for a molecule
missingVal is used if the descriptor cannot be calculated
'''
res = {}
for nm,fn in Descriptors._descList:
# some of the descriptor fucntions can throw errors if they fail, catch those here:
try:
val = fn(mol)
except:
# print the error message:
import traceback
traceback.print_exc()
# and set the descriptor value to whatever missingVal is
val = missingVal
res[nm] = val
return res
In [ ]:
#Calc Descriptors using RDKit
suppl = Chem.SDMolSupplier(sdfFilePath, removeHs=False)
mols = [m for m in suppl]
allDescrs = [getMolDescriptors(m) for m in mols]
In [ ]:
#allDescrs
In [ ]:
df = pd.DataFrame(allDescrs)
print(df.shape)
In [ ]:
allDatadf = pd.concat([moldf, df], axis=1)
In [ ]:
allDatadf
In [ ]:
allDatadf = allDatadf.drop('ID', axis=1)
allDatadf.dtypes
In [ ]:
allDatadf['LOG RLM_CLint (mL/min/kg)'] = allDatadf['LOG RLM_CLint (mL/min/kg)'].astype(float)
# drop rows that contain NaN values
allDatadf = allDatadf.dropna()
allDatadf
In [ ]:
In [ ]:
y = allDatadf['LOG RLM_CLint (mL/min/kg)']
In [ ]:
y.dtypes
In [ ]:
X = allDatadf.iloc[:,-217:]
results=X.dtypes
X
In [ ]:
#allDatadf.to_csv("y_file", encoding='utf-8', index=False, header=True)
In [ ]:
#to check datatypes
with open('your_file.txt', 'w') as f:
for line in results:
f.write(f"{line}\n")
In [ ]:
# remove NaN and inf values.
X = X.fillna(0)
y = y.fillna(0)
max_value = torch.finfo(torch.half).max
X = X.clip(-max_value, max_value)
X = X.clip(-max_value, max_value)
In [ ]:
#split to train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
In [ ]:
In [ ]:
# Train and evaluate TabPFN
reg = TabPFNRegressor(random_state=42)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
# Calculate Variance Explained (R2 Score)
score = root_mean_squared_error(y_test, y_pred)
print(f"TabPFN RMSE: {score:.4f}")
In [ ]:
In [ ]:
# Compare different machine learning models by training each one multiple times
# on different parts of the data and averaging their performance scores for a
# more reliable performance estimate
# Define models
models = [
('TabPFN', TabPFNRegressor(random_state=42)),
('RandomForest', RandomForestRegressor(random_state=42)),
('XGBoost', XGBRegressor(random_state=42)),
('CatBoost', CatBoostRegressor(random_state=42, verbose=0))
]
# Calculate scores
scoring = 'neg_root_mean_squared_error'
scores = {name: cross_val_score(model, X, y, cv=5, scoring=scoring, n_jobs=-1).mean() for name, model in models}
# Plot results
df = pd.DataFrame(list(scores.items()), columns=['Model', 'RMSE'])
df.RMSE = -df.RMSE # Smaller is better
ax = df.plot(x='Model', y='RMSE', kind='bar', figsize=(10, 6))
ax.set_ylim(0, df['RMSE'].max() * 1.2)
ax.set_title('Model Comparison - 5-fold Cross-validation \n (Root Mean Squared Error - Smaller is better)')