from io import StringIO
import io #Python3


from rdkit import Chem
from rdkit.Chem import AllChem
from numpy import linalg
from rdkit.Chem import PandasTools
from rdkit import RDConfig
from rdkit.Chem import Descriptors
from pathlib import Path


%matplotlib inline

import os

# Setup Imports
import pandas as pd
import numpy as np
from scipy import stats, integrate
import matplotlib.pyplot as plt 
import seaborn as sns
#requires scikit-learn==1.5.2

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import (
    accuracy_score,
    mean_absolute_error,
    mean_squared_error,
    root_mean_squared_error,
    r2_score,
    roc_auc_score,
)
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.inspection import DecisionBoundaryDisplay

from sklearn.datasets import fetch_openml
from sklearn.preprocessing import LabelEncoder
from IPython.display import display, Markdown, Latex

# Baseline Imports
from xgboost import XGBClassifier, XGBRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from catboost import CatBoostClassifier, CatBoostRegressor

import torch

from tabpfn import TabPFNClassifier, TabPFNRegressor

def getDeviceType() -> str:
    """Get the device type to use for training and inference."""
    if torch.cuda.is_available():
        return "cuda"
    elif torch.backends.mps.is_available(): #comment out to use cpu
        return "mps" #comment out to use cpu
    else:
        return "cpu"

device_type = getDeviceType()
print(f"Training and inference completed using: {device_type}")

#For this work all testing was done on cpu.

#Enter the path to sdf file
sdfFilePath = '/Users/chrisswain/Projects/Polaris/Data/RatLM.sdf'

moldf = PandasTools.LoadSDF(sdfFilePath,molColName='Molecule')
moldf

def getMolDescriptors(mol, missingVal=None):
    ''' calculate the full list of descriptors for a molecule
    
        missingVal is used if the descriptor cannot be calculated
    '''
    res = {}
    for nm,fn in Descriptors._descList:
        # some of the descriptor fucntions can throw errors if they fail, catch those here:
        try:
            val = fn(mol)
        except:
            # print the error message:
            import traceback
            traceback.print_exc()
            # and set the descriptor value to whatever missingVal is
            val = missingVal
        res[nm] = val
    return res

#Calc Descriptors using RDKit

suppl = Chem.SDMolSupplier(sdfFilePath, removeHs=False)
mols = [m for m in suppl]
allDescrs = [getMolDescriptors(m) for m in mols]

#allDescrs

df = pd.DataFrame(allDescrs)
print(df.shape)

allDatadf = pd.concat([moldf, df], axis=1)

allDatadf

allDatadf = allDatadf.drop('ID', axis=1)
allDatadf.dtypes

allDatadf['LOG RLM_CLint (mL/min/kg)'] = allDatadf['LOG RLM_CLint (mL/min/kg)'].astype(float)
# drop rows that contain NaN values
allDatadf = allDatadf.dropna()
allDatadf

y = allDatadf['LOG RLM_CLint (mL/min/kg)']

y.dtypes

X = allDatadf.iloc[:,-217:]
results=X.dtypes
X

#allDatadf.to_csv("y_file", encoding='utf-8', index=False, header=True)

#to check datatypes
with open('your_file.txt', 'w') as f:
    for line in results:
        f.write(f"{line}\n")

# remove NaN and inf values.
X = X.fillna(0)
y = y.fillna(0)
max_value = torch.finfo(torch.half).max
X = X.clip(-max_value, max_value)
X = X.clip(-max_value, max_value)

#split to train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Train and evaluate TabPFN
reg = TabPFNRegressor(random_state=42)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)

# Calculate Variance Explained (R2 Score)
score = root_mean_squared_error(y_test, y_pred)
print(f"TabPFN RMSE: {score:.4f}")

# Compare different machine learning models by training each one multiple times
# on different parts of the data and averaging their performance scores for a
# more reliable performance estimate

# Define models
models = [
    ('TabPFN', TabPFNRegressor(random_state=42)),
    ('RandomForest', RandomForestRegressor(random_state=42)),
    ('XGBoost', XGBRegressor(random_state=42)),
    ('CatBoost', CatBoostRegressor(random_state=42, verbose=0))
]

# Calculate scores
scoring = 'neg_root_mean_squared_error'
scores = {name: cross_val_score(model, X, y, cv=5, scoring=scoring, n_jobs=-1).mean() for name, model in models}

# Plot results
df = pd.DataFrame(list(scores.items()), columns=['Model', 'RMSE'])
df.RMSE = -df.RMSE # Smaller is better
ax = df.plot(x='Model', y='RMSE', kind='bar', figsize=(10, 6))
ax.set_ylim(0, df['RMSE'].max() * 1.2)
ax.set_title('Model Comparison - 5-fold Cross-validation \n (Root Mean Squared Error - Smaller is better)')