Notebook to tidy PubChem downloaded structures¶

In [ ]:
#To download the structures
# wget "ftp://ftp.ncbi.nlm.nih.gov/pubchem/Compound/CURRENT-Full/SDF/*.sdf.gz" 
In [ ]:
from os import listdir
import os
from os.path import isfile, join
from pathlib import Path
In [ ]:
 
In [ ]:
current_working_directory = os.getcwd()
#Need path to folder where downloaded files are
In [ ]:
onlysdfFiles = [f for f in listdir(current_working_directory) if isfile(join(current_working_directory, f)) and  f.endswith(".gz")]
onlysdfFiles
In [ ]:
#Read each file (filename something like Compound_049500001_050000000.sdf.gz)
#Structures with errors --> Errors/filename_errors.sdf
#Correct structures --> Valid/filename.sdf
#Correct structures --> save as SMILES/filename.smi
#Correct structures --> save as InChiKey/filename.smi
In [ ]:
 
In [ ]:
# Specify the directory name
directory_name = "Errors"

# Create the directory
try:
    os.mkdir(directory_name)
    print(f"Directory '{directory_name}' created successfully.")
except FileExistsError:
    print(f"Directory '{directory_name}' already exists.")
except PermissionError:
    print(f"Permission denied: Unable to create '{directory_name}'.")
except Exception as e:
    print(f"An error occurred: {e}")
In [ ]:
# Specify the directory name
directory_name = "Valid"

# Create the directory
try:
    os.mkdir(directory_name)
    print(f"Directory '{directory_name}' created successfully.")
except FileExistsError:
    print(f"Directory '{directory_name}' already exists.")
except PermissionError:
    print(f"Permission denied: Unable to create '{directory_name}'.")
except Exception as e:
    print(f"An error occurred: {e}")
In [ ]:
# Specify the directory name
directory_name = "SMILES"

# Create the directory
try:
    os.mkdir(directory_name)
    print(f"Directory '{directory_name}' created successfully.")
except FileExistsError:
    print(f"Directory '{directory_name}' already exists.")
except PermissionError:
    print(f"Permission denied: Unable to create '{directory_name}'.")
except Exception as e:
    print(f"An error occurred: {e}")
In [ ]:
# Specify the directory name
directory_name = "InChiKey"

# Create the directory
try:
    os.mkdir(directory_name)
    print(f"Directory '{directory_name}' created successfully.")
except FileExistsError:
    print(f"Directory '{directory_name}' already exists.")
except PermissionError:
    print(f"Permission denied: Unable to create '{directory_name}'.")
except Exception as e:
    print(f"An error occurred: {e}")
In [ ]:
#Generate SMILES only
for r in range(0, int(len(onlysdfFiles))):
#for r in range(0, 3): #for testing
    sdfFile = onlysdfFiles[r]
#remove .sdf.gz
    fileName = Path(sdfFile).stem
    fileName = Path(fileName).stem 
    smiFile = "/Users/chrisswain/Projects/Pubchem/SMILES/" + fileName + ".smi"
    sdfPath = '/Users/chrisswain/Projects/Pubchem/' + sdfFile
    !/Users/chrisswain/miniconda3/bin/obabel  '{sdfPath}'   -osmi -O   '{smiFile}' --canonical
In [ ]:
 
In [ ]:
#Generate SMILES and InChiKey
for r in range(0, int(len(onlysdfFiles))):
#for r in range(0, 3): #for testing
    sdfFile = onlysdfFiles[r]
#remove .sdf.gz
    fileName = Path(sdfFile).stem
    fileName = Path(fileName).stem 
    smiFile = "/Users/chrisswain/Projects/Pubchem/InChiKey/" + fileName + ".smi"
    sdfPath = '/Users/chrisswain/Projects/Pubchem/' + sdfFile
    !/Users/chrisswain/miniconda3/bin/obabel  '{sdfPath}'   -osmi -O   '{smiFile}' --canonical --append "PUBCHEM_IUPAC_INCHIKEY"
In [ ]:
 
In [ ]:
 
In [ ]: