import sqlite3
import time
import csv
import pandas as pd
from rdkit import Chem
from rdkit.Chem import PandasTools
#Install Chemicalite extension
connection = sqlite3.connect("Pubchem3.sqlite")
connection.enable_load_extension(True)
connection.load_extension('/Users/chrisswain/miniconda3/envs/rdkitenv/lib/chemicalite.dylib')
#now disable load because of potential security risks
connection.enable_load_extension(False)
connection.execute(
"CREATE TABLE ID_DATA (SMILES TEXT NOT NULL,ID INT PRIMARY KEY,INCHIKEY TEXT NOT NULL, molecule MOL)")
<sqlite3.Cursor at 0x13cf0c840>
def pubchem(path):
with open(path, 'rt') as inputfile:
reader = csv.reader(inputfile, delimiter = '\t')
next(reader) #skip header
for SMILES, ID, INCHIKEY, *_ in reader:
yield SMILES, ID, INCHIKEY
with connection:
connection.executemany(
"INSERT INTO ID_DATA(SMILES, ID, INCHIKEY, molecule)"
"VALUES(?1, ?2, ?3, mol_from_smiles(?1))", pubchem('/Users/chrisswain/Projects/Pubchem/InChiKey/AllInChiKey.tsv'))
[12:00:37] WARNING: not removing hydrogen atom without neighbors [12:00:37] WARNING: not removing hydrogen atom without neighbors [12:00:37] WARNING: not removing hydrogen atom without neighbors [12:00:37] WARNING: not removing hydrogen atom without neighbors [12:00:37] WARNING: not removing hydrogen atom without neighbors [14:14:05] WARNING: not removing hydrogen atom without neighbors [14:14:05] WARNING: not removing hydrogen atom without neighbors [14:14:05] WARNING: not removing hydrogen atom without neighbors [14:14:05] WARNING: not removing hydrogen atom without neighbors [14:14:05] Explicit valence for atom # 1 Cl, 3, is greater than permitted
cursor = connection.cursor()
targetinchikey = "WPQAOGZTDKTBHI-UHFFFAOYSA-N"
rows = cursor.execute("SELECT * FROM ID_DATA WHERE INCHIKEY == ?", (targetinchikey,),).fetchall()
rows
# This also returns the molecule as a binary object
[('OC1(C(=CCC(C1)C(=C)C)C)C1(O)C(=CCC(C1)C(=C)C)C', 500061, 'WPQAOGZTDKTBHI-UHFFFAOYSA-N', b'MOL\x00\xef\xbe\xad\xde\x00\x00\x00\x00\x10\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\x80\x01\x08\x00`\x00\x00\x00\x01\x01\x06\x00 \x00\x00\x00\x04\x06\x00(\x00\x00\x00\x03\x04\x06\x00h\x00\x00\x00\x03\x03\x01\x06\x00`\x00\x00\x00\x02\x02\x06\x00`\x00\x00\x00\x03\x01\x06\x00`\x00\x00\x00\x02\x02\x06\x00(\x00\x00\x00\x03\x04\x06\x00h\x00\x00\x00\x03\x02\x02\x06\x00`\x00\x00\x00\x01\x03\x06\x00`\x00\x00\x00\x01\x03\x06\x00 \x00\x00\x00\x04\x08\x00`\x00\x00\x00\x01\x01\x06\x00(\x00\x00\x00\x03\x04\x06\x00h\x00\x00\x00\x03\x03\x01\x06\x00`\x00\x00\x00\x02\x02\x06\x00`\x00\x00\x00\x03\x01\x06\x00`\x00\x00\x00\x02\x02\x06\x00(\x00\x00\x00\x03\x04\x06\x00h\x00\x00\x00\x03\x02\x02\x06\x00`\x00\x00\x00\x01\x03\x06\x00`\x00\x00\x00\x01\x03\x0b\x00\x01\x00\x01\x02\x00\x02\x03\x08\x02\x03\x04\x00\x04\x05\x00\x05\x06\x00\x05\x07\x00\x07\x08\x08\x02\x07\t\x00\x02\n\x00\x01\x0b\x00\x0b\x0c\x00\x0b\r\x00\r\x0e\x08\x02\x0e\x0f\x00\x0f\x10\x00\x10\x11\x00\x10\x12\x00\x12\x13\x08\x02\x12\x14\x00\r\x15\x00\x06\x01\x00\x11\x0b\x00B\x02\x00\x00\x00\x06\x02\x03\x04\x05\x06\x01\x06\r\x0e\x0f\x10\x11\x0b\x17\t\x00\x00\x00\x00\x00\x00\x00?\x00\x00\x00\x00\x12d\x00\x00\x00\x03\x00\x0f\x00\x00\x00__computedProps\x06\x02\x00\x00\x00\x00\x00\x00\x00\x07\x00\x00\x00numArom\x0f\x00\x00\x00_StereochemDone\x07\x00\x00\x00numArom\x01\x00\x00\x00\x00\x0f\x00\x00\x00_StereochemDone\x01\x01\x00\x00\x00\x13:0\x05\x00\x00\x02\x00\x0f\x00\x00\x00__computedProps\x06\x01\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00_CIPRank\x08\x00\x00\x00_CIPRank\x02\n\x00\x00\x00\x00\x02\x00\x0f\x00\x00\x00__computedProps\x06\x01\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00_CIPRank\x08\x00\x00\x00_CIPRank\x02\t\x00\x00\x00\x08\x01\x00\x02\x00\x0f\x00\x00\x00__computedProps\x06\x01\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00_CIPRank\x08\x00\x00\x00_CIPRank\x02\x08\x00\x00\x00\x00\x02\x00\x0f\x00\x00\x00__computedProps\x06\x01\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00_CIPRank\x08\x00\x00\x00_CIPRank\x02\x06\x00\x00\x00\x00\x02\x00\x0f\x00\x00\x00__computedProps\x06\x01\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00_CIPRank\x08\x00\x00\x00_CIPRank\x02\x02\x00\x00\x00\x00\x02\x00\x0f\x00\x00\x00__computedProps\x06\x01\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00_CIPRank\x08\x00\x00\x00_CIPRank\x02\x05\x00\x00\x00\x08\x01\x00\x02\x00\x0f\x00\x00\x00__computedProps\x06\x01\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00_CIPRank\x08\x00\x00\x00_CIPRank\x02\x04\x00\x00\x00\x00\x02\x00\x0f\x00\x00\x00__computedProps\x06\x01\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00_CIPRank\x08\x00\x00\x00_CIPRank\x02\x07\x00\x00\x00\x00\x02\x00\x0f\x00\x00\x00__computedProps\x06\x01\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00_CIPRank\x08\x00\x00\x00_CIPRank\x02\x03\x00\x00\x00\x00\x02\x00\x0f\x00\x00\x00__computedProps\x06\x01\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00_CIPRank\x08\x00\x00\x00_CIPRank\x02\x00\x00\x00\x00\x00\x02\x00\x0f\x00\x00\x00__computedProps\x06\x01\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00_CIPRank\x08\x00\x00\x00_CIPRank\x02\x01\x00\x00\x00\x00\x02\x00\x0f\x00\x00\x00__computedProps\x06\x01\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00_CIPRank\x08\x00\x00\x00_CIPRank\x02\t\x00\x00\x00\x08\x01\x00\x02\x00\x0f\x00\x00\x00__computedProps\x06\x01\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00_CIPRank\x08\x00\x00\x00_CIPRank\x02\n\x00\x00\x00\x00\x02\x00\x0f\x00\x00\x00__computedProps\x06\x01\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00_CIPRank\x08\x00\x00\x00_CIPRank\x02\x08\x00\x00\x00\x00\x02\x00\x0f\x00\x00\x00__computedProps\x06\x01\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00_CIPRank\x08\x00\x00\x00_CIPRank\x02\x06\x00\x00\x00\x00\x02\x00\x0f\x00\x00\x00__computedProps\x06\x01\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00_CIPRank\x08\x00\x00\x00_CIPRank\x02\x02\x00\x00\x00\x00\x02\x00\x0f\x00\x00\x00__computedProps\x06\x01\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00_CIPRank\x08\x00\x00\x00_CIPRank\x02\x05\x00\x00\x00\x08\x01\x00\x02\x00\x0f\x00\x00\x00__computedProps\x06\x01\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00_CIPRank\x08\x00\x00\x00_CIPRank\x02\x04\x00\x00\x00\x00\x02\x00\x0f\x00\x00\x00__computedProps\x06\x01\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00_CIPRank\x08\x00\x00\x00_CIPRank\x02\x07\x00\x00\x00\x00\x02\x00\x0f\x00\x00\x00__computedProps\x06\x01\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00_CIPRank\x08\x00\x00\x00_CIPRank\x02\x03\x00\x00\x00\x00\x02\x00\x0f\x00\x00\x00__computedProps\x06\x01\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00_CIPRank\x08\x00\x00\x00_CIPRank\x02\x00\x00\x00\x00\x00\x02\x00\x0f\x00\x00\x00__computedProps\x06\x01\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00_CIPRank\x08\x00\x00\x00_CIPRank\x02\x01\x00\x00\x00\x00\x13\x16')]
#To do a structure search
resID=cursor.execute("SELECT ID FROM ID_DATA WHERE mol_is_substruct(molecule, mol_from_smiles('Nc1nc2[nH]c(CN3CCN(Cc4cccc(F)c4)CC3)c(CN3CCN(Cc4cccc(F)c4)CC3)c2c(=O)[nH]1'))").fetchall()
resID
[(149878534,), (153408163,)]
#Toreturn the SMILES
resSMILES=cursor.execute("SELECT SMILES FROM ID_DATA WHERE mol_is_substruct(molecule, mol_from_smiles('O=C1NC=NC2=C1C(CN1CCNCC1)=CN2'))").fetchall()
resSMILES
[('c1ccc(cc1)N1CCN(CC1)Cc1c2c([nH]c1C)nc([nH]c2=O)N(C)C',), ('Clc1ccc(cc1)N1CCN(CC1)Cc1c2c([nH]c1C)nc([nH]c2=O)N(C)C',), ('c1ccc(c(c1)N1CCN(CC1)Cc1c2c([nH]c1C)nc([nH]c2=O)N(C)C)OC',), ('c1ccc(c(c1)N1CCN(CC1)Cc1c2c([nH]c1C)nc([nH]c2=O)N(C)C)Cl',), ('c1cc(cc(c1Cl)Cl)N1CCN(CC1)Cc1c2c([nH]c1C)nc([nH]c2=O)N(C)C',), ('c1ccc(cc1F)CN1CCN(CC1)Cc1c2c(=O)[nH]c(nc2[nH]c1CN1CCN(CC1)Cc1cccc(c1)F)NC(=O)CCCCCCC',), ('[O-][N+](=O)c1ccc(cc1)N1CCN(CC1)Cc1[nH]c2c(c1CN1CCN(CC1)c1ccc(cc1)[N+](=O)[O-])c(=O)[nH]c(n2)NC(=O)CCCCCCC',), ('c1ccc(cc1)CN1CCN(CC1)Cc1[nH]c2c(c1CN1CCN(CC1)Cc1ccccc1)c(=O)[nH]c(n2)NC(=O)CCCCCCC',), ('Fc1ccc(cc1)CN1CCN(CC1)Cc1[nH]c2c(c1CN1CCN(CC1)Cc1ccc(cc1)F)c(=O)[nH]c(n2)NC(=O)CCCCCCC',), ('c1ccc(c(c1)F)CN1CCN(CC1)Cc1c2c(=O)[nH]c(nc2[nH]c1CN1CCN(CC1)Cc1ccccc1F)NC(=O)CCCCCCC',), ('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)c1cc(Cl)cc(c1)Cl)CN1CCN(CC1)c1cc(Cl)cc(c1)Cl',), ('c1c(Cl)cc(cc1Cl)N1CCN(CC1)Cc1[nH]c2c(c1CN1CCN(CC1)c1cc(Cl)cc(c1)Cl)c(=O)[nH]c(n2)NC(=O)CCCCCCC',), ('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)Cc1ccccc1F)CN1CCN(CC1)Cc1ccccc1F',), ('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)C(c1ccccc1)c1ccccc1)CN1CCN(CC1)C(c1ccccc1)c1ccccc1',), ('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)Cc1ccccc1)CN1CCN(CC1)Cc1ccccc1',), ('c1ccc(cc1)C(N1CCN(CC1)Cc1[nH]c2c(c1CN1CCN(CC1)C(c1ccccc1)c1ccccc1)c(=O)[nH]c(n2)NC(=O)CCCCCCC)c1ccccc1',), ('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)c1ccc(cc1)[N+](=O)[O-])CN1CCN(CC1)c1ccc(cc1)[N+](=O)[O-]',), ('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)Cc1ccc(cc1)F)CN1CCN(CC1)Cc1ccc(cc1)F',), ('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)Cc1cccc(c1)F)CN1CCN(CC1)Cc1cccc(c1)F',), ('c1ccc(cc1I)CN1CCN(CC1)Cc1c2c(=O)[nH]c(nc2[nH]c1CN1CCN(CC1)Cc1cccc(c1)I)NC(=O)CCCCCCC',)]
#To get the mol object you need to use the mol_to_binary_mol
mols=cursor.execute("SELECT mol_to_binary_mol(molecule) FROM ID_DATA WHERE mol_is_substruct(molecule, mol_from_smiles('Nc1nc2[nH]c(CN3CCN(Cc4cccc(F)c4)CC3)c(CN3CCN(Cc4cccc(F)c4)CC3)c2c(=O)[nH]1'))").fetchall()
rdmols = [Chem.Mol(b[0]) for b in mols]
mol = rdmols[0]
mol
# ChemicaLite uses a virtual table mechanism to support indexing binary fingerprints in an RD-tree data structure, and this way improve the performances of substructure and similarity queries.
%%time
resSMILES=cursor.execute("SELECT SMILES FROM ID_DATA WHERE mol_is_substruct(molecule, mol_from_smiles('O=C1NC=NC2=C1C(CN1CCNCC1)=CN2'))")
CPU times: user 19min 2s, sys: 1min 10s, total: 20min 12s Wall time: 21min 7s
for smiles in resSMILES:
print(smiles)
('c1ccc(cc1)N1CCN(CC1)Cc1c2c([nH]c1C)nc([nH]c2=O)N(C)C',) ('Clc1ccc(cc1)N1CCN(CC1)Cc1c2c([nH]c1C)nc([nH]c2=O)N(C)C',) ('c1ccc(c(c1)N1CCN(CC1)Cc1c2c([nH]c1C)nc([nH]c2=O)N(C)C)OC',) ('c1ccc(c(c1)N1CCN(CC1)Cc1c2c([nH]c1C)nc([nH]c2=O)N(C)C)Cl',) ('c1cc(cc(c1Cl)Cl)N1CCN(CC1)Cc1c2c([nH]c1C)nc([nH]c2=O)N(C)C',) ('c1ccc(cc1F)CN1CCN(CC1)Cc1c2c(=O)[nH]c(nc2[nH]c1CN1CCN(CC1)Cc1cccc(c1)F)NC(=O)CCCCCCC',) ('[O-][N+](=O)c1ccc(cc1)N1CCN(CC1)Cc1[nH]c2c(c1CN1CCN(CC1)c1ccc(cc1)[N+](=O)[O-])c(=O)[nH]c(n2)NC(=O)CCCCCCC',) ('c1ccc(cc1)CN1CCN(CC1)Cc1[nH]c2c(c1CN1CCN(CC1)Cc1ccccc1)c(=O)[nH]c(n2)NC(=O)CCCCCCC',) ('Fc1ccc(cc1)CN1CCN(CC1)Cc1[nH]c2c(c1CN1CCN(CC1)Cc1ccc(cc1)F)c(=O)[nH]c(n2)NC(=O)CCCCCCC',) ('c1ccc(c(c1)F)CN1CCN(CC1)Cc1c2c(=O)[nH]c(nc2[nH]c1CN1CCN(CC1)Cc1ccccc1F)NC(=O)CCCCCCC',) ('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)c1cc(Cl)cc(c1)Cl)CN1CCN(CC1)c1cc(Cl)cc(c1)Cl',) ('c1c(Cl)cc(cc1Cl)N1CCN(CC1)Cc1[nH]c2c(c1CN1CCN(CC1)c1cc(Cl)cc(c1)Cl)c(=O)[nH]c(n2)NC(=O)CCCCCCC',) ('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)Cc1ccccc1F)CN1CCN(CC1)Cc1ccccc1F',) ('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)C(c1ccccc1)c1ccccc1)CN1CCN(CC1)C(c1ccccc1)c1ccccc1',) ('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)Cc1ccccc1)CN1CCN(CC1)Cc1ccccc1',) ('c1ccc(cc1)C(N1CCN(CC1)Cc1[nH]c2c(c1CN1CCN(CC1)C(c1ccccc1)c1ccccc1)c(=O)[nH]c(n2)NC(=O)CCCCCCC)c1ccccc1',) ('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)c1ccc(cc1)[N+](=O)[O-])CN1CCN(CC1)c1ccc(cc1)[N+](=O)[O-]',) ('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)Cc1ccc(cc1)F)CN1CCN(CC1)Cc1ccc(cc1)F',) ('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)Cc1cccc(c1)F)CN1CCN(CC1)Cc1cccc(c1)F',) ('c1ccc(cc1I)CN1CCN(CC1)Cc1c2c(=O)[nH]c(nc2[nH]c1CN1CCN(CC1)Cc1cccc(c1)I)NC(=O)CCCCCCC',)
# create a virtual table to be filled with fp data
connection.execute("CREATE VIRTUAL TABLE str_idx_pubchem_molecule " +
"USING rdtree(id, fp bits(2048))")
<sqlite3.Cursor at 0x13cf0e540>
#calculate and insert fingerprints, this will take a while
with connection:
connection.execute(
"INSERT INTO str_idx_pubchem_molecule(id, fp) " +
"SELECT ID, mol_pattern_bfp(molecule, 2048) FROM ID_DATA " +
"WHERE molecule IS NOT NULL")
%%time
res1=cursor.execute("SELECT SMILES FROM ID_DATA, str_idx_pubchem_molecule AS idx WHERE ID_DATA.ID = idx.id AND mol_is_substruct(ID_DATA.molecule, mol_from_smiles('O=C1NC=NC2=C1C(CN1CCNCC1)=CN2')) AND idx.id MATCH rdtree_subset(mol_pattern_bfp(mol_from_smiles('O=C1NC=NC2=C1C(CN1CCNCC1)=CN2'), 2048))").fetchall()
CPU times: user 300 ms, sys: 117 ms, total: 417 ms Wall time: 1.1 s
for smiles in res1:
print(smiles)
('c1c(Cl)cc(cc1Cl)N1CCN(CC1)Cc1[nH]c2c(c1CN1CCN(CC1)c1cc(Cl)cc(c1)Cl)c(=O)[nH]c(n2)NC(=O)CCCCCCC',) ('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)c1cc(Cl)cc(c1)Cl)CN1CCN(CC1)c1cc(Cl)cc(c1)Cl',) ('c1cc(cc(c1Cl)Cl)N1CCN(CC1)Cc1c2c([nH]c1C)nc([nH]c2=O)N(C)C',) ('c1ccc(cc1)CN1CCN(CC1)Cc1[nH]c2c(c1CN1CCN(CC1)Cc1ccccc1)c(=O)[nH]c(n2)NC(=O)CCCCCCC',) ('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)Cc1ccccc1F)CN1CCN(CC1)Cc1ccccc1F',) ('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)Cc1ccc(cc1)F)CN1CCN(CC1)Cc1ccc(cc1)F',) ('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)Cc1cccc(c1)F)CN1CCN(CC1)Cc1cccc(c1)F',) ('Fc1ccc(cc1)CN1CCN(CC1)Cc1[nH]c2c(c1CN1CCN(CC1)Cc1ccc(cc1)F)c(=O)[nH]c(n2)NC(=O)CCCCCCC',) ('c1ccc(c(c1)F)CN1CCN(CC1)Cc1c2c(=O)[nH]c(nc2[nH]c1CN1CCN(CC1)Cc1ccccc1F)NC(=O)CCCCCCC',) ('c1ccc(cc1F)CN1CCN(CC1)Cc1c2c(=O)[nH]c(nc2[nH]c1CN1CCN(CC1)Cc1cccc(c1)F)NC(=O)CCCCCCC',) ('c1ccc(cc1I)CN1CCN(CC1)Cc1c2c(=O)[nH]c(nc2[nH]c1CN1CCN(CC1)Cc1cccc(c1)I)NC(=O)CCCCCCC',) ('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)c1ccc(cc1)[N+](=O)[O-])CN1CCN(CC1)c1ccc(cc1)[N+](=O)[O-]',) ('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)C(c1ccccc1)c1ccccc1)CN1CCN(CC1)C(c1ccccc1)c1ccccc1',) ('c1ccc(cc1)C(N1CCN(CC1)Cc1[nH]c2c(c1CN1CCN(CC1)C(c1ccccc1)c1ccccc1)c(=O)[nH]c(n2)NC(=O)CCCCCCC)c1ccccc1',) ('[O-][N+](=O)c1ccc(cc1)N1CCN(CC1)Cc1[nH]c2c(c1CN1CCN(CC1)c1ccc(cc1)[N+](=O)[O-])c(=O)[nH]c(n2)NC(=O)CCCCCCC',) ('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)Cc1ccccc1)CN1CCN(CC1)Cc1ccccc1',) ('Clc1ccc(cc1)N1CCN(CC1)Cc1c2c([nH]c1C)nc([nH]c2=O)N(C)C',) ('c1ccc(c(c1)N1CCN(CC1)Cc1c2c([nH]c1C)nc([nH]c2=O)N(C)C)Cl',) ('c1ccc(c(c1)N1CCN(CC1)Cc1c2c([nH]c1C)nc([nH]c2=O)N(C)C)OC',) ('c1ccc(cc1)N1CCN(CC1)Cc1c2c([nH]c1C)nc([nH]c2=O)N(C)C',)
res1
[('c1c(Cl)cc(cc1Cl)N1CCN(CC1)Cc1[nH]c2c(c1CN1CCN(CC1)c1cc(Cl)cc(c1)Cl)c(=O)[nH]c(n2)NC(=O)CCCCCCC',), ('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)c1cc(Cl)cc(c1)Cl)CN1CCN(CC1)c1cc(Cl)cc(c1)Cl',), ('c1cc(cc(c1Cl)Cl)N1CCN(CC1)Cc1c2c([nH]c1C)nc([nH]c2=O)N(C)C',), ('c1ccc(cc1)CN1CCN(CC1)Cc1[nH]c2c(c1CN1CCN(CC1)Cc1ccccc1)c(=O)[nH]c(n2)NC(=O)CCCCCCC',), ('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)Cc1ccccc1F)CN1CCN(CC1)Cc1ccccc1F',), ('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)Cc1ccc(cc1)F)CN1CCN(CC1)Cc1ccc(cc1)F',), ('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)Cc1cccc(c1)F)CN1CCN(CC1)Cc1cccc(c1)F',), ('Fc1ccc(cc1)CN1CCN(CC1)Cc1[nH]c2c(c1CN1CCN(CC1)Cc1ccc(cc1)F)c(=O)[nH]c(n2)NC(=O)CCCCCCC',), ('c1ccc(c(c1)F)CN1CCN(CC1)Cc1c2c(=O)[nH]c(nc2[nH]c1CN1CCN(CC1)Cc1ccccc1F)NC(=O)CCCCCCC',), ('c1ccc(cc1F)CN1CCN(CC1)Cc1c2c(=O)[nH]c(nc2[nH]c1CN1CCN(CC1)Cc1cccc(c1)F)NC(=O)CCCCCCC',), ('c1ccc(cc1I)CN1CCN(CC1)Cc1c2c(=O)[nH]c(nc2[nH]c1CN1CCN(CC1)Cc1cccc(c1)I)NC(=O)CCCCCCC',), ('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)c1ccc(cc1)[N+](=O)[O-])CN1CCN(CC1)c1ccc(cc1)[N+](=O)[O-]',), ('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)C(c1ccccc1)c1ccccc1)CN1CCN(CC1)C(c1ccccc1)c1ccccc1',), ('c1ccc(cc1)C(N1CCN(CC1)Cc1[nH]c2c(c1CN1CCN(CC1)C(c1ccccc1)c1ccccc1)c(=O)[nH]c(n2)NC(=O)CCCCCCC)c1ccccc1',), ('[O-][N+](=O)c1ccc(cc1)N1CCN(CC1)Cc1[nH]c2c(c1CN1CCN(CC1)c1ccc(cc1)[N+](=O)[O-])c(=O)[nH]c(n2)NC(=O)CCCCCCC',), ('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)Cc1ccccc1)CN1CCN(CC1)Cc1ccccc1',), ('Clc1ccc(cc1)N1CCN(CC1)Cc1c2c([nH]c1C)nc([nH]c2=O)N(C)C',), ('c1ccc(c(c1)N1CCN(CC1)Cc1c2c([nH]c1C)nc([nH]c2=O)N(C)C)Cl',), ('c1ccc(c(c1)N1CCN(CC1)Cc1c2c([nH]c1C)nc([nH]c2=O)N(C)C)OC',), ('c1ccc(cc1)N1CCN(CC1)Cc1c2c([nH]c1C)nc([nH]c2=O)N(C)C',)]
def substructure_search(connection, substructure, limit):
print('searching for substructure:', substructure)
t1 = time.time()
rs = connection.execute(
"select ID_DATA.ID, mol_to_smiles(ID_DATA.molecule) from "
"ID_DATA, str_idx_pubchem_molecule as idx where "
"ID_DATA.ID = idx.id and "
"mol_is_substruct(ID_DATA.molecule, mol_from_smiles(?1)) and "
"idx.id match rdtree_subset(mol_pattern_bfp(mol_from_smiles(?1), 2048)) "
"limit ?2",
(substructure, limit)).fetchall()
t2 = time.time()
for ID, smiles in rs:
print(ID, smiles)
print('Found {0} matches in {1} seconds'.format(len(rs), t2-t1))
smiles = 'O=C1NC=NC2=C1C(CN1CCNCC1)=CN2'
limit = 100
substructure_search(connection, smiles, limit)
searching for substructure: O=C1NC=NC2=C1C(CN1CCNCC1)=CN2 153408156 CCCCCCCC(=O)Nc1nc2[nH]c(CN3CCN(c4cc(Cl)cc(Cl)c4)CC3)c(CN3CCN(c4cc(Cl)cc(Cl)c4)CC3)c2c(=O)[nH]1 153408155 Nc1nc2[nH]c(CN3CCN(c4cc(Cl)cc(Cl)c4)CC3)c(CN3CCN(c4cc(Cl)cc(Cl)c4)CC3)c2c(=O)[nH]1 135524318 Cc1[nH]c2nc(N(C)C)[nH]c(=O)c2c1CN1CCN(c2ccc(Cl)c(Cl)c2)CC1 150856072 CCCCCCCC(=O)Nc1nc2[nH]c(CN3CCN(Cc4ccccc4)CC3)c(CN3CCN(Cc4ccccc4)CC3)c2c(=O)[nH]1 153408157 Nc1nc2[nH]c(CN3CCN(Cc4ccccc4F)CC3)c(CN3CCN(Cc4ccccc4F)CC3)c2c(=O)[nH]1 153408162 Nc1nc2[nH]c(CN3CCN(Cc4ccc(F)cc4)CC3)c(CN3CCN(Cc4ccc(F)cc4)CC3)c2c(=O)[nH]1 153408163 Nc1nc2[nH]c(CN3CCN(Cc4cccc(F)c4)CC3)c(CN3CCN(Cc4cccc(F)c4)CC3)c2c(=O)[nH]1 151901732 CCCCCCCC(=O)Nc1nc2[nH]c(CN3CCN(Cc4ccc(F)cc4)CC3)c(CN3CCN(Cc4ccc(F)cc4)CC3)c2c(=O)[nH]1 152264085 CCCCCCCC(=O)Nc1nc2[nH]c(CN3CCN(Cc4ccccc4F)CC3)c(CN3CCN(Cc4ccccc4F)CC3)c2c(=O)[nH]1 149878534 CCCCCCCC(=O)Nc1nc2[nH]c(CN3CCN(Cc4cccc(F)c4)CC3)c(CN3CCN(Cc4cccc(F)c4)CC3)c2c(=O)[nH]1 155805404 CCCCCCCC(=O)Nc1nc2[nH]c(CN3CCN(Cc4cccc(I)c4)CC3)c(CN3CCN(Cc4cccc(I)c4)CC3)c2c(=O)[nH]1 153408161 Nc1nc2[nH]c(CN3CCN(c4ccc([N+](=O)[O-])cc4)CC3)c(CN3CCN(c4ccc([N+](=O)[O-])cc4)CC3)c2c(=O)[nH]1 153408158 Nc1nc2[nH]c(CN3CCN(C(c4ccccc4)c4ccccc4)CC3)c(CN3CCN(C(c4ccccc4)c4ccccc4)CC3)c2c(=O)[nH]1 153408160 CCCCCCCC(=O)Nc1nc2[nH]c(CN3CCN(C(c4ccccc4)c4ccccc4)CC3)c(CN3CCN(C(c4ccccc4)c4ccccc4)CC3)c2c(=O)[nH]1 150504227 CCCCCCCC(=O)Nc1nc2[nH]c(CN3CCN(c4ccc([N+](=O)[O-])cc4)CC3)c(CN3CCN(c4ccc([N+](=O)[O-])cc4)CC3)c2c(=O)[nH]1 153408159 Nc1nc2[nH]c(CN3CCN(Cc4ccccc4)CC3)c(CN3CCN(Cc4ccccc4)CC3)c2c(=O)[nH]1 135524311 Cc1[nH]c2nc(N(C)C)[nH]c(=O)c2c1CN1CCN(c2ccc(Cl)cc2)CC1 135524317 Cc1[nH]c2nc(N(C)C)[nH]c(=O)c2c1CN1CCN(c2ccccc2Cl)CC1 135524312 COc1ccccc1N1CCN(Cc2c(C)[nH]c3nc(N(C)C)[nH]c(=O)c23)CC1 135524310 Cc1[nH]c2nc(N(C)C)[nH]c(=O)c2c1CN1CCN(c2ccccc2)CC1 Found 20 matches in 0.43364596366882324 seconds
#Similarity searching
# create a virtual table to be filled with morgan bfp data
connection.execute("CREATE VIRTUAL TABLE morgan_idx_pubchem_molecule " +
"USING rdtree(id, fp bits(1024))");
with connection:
connection.execute(
"INSERT INTO morgan_idx_pubchem_molecule(id, fp) " +
"SELECT ID, mol_morgan_bfp(molecule, 2, 1024) FROM ID_DATA " +
"WHERE molecule IS NOT NULL")
def tanimoto_search(connection, target, threshold):
print('searching for target:', target)
t1 = time.time()
rs = connection.execute(
"SELECT c.ID, mol_to_smiles(c.molecule), "
"bfp_tanimoto(mol_morgan_bfp(c.molecule, 2, 1024), "
" mol_morgan_bfp(mol_from_smiles(?1), 2, 1024)) as t "
"FROM "
"ID_DATA as c JOIN morgan_idx_pubchem_molecule as idx USING(id) "
"WHERE "
"idx.id MATCH rdtree_tanimoto(mol_morgan_bfp(mol_from_smiles(?1), 2, 1024), ?2) "
"ORDER BY t DESC",
(target, threshold)).fetchall()
t2 = time.time()
for ID, smiles, sim in rs:
print(ID, smiles, sim)
print('Found {0} matches in {1} seconds'.format(len(rs), t2-t1))
smiles = "O=c1ccc(-c2ccccc2)cn1O"
threshold = 0.5
tanimoto_search(connection, smiles, threshold)
searching for target: O=c1ccc(-c2ccccc2)cn1O 44543378 O=c1ccc(-c2ccccc2)cn1O 1.0 153887090 O=c1ccc(-c2ccncc2)cn1O 0.6551724137931034 70596262 O=C(O)n1cc(-c2ccccc2)ccc1=O 0.6 14595918 Cn1cc(-c2ccccc2)ccc1=O 0.5862068965517241 11988946 O=c1ccc(-c2ccccc2)cn1-c1ccccc1 0.5666666666666667 68247588 O=c1ccc(-c2ccccc2)cn1-c1ccccc1O 0.5454545454545454 100942537 CCn1cc(-c2ccccc2)ccc1=O 0.53125 68028637 O=c1ccc(-c2ccccc2)cn1-c1ccc(O)cc1 0.5294117647058824 82113594 NCCn1cc(-c2ccccc2)ccc1=O 0.5151515151515151 4258240 O=c1ccc(-c2ccc(-c3ccccc3)cc2)cn1Cc1ccccc1 0.5151515151515151 3550352 O=c1ccc(-c2ccccc2)cn1Cc1ccccc1 0.5151515151515151 3641101 O=c1ccc(-c2ccc(-c3ccccc3)cc2)cn1C(c1ccccc1)c1ccccc1 0.5151515151515151 3487768 O=c1ccc(-c2ccccc2)cn1C(c1ccccc1)c1ccccc1 0.5151515151515151 12671450 O=c1cnc(-c2ccccc2)cn1O 0.5151515151515151 3699479 C=CCn1cc(-c2ccccc2)ccc1=O 0.5 3682683 C=CCn1cc(-c2ccc(-c3ccccc3)cc2)ccc1=O 0.5 162015486 O=P(O)(O)O.c1ccc(-c2ccc(-c3ccc(-c4ccccc4)cc3)cc2)cc1 0.5 58016393 Cc1ccc(-n2cc(-c3ccccc3)ccc2=O)cc1 0.5 3897369 CC(C)=CCn1cc(-c2ccc(-c3ccccc3)cc2)ccc1=O 0.5 3376142 CC(C)=CCn1cc(-c2ccccc2)ccc1=O 0.5 58016549 O=c1ccc(-c2ccc(O)cc2)cn1-c1ccccc1 0.5 100942536 O=c1ccc(-c2ccccc2)cn1CCc1ccccc1 0.5 11299201 O=C(O)c1cc(-c2ccccc2)cn(O)c1=O 0.5 17913416 O=c1ccc2ccc(-c3ccccc3)cc2n1O 0.5 90188457 O=c1ccccn1O.[Zn] 0.5 87935139 O=c1ccccn1O.O=c1ccccn1O.[Zn] 0.5 10953512 O=c1ccc(O)cn1O 0.5 Found 27 matches in 52.773017168045044 seconds
rs = connection.execute(
"SELECT c.ID, mol_to_smiles(c.molecule), "
"bfp_tanimoto(mol_morgan_bfp(c.molecule, 2, 1024), "
" mol_morgan_bfp(mol_from_smiles(?1), 2, 1024)) as t "
"FROM "
"ID_DATA as c JOIN morgan_idx_pubchem_molecule as idx USING(id) "
"WHERE "
"idx.id MATCH rdtree_tanimoto(mol_morgan_bfp(mol_from_smiles(?1), 2, 1024), ?2) "
"ORDER BY t DESC",
(smiles, threshold)).fetchall()
rs
[(44543378, 'O=c1ccc(-c2ccccc2)cn1O', 1.0), (153887090, 'O=c1ccc(-c2ccncc2)cn1O', 0.6551724137931034), (70596262, 'O=C(O)n1cc(-c2ccccc2)ccc1=O', 0.6), (14595918, 'Cn1cc(-c2ccccc2)ccc1=O', 0.5862068965517241), (11988946, 'O=c1ccc(-c2ccccc2)cn1-c1ccccc1', 0.5666666666666667), (68247588, 'O=c1ccc(-c2ccccc2)cn1-c1ccccc1O', 0.5454545454545454), (100942537, 'CCn1cc(-c2ccccc2)ccc1=O', 0.53125), (68028637, 'O=c1ccc(-c2ccccc2)cn1-c1ccc(O)cc1', 0.5294117647058824), (82113594, 'NCCn1cc(-c2ccccc2)ccc1=O', 0.5151515151515151), (4258240, 'O=c1ccc(-c2ccc(-c3ccccc3)cc2)cn1Cc1ccccc1', 0.5151515151515151), (3550352, 'O=c1ccc(-c2ccccc2)cn1Cc1ccccc1', 0.5151515151515151), (3641101, 'O=c1ccc(-c2ccc(-c3ccccc3)cc2)cn1C(c1ccccc1)c1ccccc1', 0.5151515151515151), (3487768, 'O=c1ccc(-c2ccccc2)cn1C(c1ccccc1)c1ccccc1', 0.5151515151515151), (12671450, 'O=c1cnc(-c2ccccc2)cn1O', 0.5151515151515151), (3699479, 'C=CCn1cc(-c2ccccc2)ccc1=O', 0.5), (3682683, 'C=CCn1cc(-c2ccc(-c3ccccc3)cc2)ccc1=O', 0.5), (162015486, 'O=P(O)(O)O.c1ccc(-c2ccc(-c3ccc(-c4ccccc4)cc3)cc2)cc1', 0.5), (58016393, 'Cc1ccc(-n2cc(-c3ccccc3)ccc2=O)cc1', 0.5), (3897369, 'CC(C)=CCn1cc(-c2ccc(-c3ccccc3)cc2)ccc1=O', 0.5), (3376142, 'CC(C)=CCn1cc(-c2ccccc2)ccc1=O', 0.5), (58016549, 'O=c1ccc(-c2ccc(O)cc2)cn1-c1ccccc1', 0.5), (100942536, 'O=c1ccc(-c2ccccc2)cn1CCc1ccccc1', 0.5), (11299201, 'O=C(O)c1cc(-c2ccccc2)cn(O)c1=O', 0.5), (17913416, 'O=c1ccc2ccc(-c3ccccc3)cc2n1O', 0.5), (90188457, 'O=c1ccccn1O.[Zn]', 0.5), (87935139, 'O=c1ccccn1O.O=c1ccccn1O.[Zn]', 0.5), (10953512, 'O=c1ccc(O)cn1O', 0.5)]