import sqlite3
import csv
import time
import pandas as pd
from rdkit import Chem
from rdkit.Chem import PandasTools
connection = sqlite3.connect('chembldb.sqlite')
connection.enable_load_extension(True)
connection.load_extension('chemicalite')
connection.enable_load_extension(False)
connection.execute(
"CREATE TABLE chembl(id INTEGER PRIMARY KEY, chembl_id TEXT, SMILES TEXT, INCHI, INCHIKEY TEXT, molecule MOL)")
<sqlite3.Cursor at 0x14551c640>
def chembl(path):
with open(path, 'rt') as inputfile:
reader = csv.reader(inputfile, delimiter='\t')
next(reader) # skip header line
for chembl_id, smiles, inchi, inchikey, *_ in reader:
yield chembl_id, smiles, inchi, inchikey
with connection:
connection.executemany(
"INSERT INTO chembl(chembl_id, smiles, inchi, INCHIKEY, molecule) "
"VALUES(?1, ?2, ?3, ?4, mol_from_smiles(?2))", chembl('chembl_35_chemreps.txt'))
[13:24:04] WARNING: not removing hydrogen atom without neighbors [13:24:31] WARNING: not removing hydrogen atom without neighbors [13:24:31] WARNING: not removing hydrogen atom without neighbors [13:25:34] WARNING: not removing hydrogen atom without neighbors [13:25:59] WARNING: not removing hydrogen atom without neighbors [13:26:01] WARNING: not removing hydrogen atom without neighbors [13:26:03] WARNING: not removing hydrogen atom without neighbors [13:26:39] WARNING: not removing hydrogen atom without neighbors [13:26:39] WARNING: not removing hydrogen atom without neighbors [13:26:39] WARNING: not removing hydrogen atom without neighbors [13:26:42] Can't kekulize mol. Unkekulized atoms: 1 2 3 4 5 6 10 11 15 16 17 19 20 21 [13:26:53] WARNING: not removing hydrogen atom without neighbors [13:26:53] WARNING: not removing hydrogen atom without neighbors [13:26:53] WARNING: not removing hydrogen atom without neighbors [13:26:53] WARNING: not removing hydrogen atom without neighbors
mols = connection.execute("SELECT molecule FROM chembl WHERE chembl_id == ('CHEMBL501674')")
mols
<sqlite3.Cursor at 0x16e1cda40>
cursor = connection.cursor()
chemblid = "CHEMBL501674"
rows = cursor.execute("SELECT * FROM chembl WHERE chembl_id == ?", (chemblid,),).fetchall()
cursor = connection.cursor()
inchikey = "JFHNJVCGXRRDFC-GSRYTVDASA-N"
rows = cursor.execute("SELECT chembl_id FROM chembl WHERE INCHIKEY == ?", (inchikey,),).fetchall()
rows
[('CHEMBL501674',)]
%%time
resID=cursor.execute("SELECT chembl_id FROM chembl WHERE mol_is_substruct(molecule, mol_from_smiles('C1=CC=C(C=C1)C1=CC=CN=C1'))").fetchall()
CPU times: user 53.4 s, sys: 2.32 s, total: 55.7 s Wall time: 55.7 s
len(resID)
23785
#substructure searching
# create a virtual table to be filled with fp data
connection.execute("CREATE VIRTUAL TABLE str_idx_chembl_molecule " +
"USING rdtree(id, fp bits(2048))")
<sqlite3.Cursor at 0x14763c0c0>
with connection:
connection.execute(
"INSERT INTO str_idx_chembl_molecule(id, fp) " +
"SELECT id, mol_pattern_bfp(molecule, 2048) FROM chembl " +
"WHERE molecule IS NOT NULL")
count = connection.execute("SELECT COUNT(*) FROM chembl, str_idx_chembl_molecule AS idx " +
"WHERE chembl.id = idx.id AND mol_is_substruct(chembl.molecule, mol_from_smiles('C1=CC=C(C=C1)C1=CC=CN=C1')) " +
"AND idx.id MATCH rdtree_subset(mol_pattern_bfp(mol_from_smiles('C1=CC=C(C=C1)C1=CC=CN=C1'), 2048))").fetchall()[0][0]
count
23785
import argparse
def substructure_search(connection, substructure, limit):
print('searching for substructure:', substructure)
t1 = time.time()
rs = connection.execute(
"select chembl.chembl_id, mol_to_smiles(chembl.molecule) from "
"chembl, str_idx_chembl_molecule as idx where "
"chembl.id = idx.id and "
"mol_is_substruct(chembl.molecule, mol_from_smiles(?1)) and "
"idx.id match rdtree_subset(mol_pattern_bfp(mol_from_smiles(?1), 2048)) "
"limit ?2",
(substructure, limit)).fetchall()
t2 = time.time()
for chembl_id, smiles in rs:
print(chembl_id, smiles)
print('Found {0} matches in {1} seconds'.format(len(rs), t2-t1))
smiles = 'C1=CC=C(C=C1)C1=CC=CN=C1'
limit = 10
substructure_search(connection, smiles, limit)
searching for substructure: C1=CC=C(C=C1)C1=CC=CN=C1 CHEMBL574580 O=c1ccc(-c2ccccc2)cn1O CHEMBL569186 [O-][n+]1cccc(-c2ccccc2)c1 CHEMBL4452320 O=Cc1ccccc1-c1cccnc1 CHEMBL34657 c1ccc(-c2cccnc2)cc1 CHEMBL1605269 Nc1cc(-c2ccccc2)cnc1O CHEMBL1778131 Nc1cccc(-c2cccnc2)c1 CHEMBL3623821 N#Cc1ccc(-c2cccnc2)cc1 CHEMBL178946 NCc1ccc(-c2cccnc2)cc1 CHEMBL3402235 CNCc1cccc(-c2cccnc2)c1 CHEMBL4548726 NCc1cccc(-c2cccnc2)c1 Found 10 matches in 0.01191401481628418 seconds
#Similarity searching
# create a virtual table to be filled with morgan bfp data
connection.execute("CREATE VIRTUAL TABLE morgan_idx_chembl_molecule " +
"USING rdtree(id, fp bits(1024))");
with connection:
connection.execute(
"INSERT INTO morgan_idx_chembl_molecule(id, fp) " +
"SELECT id, mol_morgan_bfp(molecule, 2, 1024) FROM chembl " +
"WHERE molecule IS NOT NULL")
def tanimoto_count(connection, target, threshold):
print('Target structure:', target)
print('Minimum Tanimoto similarity:', threshold)
t1 = time.time()
count = connection.execute(
"SELECT count(*) FROM "
"morgan_idx_chembl_molecule as idx WHERE "
"idx.id match rdtree_tanimoto(mol_morgan_bfp(mol_from_smiles(?), 2, 1024), ?)",
(target, threshold)).fetchall()[0][0]
t2 = time.time()
print('Found {0} matching objects in {1} seconds'.format(count, t2-t1))
smiles = "O=c1ccc(-c2ccccc2)cn1O"
threshold = 0.1
tanimoto_count(connection, smiles, threshold)
Target structure: O=c1ccc(-c2ccccc2)cn1O Minimum Tanimoto similarity: 0.1 Found 1531142 matching objects in 1.1654012203216553 seconds
rs = connection.execute(
"SELECT c.chembl_id, mol_to_smiles(c.molecule), "
"bfp_tanimoto(mol_morgan_bfp(c.molecule, 2, 1024), "
" mol_morgan_bfp(mol_from_smiles(?1), 2, 1024)) as t "
"FROM "
"chembl as c JOIN morgan_idx_chembl_molecule as idx USING(id) "
"WHERE "
"idx.id MATCH rdtree_tanimoto(mol_morgan_bfp(mol_from_smiles(?1), 2, 1024), ?2) "
"ORDER BY t DESC",
(smiles, threshold)).fetchall()
len(rs)
1531142
def tanimoto_search(connection, target, threshold):
print('searching for target:', target)
t1 = time.time()
rs = connection.execute(
"SELECT c.chembl_id, mol_to_smiles(c.molecule), "
"bfp_tanimoto(mol_morgan_bfp(c.molecule, 2, 1024), "
" mol_morgan_bfp(mol_from_smiles(?1), 2, 1024)) as t "
"FROM "
"chembl as c JOIN morgan_idx_chembl_molecule as idx USING(id) "
"WHERE "
"idx.id MATCH rdtree_tanimoto(mol_morgan_bfp(mol_from_smiles(?1), 2, 1024), ?2) "
"ORDER BY t DESC",
(target, threshold)).fetchall()
t2 = time.time()
for chembl_id, smiles, sim in rs:
print(chembl_id, smiles, sim)
print('Found {0} matches in {1} seconds'.format(len(rs), t2-t1))
tanimoto_search(connection, smiles, threshold)
searching for target: O=c1ccc(-c2ccccc2)cn1O CHEMBL574580 O=c1ccc(-c2ccccc2)cn1O 1.0 CHEMBL2208176 O=c1ccc(-c2ccccc2)cn1-c1ccccc1 0.5666666666666667 CHEMBL3343424 O=c1ccc(CCc2ccccc2)cn1O 0.53125 CHEMBL5197649 O=c1ccc(Cc2ccccc2)cn1O 0.53125 CHEMBL3343426 O=c1ccc(Oc2ccccc2)cn1O 0.53125 CHEMBL5186716 O=c1cc(-c2ccccc2)ccn1O 0.5161290322580645 CHEMBL3343425 O=c1ccc(CCCc2ccccc2)cn1O 0.5 Found 7 matches in 0.3809819221496582 seconds