In [1]:
import sqlite3
import csv
import time
import pandas as pd
from rdkit import Chem
from rdkit.Chem import PandasTools
In [2]:
connection = sqlite3.connect('chembldb.sqlite')
connection.enable_load_extension(True)
connection.load_extension('chemicalite')
connection.enable_load_extension(False)
In [3]:
connection.execute(
    "CREATE TABLE chembl(id INTEGER PRIMARY KEY, chembl_id TEXT, SMILES TEXT, INCHI, INCHIKEY TEXT, molecule MOL)")
Out[3]:
<sqlite3.Cursor at 0x14551c640>
In [4]:
def chembl(path):
    with open(path, 'rt') as inputfile:
        reader = csv.reader(inputfile, delimiter='\t')
        next(reader) # skip header line
        for chembl_id, smiles, inchi, inchikey, *_ in reader:
            yield chembl_id, smiles, inchi, inchikey
In [6]:
with connection:
    connection.executemany(
        "INSERT INTO chembl(chembl_id, smiles, inchi, INCHIKEY, molecule) "
        "VALUES(?1, ?2, ?3, ?4, mol_from_smiles(?2))", chembl('chembl_35_chemreps.txt'))
[13:24:04] WARNING: not removing hydrogen atom without neighbors
[13:24:31] WARNING: not removing hydrogen atom without neighbors
[13:24:31] WARNING: not removing hydrogen atom without neighbors
[13:25:34] WARNING: not removing hydrogen atom without neighbors
[13:25:59] WARNING: not removing hydrogen atom without neighbors
[13:26:01] WARNING: not removing hydrogen atom without neighbors
[13:26:03] WARNING: not removing hydrogen atom without neighbors
[13:26:39] WARNING: not removing hydrogen atom without neighbors
[13:26:39] WARNING: not removing hydrogen atom without neighbors
[13:26:39] WARNING: not removing hydrogen atom without neighbors
[13:26:42] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 10 11 15 16 17 19 20 21
[13:26:53] WARNING: not removing hydrogen atom without neighbors
[13:26:53] WARNING: not removing hydrogen atom without neighbors
[13:26:53] WARNING: not removing hydrogen atom without neighbors
[13:26:53] WARNING: not removing hydrogen atom without neighbors
In [7]:
mols = connection.execute("SELECT molecule FROM chembl WHERE chembl_id == ('CHEMBL501674')")
mols
Out[7]:
<sqlite3.Cursor at 0x16e1cda40>
In [8]:
cursor = connection.cursor()
chemblid = "CHEMBL501674"
rows = cursor.execute("SELECT * FROM chembl  WHERE chembl_id == ?", (chemblid,),).fetchall()
In [3]:
cursor = connection.cursor()
inchikey = "JFHNJVCGXRRDFC-GSRYTVDASA-N"
rows = cursor.execute("SELECT chembl_id FROM chembl  WHERE INCHIKEY == ?", (inchikey,),).fetchall()
rows
Out[3]:
[('CHEMBL501674',)]
In [4]:
%%time
resID=cursor.execute("SELECT chembl_id FROM chembl WHERE mol_is_substruct(molecule, mol_from_smiles('C1=CC=C(C=C1)C1=CC=CN=C1'))").fetchall()
CPU times: user 53.4 s, sys: 2.32 s, total: 55.7 s
Wall time: 55.7 s
In [9]:
len(resID)
Out[9]:
23785
In [ ]:
 
In [12]:
#substructure searching
# create a virtual table to be filled with fp data
connection.execute("CREATE VIRTUAL TABLE str_idx_chembl_molecule " +
                   "USING rdtree(id, fp bits(2048))")
Out[12]:
<sqlite3.Cursor at 0x14763c0c0>
In [13]:
with connection:
    connection.execute(
        "INSERT INTO str_idx_chembl_molecule(id, fp) " +
        "SELECT id, mol_pattern_bfp(molecule, 2048) FROM chembl " +
        "WHERE molecule IS NOT NULL")
In [26]:
count = connection.execute("SELECT COUNT(*) FROM chembl, str_idx_chembl_molecule AS idx " + 
                "WHERE chembl.id = idx.id AND mol_is_substruct(chembl.molecule, mol_from_smiles('C1=CC=C(C=C1)C1=CC=CN=C1')) " +
                   "AND idx.id MATCH rdtree_subset(mol_pattern_bfp(mol_from_smiles('C1=CC=C(C=C1)C1=CC=CN=C1'), 2048))").fetchall()[0][0]

count
Out[26]:
23785
In [15]:
import argparse
In [6]:
def substructure_search(connection, substructure, limit):
    print('searching for substructure:', substructure)

    t1 = time.time()
    rs = connection.execute(
        "select chembl.chembl_id, mol_to_smiles(chembl.molecule) from "
        "chembl, str_idx_chembl_molecule as idx where "
        "chembl.id = idx.id and "
        "mol_is_substruct(chembl.molecule, mol_from_smiles(?1)) and "
        "idx.id match rdtree_subset(mol_pattern_bfp(mol_from_smiles(?1), 2048)) "
        "limit ?2",
        (substructure, limit)).fetchall()
    t2 = time.time()

    for chembl_id, smiles in rs:
        print(chembl_id, smiles)
    print('Found {0} matches in {1} seconds'.format(len(rs), t2-t1))
In [7]:
smiles = 'C1=CC=C(C=C1)C1=CC=CN=C1'
limit = 10
substructure_search(connection, smiles, limit)
searching for substructure: C1=CC=C(C=C1)C1=CC=CN=C1
CHEMBL574580 O=c1ccc(-c2ccccc2)cn1O
CHEMBL569186 [O-][n+]1cccc(-c2ccccc2)c1
CHEMBL4452320 O=Cc1ccccc1-c1cccnc1
CHEMBL34657 c1ccc(-c2cccnc2)cc1
CHEMBL1605269 Nc1cc(-c2ccccc2)cnc1O
CHEMBL1778131 Nc1cccc(-c2cccnc2)c1
CHEMBL3623821 N#Cc1ccc(-c2cccnc2)cc1
CHEMBL178946 NCc1ccc(-c2cccnc2)cc1
CHEMBL3402235 CNCc1cccc(-c2cccnc2)c1
CHEMBL4548726 NCc1cccc(-c2cccnc2)c1
Found 10 matches in 0.01191401481628418 seconds
In [18]:
#Similarity searching
# create a virtual table to be filled with morgan bfp data
connection.execute("CREATE VIRTUAL TABLE morgan_idx_chembl_molecule " +
            "USING rdtree(id, fp bits(1024))");
In [19]:
with connection:
    connection.execute(
        "INSERT INTO morgan_idx_chembl_molecule(id, fp) " +
        "SELECT id, mol_morgan_bfp(molecule, 2, 1024) FROM chembl " +
        "WHERE molecule IS NOT NULL")
In [20]:
def tanimoto_count(connection, target, threshold):
    print('Target structure:', target)
    print('Minimum Tanimoto similarity:', threshold)

    t1 = time.time()
    count = connection.execute(
        "SELECT count(*) FROM "
        "morgan_idx_chembl_molecule as idx WHERE "
        "idx.id match rdtree_tanimoto(mol_morgan_bfp(mol_from_smiles(?), 2, 1024), ?)",
        (target, threshold)).fetchall()[0][0]
    t2 = time.time()

    print('Found {0} matching objects in {1} seconds'.format(count, t2-t1))
In [27]:
smiles = "O=c1ccc(-c2ccccc2)cn1O"
threshold = 0.1
tanimoto_count(connection, smiles, threshold)
Target structure: O=c1ccc(-c2ccccc2)cn1O
Minimum Tanimoto similarity: 0.1
Found 1531142 matching objects in 1.1654012203216553 seconds
In [ ]:
 
In [28]:
rs = connection.execute(
    "SELECT c.chembl_id, mol_to_smiles(c.molecule), "
    "bfp_tanimoto(mol_morgan_bfp(c.molecule, 2, 1024), "
    "             mol_morgan_bfp(mol_from_smiles(?1), 2, 1024)) as t "
    "FROM "
    "chembl as c JOIN morgan_idx_chembl_molecule as idx USING(id) "
    "WHERE "
    "idx.id MATCH rdtree_tanimoto(mol_morgan_bfp(mol_from_smiles(?1), 2, 1024), ?2) "
    "ORDER BY t DESC",
    (smiles, threshold)).fetchall()
In [30]:
len(rs)
Out[30]:
1531142
In [24]:
def tanimoto_search(connection, target, threshold):
    print('searching for target:', target)

    t1 = time.time()
    rs = connection.execute(
        "SELECT c.chembl_id, mol_to_smiles(c.molecule), "
        "bfp_tanimoto(mol_morgan_bfp(c.molecule, 2, 1024), "
        "             mol_morgan_bfp(mol_from_smiles(?1), 2, 1024)) as t "
        "FROM "
        "chembl as c JOIN morgan_idx_chembl_molecule as idx USING(id) "
        "WHERE "
        "idx.id MATCH rdtree_tanimoto(mol_morgan_bfp(mol_from_smiles(?1), 2, 1024), ?2) "
        "ORDER BY t DESC",
        (target, threshold)).fetchall()
    t2 = time.time()

    for chembl_id, smiles, sim in rs:
        print(chembl_id, smiles, sim)

    print('Found {0} matches in {1} seconds'.format(len(rs), t2-t1))
In [25]:
tanimoto_search(connection, smiles, threshold)
searching for target: O=c1ccc(-c2ccccc2)cn1O
CHEMBL574580 O=c1ccc(-c2ccccc2)cn1O 1.0
CHEMBL2208176 O=c1ccc(-c2ccccc2)cn1-c1ccccc1 0.5666666666666667
CHEMBL3343424 O=c1ccc(CCc2ccccc2)cn1O 0.53125
CHEMBL5197649 O=c1ccc(Cc2ccccc2)cn1O 0.53125
CHEMBL3343426 O=c1ccc(Oc2ccccc2)cn1O 0.53125
CHEMBL5186716 O=c1cc(-c2ccccc2)ccn1O 0.5161290322580645
CHEMBL3343425 O=c1ccc(CCCc2ccccc2)cn1O 0.5
Found 7 matches in 0.3809819221496582 seconds
In [ ]: