import sqlite3
import csv
import time
import pandas as pd
from rdkit import Chem
from rdkit.Chem import PandasTools


connection = sqlite3.connect('chembldb.sqlite')
connection.enable_load_extension(True)
connection.load_extension('chemicalite')
connection.enable_load_extension(False)


connection.execute(
    "CREATE TABLE chembl(id INTEGER PRIMARY KEY, chembl_id TEXT, SMILES TEXT, INCHI, INCHIKEY TEXT, molecule MOL)")

<sqlite3.Cursor at 0x14551c640>


def chembl(path):
    with open(path, 'rt') as inputfile:
        reader = csv.reader(inputfile, delimiter='\t')
        next(reader) # skip header line
        for chembl_id, smiles, inchi, inchikey, *_ in reader:
            yield chembl_id, smiles, inchi, inchikey


with connection:
    connection.executemany(
        "INSERT INTO chembl(chembl_id, smiles, inchi, INCHIKEY, molecule) "
        "VALUES(?1, ?2, ?3, ?4, mol_from_smiles(?2))", chembl('chembl_35_chemreps.txt'))

[13:24:04] WARNING: not removing hydrogen atom without neighbors
[13:24:31] WARNING: not removing hydrogen atom without neighbors
[13:24:31] WARNING: not removing hydrogen atom without neighbors
[13:25:34] WARNING: not removing hydrogen atom without neighbors
[13:25:59] WARNING: not removing hydrogen atom without neighbors
[13:26:01] WARNING: not removing hydrogen atom without neighbors
[13:26:03] WARNING: not removing hydrogen atom without neighbors
[13:26:39] WARNING: not removing hydrogen atom without neighbors
[13:26:39] WARNING: not removing hydrogen atom without neighbors
[13:26:39] WARNING: not removing hydrogen atom without neighbors
[13:26:42] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 10 11 15 16 17 19 20 21
[13:26:53] WARNING: not removing hydrogen atom without neighbors
[13:26:53] WARNING: not removing hydrogen atom without neighbors
[13:26:53] WARNING: not removing hydrogen atom without neighbors
[13:26:53] WARNING: not removing hydrogen atom without neighbors


mols = connection.execute("SELECT molecule FROM chembl WHERE chembl_id == ('CHEMBL501674')")
mols

<sqlite3.Cursor at 0x16e1cda40>


cursor = connection.cursor()
chemblid = "CHEMBL501674"
rows = cursor.execute("SELECT * FROM chembl  WHERE chembl_id == ?", (chemblid,),).fetchall()


cursor = connection.cursor()
inchikey = "JFHNJVCGXRRDFC-GSRYTVDASA-N"
rows = cursor.execute("SELECT chembl_id FROM chembl  WHERE INCHIKEY == ?", (inchikey,),).fetchall()
rows

[('CHEMBL501674',)]


%%time
resID=cursor.execute("SELECT chembl_id FROM chembl WHERE mol_is_substruct(molecule, mol_from_smiles('C1=CC=C(C=C1)C1=CC=CN=C1'))").fetchall()

CPU times: user 53.4 s, sys: 2.32 s, total: 55.7 s
Wall time: 55.7 s


len(resID)

23785


#substructure searching
# create a virtual table to be filled with fp data
connection.execute("CREATE VIRTUAL TABLE str_idx_chembl_molecule " +
                   "USING rdtree(id, fp bits(2048))")

<sqlite3.Cursor at 0x14763c0c0>


with connection:
    connection.execute(
        "INSERT INTO str_idx_chembl_molecule(id, fp) " +
        "SELECT id, mol_pattern_bfp(molecule, 2048) FROM chembl " +
        "WHERE molecule IS NOT NULL")


count = connection.execute("SELECT COUNT(*) FROM chembl, str_idx_chembl_molecule AS idx " + 
                "WHERE chembl.id = idx.id AND mol_is_substruct(chembl.molecule, mol_from_smiles('C1=CC=C(C=C1)C1=CC=CN=C1')) " +
                   "AND idx.id MATCH rdtree_subset(mol_pattern_bfp(mol_from_smiles('C1=CC=C(C=C1)C1=CC=CN=C1'), 2048))").fetchall()[0][0]

count

23785


import argparse


def substructure_search(connection, substructure, limit):
    print('searching for substructure:', substructure)

    t1 = time.time()
    rs = connection.execute(
        "select chembl.chembl_id, mol_to_smiles(chembl.molecule) from "
        "chembl, str_idx_chembl_molecule as idx where "
        "chembl.id = idx.id and "
        "mol_is_substruct(chembl.molecule, mol_from_smiles(?1)) and "
        "idx.id match rdtree_subset(mol_pattern_bfp(mol_from_smiles(?1), 2048)) "
        "limit ?2",
        (substructure, limit)).fetchall()
    t2 = time.time()

    for chembl_id, smiles in rs:
        print(chembl_id, smiles)
    print('Found {0} matches in {1} seconds'.format(len(rs), t2-t1))


smiles = 'C1=CC=C(C=C1)C1=CC=CN=C1'
limit = 10
substructure_search(connection, smiles, limit)

searching for substructure: C1=CC=C(C=C1)C1=CC=CN=C1
CHEMBL574580 O=c1ccc(-c2ccccc2)cn1O
CHEMBL569186 [O-][n+]1cccc(-c2ccccc2)c1
CHEMBL4452320 O=Cc1ccccc1-c1cccnc1
CHEMBL34657 c1ccc(-c2cccnc2)cc1
CHEMBL1605269 Nc1cc(-c2ccccc2)cnc1O
CHEMBL1778131 Nc1cccc(-c2cccnc2)c1
CHEMBL3623821 N#Cc1ccc(-c2cccnc2)cc1
CHEMBL178946 NCc1ccc(-c2cccnc2)cc1
CHEMBL3402235 CNCc1cccc(-c2cccnc2)c1
CHEMBL4548726 NCc1cccc(-c2cccnc2)c1
Found 10 matches in 0.01191401481628418 seconds


#Similarity searching
# create a virtual table to be filled with morgan bfp data
connection.execute("CREATE VIRTUAL TABLE morgan_idx_chembl_molecule " +
            "USING rdtree(id, fp bits(1024))");


with connection:
    connection.execute(
        "INSERT INTO morgan_idx_chembl_molecule(id, fp) " +
        "SELECT id, mol_morgan_bfp(molecule, 2, 1024) FROM chembl " +
        "WHERE molecule IS NOT NULL")


def tanimoto_count(connection, target, threshold):
    print('Target structure:', target)
    print('Minimum Tanimoto similarity:', threshold)

    t1 = time.time()
    count = connection.execute(
        "SELECT count(*) FROM "
        "morgan_idx_chembl_molecule as idx WHERE "
        "idx.id match rdtree_tanimoto(mol_morgan_bfp(mol_from_smiles(?), 2, 1024), ?)",
        (target, threshold)).fetchall()[0][0]
    t2 = time.time()

    print('Found {0} matching objects in {1} seconds'.format(count, t2-t1))


smiles = "O=c1ccc(-c2ccccc2)cn1O"
threshold = 0.1
tanimoto_count(connection, smiles, threshold)

Target structure: O=c1ccc(-c2ccccc2)cn1O
Minimum Tanimoto similarity: 0.1
Found 1531142 matching objects in 1.1654012203216553 seconds


rs = connection.execute(
    "SELECT c.chembl_id, mol_to_smiles(c.molecule), "
    "bfp_tanimoto(mol_morgan_bfp(c.molecule, 2, 1024), "
    "             mol_morgan_bfp(mol_from_smiles(?1), 2, 1024)) as t "
    "FROM "
    "chembl as c JOIN morgan_idx_chembl_molecule as idx USING(id) "
    "WHERE "
    "idx.id MATCH rdtree_tanimoto(mol_morgan_bfp(mol_from_smiles(?1), 2, 1024), ?2) "
    "ORDER BY t DESC",
    (smiles, threshold)).fetchall()


len(rs)

1531142


def tanimoto_search(connection, target, threshold):
    print('searching for target:', target)

    t1 = time.time()
    rs = connection.execute(
        "SELECT c.chembl_id, mol_to_smiles(c.molecule), "
        "bfp_tanimoto(mol_morgan_bfp(c.molecule, 2, 1024), "
        "             mol_morgan_bfp(mol_from_smiles(?1), 2, 1024)) as t "
        "FROM "
        "chembl as c JOIN morgan_idx_chembl_molecule as idx USING(id) "
        "WHERE "
        "idx.id MATCH rdtree_tanimoto(mol_morgan_bfp(mol_from_smiles(?1), 2, 1024), ?2) "
        "ORDER BY t DESC",
        (target, threshold)).fetchall()
    t2 = time.time()

    for chembl_id, smiles, sim in rs:
        print(chembl_id, smiles, sim)

    print('Found {0} matches in {1} seconds'.format(len(rs), t2-t1))


tanimoto_search(connection, smiles, threshold)

searching for target: O=c1ccc(-c2ccccc2)cn1O
CHEMBL574580 O=c1ccc(-c2ccccc2)cn1O 1.0
CHEMBL2208176 O=c1ccc(-c2ccccc2)cn1-c1ccccc1 0.5666666666666667
CHEMBL3343424 O=c1ccc(CCc2ccccc2)cn1O 0.53125
CHEMBL5197649 O=c1ccc(Cc2ccccc2)cn1O 0.53125
CHEMBL3343426 O=c1ccc(Oc2ccccc2)cn1O 0.53125
CHEMBL5186716 O=c1cc(-c2ccccc2)ccn1O 0.5161290322580645
CHEMBL3343425 O=c1ccc(CCCc2ccccc2)cn1O 0.5
Found 7 matches in 0.3809819221496582 seconds