In [1]:
import sqlite3
import time
import csv
import pandas as pd
from rdkit import Chem
from rdkit.Chem import PandasTools
In [2]:
#Install Chemicalite extension
connection = sqlite3.connect("Pubchem3.sqlite")
connection.enable_load_extension(True)
connection.load_extension('/Users/chrisswain/miniconda3/envs/rdkitenv/lib/chemicalite.dylib')
#now disable load because of potential security risks 
connection.enable_load_extension(False)
In [3]:
connection.execute(
    "CREATE TABLE ID_DATA (SMILES TEXT NOT NULL,ID INT PRIMARY KEY,INCHIKEY TEXT NOT NULL, molecule MOL)")
Out[3]:
<sqlite3.Cursor at 0x13cf0c840>
In [4]:
def pubchem(path):
    with open(path, 'rt') as inputfile:
        reader = csv.reader(inputfile, delimiter = '\t')
        next(reader) #skip header
        for SMILES, ID, INCHIKEY, *_ in reader:
            yield SMILES, ID, INCHIKEY
    
In [ ]:
 
In [5]:
with connection:
    connection.executemany(
    "INSERT INTO ID_DATA(SMILES, ID, INCHIKEY, molecule)"
    "VALUES(?1, ?2, ?3, mol_from_smiles(?1))", pubchem('/Users/chrisswain/Projects/Pubchem/InChiKey/AllInChiKey.tsv'))
[12:00:37] WARNING: not removing hydrogen atom without neighbors
[12:00:37] WARNING: not removing hydrogen atom without neighbors
[12:00:37] WARNING: not removing hydrogen atom without neighbors
[12:00:37] WARNING: not removing hydrogen atom without neighbors
[12:00:37] WARNING: not removing hydrogen atom without neighbors

[14:14:05] WARNING: not removing hydrogen atom without neighbors
[14:14:05] WARNING: not removing hydrogen atom without neighbors
[14:14:05] WARNING: not removing hydrogen atom without neighbors
[14:14:05] WARNING: not removing hydrogen atom without neighbors
[14:14:05] Explicit valence for atom # 1 Cl, 3, is greater than permitted
In [6]:
cursor = connection.cursor()
targetinchikey = "WPQAOGZTDKTBHI-UHFFFAOYSA-N"
rows = cursor.execute("SELECT * FROM ID_DATA  WHERE INCHIKEY == ?", (targetinchikey,),).fetchall()
rows
# This also returns the molecule as a binary object
Out[6]:
[('OC1(C(=CCC(C1)C(=C)C)C)C1(O)C(=CCC(C1)C(=C)C)C',
  500061,
  'WPQAOGZTDKTBHI-UHFFFAOYSA-N',
  b'MOL\x00\xef\xbe\xad\xde\x00\x00\x00\x00\x10\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\x80\x01\x08\x00`\x00\x00\x00\x01\x01\x06\x00 \x00\x00\x00\x04\x06\x00(\x00\x00\x00\x03\x04\x06\x00h\x00\x00\x00\x03\x03\x01\x06\x00`\x00\x00\x00\x02\x02\x06\x00`\x00\x00\x00\x03\x01\x06\x00`\x00\x00\x00\x02\x02\x06\x00(\x00\x00\x00\x03\x04\x06\x00h\x00\x00\x00\x03\x02\x02\x06\x00`\x00\x00\x00\x01\x03\x06\x00`\x00\x00\x00\x01\x03\x06\x00 \x00\x00\x00\x04\x08\x00`\x00\x00\x00\x01\x01\x06\x00(\x00\x00\x00\x03\x04\x06\x00h\x00\x00\x00\x03\x03\x01\x06\x00`\x00\x00\x00\x02\x02\x06\x00`\x00\x00\x00\x03\x01\x06\x00`\x00\x00\x00\x02\x02\x06\x00(\x00\x00\x00\x03\x04\x06\x00h\x00\x00\x00\x03\x02\x02\x06\x00`\x00\x00\x00\x01\x03\x06\x00`\x00\x00\x00\x01\x03\x0b\x00\x01\x00\x01\x02\x00\x02\x03\x08\x02\x03\x04\x00\x04\x05\x00\x05\x06\x00\x05\x07\x00\x07\x08\x08\x02\x07\t\x00\x02\n\x00\x01\x0b\x00\x0b\x0c\x00\x0b\r\x00\r\x0e\x08\x02\x0e\x0f\x00\x0f\x10\x00\x10\x11\x00\x10\x12\x00\x12\x13\x08\x02\x12\x14\x00\r\x15\x00\x06\x01\x00\x11\x0b\x00B\x02\x00\x00\x00\x06\x02\x03\x04\x05\x06\x01\x06\r\x0e\x0f\x10\x11\x0b\x17\t\x00\x00\x00\x00\x00\x00\x00?\x00\x00\x00\x00\x12d\x00\x00\x00\x03\x00\x0f\x00\x00\x00__computedProps\x06\x02\x00\x00\x00\x00\x00\x00\x00\x07\x00\x00\x00numArom\x0f\x00\x00\x00_StereochemDone\x07\x00\x00\x00numArom\x01\x00\x00\x00\x00\x0f\x00\x00\x00_StereochemDone\x01\x01\x00\x00\x00\x13:0\x05\x00\x00\x02\x00\x0f\x00\x00\x00__computedProps\x06\x01\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00_CIPRank\x08\x00\x00\x00_CIPRank\x02\n\x00\x00\x00\x00\x02\x00\x0f\x00\x00\x00__computedProps\x06\x01\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00_CIPRank\x08\x00\x00\x00_CIPRank\x02\t\x00\x00\x00\x08\x01\x00\x02\x00\x0f\x00\x00\x00__computedProps\x06\x01\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00_CIPRank\x08\x00\x00\x00_CIPRank\x02\x08\x00\x00\x00\x00\x02\x00\x0f\x00\x00\x00__computedProps\x06\x01\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00_CIPRank\x08\x00\x00\x00_CIPRank\x02\x06\x00\x00\x00\x00\x02\x00\x0f\x00\x00\x00__computedProps\x06\x01\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00_CIPRank\x08\x00\x00\x00_CIPRank\x02\x02\x00\x00\x00\x00\x02\x00\x0f\x00\x00\x00__computedProps\x06\x01\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00_CIPRank\x08\x00\x00\x00_CIPRank\x02\x05\x00\x00\x00\x08\x01\x00\x02\x00\x0f\x00\x00\x00__computedProps\x06\x01\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00_CIPRank\x08\x00\x00\x00_CIPRank\x02\x04\x00\x00\x00\x00\x02\x00\x0f\x00\x00\x00__computedProps\x06\x01\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00_CIPRank\x08\x00\x00\x00_CIPRank\x02\x07\x00\x00\x00\x00\x02\x00\x0f\x00\x00\x00__computedProps\x06\x01\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00_CIPRank\x08\x00\x00\x00_CIPRank\x02\x03\x00\x00\x00\x00\x02\x00\x0f\x00\x00\x00__computedProps\x06\x01\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00_CIPRank\x08\x00\x00\x00_CIPRank\x02\x00\x00\x00\x00\x00\x02\x00\x0f\x00\x00\x00__computedProps\x06\x01\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00_CIPRank\x08\x00\x00\x00_CIPRank\x02\x01\x00\x00\x00\x00\x02\x00\x0f\x00\x00\x00__computedProps\x06\x01\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00_CIPRank\x08\x00\x00\x00_CIPRank\x02\t\x00\x00\x00\x08\x01\x00\x02\x00\x0f\x00\x00\x00__computedProps\x06\x01\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00_CIPRank\x08\x00\x00\x00_CIPRank\x02\n\x00\x00\x00\x00\x02\x00\x0f\x00\x00\x00__computedProps\x06\x01\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00_CIPRank\x08\x00\x00\x00_CIPRank\x02\x08\x00\x00\x00\x00\x02\x00\x0f\x00\x00\x00__computedProps\x06\x01\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00_CIPRank\x08\x00\x00\x00_CIPRank\x02\x06\x00\x00\x00\x00\x02\x00\x0f\x00\x00\x00__computedProps\x06\x01\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00_CIPRank\x08\x00\x00\x00_CIPRank\x02\x02\x00\x00\x00\x00\x02\x00\x0f\x00\x00\x00__computedProps\x06\x01\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00_CIPRank\x08\x00\x00\x00_CIPRank\x02\x05\x00\x00\x00\x08\x01\x00\x02\x00\x0f\x00\x00\x00__computedProps\x06\x01\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00_CIPRank\x08\x00\x00\x00_CIPRank\x02\x04\x00\x00\x00\x00\x02\x00\x0f\x00\x00\x00__computedProps\x06\x01\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00_CIPRank\x08\x00\x00\x00_CIPRank\x02\x07\x00\x00\x00\x00\x02\x00\x0f\x00\x00\x00__computedProps\x06\x01\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00_CIPRank\x08\x00\x00\x00_CIPRank\x02\x03\x00\x00\x00\x00\x02\x00\x0f\x00\x00\x00__computedProps\x06\x01\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00_CIPRank\x08\x00\x00\x00_CIPRank\x02\x00\x00\x00\x00\x00\x02\x00\x0f\x00\x00\x00__computedProps\x06\x01\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00_CIPRank\x08\x00\x00\x00_CIPRank\x02\x01\x00\x00\x00\x00\x13\x16')]
In [7]:
#To do a structure search
resID=cursor.execute("SELECT ID FROM ID_DATA WHERE mol_is_substruct(molecule, mol_from_smiles('Nc1nc2[nH]c(CN3CCN(Cc4cccc(F)c4)CC3)c(CN3CCN(Cc4cccc(F)c4)CC3)c2c(=O)[nH]1'))").fetchall()

resID
Out[7]:
[(149878534,), (153408163,)]
In [8]:
#Toreturn the SMILES
resSMILES=cursor.execute("SELECT SMILES FROM ID_DATA WHERE mol_is_substruct(molecule, mol_from_smiles('O=C1NC=NC2=C1C(CN1CCNCC1)=CN2'))").fetchall()

resSMILES
Out[8]:
[('c1ccc(cc1)N1CCN(CC1)Cc1c2c([nH]c1C)nc([nH]c2=O)N(C)C',),
 ('Clc1ccc(cc1)N1CCN(CC1)Cc1c2c([nH]c1C)nc([nH]c2=O)N(C)C',),
 ('c1ccc(c(c1)N1CCN(CC1)Cc1c2c([nH]c1C)nc([nH]c2=O)N(C)C)OC',),
 ('c1ccc(c(c1)N1CCN(CC1)Cc1c2c([nH]c1C)nc([nH]c2=O)N(C)C)Cl',),
 ('c1cc(cc(c1Cl)Cl)N1CCN(CC1)Cc1c2c([nH]c1C)nc([nH]c2=O)N(C)C',),
 ('c1ccc(cc1F)CN1CCN(CC1)Cc1c2c(=O)[nH]c(nc2[nH]c1CN1CCN(CC1)Cc1cccc(c1)F)NC(=O)CCCCCCC',),
 ('[O-][N+](=O)c1ccc(cc1)N1CCN(CC1)Cc1[nH]c2c(c1CN1CCN(CC1)c1ccc(cc1)[N+](=O)[O-])c(=O)[nH]c(n2)NC(=O)CCCCCCC',),
 ('c1ccc(cc1)CN1CCN(CC1)Cc1[nH]c2c(c1CN1CCN(CC1)Cc1ccccc1)c(=O)[nH]c(n2)NC(=O)CCCCCCC',),
 ('Fc1ccc(cc1)CN1CCN(CC1)Cc1[nH]c2c(c1CN1CCN(CC1)Cc1ccc(cc1)F)c(=O)[nH]c(n2)NC(=O)CCCCCCC',),
 ('c1ccc(c(c1)F)CN1CCN(CC1)Cc1c2c(=O)[nH]c(nc2[nH]c1CN1CCN(CC1)Cc1ccccc1F)NC(=O)CCCCCCC',),
 ('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)c1cc(Cl)cc(c1)Cl)CN1CCN(CC1)c1cc(Cl)cc(c1)Cl',),
 ('c1c(Cl)cc(cc1Cl)N1CCN(CC1)Cc1[nH]c2c(c1CN1CCN(CC1)c1cc(Cl)cc(c1)Cl)c(=O)[nH]c(n2)NC(=O)CCCCCCC',),
 ('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)Cc1ccccc1F)CN1CCN(CC1)Cc1ccccc1F',),
 ('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)C(c1ccccc1)c1ccccc1)CN1CCN(CC1)C(c1ccccc1)c1ccccc1',),
 ('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)Cc1ccccc1)CN1CCN(CC1)Cc1ccccc1',),
 ('c1ccc(cc1)C(N1CCN(CC1)Cc1[nH]c2c(c1CN1CCN(CC1)C(c1ccccc1)c1ccccc1)c(=O)[nH]c(n2)NC(=O)CCCCCCC)c1ccccc1',),
 ('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)c1ccc(cc1)[N+](=O)[O-])CN1CCN(CC1)c1ccc(cc1)[N+](=O)[O-]',),
 ('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)Cc1ccc(cc1)F)CN1CCN(CC1)Cc1ccc(cc1)F',),
 ('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)Cc1cccc(c1)F)CN1CCN(CC1)Cc1cccc(c1)F',),
 ('c1ccc(cc1I)CN1CCN(CC1)Cc1c2c(=O)[nH]c(nc2[nH]c1CN1CCN(CC1)Cc1cccc(c1)I)NC(=O)CCCCCCC',)]
In [9]:
#To get the mol object you need to use the mol_to_binary_mol
mols=cursor.execute("SELECT mol_to_binary_mol(molecule)  FROM ID_DATA WHERE mol_is_substruct(molecule, mol_from_smiles('Nc1nc2[nH]c(CN3CCN(Cc4cccc(F)c4)CC3)c(CN3CCN(Cc4cccc(F)c4)CC3)c2c(=O)[nH]1'))").fetchall()
In [10]:
rdmols = [Chem.Mol(b[0]) for b in mols]
mol = rdmols[0]
mol
Out[10]:

Speeding things up¶

In [11]:
# ChemicaLite uses a virtual table mechanism to support indexing binary fingerprints in an RD-tree data structure, and this way improve the performances of substructure and similarity queries.
In [12]:
%%time
resSMILES=cursor.execute("SELECT SMILES FROM ID_DATA WHERE mol_is_substruct(molecule, mol_from_smiles('O=C1NC=NC2=C1C(CN1CCNCC1)=CN2'))")
CPU times: user 19min 2s, sys: 1min 10s, total: 20min 12s
Wall time: 21min 7s
In [13]:
for smiles in resSMILES:
        print(smiles)
('c1ccc(cc1)N1CCN(CC1)Cc1c2c([nH]c1C)nc([nH]c2=O)N(C)C',)
('Clc1ccc(cc1)N1CCN(CC1)Cc1c2c([nH]c1C)nc([nH]c2=O)N(C)C',)
('c1ccc(c(c1)N1CCN(CC1)Cc1c2c([nH]c1C)nc([nH]c2=O)N(C)C)OC',)
('c1ccc(c(c1)N1CCN(CC1)Cc1c2c([nH]c1C)nc([nH]c2=O)N(C)C)Cl',)
('c1cc(cc(c1Cl)Cl)N1CCN(CC1)Cc1c2c([nH]c1C)nc([nH]c2=O)N(C)C',)
('c1ccc(cc1F)CN1CCN(CC1)Cc1c2c(=O)[nH]c(nc2[nH]c1CN1CCN(CC1)Cc1cccc(c1)F)NC(=O)CCCCCCC',)
('[O-][N+](=O)c1ccc(cc1)N1CCN(CC1)Cc1[nH]c2c(c1CN1CCN(CC1)c1ccc(cc1)[N+](=O)[O-])c(=O)[nH]c(n2)NC(=O)CCCCCCC',)
('c1ccc(cc1)CN1CCN(CC1)Cc1[nH]c2c(c1CN1CCN(CC1)Cc1ccccc1)c(=O)[nH]c(n2)NC(=O)CCCCCCC',)
('Fc1ccc(cc1)CN1CCN(CC1)Cc1[nH]c2c(c1CN1CCN(CC1)Cc1ccc(cc1)F)c(=O)[nH]c(n2)NC(=O)CCCCCCC',)
('c1ccc(c(c1)F)CN1CCN(CC1)Cc1c2c(=O)[nH]c(nc2[nH]c1CN1CCN(CC1)Cc1ccccc1F)NC(=O)CCCCCCC',)
('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)c1cc(Cl)cc(c1)Cl)CN1CCN(CC1)c1cc(Cl)cc(c1)Cl',)
('c1c(Cl)cc(cc1Cl)N1CCN(CC1)Cc1[nH]c2c(c1CN1CCN(CC1)c1cc(Cl)cc(c1)Cl)c(=O)[nH]c(n2)NC(=O)CCCCCCC',)
('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)Cc1ccccc1F)CN1CCN(CC1)Cc1ccccc1F',)
('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)C(c1ccccc1)c1ccccc1)CN1CCN(CC1)C(c1ccccc1)c1ccccc1',)
('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)Cc1ccccc1)CN1CCN(CC1)Cc1ccccc1',)
('c1ccc(cc1)C(N1CCN(CC1)Cc1[nH]c2c(c1CN1CCN(CC1)C(c1ccccc1)c1ccccc1)c(=O)[nH]c(n2)NC(=O)CCCCCCC)c1ccccc1',)
('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)c1ccc(cc1)[N+](=O)[O-])CN1CCN(CC1)c1ccc(cc1)[N+](=O)[O-]',)
('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)Cc1ccc(cc1)F)CN1CCN(CC1)Cc1ccc(cc1)F',)
('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)Cc1cccc(c1)F)CN1CCN(CC1)Cc1cccc(c1)F',)
('c1ccc(cc1I)CN1CCN(CC1)Cc1c2c(=O)[nH]c(nc2[nH]c1CN1CCN(CC1)Cc1cccc(c1)I)NC(=O)CCCCCCC',)
In [14]:
# create a virtual table to be filled with fp data
connection.execute("CREATE VIRTUAL TABLE str_idx_pubchem_molecule " + 
                   "USING rdtree(id, fp bits(2048))")
Out[14]:
<sqlite3.Cursor at 0x13cf0e540>
In [15]:
#calculate and insert fingerprints, this will take a while
with connection:
    connection.execute(
        "INSERT INTO str_idx_pubchem_molecule(id, fp) " +
        "SELECT ID, mol_pattern_bfp(molecule, 2048) FROM ID_DATA " +
        "WHERE molecule IS NOT NULL")
In [ ]:
 
In [16]:
%%time
res1=cursor.execute("SELECT SMILES FROM ID_DATA, str_idx_pubchem_molecule AS idx WHERE ID_DATA.ID = idx.id AND mol_is_substruct(ID_DATA.molecule,  mol_from_smiles('O=C1NC=NC2=C1C(CN1CCNCC1)=CN2')) AND idx.id MATCH rdtree_subset(mol_pattern_bfp(mol_from_smiles('O=C1NC=NC2=C1C(CN1CCNCC1)=CN2'), 2048))").fetchall()
CPU times: user 300 ms, sys: 117 ms, total: 417 ms
Wall time: 1.1 s
In [17]:
for smiles in res1:
        print(smiles)
('c1c(Cl)cc(cc1Cl)N1CCN(CC1)Cc1[nH]c2c(c1CN1CCN(CC1)c1cc(Cl)cc(c1)Cl)c(=O)[nH]c(n2)NC(=O)CCCCCCC',)
('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)c1cc(Cl)cc(c1)Cl)CN1CCN(CC1)c1cc(Cl)cc(c1)Cl',)
('c1cc(cc(c1Cl)Cl)N1CCN(CC1)Cc1c2c([nH]c1C)nc([nH]c2=O)N(C)C',)
('c1ccc(cc1)CN1CCN(CC1)Cc1[nH]c2c(c1CN1CCN(CC1)Cc1ccccc1)c(=O)[nH]c(n2)NC(=O)CCCCCCC',)
('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)Cc1ccccc1F)CN1CCN(CC1)Cc1ccccc1F',)
('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)Cc1ccc(cc1)F)CN1CCN(CC1)Cc1ccc(cc1)F',)
('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)Cc1cccc(c1)F)CN1CCN(CC1)Cc1cccc(c1)F',)
('Fc1ccc(cc1)CN1CCN(CC1)Cc1[nH]c2c(c1CN1CCN(CC1)Cc1ccc(cc1)F)c(=O)[nH]c(n2)NC(=O)CCCCCCC',)
('c1ccc(c(c1)F)CN1CCN(CC1)Cc1c2c(=O)[nH]c(nc2[nH]c1CN1CCN(CC1)Cc1ccccc1F)NC(=O)CCCCCCC',)
('c1ccc(cc1F)CN1CCN(CC1)Cc1c2c(=O)[nH]c(nc2[nH]c1CN1CCN(CC1)Cc1cccc(c1)F)NC(=O)CCCCCCC',)
('c1ccc(cc1I)CN1CCN(CC1)Cc1c2c(=O)[nH]c(nc2[nH]c1CN1CCN(CC1)Cc1cccc(c1)I)NC(=O)CCCCCCC',)
('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)c1ccc(cc1)[N+](=O)[O-])CN1CCN(CC1)c1ccc(cc1)[N+](=O)[O-]',)
('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)C(c1ccccc1)c1ccccc1)CN1CCN(CC1)C(c1ccccc1)c1ccccc1',)
('c1ccc(cc1)C(N1CCN(CC1)Cc1[nH]c2c(c1CN1CCN(CC1)C(c1ccccc1)c1ccccc1)c(=O)[nH]c(n2)NC(=O)CCCCCCC)c1ccccc1',)
('[O-][N+](=O)c1ccc(cc1)N1CCN(CC1)Cc1[nH]c2c(c1CN1CCN(CC1)c1ccc(cc1)[N+](=O)[O-])c(=O)[nH]c(n2)NC(=O)CCCCCCC',)
('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)Cc1ccccc1)CN1CCN(CC1)Cc1ccccc1',)
('Clc1ccc(cc1)N1CCN(CC1)Cc1c2c([nH]c1C)nc([nH]c2=O)N(C)C',)
('c1ccc(c(c1)N1CCN(CC1)Cc1c2c([nH]c1C)nc([nH]c2=O)N(C)C)Cl',)
('c1ccc(c(c1)N1CCN(CC1)Cc1c2c([nH]c1C)nc([nH]c2=O)N(C)C)OC',)
('c1ccc(cc1)N1CCN(CC1)Cc1c2c([nH]c1C)nc([nH]c2=O)N(C)C',)
In [18]:
res1
Out[18]:
[('c1c(Cl)cc(cc1Cl)N1CCN(CC1)Cc1[nH]c2c(c1CN1CCN(CC1)c1cc(Cl)cc(c1)Cl)c(=O)[nH]c(n2)NC(=O)CCCCCCC',),
 ('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)c1cc(Cl)cc(c1)Cl)CN1CCN(CC1)c1cc(Cl)cc(c1)Cl',),
 ('c1cc(cc(c1Cl)Cl)N1CCN(CC1)Cc1c2c([nH]c1C)nc([nH]c2=O)N(C)C',),
 ('c1ccc(cc1)CN1CCN(CC1)Cc1[nH]c2c(c1CN1CCN(CC1)Cc1ccccc1)c(=O)[nH]c(n2)NC(=O)CCCCCCC',),
 ('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)Cc1ccccc1F)CN1CCN(CC1)Cc1ccccc1F',),
 ('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)Cc1ccc(cc1)F)CN1CCN(CC1)Cc1ccc(cc1)F',),
 ('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)Cc1cccc(c1)F)CN1CCN(CC1)Cc1cccc(c1)F',),
 ('Fc1ccc(cc1)CN1CCN(CC1)Cc1[nH]c2c(c1CN1CCN(CC1)Cc1ccc(cc1)F)c(=O)[nH]c(n2)NC(=O)CCCCCCC',),
 ('c1ccc(c(c1)F)CN1CCN(CC1)Cc1c2c(=O)[nH]c(nc2[nH]c1CN1CCN(CC1)Cc1ccccc1F)NC(=O)CCCCCCC',),
 ('c1ccc(cc1F)CN1CCN(CC1)Cc1c2c(=O)[nH]c(nc2[nH]c1CN1CCN(CC1)Cc1cccc(c1)F)NC(=O)CCCCCCC',),
 ('c1ccc(cc1I)CN1CCN(CC1)Cc1c2c(=O)[nH]c(nc2[nH]c1CN1CCN(CC1)Cc1cccc(c1)I)NC(=O)CCCCCCC',),
 ('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)c1ccc(cc1)[N+](=O)[O-])CN1CCN(CC1)c1ccc(cc1)[N+](=O)[O-]',),
 ('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)C(c1ccccc1)c1ccccc1)CN1CCN(CC1)C(c1ccccc1)c1ccccc1',),
 ('c1ccc(cc1)C(N1CCN(CC1)Cc1[nH]c2c(c1CN1CCN(CC1)C(c1ccccc1)c1ccccc1)c(=O)[nH]c(n2)NC(=O)CCCCCCC)c1ccccc1',),
 ('[O-][N+](=O)c1ccc(cc1)N1CCN(CC1)Cc1[nH]c2c(c1CN1CCN(CC1)c1ccc(cc1)[N+](=O)[O-])c(=O)[nH]c(n2)NC(=O)CCCCCCC',),
 ('Nc1nc2[nH]c(c(c2c(=O)[nH]1)CN1CCN(CC1)Cc1ccccc1)CN1CCN(CC1)Cc1ccccc1',),
 ('Clc1ccc(cc1)N1CCN(CC1)Cc1c2c([nH]c1C)nc([nH]c2=O)N(C)C',),
 ('c1ccc(c(c1)N1CCN(CC1)Cc1c2c([nH]c1C)nc([nH]c2=O)N(C)C)Cl',),
 ('c1ccc(c(c1)N1CCN(CC1)Cc1c2c([nH]c1C)nc([nH]c2=O)N(C)C)OC',),
 ('c1ccc(cc1)N1CCN(CC1)Cc1c2c([nH]c1C)nc([nH]c2=O)N(C)C',)]
In [19]:
def substructure_search(connection, substructure, limit):
    print('searching for substructure:', substructure)

    t1 = time.time()
    rs = connection.execute(
        "select ID_DATA.ID, mol_to_smiles(ID_DATA.molecule) from "
        "ID_DATA, str_idx_pubchem_molecule as idx where "
        "ID_DATA.ID = idx.id and "
        "mol_is_substruct(ID_DATA.molecule, mol_from_smiles(?1)) and "
        "idx.id match rdtree_subset(mol_pattern_bfp(mol_from_smiles(?1), 2048)) "
        "limit ?2",
        (substructure, limit)).fetchall()
    t2 = time.time()

    for ID, smiles in rs:
        print(ID, smiles)
    print('Found {0} matches in {1} seconds'.format(len(rs), t2-t1))
In [20]:
smiles = 'O=C1NC=NC2=C1C(CN1CCNCC1)=CN2'
limit = 100
substructure_search(connection, smiles, limit)
searching for substructure: O=C1NC=NC2=C1C(CN1CCNCC1)=CN2
153408156 CCCCCCCC(=O)Nc1nc2[nH]c(CN3CCN(c4cc(Cl)cc(Cl)c4)CC3)c(CN3CCN(c4cc(Cl)cc(Cl)c4)CC3)c2c(=O)[nH]1
153408155 Nc1nc2[nH]c(CN3CCN(c4cc(Cl)cc(Cl)c4)CC3)c(CN3CCN(c4cc(Cl)cc(Cl)c4)CC3)c2c(=O)[nH]1
135524318 Cc1[nH]c2nc(N(C)C)[nH]c(=O)c2c1CN1CCN(c2ccc(Cl)c(Cl)c2)CC1
150856072 CCCCCCCC(=O)Nc1nc2[nH]c(CN3CCN(Cc4ccccc4)CC3)c(CN3CCN(Cc4ccccc4)CC3)c2c(=O)[nH]1
153408157 Nc1nc2[nH]c(CN3CCN(Cc4ccccc4F)CC3)c(CN3CCN(Cc4ccccc4F)CC3)c2c(=O)[nH]1
153408162 Nc1nc2[nH]c(CN3CCN(Cc4ccc(F)cc4)CC3)c(CN3CCN(Cc4ccc(F)cc4)CC3)c2c(=O)[nH]1
153408163 Nc1nc2[nH]c(CN3CCN(Cc4cccc(F)c4)CC3)c(CN3CCN(Cc4cccc(F)c4)CC3)c2c(=O)[nH]1
151901732 CCCCCCCC(=O)Nc1nc2[nH]c(CN3CCN(Cc4ccc(F)cc4)CC3)c(CN3CCN(Cc4ccc(F)cc4)CC3)c2c(=O)[nH]1
152264085 CCCCCCCC(=O)Nc1nc2[nH]c(CN3CCN(Cc4ccccc4F)CC3)c(CN3CCN(Cc4ccccc4F)CC3)c2c(=O)[nH]1
149878534 CCCCCCCC(=O)Nc1nc2[nH]c(CN3CCN(Cc4cccc(F)c4)CC3)c(CN3CCN(Cc4cccc(F)c4)CC3)c2c(=O)[nH]1
155805404 CCCCCCCC(=O)Nc1nc2[nH]c(CN3CCN(Cc4cccc(I)c4)CC3)c(CN3CCN(Cc4cccc(I)c4)CC3)c2c(=O)[nH]1
153408161 Nc1nc2[nH]c(CN3CCN(c4ccc([N+](=O)[O-])cc4)CC3)c(CN3CCN(c4ccc([N+](=O)[O-])cc4)CC3)c2c(=O)[nH]1
153408158 Nc1nc2[nH]c(CN3CCN(C(c4ccccc4)c4ccccc4)CC3)c(CN3CCN(C(c4ccccc4)c4ccccc4)CC3)c2c(=O)[nH]1
153408160 CCCCCCCC(=O)Nc1nc2[nH]c(CN3CCN(C(c4ccccc4)c4ccccc4)CC3)c(CN3CCN(C(c4ccccc4)c4ccccc4)CC3)c2c(=O)[nH]1
150504227 CCCCCCCC(=O)Nc1nc2[nH]c(CN3CCN(c4ccc([N+](=O)[O-])cc4)CC3)c(CN3CCN(c4ccc([N+](=O)[O-])cc4)CC3)c2c(=O)[nH]1
153408159 Nc1nc2[nH]c(CN3CCN(Cc4ccccc4)CC3)c(CN3CCN(Cc4ccccc4)CC3)c2c(=O)[nH]1
135524311 Cc1[nH]c2nc(N(C)C)[nH]c(=O)c2c1CN1CCN(c2ccc(Cl)cc2)CC1
135524317 Cc1[nH]c2nc(N(C)C)[nH]c(=O)c2c1CN1CCN(c2ccccc2Cl)CC1
135524312 COc1ccccc1N1CCN(Cc2c(C)[nH]c3nc(N(C)C)[nH]c(=O)c23)CC1
135524310 Cc1[nH]c2nc(N(C)C)[nH]c(=O)c2c1CN1CCN(c2ccccc2)CC1
Found 20 matches in 0.43364596366882324 seconds
In [ ]:
 
In [21]:
#Similarity searching
# create a virtual table to be filled with morgan bfp data
connection.execute("CREATE VIRTUAL TABLE morgan_idx_pubchem_molecule " +
            "USING rdtree(id, fp bits(1024))");
In [23]:
with connection:
    connection.execute(
        "INSERT INTO morgan_idx_pubchem_molecule(id, fp) " +
        "SELECT ID, mol_morgan_bfp(molecule, 2, 1024) FROM ID_DATA " +
        "WHERE molecule IS NOT NULL")
In [24]:
def tanimoto_search(connection, target, threshold):
    print('searching for target:', target)

    t1 = time.time()
    rs = connection.execute(
        "SELECT c.ID, mol_to_smiles(c.molecule), "
        "bfp_tanimoto(mol_morgan_bfp(c.molecule, 2, 1024), "
        "             mol_morgan_bfp(mol_from_smiles(?1), 2, 1024)) as t "
        "FROM "
        "ID_DATA as c JOIN morgan_idx_pubchem_molecule as idx USING(id) "
        "WHERE "
        "idx.id MATCH rdtree_tanimoto(mol_morgan_bfp(mol_from_smiles(?1), 2, 1024), ?2) "
        "ORDER BY t DESC",
        (target, threshold)).fetchall()
    t2 = time.time()

    for ID, smiles, sim in rs:
        print(ID, smiles, sim)

    print('Found {0} matches in {1} seconds'.format(len(rs), t2-t1))
In [25]:
smiles = "O=c1ccc(-c2ccccc2)cn1O"
threshold = 0.5
In [26]:
tanimoto_search(connection, smiles, threshold)
searching for target: O=c1ccc(-c2ccccc2)cn1O
44543378 O=c1ccc(-c2ccccc2)cn1O 1.0
153887090 O=c1ccc(-c2ccncc2)cn1O 0.6551724137931034
70596262 O=C(O)n1cc(-c2ccccc2)ccc1=O 0.6
14595918 Cn1cc(-c2ccccc2)ccc1=O 0.5862068965517241
11988946 O=c1ccc(-c2ccccc2)cn1-c1ccccc1 0.5666666666666667
68247588 O=c1ccc(-c2ccccc2)cn1-c1ccccc1O 0.5454545454545454
100942537 CCn1cc(-c2ccccc2)ccc1=O 0.53125
68028637 O=c1ccc(-c2ccccc2)cn1-c1ccc(O)cc1 0.5294117647058824
82113594 NCCn1cc(-c2ccccc2)ccc1=O 0.5151515151515151
4258240 O=c1ccc(-c2ccc(-c3ccccc3)cc2)cn1Cc1ccccc1 0.5151515151515151
3550352 O=c1ccc(-c2ccccc2)cn1Cc1ccccc1 0.5151515151515151
3641101 O=c1ccc(-c2ccc(-c3ccccc3)cc2)cn1C(c1ccccc1)c1ccccc1 0.5151515151515151
3487768 O=c1ccc(-c2ccccc2)cn1C(c1ccccc1)c1ccccc1 0.5151515151515151
12671450 O=c1cnc(-c2ccccc2)cn1O 0.5151515151515151
3699479 C=CCn1cc(-c2ccccc2)ccc1=O 0.5
3682683 C=CCn1cc(-c2ccc(-c3ccccc3)cc2)ccc1=O 0.5
162015486 O=P(O)(O)O.c1ccc(-c2ccc(-c3ccc(-c4ccccc4)cc3)cc2)cc1 0.5
58016393 Cc1ccc(-n2cc(-c3ccccc3)ccc2=O)cc1 0.5
3897369 CC(C)=CCn1cc(-c2ccc(-c3ccccc3)cc2)ccc1=O 0.5
3376142 CC(C)=CCn1cc(-c2ccccc2)ccc1=O 0.5
58016549 O=c1ccc(-c2ccc(O)cc2)cn1-c1ccccc1 0.5
100942536 O=c1ccc(-c2ccccc2)cn1CCc1ccccc1 0.5
11299201 O=C(O)c1cc(-c2ccccc2)cn(O)c1=O 0.5
17913416 O=c1ccc2ccc(-c3ccccc3)cc2n1O 0.5
90188457 O=c1ccccn1O.[Zn] 0.5
87935139 O=c1ccccn1O.O=c1ccccn1O.[Zn] 0.5
10953512 O=c1ccc(O)cn1O 0.5
Found 27 matches in 52.773017168045044 seconds
In [28]:
rs = connection.execute(
    "SELECT c.ID, mol_to_smiles(c.molecule), "
    "bfp_tanimoto(mol_morgan_bfp(c.molecule, 2, 1024), "
    "             mol_morgan_bfp(mol_from_smiles(?1), 2, 1024)) as t "
    "FROM "
    "ID_DATA as c JOIN morgan_idx_pubchem_molecule as idx USING(id) "
    "WHERE "
    "idx.id MATCH rdtree_tanimoto(mol_morgan_bfp(mol_from_smiles(?1), 2, 1024), ?2) "
    "ORDER BY t DESC",
    (smiles, threshold)).fetchall()
In [29]:
rs
Out[29]:
[(44543378, 'O=c1ccc(-c2ccccc2)cn1O', 1.0),
 (153887090, 'O=c1ccc(-c2ccncc2)cn1O', 0.6551724137931034),
 (70596262, 'O=C(O)n1cc(-c2ccccc2)ccc1=O', 0.6),
 (14595918, 'Cn1cc(-c2ccccc2)ccc1=O', 0.5862068965517241),
 (11988946, 'O=c1ccc(-c2ccccc2)cn1-c1ccccc1', 0.5666666666666667),
 (68247588, 'O=c1ccc(-c2ccccc2)cn1-c1ccccc1O', 0.5454545454545454),
 (100942537, 'CCn1cc(-c2ccccc2)ccc1=O', 0.53125),
 (68028637, 'O=c1ccc(-c2ccccc2)cn1-c1ccc(O)cc1', 0.5294117647058824),
 (82113594, 'NCCn1cc(-c2ccccc2)ccc1=O', 0.5151515151515151),
 (4258240, 'O=c1ccc(-c2ccc(-c3ccccc3)cc2)cn1Cc1ccccc1', 0.5151515151515151),
 (3550352, 'O=c1ccc(-c2ccccc2)cn1Cc1ccccc1', 0.5151515151515151),
 (3641101,
  'O=c1ccc(-c2ccc(-c3ccccc3)cc2)cn1C(c1ccccc1)c1ccccc1',
  0.5151515151515151),
 (3487768, 'O=c1ccc(-c2ccccc2)cn1C(c1ccccc1)c1ccccc1', 0.5151515151515151),
 (12671450, 'O=c1cnc(-c2ccccc2)cn1O', 0.5151515151515151),
 (3699479, 'C=CCn1cc(-c2ccccc2)ccc1=O', 0.5),
 (3682683, 'C=CCn1cc(-c2ccc(-c3ccccc3)cc2)ccc1=O', 0.5),
 (162015486, 'O=P(O)(O)O.c1ccc(-c2ccc(-c3ccc(-c4ccccc4)cc3)cc2)cc1', 0.5),
 (58016393, 'Cc1ccc(-n2cc(-c3ccccc3)ccc2=O)cc1', 0.5),
 (3897369, 'CC(C)=CCn1cc(-c2ccc(-c3ccccc3)cc2)ccc1=O', 0.5),
 (3376142, 'CC(C)=CCn1cc(-c2ccccc2)ccc1=O', 0.5),
 (58016549, 'O=c1ccc(-c2ccc(O)cc2)cn1-c1ccccc1', 0.5),
 (100942536, 'O=c1ccc(-c2ccccc2)cn1CCc1ccccc1', 0.5),
 (11299201, 'O=C(O)c1cc(-c2ccccc2)cn(O)c1=O', 0.5),
 (17913416, 'O=c1ccc2ccc(-c3ccccc3)cc2n1O', 0.5),
 (90188457, 'O=c1ccccn1O.[Zn]', 0.5),
 (87935139, 'O=c1ccccn1O.O=c1ccccn1O.[Zn]', 0.5),
 (10953512, 'O=c1ccc(O)cn1O', 0.5)]
In [ ]: