Download multiple PDB files using a Jupyter notebook

The RCSB Protein Data Bank is an absolutely invaluable resource that provides archive-information about the 3D shapes of proteins, nucleic acids, and complex assemblies that helps scientists understand all aspects of biomedicine and agriculture, from protein synthesis to health and disease. Currently the PDB contains over 134,000 data files containing structural information on 42547 distinct protein sequences of which 37600 are human sequences. They also provide a series of tools to search, view and analyse the data.

Downloading an individual pdf file is pretty trivial and can be done from the web page as shown in the image below.

They also provide a Download Tool launched as stand-alone application using the Java Web Start protocol. The tool is downloaded locally and must be then opened. I’ve found this a little temperamental and had issues with Java versions and security settings.

Since I’ve been making extensive use of the web services to interact with RCSB I decided to explore the use of Python to download multiple files. This turned out to be very successful and I’ve used it to download a batch of 30,000 files.

Jupyter Notebook

I’ve become a great fan of Jupyter notebooks, I use them extensively to not only record work I’m doing but also as a workflow tool. They are also a great way to share code.

You will need to edit the path to the file containing the pdb codes, and folder where you want to download the PDB files to. We then read in the pub codes (either as comma-separated values or as one code per line). Then download the files (gzipped if requested). The try loop is needed in cases where files are unavailable, and a list of unavailable file sis printed out.

# Use PDB ID to download PDB files
# Authored by Chris Swain (http://www.macinchem.org)
# Copyright CC-BY

import csv
import os
import sys

# Python 2 and 3 compatibility
if sys.version_info[0] == 2: 
    from urllib import urlretrieve
else:
    from urllib.request import urlretrieve

# Use PDB ID to download PDB files

# Authored by Chris Swain (http://www.macinchem.org)

# Copyright CC-BY

import csv

import os

import sys

# Python 2 and 3 compatibility

if sys.version_info[0] == 2:

from urllib import urlretrieve

else:

from urllib.request import urlretrieve

# You may want to edit these parameters

# File containing comma-separated list of the desired PDB IDs
pdb_codes_file = '/Users/username/Desktop/SAH_PDB.txt'

# Folder to download files to
download_folder = 'PDB2/'

# Whether to download gzip compressed files
compressed = True

# You may want to edit these parameters

# File containing comma-separated list of the desired PDB IDs

pdb_codes_file = '/Users/username/Desktop/SAH_PDB.txt'

# Folder to download files to

download_folder = 'PDB2/'

# Whether to download gzip compressed files

compressed = True

# Read the PDB IDs from the input file format 6G2,MES,50D,50E,50F,9FV,W38,W3C,9FS,SIQ,ATP,F10,W3I,W9D
with open(pdb_codes_file) as f:
    # Change to .split('\n') if PDB IDs are 1 per line
    pdb_codes = f.read().split(',')
    
# Alternatively, hard code the PDB IDs:
# pdb_codes = ['1LS6', '1Z28', '2D06', '3QVU', '3QVV', '3U3J', '3U3K']

#For testing
#print(pdb_codes)

# Read the PDB IDs from the input file format 6G2,MES,50D,50E,50F,9FV,W38,W3C,9FS,SIQ,ATP,F10,W3I,W9D

with open(pdb_codes_file) as f:

# Change to .split('\n') if PDB IDs are 1 per line

pdb_codes = f.read().split(',')

# Alternatively, hard code the PDB IDs:

# pdb_codes = ['1LS6', '1Z28', '2D06', '3QVU', '3QVV', '3U3J', '3U3K']

#For testing

#print(pdb_codes)

# Ensure download folder exists
try:
    os.makedirs(download_folder)
except OSError as e:
    # Ignore OSError raised if it already exists
    pass

# Ensure download folder exists

try:

os.makedirs(download_folder)

except OSError as e:

# Ignore OSError raised if it already exists

pass

for pdb_code in pdb_codes:
    # Add .pdb extension and remove ':1' suffix if entities
    filename = '%s.pdb' % pdb_code[:4]
    # Add .gz extension if compressed
    if compressed:
        filename = '%s.gz' % filename
    url = 'https://files.rcsb.org/download/%s' % filename
    destination_file = os.path.join(download_folder, filename)

    # Download the file
    try:
        urlretrieve(url, destination_file)
    except Exception as e:
        pass

for pdb_code in pdb_codes:

# Add .pdb extension and remove ':1' suffix if entities

filename = '%s.pdb' % pdb_code[:4]

# Add .gz extension if compressed

if compressed:

filename = '%s.gz' % filename

url = 'https://files.rcsb.org/download/%s' % filename

destination_file = os.path.join(download_folder, filename)

# Download the file

try:

urlretrieve(url, destination_file)

except Exception as e:

pass

You can download the jupyter notebook here

download_pdb.ipynb 2 Download

Download multiple PDB files using a Jupyter notebook

Jupyter Notebook

Related Posts

Selecting random clusters from a large dataset in Vortex

Using ChemDraw as input for Boltz docking