Curating Adaptive Sampling input files for MinKNOW

The following short workflow will prepare and download the necessary files to perform an Adaptive Sampling sequence experiment selecting for reads that span genes, transcripts, exons, etc. stored within ensembl.

To begin to prepare the files, simply select a species from the dropdown below.

In [3]:
# Code installation
from ftplib import FTP
import os
import ipywidgets as widgets
import pandas as pd
import pyranges as pr
import pysam
import requests


class EnsemblRestClient(object):
    def __init__(self, server='http://rest.ensembl.org'):
        self.server = 'http://rest.ensembl.org'
        self.ftp = 'ftp.ensembl.org'
        self.ftp_dna_path = '/pub/release-100/fasta/{}/dna/'
        self.ftp_dna_suff = {
            'primary':'dna.primary_assembly.fa.gz',
            'toplevel':'dna.toplevel.fa.gz'}
        self.ftp_gtf_path = '/pub/release-100/gtf/{}/'
        self.ftp_gtf_suff = {
            'gtf':"100.gtf.gz"}

        self.dna_template = \
            "ftp://" + self.ftp + self.ftp_dna_path + "/{}.{}.{}"
        self.gtf_template = \
            "ftp://" + self.ftp + self.ftp_gtf_path + \
            "/{}.{}." + self.ftp_gtf_suff['gtf']

    def get(self, endpoint, params=dict(), **kwargs):
        if 'json' not in kwargs:
            kwargs['json'] = params
        data = dict()
        try:
            response = requests.get(self.server + endpoint, **kwargs)
            if response.status_code == 429:
                if 'Retry-After' in response.headers:
                    retry = e.headers['Retry-After']
                    time.sleep(float(retry))
                    response = requests.get(self.server + endpoint, **kwargs)
        except:
            print(' - Request failed for {0}'.format(endpoint))
            print(response.status_code) 
        else:
            data = response.json()
            if "error" in data:
                print(" - ERROR:\n   {}".format(data["error"]))
        return data

    def species_list(self):
        return self.get("/info/species")

    def assembly_name(self, species):
        #assembly = self.get('/info/assembly/{}'.format(species))
        #if 'assembly_name' in assembly:
        #    return assembly['assembly_name']
        # this is a bit circular...
        paths = self._ftp_list(
            self.ftp_dna_path.format(species), self.ftp_dna_suff)
        stem = paths["toplevel"].split('.', 1)[1]
        assm = stem.replace("." + self.ftp_dna_suff["toplevel"], "")
        return assm

    def _ftp_list(self, path, filt):
        ftpdata = dict()
        with FTP('ftp.ensembl.org') as ftp:
            ftp.login()
            def grab(x):
                fname = x.split()[-1]
                for key, value in filt.items():
                    if fname.endswith(value):
                        ftpdata[key] = fname
            ftp.dir(path, grab)
        return ftpdata

    def dna_url(self, species, toplevel=True, assembly_name=None):
        #if assembly_name is None:
        #    assembly_name = client.assembly_name(species)
        #return self.dna_template.format(
        #    species, species.capitalize(), assembly_name)
        paths = self._ftp_list(
            self.ftp_dna_path.format(species), self.ftp_dna_suff)
        fname = paths['toplevel']
        if 'primary' in paths.keys():
            fname = paths['primary']
        return "ftp://" + self.ftp + self.ftp_dna_path.format(species) + fname
        
    def gtf_url(self, species, assembly_name=None):
        #if assembly_name is None:
        #    assembly_name = client.assembly_name(species)
        #return self.gtf_template.format(
        #    species, species.capitalize(), assembly_name)
        paths = self._ftp_list(
            self.ftp_gtf_path.format(species), self.ftp_gtf_suff)
        fname = paths['gtf']
        return "ftp://" + self.ftp + self.ftp_gtf_path.format(species) + fname


print(" * Querying ensembl species")
client = EnsemblRestClient()
species_list = client.species_list()
species_list = sorted(s['name'] for s in species_list['species'])
print(" - Found {} species".format(len(species_list)))
species_list.insert(0, "--")
urls = (None, None)
 * Querying ensembl species
 - Found 310 species

To produce efficiently reasonable target regions please provide an average read length. This should be an arithmetic mean not an N50 length. After pressing play here you will be given the opportunity to select your genome of interest from a drop-down box.

In [4]:
def species_change(inputs):
    global urls
    print(" * Finding files, please wait...", end="")
    assm = client.assembly_name(inputs.species)
    dna_url = client.dna_url(inputs.species, assembly_name=assm)
    gtf_url = client.gtf_url(inputs.species, assembly_name=assm)
    urls = (dna_url, gtf_url)
    
    
    try:
        print(" * Retrieving files...")
        print(" - {}".format(dna_url))
        print(" - {}".format(gtf_url))
        dna_path = os.path.basename(dna_url)
        gtf_path = os.path.basename(gtf_url)
        if not os.path.isfile(dna_path):
            !wget $dna_url || printf "\n * Failed to download assembly\n"
            if not os.path.isfile(dna_path):
                raise FileNotFoundError(' - Assembly could not be downloaded.')
        else:
            print(" - Skipping genome download")
        if not os.path.isfile(gtf_path):
            !wget $gtf_url || printf "\n * Failed to download gtf\n"
            if not os.path.isfile(gtf_path):
                raise FileNotFoundError(' - GTF could not be downloaded.')
        else:
            print(" - Skipping gtf download")
    except Exception as e:
        print(" * Failed to retrieve files")
        print("{}".format(e))
    else:
        print(" * Finished download")
        #print(" * Calculating total assembly length")
        #glength = 0
        #with pysam.FastxFile(dna_path) as fh:
        #    for r in fh:
        #        glength += len(r.sequence)
        #    print(" - Assembly length: {}".format(glength))
        print(" * Reading gtf")
        ranges = pr.read_gtf(gtf_path)
        print(" - Merging and expanding intervals (this may take a while)...", end="")
        merged = ranges.merge(strand=False)
        sloppy = merged.slack(inputs.read_length // 2).merge(strand=False)
        print("done")
        df = pd.DataFrame({
            'Source GTF':[len(ranges)],
            'Filtered':[len(merged)],
            'Padded':[len(sloppy)]},
            index=['Intervals'])
        bed_path = "{}.read_until.bed".format(dna_path)
        sloppy.to_bed(bed_path)

        print()
        display(df)
        print(" * Input files:")
        print("   - Genome: {}".format(dna_url))
        print("   - GTF   : {}".format(gtf_url))
        print(" * Output files:")
        print("   - Genome: {}".format(os.path.abspath(dna_path)))
        print("   - GTF   : {}".format(os.path.abspath(gtf_path)))
        print("   - BED   : {}".format(os.path.abspath(bed_path)))
    

from epi2melabs.notebook import InputForm, InputSpec
species_dropdown = widgets.Dropdown(
    options=species_list, value='--', description='species:')
input_form = InputForm(
    InputSpec('read_length', 'Read length', (100, 10000, 100)),
    InputSpec('species', 'Species', species_dropdown))
input_form.add_process_button(species_change)
input_form.display()
VBox(children=(HBox(children=(Label(value='Read length', layout=Layout(width='150px')), interactive(children=(…

When the above code has finished executing a table will be displayed detailing the number of regions of interest.

Also shown are paths to the output files:

  1. A reference genome (to provide to MinKNOW)
  2. The source .gtf file from which target regions were produced.
  3. A .bed file containing target regions to provide to MinKNOW.

These can be downloaded through your web browser by using the filebrowser to the left-hand side of this page.