The following short workflow will prepare and download the necessary files to perform an Adaptive Sampling sequence experiment selecting for reads that span genes, transcripts, exons, etc. stored within ensembl.
To begin to prepare the files, simply select a species from the dropdown below.
# Code installation
from ftplib import FTP
import os
import ipywidgets as widgets
import pandas as pd
import pyranges as pr
import pysam
import requests
class EnsemblRestClient(object):
def __init__(self, server='http://rest.ensembl.org'):
self.server = 'http://rest.ensembl.org'
self.ftp = 'ftp.ensembl.org'
self.ftp_dna_path = '/pub/release-100/fasta/{}/dna/'
self.ftp_dna_suff = {
'primary':'dna.primary_assembly.fa.gz',
'toplevel':'dna.toplevel.fa.gz'}
self.ftp_gtf_path = '/pub/release-100/gtf/{}/'
self.ftp_gtf_suff = {
'gtf':"100.gtf.gz"}
self.dna_template = \
"ftp://" + self.ftp + self.ftp_dna_path + "/{}.{}.{}"
self.gtf_template = \
"ftp://" + self.ftp + self.ftp_gtf_path + \
"/{}.{}." + self.ftp_gtf_suff['gtf']
def get(self, endpoint, params=dict(), **kwargs):
if 'json' not in kwargs:
kwargs['json'] = params
data = dict()
try:
response = requests.get(self.server + endpoint, **kwargs)
if response.status_code == 429:
if 'Retry-After' in response.headers:
retry = e.headers['Retry-After']
time.sleep(float(retry))
response = requests.get(self.server + endpoint, **kwargs)
except:
print(' - Request failed for {0}'.format(endpoint))
print(response.status_code)
else:
data = response.json()
if "error" in data:
print(" - ERROR:\n {}".format(data["error"]))
return data
def species_list(self):
return self.get("/info/species")
def assembly_name(self, species):
#assembly = self.get('/info/assembly/{}'.format(species))
#if 'assembly_name' in assembly:
# return assembly['assembly_name']
# this is a bit circular...
paths = self._ftp_list(
self.ftp_dna_path.format(species), self.ftp_dna_suff)
stem = paths["toplevel"].split('.', 1)[1]
assm = stem.replace("." + self.ftp_dna_suff["toplevel"], "")
return assm
def _ftp_list(self, path, filt):
ftpdata = dict()
with FTP('ftp.ensembl.org') as ftp:
ftp.login()
def grab(x):
fname = x.split()[-1]
for key, value in filt.items():
if fname.endswith(value):
ftpdata[key] = fname
ftp.dir(path, grab)
return ftpdata
def dna_url(self, species, toplevel=True, assembly_name=None):
#if assembly_name is None:
# assembly_name = client.assembly_name(species)
#return self.dna_template.format(
# species, species.capitalize(), assembly_name)
paths = self._ftp_list(
self.ftp_dna_path.format(species), self.ftp_dna_suff)
fname = paths['toplevel']
if 'primary' in paths.keys():
fname = paths['primary']
return "ftp://" + self.ftp + self.ftp_dna_path.format(species) + fname
def gtf_url(self, species, assembly_name=None):
#if assembly_name is None:
# assembly_name = client.assembly_name(species)
#return self.gtf_template.format(
# species, species.capitalize(), assembly_name)
paths = self._ftp_list(
self.ftp_gtf_path.format(species), self.ftp_gtf_suff)
fname = paths['gtf']
return "ftp://" + self.ftp + self.ftp_gtf_path.format(species) + fname
print(" * Querying ensembl species")
client = EnsemblRestClient()
species_list = client.species_list()
species_list = sorted(s['name'] for s in species_list['species'])
print(" - Found {} species".format(len(species_list)))
species_list.insert(0, "--")
urls = (None, None)
* Querying ensembl species - Found 310 species
To produce efficiently reasonable target regions please provide an average read length. This should be an arithmetic mean not an N50 length. After pressing play here you will be given the opportunity to select your genome of interest from a drop-down box.
def species_change(inputs):
global urls
print(" * Finding files, please wait...", end="")
assm = client.assembly_name(inputs.species)
dna_url = client.dna_url(inputs.species, assembly_name=assm)
gtf_url = client.gtf_url(inputs.species, assembly_name=assm)
urls = (dna_url, gtf_url)
try:
print(" * Retrieving files...")
print(" - {}".format(dna_url))
print(" - {}".format(gtf_url))
dna_path = os.path.basename(dna_url)
gtf_path = os.path.basename(gtf_url)
if not os.path.isfile(dna_path):
!wget $dna_url || printf "\n * Failed to download assembly\n"
if not os.path.isfile(dna_path):
raise FileNotFoundError(' - Assembly could not be downloaded.')
else:
print(" - Skipping genome download")
if not os.path.isfile(gtf_path):
!wget $gtf_url || printf "\n * Failed to download gtf\n"
if not os.path.isfile(gtf_path):
raise FileNotFoundError(' - GTF could not be downloaded.')
else:
print(" - Skipping gtf download")
except Exception as e:
print(" * Failed to retrieve files")
print("{}".format(e))
else:
print(" * Finished download")
#print(" * Calculating total assembly length")
#glength = 0
#with pysam.FastxFile(dna_path) as fh:
# for r in fh:
# glength += len(r.sequence)
# print(" - Assembly length: {}".format(glength))
print(" * Reading gtf")
ranges = pr.read_gtf(gtf_path)
print(" - Merging and expanding intervals (this may take a while)...", end="")
merged = ranges.merge(strand=False)
sloppy = merged.slack(inputs.read_length // 2).merge(strand=False)
print("done")
df = pd.DataFrame({
'Source GTF':[len(ranges)],
'Filtered':[len(merged)],
'Padded':[len(sloppy)]},
index=['Intervals'])
bed_path = "{}.read_until.bed".format(dna_path)
sloppy.to_bed(bed_path)
print()
display(df)
print(" * Input files:")
print(" - Genome: {}".format(dna_url))
print(" - GTF : {}".format(gtf_url))
print(" * Output files:")
print(" - Genome: {}".format(os.path.abspath(dna_path)))
print(" - GTF : {}".format(os.path.abspath(gtf_path)))
print(" - BED : {}".format(os.path.abspath(bed_path)))
from epi2melabs.notebook import InputForm, InputSpec
species_dropdown = widgets.Dropdown(
options=species_list, value='--', description='species:')
input_form = InputForm(
InputSpec('read_length', 'Read length', (100, 10000, 100)),
InputSpec('species', 'Species', species_dropdown))
input_form.add_process_button(species_change)
input_form.display()
VBox(children=(HBox(children=(Label(value='Read length', layout=Layout(width='150px')), interactive(children=(…
When the above code has finished executing a table will be displayed detailing the number of regions of interest.
Also shown are paths to the output files:
.gtf
file from which target regions were produced..bed
file containing target regions to provide to MinKNOW.These can be downloaded through your web browser by using the filebrowser to the left-hand side of this page.