from epi2melabs import ping
tutorial_name = "clone_validation"
pinger = ping.Pingu()
pinger.send_notebook_ping('start', tutorial_name)

# create a work directory and move into it
working_dir = '/epi2melabs/{}/'.format(tutorial_name)
!mkdir -p "$working_dir"
%cd "$working_dir"


!mamba install -y -q -c bioconda last diamond blast --quiet > /dev/null


bucket = "ont-exd-int-s3-euwst1-epi2me-labs"
domain = "s3-eu-west-1.amazonaws.com"
site = "https://{}.{}".format(bucket, domain)

!rm -rf clone_sample_data clone_sample_data.tar.gz
!wget -O clone_sample_data.tar.gz \
    "$site/$tutorial_name/clone_sample_data.tar.gz"
!tar -xzvf clone_sample_data.tar.gz


import os
import multiprocessing
import pandas as pd
import pysam

import aplanat
from aplanat.graphics import InfoGraphItems
from aplanat.report import HTMLReport
from epi2melabs.notebook import InputForm, InputSpec

import ipywidgets as widgets

report = None
exec_summary = None

def process_form(inputs):
    global report
    global exec_summary
    
    report = HTMLReport(
        "Nanopore Clone Validation workflow", "EPI2ME Labs Summary",
        require_keys=True)
    report.markdown('### Experiment Summary', 'exp-summ-head')
    report.placeholder(key='exec-plot')  # placeholder
    exec_summary = InfoGraphItems()
    
    input_reads = inputs._input_reads
    
    all_good = True
    !cecho ok "Checking read inputs..."
    fail_msg = "The input read file is not a valid file or folder."
    if os.path.isdir(input_reads):
        print(" - Found input folder.")
    elif os.path.isfile(input_reads):
        print(" - Found input file, moving to a folder")
        # guppy demultiplexer takes only an input folder
        fname = os.path.basename(input_reads)
        !mkdir -p reads \
            && mv $input_reads reads
        input_reads = os.path.abspath("reads")
    elif os.path.isdir('reads'):
        moved_file = os.path.join('reads', os.path.basename(input_reads))
        if os.path.isfile(moved_file):
            input_reads = os.path.abspath("reads")
            print(" - Found previously moved file.")
        else:
            !cecho error "$fail_msg"
            all_good = False
    else:
        !cecho error "$fail_msg"
        all_good = False
    print(" - Input reads is: {}.".format(input_reads))
    # set input_path on inputs form for others to access
    inputs.input_reads = input_reads

    !cecho ok "Making output folder..."
    outdir = inputs.output_folder
    !mkdir -p "$outdir"

    !cecho ok "Checking reference inputs..."
    if not os.path.isfile(inputs.target_dna_sequence):
        !cecho error "Target sequence file does not exist."
        all_good = False
    else:
        with pysam.FastaFile(inputs.target_dna_sequence) as fh:
            if fh.nreferences != 1 or fh.lengths[0] < 1:
                !cecho error " - Vector .fasta does not contain a single sequence."
                all_good = False
            else:
                print(" - Found sequence (target): {}".format(fh.references[0]))
    if not os.path.isfile(inputs.target_peptide):
        !cecho error "Target peptide file does not exist."
        all_good = False
    else:
        with pysam.FastaFile(inputs.target_peptide) as fh:
            if fh.nreferences != 1 or fh.lengths[0] < 1:
                !cecho error " - Target .fasta does not contain a single sequence."
                all_good = False
            else:
                print(" - Found sequence (peptide): {}".format(fh.references[0]))

    if all_good:
        !cecho success "Inputs validated."
    else:
        !cecho error "One or more errors detected while checking inputs."

inputs = InputForm(
    InputSpec('_input_reads', 'Select .fastq dataset', 'clone_sample_data/flongle_clone_data.fastq.gz'),
    InputSpec('output_folder', 'Output folder', 'analysis'),
    InputSpec('target_dna_sequence', 'Vector .fasta', 'clone_sample_data/vector.fasta'),
    InputSpec('target_peptide', 'Target peptide', 'clone_sample_data/peptide1.fasta'),
    InputSpec('threads', 'Processing threads', (1, multiprocessing.cpu_count(), 1)),
    InputSpec('demultiplex', 'Demultiplex', True),
    InputSpec('barcode_kit', 'Barcoding kit', 'SQK-RBK004'))
inputs._fname = os.path.abspath("sample_data.fastq.gz")
inputs.add_process_button(process_form)
inputs.display()

VBox(children=(HBox(children=(Label(value='Select .fastq dataset', layout=Layout(width='150px')), interactive(…


# Running Guppy demultiplexing (click play)

input_reads = inputs.input_reads
output_folder = inputs.output_folder
barcode_kit = inputs.barcode_kit
threads = inputs.threads

!rm -rf $output_folder/demultiplex
if inputs.demultiplex:
    !guppy_barcoder \
        --barcode_kits $barcode_kit \
        --compress_fastq --records_per_fastq 0 --recursive --worker_threads $threads \
        --save_path $output_folder/demultiplex \
        --input_path $input_reads \
        && cecho success "Guppy finished successfully" \
        || cecho error "Guppy failed"
else:
    print("Skipping demultiplexing as requested.")

ONT Guppy barcoding software version 4.5.4+66c1a77
input path:         /epi2melabs/clone_validation/reads
save path:          analysis/demultiplex
arrangement files:  barcode_arrs_rbk4.cfg 
lamp arr. files:    barcode_arrs_ncov8.cfg barcode_arrs_ncov96.cfg barcode_arrs_multivirus1.cfg barcode_arrs_multivirus8.cfg
min. score front:   60
min. score rear:    60

Found 1 input files.

0%   10   20   30   40   50   60   70   80   90   100%
|----|----|----|----|----|----|----|----|----|----|
***************************************************
Done in 2574 ms.
Guppy finished successfully


# Review demultiplex barcode assignments (click play)

import ipywidgets as widgets
from IPython.display import display

import aplanat
from aplanat import bars, hist, annot
from bokeh.models import Span
from bokeh.layouts import gridplot

minimum_reads = 150
read_source = None
complete_fastq = None
dataset_name = None
dataset_folder = None
barcodes_found = 'NA'

if not inputs.demultiplex:
    read_source = inputs.input_reads
    dataset_name = 'all_data'
    dataset_folder = os.path.join(inputs.output_folder, dataset_name)
    complete_fastq = os.path.join(dataset_folder, "all_data.fastq")
    barcodes_found = 'NA'
else:
    barcode_summary_file = os.path.join(
        inputs.output_folder, "demultiplex", "barcoding_summary.txt")
    if not os.path.exists(barcode_summary_file):
        print("Could not find barcode summary: {}.".format(barcode_summary_file))
    else:
        try:
            bc_summary = pd.read_csv(barcode_summary_file, sep='\t')
        except Exception as e:
            print("Could not read barcode summary: {}".format(barcode_summary))
    
    # barcode count plot
    barcode_counts = pd.DataFrame(bc_summary['barcode_arrangement'].value_counts()) \
        .sort_index().reset_index().rename(
            columns={'index':'barcode', 'barcode_arrangement':'count'})
    bc_counts = bars.simple_bar(
        barcode_counts['barcode'], barcode_counts['count'], colors='#0084A9',
        title='Number of reads per barcode')
    bc_counts = annot.subtitle(
        bc_counts,
        '>{} reads are recommended for further analysis.'.format(
            minimum_reads))
    bc_counts.xaxis.major_label_orientation = 3.14 / 4
    bc_counts = annot.marker_hline(bc_counts, minimum_reads)
    aplanat.show(bc_counts, background="#f4f4f4")
    report.plot(bc_counts, key='bc-plot')

    barcode_counts['sufficient data'] = barcode_counts['count'] > minimum_reads
    valid_barcodes = set(
        barcode_counts.loc[barcode_counts['sufficient data']]['barcode'])
    barcodes_found = len(valid_barcodes)
    valid_barcodes.discard("unclassified")
    valid_barcodes = sorted(valid_barcodes)

    def barcode_change(change):
        global dataset_name
        global dataset_folder
        global read_source
        global complete_fastq
        if change['type'] == 'change' and change['name'] == 'value':
            dataset_name = change['new']
            dataset_folder = os.path.join(output_folder, dataset_name)
            read_source = os.path.join(
                output_folder, "demultiplex", dataset_name)
            complete_fastq = os.path.join(dataset_folder, "{}.fastq".format(dataset_name))

    barcode_dropdown = widgets.Dropdown(
        options=valid_barcodes, value=dataset_name, description='Barcode:')
    barcode_dropdown.observe(barcode_change)
    print("Select a barcode to use for the remainder")
    display(barcode_dropdown)
exec_summary.append('Barcodes found', str(barcodes_found), 'balance-scale')

Select a barcode to use for the remainder

Dropdown(description='Barcode:', options=('barcode01', 'barcode02', 'barcode03', 'barcode04', 'barcode05'), va…


# FASTQ assessment (click show)
from aplanat.hist import histogram
from bokeh.layouts import gridplot
import numpy as np
from pysam import FastxFile

if read_source is None or complete_fastq is None:
    raise RuntimeError("Please run the previous code cell.")

!mkdir -p $dataset_folder
!rationalize_fastq -i $read_source -o $complete_fastq

mean_qualities = list()
lengths = list()

# open the file and iterate through its records
with FastxFile(complete_fastq) as fq:
    for rec in fq:
        # ONT calculation for "mean Q score"
        quals = np.fromiter(
            (ord(x) - 33 for x in rec.quality),
            dtype=int, count=len(rec.quality))
        mean_p = np.mean(np.power(10, quals/-10))
        mean_qualities.append(-10*np.log10(mean_p))
        lengths.append(len(quals))

p1 = histogram(
    [np.array(mean_qualities)], title="Read quality scores",
    x_axis_label="quality", y_axis_label="count", colors=['#0084A9'],
    height=250, width=300)
p2 = histogram(
    [lengths], colors=['#0084A9'], title="Read lengths",
    x_axis_label="read length / bases", y_axis_label="count",
    height=250, width=300, bins=100, xlim=(0, 7000))
stats_plots = gridplot((p1, p2), ncols=2)
aplanat.show(stats_plots, background="#f4f4f4")

# Setup report for barcode 
section = report.add_section(dataset_name)
bc_summary = InfoGraphItems()
bc_summary.extend((
    ('Median read length', str(int(np.median(lengths))), 'align-center', 'b'),
    ('Median read quality', str(int(np.median(mean_qualities))), 'thumbs-up', '')))
section.markdown(
    "### {} Analysis".format(dataset_name.capitalize()),
    key='{}-heading'.format(dataset_name))
section.placeholder('exec-plot')
section.plot(stats_plots, key='statsplots')

Input: /epi2melabs/clone_validation/analysis/demultiplex/barcode01
Output: /epi2melabs/clone_validation/analysis/barcode01/barcode01.fastq
Directory detected, concatenating found files.
 - Found 1 files.
 - Concatenating files...done.


import ipywidgets as widgets

data_prefix = None
fastq_source = None
ave_length = None

def process_form(inputs):
    global data_prefix
    global fastq_source
    global ave_length
    
    print("Calculating stats. for complete dataset...")
    print(" - Input: {}.".format(complete_fastq))
    !seqkit stats --tabular "$complete_fastq" > "$complete_fastq".stats
    complete_stats = pd.read_csv("{}.stats".format(complete_fastq), delimiter='\t')

    print("Filtering dataset by length and quality...")
    filtered_fastq = complete_fastq.replace('.fastq', '.filtered.fastq')
    short, long = inputs.shortest_read, inputs.longest_read
    qual = inputs.quality_filter
    !seqkit seq --remove-gaps --min-len "$short" --max-len "$long" \
        --min-qual "$qual" "$complete_fastq" > "$filtered_fastq"
    !seqkit stats --tabular "$filtered_fastq" > "$filtered_fastq".stats
    filtered_stats = pd.read_csv("{}.stats".format(filtered_fastq), delimiter='\t')

    print("Subsampling filtered dataset...")
    needed_bases = inputs.expected_size * inputs.assemble_x
    reqd_reads = int((inputs.expected_size * inputs.assemble_x) // filtered_stats['avg_len'][0])
    print("Require {} bases ({} total), approximately {} reads".format(
        needed_bases, filtered_stats['sum_len'][0], reqd_reads))

    sampled_fastq = filtered_fastq.replace(
        '.filtered.fastq', '.filtered.sampled.fastq')
    !seqkit sample --quiet -n "$reqd_reads" --two-pass "$filtered_fastq" > "$sampled_fastq" 
    !seqkit stats --tabular "$sampled_fastq" > "$sampled_fastq".stats
    sampled_stats = pd.read_csv("{}.stats".format(sampled_fastq), delimiter='\t')

    print("Creating statistics table...")
    display_stats = pd.concat(
        (complete_stats, filtered_stats, sampled_stats)).reset_index(drop=True)
    display_stats['file'] = display_stats['file'].apply(os.path.basename)
    display(display_stats)
    section = report.sections[dataset_name]
    section.markdown(
        "The table below details read counts and lengths after filtering.",
        key="countlead".format(dataset_name))
    section.table(display_stats, key='read-stats')

    fastqs = dict(zip(
        ('complete', 'filtered', 'sampled'),
        ((complete_fastq, complete_stats),
         (filtered_fastq, filtered_stats),
         (sampled_fastq, sampled_stats))))
    fastq_source, stats_source = fastqs[inputs.dataset]
    data_prefix = os.path.splitext(fastq_source)[0]
    ave_length = int(stats_source['avg_len'][0])
    print("Selecting `{}` for onward use, with average read length {}.".format(
        fastq_source, ave_length))
    bc_summary.append('Reads used', str(int(stats_source['num_seqs'][0])), 'bookmark')

filter_form = InputForm(
    InputSpec('shortest_read', 'Shortest read', widgets.IntText(4000)),
    InputSpec('longest_read', 'Longest read', widgets.IntText(5000)),
    InputSpec('quality_filter', 'Min. Quality', widgets.IntText(13)),
    InputSpec('expected_size', 'Expected assembly size', widgets.IntText(4700)),
    InputSpec('assemble_x', 'Read coverage to use', widgets.IntText(150)),
    InputSpec('dataset', 'Read sampling to use', ['sampled', 'complete', 'filtered']))
filter_form.add_process_button(process_form)
filter_form.display()

VBox(children=(HBox(children=(Label(value='Shortest read', layout=Layout(width='150px')), interactive(children…


# run alignments
target_dna_sequence = inputs.target_dna_sequence
!cecho ok "Running alignments for: $data_prefix"

!rm -f "$data_prefix".bam*
!rm -rf "$target_dna_sequence".*
!mini_align \
    -r "$target_dna_sequence" -i "$fastq_source" \
    -p "$data_prefix" \
    -t 4 \
    || cecho error "Alignment failed"
# calculate read stats
!stats_from_bam "$data_prefix".bam > "$data_prefix".bam.stats \
    || cecho "Failed to create alignment statistics"
# calculate depth
!mosdepth -t 2 --flag 1812 "$data_prefix".fwd "$data_prefix".bam 
!mosdepth -t 2 --flag 1796 --include-flag 16 "$data_prefix".rev "$data_prefix".bam


# Alignment visualization (click play)
from aplanat import annot, lines, hist, util
from bokeh.layouts import gridplot

depth_fwd = pd.read_csv(
    '{}.fwd.per-base.bed.gz'.format(data_prefix), sep='\t',
    names=['ref', 'start', 'end', 'depth'])
depth_rev = pd.read_csv(
    '{}.rev.per-base.bed.gz'.format(data_prefix), sep='\t',
    names=['ref', 'start', 'end', 'depth'])
depth_plot = lines.steps(
    [depth_fwd['start'], depth_rev['start']],
    [depth_fwd['depth'], depth_rev['depth']],
    colors=['darkolivegreen', 'maroon'],
    names=['Forward', 'Reverse'],
    x_axis_label='Position along reference',
    y_axis_label='sequencing depth / bases')
stats = pd.read_csv("{}.bam.stats".format(data_prefix), sep='\t')

x_grid, pdf = util.kernel_density_estimate(stats['acc'])
mode = x_grid[np.argmax(pdf)]
acc_plot = lines.line(
    [x_grid], [pdf],
    x_axis_label="read accuracy", y_axis_label="read density")
acc_plot = annot.marker_vline(acc_plot, mode, label='mode: {:.1f}'.format(mode))

x_grid, pdf = util.kernel_density_estimate(stats['coverage'])
mode = x_grid[np.argmax(pdf)]
cover_plot = lines.line(
    [x_grid], [pdf],
    x_axis_label="read coverage", y_axis_label="read density")
cover_plot = annot.marker_vline(cover_plot, mode, label='mode: {:.1f}'.format(mode))

plot = gridplot([acc_plot, cover_plot, depth_plot], ncols=2)
aplanat.show(plot, background="#f4f4f4")
section = report.sections[dataset_name]
section.markdown(
    "Read metrics with respect to the provided reference are shown below. "
    "The read coverage is expected to be low for reads of a similar length "
    "to the reference size. Similarly the coverage graph may show a large "
    "bias toward the middle of the sequence.",
    key="countlead".format(dataset_name))
section.plot(plot, key='read-plots')


# De novo assembly input average read length
assembly_out = None
def process_form(flye_inputs):
    global assembly_out
    assembly_out = os.path.join('epi2melabs','analysis',dataset_folder, "flye")
    min_overlap = int(flye_inputs.overlap_fraction * min(filter_form.expected_size, ave_length))
    !rm -rf $assembly_out
    !mkdir -p "$assembly_out"
    threads = inputs.threads
    expected_size = filter_form.expected_size
    !flye --out-dir $assembly_out --nano-raw $fastq_source \
        --genome-size $expected_size --meta --threads $threads \
        --min-overlap $min_overlap

flye_form = InputForm(
    InputSpec('overlap_fraction', 'Flye "overlap_fraction"', (0.05, 1, 0.05),
              long_desc="The minumum overlap fraction is multiplied by the average read length to determine the minimum read overlap size used within the assembly."))
flye_form.add_process_button(process_form)
flye_form.display()


assembly = os.path.join(assembly_out, "assembly.fasta")
flye_info = pd.read_csv(os.path.join(assembly_out, "assembly_info.txt"), delimiter='\t')
display(flye_info)
section = report.sections[dataset_name]
section.markdown('Summary of initial assembly:', key='assm-stats-head')
section.table(flye_info, index=False, key='assm-stats')


# Creating a dotplot (click play)
import tempfile
from aplanat import base
assm_dir = os.path.join(inputs.output_folder, "assm_check")
assm2assm = os.path.join(assm_dir, "aln.paf")
!mkdir -p $assm_dir

def last_dot(fasta, ref=None):
    tmp = tempfile.mkdtemp()
    if ref is None:
        ref = fasta
    !lastdb $tmp/lastdb $ref
    !lastal $tmp/lastdb $fasta > $tmp/alns.maf
    records = list()
    with open(os.path.join(tmp, "alns.maf")) as maf:
        while True:
            line = maf.readline()
            if line.startswith('#'):
                continue
            elif line.startswith('a'):
                r1 = maf.readline().split()[1:5]
                r2 = maf.readline().split()[1:5]
                blank = maf.readline()
                records.append(r1 + r2)
            elif line == "":
                break
            else:
                print(line)
                raise IOError("Cannot read alignment file")
    names = ['ref', 'rstart', 'rlen', 'rorient', 'query', 'qstart', 'qlen', 'qorient']
    df = pd.DataFrame(records, columns=names)
    df = df.loc[df['qorient'] == '+']
    for f in ['qstart', 'qlen', 'rstart', 'rlen']:
        df[f] = df[f].astype(int)
    df['qend'] = df['qstart'] + df['qlen']
    df['rend'] = df['rstart'] + df['rlen']
    !rm -rf $tmp
    return df

df = last_dot(assembly)    
dotplot = base.simple(
    [], [], xlim=(0, max(df['rend'])), ylim=(0, max(df['qend'])),
    width=440, height=400, x_axis_label='position', y_axis_label='position')
dotplot.segment(df['rstart'], df['qstart'], df['rend'], df['qend'])
aplanat.show(dotplot, background="#f4f4f4")


# Circularising assembly by overlap-based trimming (click play)
from aplanat.report import bokeh_table
import pysam

def split_and_align(sequence_in, name):

    print("Aligning two halves of assembly...")
    with pysam.FastaFile(sequence_in) as fasta:
        if len(fasta.references) > 1:
            raise IOError("More than one contig found in assembly.")
        rec = fasta.fetch(fasta.references[0])
        half = len(rec) // 2
        first, second = rec[0:half], rec[half:]
        lengths = [len(x) for x in (first, second)]
    
    split_assm = os.path.join(assm_dir, 'split.{}.fasta')
    paf = os.path.join(assm_dir, 'overlap.paf')
    for name, seq in enumerate((first, second)):
        with open(split_assm.format(name), 'w') as fh:
            fh.write(">seq{}\n{}\n".format(name, seq))
    s0, s1 = (split_assm.format(x) for x in range(2))
    !minimap2 -cx asm5 $s0 $s1 > $paf 2> /dev/null
    return paf, first, second

# flye can give lots of repeats, find overlaps and trime iteratively
finished = False
iteration = 0
trimmed_assm = assembly
plots = list()
while not finished:
    iteration += 1
    print("Iteration: {}".format(iteration))
    print(" - Trimming sequence according to overlap...")
    paf, first, second = split_and_align(trimmed_assm, 'iter_{}'.format(iteration))
    try:
        df = pd.read_csv(paf, sep='\t', header=None)
    except ValueError:
        print(" - No alignments found, leaving assembly untrimmed.")
        finished = True
    else:
        if len(df) != 1:
            print(" - Found multiple alignments, selecting best.")
            df = df[df[2] < 5]
            if len(df) == 0:
                print("- Could not find suitable alignment, leaving assembly untrimmed.")
                finished = True
        else:
            finished = True
        start1, end1 = df[2][0], df[3][0]
        start0, end0 = df[7][0], df[8][0]
        new_seq = second[:end1] + first[end0:]
        trimmed_assm = assembly.replace(".fasta", ".trimmed.fasta")
        print(" - Writing trimmed sequence to {}.".format(trimmed_assm))
        with open(trimmed_assm, 'w') as fh:
            fh.write(">contig_trim\n{}\n".format(new_seq))
        print("Redoing dotplot...")
        df = last_dot(trimmed_assm)
        dotplot = base.simple(
            [], [], xlim=(0, max(df['rend'])), ylim=(0, max(df['qend'])),
            width=440, height=400, title='After trim iteration {}'.format(iteration),
            x_axis_label='position', y_axis_label='position')
        dotplot.segment(df['rstart'], df['qstart'], df['rend'], df['qend'])
        plots.append(dotplot)
        plots.append(bokeh_table(df, index=False))

aplanat.show(gridplot(plots, ncols=2), background="#f4f4f4")

Iteration: 1
 - Trimming sequence according to overlap...
Aligning two halves of assembly...
 - Writing trimmed sequence to epi2melabs/analysis/analysis/barcode01/flye/assembly.trimmed.fasta.
Redoing dotplot...


# Manual edit of assembly (click play)
from Bio import SeqIO

def process_form(inputs):
    global trimmed_assm
    fasta = SeqIO.read(trimmed_assm, "fasta")
    manual_assm = assembly.replace(".fasta", ".manualtrim.fasta")
    if inputs.end_position > inputs.start_position:
        edited = fasta[inputs.start_position:inputs.end_position]
        SeqIO.write(edited, manual_assm, "fasta")
        trimmed_assm = manual_assm
    else:
        print("Skipping manual trim.")

edit_form = InputForm(
    InputSpec('start_position', 'Start position', widgets.IntText()),
    InputSpec('end_position', 'End position', widgets.IntText()))
edit_form.add_process_button(process_form)
edit_form.display()

VBox(children=(HBox(children=(Label(value='Start position', layout=Layout(width='150px')), interactive(childre…


# Running medaka to polish the assembly (click play)
from Bio import SeqIO
medaka_dir = os.path.join(dataset_folder, "medaka")
medaka_consensus = os.path.join(medaka_dir, "consensus.fasta")
!rm -rf $medaka_dir
!mkdir -p $medaka_dir/
threads =inputs.threads
!mini_assemble -t $threads -i $fastq_source -r $trimmed_assm -o $medaka_dir/racon/ -p assm 2>/dev/null 
!run medaka_consensus -i $fastq_source -d $trimmed_assm -o $medaka_dir
medaka_seq = SeqIO.read(medaka_consensus, "fasta")
text = "The final assembly after medaka correction contains {} bases".format(len(medaka_seq))
bc_summary.append("Assembly length", str(len(medaka_seq)), 'ruler', 'b')
print(text)
section = report.sections[dataset_name];
section.markdown(text, "medaka-length");


start_tolerance = 20

def process_form(inputs):
    global start_tolerance
    start_tolerance = inputs.start_tolerance

start_tol_form = InputForm(
    InputSpec('start_tolerance', 'Start tolerance', 20))
start_tol_form.add_process_button(process_form)
start_tol_form.display()

VBox(children=(HBox(children=(Label(value='Start tolerance', layout=Layout(width='150px')), interactive(childr…


# Synchronising assembled plasmid and reference plasmid (click play)

from Bio import SearchIO, SeqIO
corrected_sequence = medaka_consensus
curation_folder = os.path.join(dataset_folder, "curation")
curated_sequence = os.path.join(curation_folder, "assembly.fasta")

blast_db = os.path.join(
    curation_folder, "{}.blastdb".format(os.path.basename(target_dna_sequence)))
blast_result = blast_db.replace(".blastdb", "blastxml")
!makeblastdb -in $target_dna_sequence -dbtype nucl -out $blast_db
!blastn -db $blast_db -query $medaka_consensus -outfmt 5 -out $blast_result

blast_parse = SearchIO.read(blast_result, 'blast-xml')
if len(blast_parse) > 0:
    qresult = blast_parse[0]
    if qresult[0].hit_frame < 0:
        print("Aligned sequence on the reverse strand - reverse complementing")
        corrected_sequence = os.path.join(medaka_dir, "consensus_rev_comp.fasta")
        !seqkit seq -p -r -t DNA -v $medaka_consensus > $corrected_sequence
        !blastn -db $blast_db -query $corrected_sequence -outfmt 5 -out $blast_result
else:
    print("No blast results")

blast_parse = SearchIO.read(blast_result, 'blast-xml')
if len(blast_parse) > 0:
    qresult = blast_parse[0]
    qres = sorted(qresult, key=lambda x: x.hit_start)
    hsp = qres[0]
    if hsp.hit_start <= start_tolerance:
        cut_pos = hsp.query_start
        print(
            "Cutting consensus at {}. Ref. position {}.".format(
                cut_pos, hsp.hit_start))
        fasta = SeqIO.read(corrected_sequence, "fasta")
        # write the new fasta sequence
        edited = fasta[cut_pos:] + fasta[:cut_pos]
        SeqIO.write(edited, curated_sequence, "fasta")
    else:
        print(
            "Could not find a valid match. The closest match to the "
            "start was at position {}. The sequence will be left " 
            "unchanged".format(hsp.hit_start))
        curated_sequence = corrected_sequence
        !cp $corrected_sequence $curated_sequence
else:
    curated_sequence = corrected_sequence
    !cp $corrected_sequence $curated_sequence

print("\nFinal curated sequence: {}.".format(curated_sequence))
print("Creating dotplot with rotated assembly against reference...")
df = last_dot(curated_sequence, ref=target_dna_sequence)
dotplot = base.simple(
    [], [], xlim=(0, max(df['rend'])), ylim=(0, max(df['qend'])),
    width=440, height=400, x_axis_label='ref position', y_axis_label='query position')
dotplot.segment(df['rstart'], df['qstart'], df['rend'], df['qend'])
aplanat.show(dotplot, background="#f4f4f4")


# Alignment of reads to assembly (click play)

eval_path = os.path.join(dataset_folder, 'align_eval')
prefix = os.path.join(eval_path, 'reads2assm')
!rm -rf $eval_path
!mkdir -p $eval_path
# double sequence
doubled_seq = os.path.join(eval_path, "doubled.fasta")
seq = SeqIO.read(curated_sequence, "fasta")
half_len = len(seq) // 2
edited = seq[half_len:] + seq + seq[:half_len]
SeqIO.write(edited, doubled_seq, "fasta")
# align
print("Aligning reads...", end='')
!mini_align \
    -r "$doubled_seq" -i "$complete_fastq" \
    -p "$prefix" \
    -t 4 2> /dev/null
print("done.")
# calculate read stats
!stats_from_bam "$prefix".bam > "$prefix".bam.stats 2>/dev/null
# calculate depth
!mosdepth -t 2 --flag 1812 "$prefix".fwd "$prefix".bam 
!mosdepth -t 2 --flag 1796 --include-flag 16 "$prefix".rev "$prefix".bam

def unroll_depth(df, size):
    hsize = size // 2
    start = df['start'] < hsize
    mid = (df['start'] >= hsize) & (df['start'] < hsize + size)
    end = df['start'] >= size + hsize
    for item in ('start', 'end'):
        df.loc[start, item] += hsize
        df.loc[mid, item] -= hsize
        df.loc[end, item] -= size + hsize
    result = np.zeros(size, dtype=[('pos', int), ('depth', int)])
    result['pos'] = np.arange(size)
    for _, row in df.iterrows():
        result['depth'][row['start']:row['end']] += row['depth']
    return result

fwd = pd.read_csv(
    '{}.fwd.per-base.bed.gz'.format(prefix), sep='\t',
    names=['ref', 'start', 'end', 'depth'])
depth_fwd = unroll_depth(fwd, len(seq))
rev = pd.read_csv(
    '{}.rev.per-base.bed.gz'.format(prefix), sep='\t',
    names=['ref', 'start', 'end', 'depth'])
depth_rev = unroll_depth(rev, len(seq))

depth_plot = lines.steps(
    [depth_fwd['pos'], depth_rev['pos']],
    [depth_fwd['depth'], depth_rev['depth']],
    colors=['darkolivegreen', 'maroon'],
    names=['Forward', 'Reverse'],
    x_axis_label='Position along reference',
    y_axis_label='sequencing depth / bases',
    title='All reads aligned to circularise assembly')
stats = pd.read_csv("{}.bam.stats".format(prefix), sep='\t')

x_grid, pdf = util.kernel_density_estimate(stats['coverage'])
mode = x_grid[np.argmax(pdf)]
cover_plot = lines.line(
    [x_grid], [pdf],
    x_axis_label="read coverage", y_axis_label="read density",
    title='Alignment read coverage against circularised assembly',
    xlim=(90,100))
cover_plot = annot.marker_vline(cover_plot, mode, label='mode: {:.1f}'.format(mode))

plot = gridplot([cover_plot, depth_plot], ncols=2)
aplanat.show(plot, background="#f4f4f4")
section.plot(plot, key='assm-eval')

Aligning reads...done.


# Identifying target peptide in assembly (click play)
from aplanat.graphics import infographic
print("Assembly: {}".format(curated_sequence))
print("Target peptide: {}".format(inputs.target_peptide))
tp = inputs.target_peptide
!cat $tp

diamond_db = os.path.join(medaka_dir, f"{os.path.basename(inputs.target_peptide)}.dmnd")
medaka_diamond = os.path.join(medaka_dir, "consensus.diamond")
!rm -f $medaka_diamond
!diamond makedb --in "$working_dir"/"$tp" --db "$diamond_db" --quiet
!diamond blastx --db "$diamond_db" -q $curated_sequence \
    --outfmt 0 -o "$medaka_diamond" --quiet \
    --threads 1 --more-sensitive --frameshift 15
!cat $medaka_diamond
peptide_found = True
with open(medaka_diamond) as fh:
    data = ''.join(("    {}".format(x) for x in fh.readlines()))
    if "No hits found" in data:
        peptide_found = False
    section = report.sections[dataset_name]
    section.markdown(
        "Alignment of target peptide to assembly:", key="pep-align-lead")
    section.markdown(data, key="pep-align")
icon = ['times', 'check'][peptide_found]
bc_summary.append("Peptide found", "", icon)
plot = infographic(bc_summary.values())
aplanat.show(plot, background="#f4f4f4")
section = report.sections[dataset_name]
section['exec-plot'] = plot

# update all barcodes report
report['exec-plot'] = infographic(exec_summary.values())
fname = os.path.join(output_folder, 'report.html')
report.write(fname)
print("Report written to: {}.".format(os.path.abspath(fname)))

Assembly: analysis/barcode01/curation/assembly.fasta
Target peptide: clone_sample_data/peptide1.fasta
>peptide1
MSAWSHPQFEKGGGSGGGSGGSAWSHPQFEKSGGGGGENLYFQGMSAKAAEGYEQIEVDV
VAVWKEGYVYENRGSTSVDQKITITKGMKNVNSETRTVTATHSIGSTISTGDAFEIGSVE
VSYSHSHEESQVSMTETEVYESKVIEHTITIPPTSKFTRWQLNADVGGADIEYMYLIDEV
TPIGGTQSIPQVITSRAKIIVGRQIILGKTEIRIKHAERKEYMTVVSRKSWPAATLGHSK
LFKFVLYEDWGGFRIKTLNTMYSGYEYAYSSDQGGIYFDQGTDNPKQRWAINKSLPLRHG
DVVTFMNKYFTRSGLCYDDGPATNVYCLDKREDKWILEVVG
BLASTP 2.3.0+


Query= contig_trim

Length=4681

>peptide1
Length=341

 Score = 662 bits (1708),  Expect = 2.90e-231
 Identities = 341/341 (100%), Positives = 341/341 (100%), Gaps = 0/341 (0%)
 Frame = 2

Query  3659  MSAWSHPQFEKGGGSGGGSGGSAXXXXXXXXXXXXXXENLYFQGMSAKAAEGYEQIEVDV 3838
             MSAWSHPQFEKGGGSGGGSGGSAXXXXXXXXXXXXXXENLYFQGMSAKAAEGYEQIEVDV
Sbjct     1  MSAWSHPQFEKGGGSGGGSGGSAXXXXXXXXXXXXXXENLYFQGMSAKAAEGYEQIEVDV 60

Query  3839  VAVWKEGYVYENRGSTSVDQKITITKGMKNVNSETRTVTATHSIGSTISTGDAFEIGSVE 4018
             VAVWKEGYVYENRGSTSVDQKITITKGMKNVNSETRTVTATHSIGSTISTGDAFEIGSVE
Sbjct    61  VAVWKEGYVYENRGSTSVDQKITITKGMKNVNSETRTVTATHSIGSTISTGDAFEIGSVE 120

Query  4019  VSYSHSHEESQVSMTETEVYESKVIEHTITIPPTSKFTRWQLNADVGGADIEYMYLIDEV 4198
             VSYSHSHEESQVSMTETEVYESKVIEHTITIPPTSKFTRWQLNADVGGADIEYMYLIDEV
Sbjct   121  VSYSHSHEESQVSMTETEVYESKVIEHTITIPPTSKFTRWQLNADVGGADIEYMYLIDEV 180

Query  4199  TPIGGTQSIPQVITSRAKIIVGRQIILGKTEIRIKHAERKEYMTVVSRKSWPAATLGHSK 4378
             TPIGGTQSIPQVITSRAKIIVGRQIILGKTEIRIKHAERKEYMTVVSRKSWPAATLGHSK
Sbjct   181  TPIGGTQSIPQVITSRAKIIVGRQIILGKTEIRIKHAERKEYMTVVSRKSWPAATLGHSK 240

Query  4379  LFKFVLYEDWGGFRIKTLNTMYSGYEYAYSSDQGGIYFDQGTDNPKQRWAINKSLPLRHG 4558
             LFKFVLYEDWGGFRIKTLNTMYSGYEYAYSSDQGGIYFDQGTDNPKQRWAINKSLPLRHG
Sbjct   241  LFKFVLYEDWGGFRIKTLNTMYSGYEYAYSSDQGGIYFDQGTDNPKQRWAINKSLPLRHG 300

Query  4559  DVVTFMNKYFTRSGLCYDDGPATNVYCLDKREDKWILEVVG 4681
             DVVTFMNKYFTRSGLCYDDGPATNVYCLDKREDKWILEVVG
Sbjct   301  DVVTFMNKYFTRSGLCYDDGPATNVYCLDKREDKWILEVVG 341

Report written to: /epi2melabs/clone_validation/analysis/report.html.

Validation of cloned DNA material using Nanopore sequencing

Expected Duration: 45 minutes

Introduction¶

Getting started¶

Install additional software¶

Sample Data¶

Using your own data¶

Data entry¶

Preliminary Analysis Section¶

Analysis of a single barcoded analyte¶

De novo assembly of the plasmid sequence¶

Medaka polishing of assembly¶

Rotating assembly to reference and identifying target peptide¶

Summary¶