from epi2melabs import ping
tutorial_name = 'ncov_tutorial'
pinger = ping.Pingu()
pinger.send_notebook_ping('start', tutorial_name)

# create a work directory and move into it
working_dir = '/epi2melabs/{}/'.format(tutorial_name)
!mkdir -p "$working_dir"
%cd "$working_dir"

/epi2melabs/ncov_tutorial


# ARTIC Bioinformatics installation (click the play button to install)
import os
!mamba install -q -y np-artic
!echo
!echo "Testing install (version should be displayed below)"
!artic --version
!echo
!echo "Installing nextclade"
!npm install --global @neherlab/nextclade;
!echo
!echo "Installing pangolin"
!mamba create -n pangolin -c bioconda --quiet --yes pangolin=3.1.4


bucket = "ont-exd-int-s3-euwst1-epi2me-labs"
domain = "s3-eu-west-1.amazonaws.com"
site = "https://{}.{}".format(bucket, domain)

!wget -O sars-barcoded-samples.tar.gz "$site/ncov_tutorial/sars-barcoded-samples.tar.gz"
print("Extracting data...")
!rm -rf sars-barcoded-samples
!tar -xzf sars-barcoded-samples.tar.gz
print("Done.")

# a second larger sample dataset is also available for testing
#!wget -O 96samples.tar.bz2 "$site/ncov_tutorial/96samples.tar.bz2"
#print("Extracting data...")
#!rm -rf 180min
#!tar -xjf 96samples.tar.bz2
#!mv 180min 96samples
#print("Done.")


import os
import shutil

import ipywidgets as widgets

from epi2melabs import notebook

bucket = "ont-exd-int-s3-euwst1-epi2me-labs"
domain = "s3-eu-west-1.amazonaws.com"
site = "https://{}.{}".format(bucket, domain)

input_folder = None
recursive = None
output_folder = None 
run_name = None
overwrite = None
primer_scheme_directory = None

def process_form(inputs):
    global input_folder, resursive, output_folder, run_name, overwrite, primer_scheme_directory
    input_folder = inputs.input_folder
    recursive = inputs.recursive
    output_folder = inputs.output_folder
    run_name = inputs.run_name
    overwrite = inputs.overwrite

    success = True
    msg_lines = []

    # check input folder
    !cecho ok "Checking input folder"
    if not os.path.exists(input_folder):
        print(notebook.error(" - `{}` does not exist.".format(input_folder)))
        return
    else:
        msg = "Input folder:  {}".format(input_folder)
        print(notebook.success(msg))

    # check output
    !cecho ok "Checking output folder"
    if os.path.exists(output_folder) and not overwrite:
        msg = " - `{}` exists, overwrite not selected".format(output_folder)
        primer_scheme_directory = os.path.abspath(
            os.path.join(output_folder, "primer_schemes"))
        print(notebook.error(msg))
        return
    else:
        if os.path.exists(output_folder):
            print(notebook.ok(" - Previous output folder exists, removing"))
            try:
                shutil.rmtree(output_folder)
            except:
                print(notebook.error(' - Error while deleting directory'))
                return
        os.mkdir(output_folder)
        print(notebook.ok(" - Downloading primer schemes"))
        primers = "primer_schemes.tar.gz"
        !wget -O "$primers" "$site/ncov_tutorial/$primers"
        !tar -xzvf "$primers"
        !mv primer_schemes $output_folder
        primer_scheme_directory = os.path.abspath(
            os.path.join(output_folder, "primer_schemes"))
        msg = "Output folder: {}".format(output_folder)
        print(notebook.success(msg))

input_form = notebook.InputForm(
    notebook.InputSpec('input_folder', 'Input folder', "/epi2melabs/ncov_tutorial/sars-barcoded-samples"),
    notebook.InputSpec('recursive', 'Recursive input search', False),
    notebook.InputSpec('output_folder', 'Output folder', "/epi2melabs/ncov_tutorial/analysis/"),
    notebook.InputSpec('run_name', 'Run name', 'run0'),
    notebook.InputSpec('overwrite', 'Overwrite existing', False),
    description_width='200px'
)
input_form.add_process_button(process_form)
input_form.display()

VBox(children=(HBox(children=(Label(value='Input folder', layout=Layout(width='200px')), interactive(children=…


import glob
import multiprocessing
import os
import shutil

import ipywidgets as widgets

import aplanat.report
from epi2melabs import notebook

debug = True
ref_name = "MN908947.3"
ref_len = 29903

threads = None
primer_set = None
minimum_reads = None
min_read_length = None
max_read_length = None
barcode_arrangements = None
require_both_ends = None
seq_summary_file = None
medaka_model = None
report = None

!run medaka tools list_models > medaka_models.txt
model_keys = {'Available', 'Default consensus'}
models = dict()
with open("medaka_models.txt") as fh:
    for line in fh.readlines():
        for model_key in model_keys:
            key, items = line.split(":")
            mods = [m.strip() for m in items.split(",")]
            models[key] = [m for m in mods if ('variant' in m)]

def process_form(inputs):
    global threads, \
        primer_set, minimum_reads, min_read_length, max_read_length, \
        seq_summary_file, medaka_model, report, barcode_arrangement
    threads = inputs.threads
    primer_set = inputs.primer_set
    minimum_reads = inputs.minimum_reads
    barcode_arrangement = inputs.barcode_arrangements
    #min_read_length = inputs.min_read_length
    #max_read_length = inputs.max_read_length
    medaka_model = inputs.medaka_model
    if "V1200" in primer_set:
        min_read_length, max_read_length = 150, 1200
    else:
        min_read_length, max_read_length = 400, 700
    text = """
    ###Analysis Parameters

        input: {}
        primer set: {}
        minimum reads: {}
        min. read length: {}
        max. read length: {}
        barcode arrangement: {}
    """.format(
        input_folder, primer_set, minimum_reads, min_read_length, max_read_length,
        barcode_arrangement)
    print(text.replace('###', ''))
    report = aplanat.report.HTMLReport(
        "SARS-Cov-2 Analysis",
        "EPI2MELabs summary report for: {}.".format(run_name),
        require_keys = True)
    report.markdown(text, key="simple preamble")

primers = ["SARS-CoV-2/{}".format(v) for v in ("V1", "V2", "V3", "V4", "V1200")]
# note: code later on in this notebook assumes SARS-CoV-2
#primers.extend(["spike-seq/{}".format(v) for v in ("V1", )])
input_form = notebook.InputForm(
    notebook.InputSpec('threads', 'Compute threads', (1, multiprocessing.cpu_count())),
    notebook.InputSpec('primer_set', 'Primer set', widgets.Dropdown(options=primers, value="SARS-CoV-2/V4")),
    notebook.InputSpec('minimum_reads', 'Minimum reads', widgets.IntText(10000)),
    #notebook.InputSpec('min_read_length', 'Minimum read length', widgets.IntText(400)),
    #notebook.InputSpec('max_read_length', 'Maximum read length', widgets.IntText(700)),
    notebook.InputSpec(
        'medaka_model', 'Medaka model name',
        widgets.Dropdown(value=models['Default variant'][0], options=models['Available'])),
    notebook.InputSpec(
        'barcode_arrangements', 'Barcode arrangement',
        widgets.Dropdown(options=["12", "24", "96"])),
    description_width='200px')

input_form.add_process_button(process_form)
input_form.display()

VBox(children=(HBox(children=(Label(value='Compute threads', layout=Layout(width='200px')), interactive(childr…


# Merging data files (press play)

import pandas as pd
import pysam
import numpy as np
from tqdm.notebook import tqdm

def mean_qual(quals):
    """Calculate mean quality score of a read."""
    qual = np.fromiter(
        (ord(x) - 33 for x in quals),
        dtype=int, count=len(quals))
    mean_p = np.mean(np.power(10, qual / -10))
    return -10 * np.log10(mean_p)

print(notebook.ok("Creating read summary from .fastq data."))

def get_read_lengths(fname):
    data = []
    barcode = os.path.basename(os.path.dirname(fname))
    if not (barcode.startswith('barcode') or barcode == 'unclassified'):
        raise ValueError('.fastq data found in a folder named "barcode*".')
    for read in pysam.FastxFile(fname):
        data.append("{}\t{}\t{}\t{}\n".format(read.name, barcode, len(read.sequence), mean_qual(read.quality)))
    data = ''.join(data)
    return data

fastqs = glob.glob(os.path.join(input_folder, "**", "*.fastq*"))
print(notebook.ok(" - Found {} files.".format(len(fastqs))))
seq_summary_file = os.path.join(output_folder, "read_lengths.txt")
try:
    with open(seq_summary_file, "w") as fh:
        fh.write("read_id\tbarcode_arrangement\tsequence_length_template\tread_quality\n")
        #for results in map(get_read_lengths, fastqs):
        #    fh.write(results)
        with tqdm(total=len(fastqs)) as progress:
            for results in map(get_read_lengths, fastqs):
                fh.write(results)
                progress.update(1)
except Exception as e:
    print(notebook.error(" - Failed to create sequence summary."))
else:
    print(notebook.success(" - Created sequence summary."))
    seq_summary = pd.read_csv(seq_summary_file, "\t")

Creating read summary from .fastq data.
 - Found 11 files.

  0%|          | 0/11 [00:00<?, ?it/s]

 - Created sequence summary.


# Reads QC Report (press play)
import aplanat
from aplanat import bars, hist, annot
from aplanat.util import Colors
from bokeh.models import Span
from bokeh.layouts import gridplot
import pandas as pd 
import numpy as np


plots = list()

total_bases = seq_summary['sequence_length_template'].sum()
mean_length = total_bases / len(seq_summary)
median_length = np.median(seq_summary['sequence_length_template'])

# read length plot
datas = [seq_summary['sequence_length_template']]
length_hist = hist.histogram(
    datas, bins=400,
    title="Read length distribution.",
    x_axis_label='Read Length / bases',
    y_axis_label='Number of reads',
    colors = [Colors.cerulean],
    xlim=(0, 2000))
length_hist = annot.marker_vline(
    length_hist, min_read_length,
    label="Min: {}".format(min_read_length), text_baseline='bottom', color='grey')
length_hist = annot.marker_vline(
    length_hist, max_read_length,
    label="Max: {}".format(max_read_length), text_baseline='top')
length_hist = annot.subtitle(
    length_hist,
    "Mean: {:.0f}. Median: {:.0f}".format(
        mean_length, median_length))
plots.append(length_hist)

datas = [seq_summary['read_quality']]
mean_q, median_q = np.mean(datas[0]), np.median(datas[0])
q_hist = hist.histogram(
        datas, colors=[Colors.cerulean], bins=100,
        title="Read quality score",
        x_axis_label="Quality score",
        y_axis_label="Number of reads",
        xlim=(4, 25))
q_hist = annot.subtitle(
        q_hist,
        "Mean: {:.0f}. Median: {:.0f}".format(
            mean_q, median_q))

plots.append(q_hist)

# barcode count plot
good_reads = seq_summary.loc[
    (seq_summary['sequence_length_template'] > min_read_length)
    & (seq_summary['sequence_length_template'] > min_read_length)]
barcode_counts = (
    pd.DataFrame(good_reads['barcode_arrangement'].value_counts())
    .sort_index()
    .reset_index()
    .rename(
        columns={'index':'barcode', 'barcode_arrangement':'count'})
    )
bc_counts = bars.simple_bar(
    barcode_counts['barcode'], barcode_counts['count'], colors=Colors.cerulean,
    title='Number of reads per barcode (filtered by {} < length < {})'.format(min_read_length, max_read_length))
bc_counts = annot.subtitle(
    bc_counts,
    'Barcodes with fewer than {} reads will not be analysed further.'.format(
        minimum_reads))
bc_counts.xaxis.major_label_orientation = 3.14/2
bc_counts = annot.marker_hline(bc_counts, minimum_reads)
plots.append(bc_counts)

# determine which barcode datasets are good
barcode_counts['sufficient data'] = barcode_counts['count'] > minimum_reads
valid_barcodes = set(
    barcode_counts.loc[barcode_counts['sufficient data']]['barcode'])

plots = gridplot(plots, ncols=2)

report.markdown("""
### Read Quality Control

The following depicts reads remaining after barcode demultiplexing.
""", key="qc_header")
report.plot(plots, key="qc_plots")
aplanat.show(plots, background="#f4f4f4")


# Run ARTIC analysis for each barcode (press play)
def run_artic(
        barcode, directory, artic_output,
        min_len=400, max_len=700, run_name="run0", threads=8,
        scheme="V3"):
    !mkdir -p $artic_output/$barcode
    prefix = os.path.join(artic_output, barcode, run_name)
    log_file = "{}_artic.log".format(prefix)
    print("Writing log file to: {}.".format(log_file))
    # read filtering
    !cecho ok "Running artic guppyplex to filter reads"
    !echo "=== Start of 'artic guppyplex' log" >> $log_file
    !run artic guppyplex --skip-quality-check \
        --min-length $min_len --max-length $max_len \
        --directory $directory --prefix $prefix >>$log_file 2>&1 \
        && echo " - artic guppyplex finished"
    # the output of the above will be:
    read_file = "{}_{}.fastq".format(prefix, barcode)
    # run everything else
    !cecho ok "Running artic minion --medaka to call variants"
    !echo "=== Start of 'artic minion' log" >> $log_file
    !run artic minion --medaka --normalise 200 --threads $threads \
        --read-file $read_file \
        --medaka-model $medaka_model \
        --scheme-directory $primer_scheme_directory \
        $scheme $prefix \
        >>$log_file 2>&1 \
        && echo " - artic minion finished"
    !cecho ok "Running alignment QC"
    !echo "=== Start of 'alignment QC' log" >> $log_file
    for i in (1,2):
        input_bam = "{}.primertrimmed.rg.sorted.bam".format(prefix)
        #tag = "nCoV-2019_{}".format(i)
        tag = i
        bam = "{}.{}".format(input_bam, tag)
        if os.path.isfile(input_bam):
            !samtools view -r $tag -b $input_bam > $bam
            !samtools index $bam
            !stats_from_bam $bam > $bam".stats"
            !coverage_from_bam -s 50 -p $bam $bam
            !rm -rf $bam $bam.bai
        else:
            !echo error " - No Artic output bam file found for primer set $i, Artic failed."
    return prefix

# setup artic output
print(output_folder)
artic_output = os.path.join(output_folder, "artic")

# run analysis for each barcode
valid_barcodes = [
    x for x in barcode_counts.loc[barcode_counts['sufficient data']]['barcode']
    if x != "unclassified"]
print(notebook.warning("Running Artic for:"))
for barcode in valid_barcodes:
    print("   {}".format(barcode))
print()
for barcode in valid_barcodes:
    msg = "Running ARTIC analysis for: {}".format(barcode)
    print(notebook.warning(msg))
    print(notebook.warning("-" * len(msg)))
    print()
    artic_input = os.path.join(input_folder, barcode)
    !rm -rf $artic_output/$barcode
    out_prefix = run_artic(
        barcode, artic_input, artic_output,
        min_len=min_read_length, max_len=max_read_length, run_name=run_name, 
        threads=threads, scheme=primer_set)
    print("Results for {} can be found at {}".format(barcode, out_prefix))
    msg = "ARTIC finished for: {}".format(barcode)
    print(msg)
    print("=" * len(msg))
    print()

/epi2melabs/ncov_tutorial/analysis/
Running Artic for:
   barcode01
   barcode02
   barcode03
   barcode05
   barcode06
   barcode07
   barcode08
   barcode09
   barcode10
   barcode12

Running ARTIC analysis for: barcode01
-------------------------------------

Writing log file to: /epi2melabs/ncov_tutorial/analysis/artic/barcode01/run0_artic.log.
Running artic guppyplex to filter reads
 - artic guppyplex finished
Running artic minion --medaka to call variants
 - artic minion finished
Running alignment QC
Mapped/Unmapped/Short/Masked/Skipped(all matches masked): 6656/0/0/0/0
[13:14:54 - root] Processing region MN908947.3:0-29900
Mapped/Unmapped/Short/Masked/Skipped(all matches masked): 6551/0/0/0/0
[13:14:57 - root] Processing region MN908947.3:0-29900
Results for barcode01 can be found at /epi2melabs/ncov_tutorial/analysis/artic/barcode01/run0
ARTIC finished for: barcode01
=============================

Running ARTIC analysis for: barcode02
-------------------------------------

Writing log file to: /epi2melabs/ncov_tutorial/analysis/artic/barcode02/run0_artic.log.
Running artic guppyplex to filter reads
 - artic guppyplex finished
Running artic minion --medaka to call variants
 - artic minion finished
Running alignment QC
Mapped/Unmapped/Short/Masked/Skipped(all matches masked): 6421/0/0/0/0
[13:17:46 - root] Processing region MN908947.3:0-29900
Mapped/Unmapped/Short/Masked/Skipped(all matches masked): 6576/0/0/0/0
[13:17:49 - root] Processing region MN908947.3:0-29900
Results for barcode02 can be found at /epi2melabs/ncov_tutorial/analysis/artic/barcode02/run0
ARTIC finished for: barcode02
=============================

Running ARTIC analysis for: barcode03
-------------------------------------

Writing log file to: /epi2melabs/ncov_tutorial/analysis/artic/barcode03/run0_artic.log.
Running artic guppyplex to filter reads
 - artic guppyplex finished
Running artic minion --medaka to call variants
 - artic minion finished
Running alignment QC
Mapped/Unmapped/Short/Masked/Skipped(all matches masked): 6624/0/0/0/0
[13:20:28 - root] Processing region MN908947.3:0-29900
Mapped/Unmapped/Short/Masked/Skipped(all matches masked): 6354/0/0/0/0
[13:20:30 - root] Processing region MN908947.3:0-29900
Results for barcode03 can be found at /epi2melabs/ncov_tutorial/analysis/artic/barcode03/run0
ARTIC finished for: barcode03
=============================

Running ARTIC analysis for: barcode05
-------------------------------------

Writing log file to: /epi2melabs/ncov_tutorial/analysis/artic/barcode05/run0_artic.log.
Running artic guppyplex to filter reads
 - artic guppyplex finished
Running artic minion --medaka to call variants
 - artic minion finished
Running alignment QC
Mapped/Unmapped/Short/Masked/Skipped(all matches masked): 5772/0/0/0/0
[13:23:18 - root] Processing region MN908947.3:0-29900
Mapped/Unmapped/Short/Masked/Skipped(all matches masked): 5973/0/0/0/0
[13:23:20 - root] Processing region MN908947.3:0-29900
Results for barcode05 can be found at /epi2melabs/ncov_tutorial/analysis/artic/barcode05/run0
ARTIC finished for: barcode05
=============================

Running ARTIC analysis for: barcode06
-------------------------------------

Writing log file to: /epi2melabs/ncov_tutorial/analysis/artic/barcode06/run0_artic.log.
Running artic guppyplex to filter reads
 - artic guppyplex finished
Running artic minion --medaka to call variants
 - artic minion finished
Running alignment QC
Mapped/Unmapped/Short/Masked/Skipped(all matches masked): 6811/0/0/0/0
[13:26:10 - root] Processing region MN908947.3:0-29900
Mapped/Unmapped/Short/Masked/Skipped(all matches masked): 6444/0/0/0/0
[13:26:13 - root] Processing region MN908947.3:0-29900
Results for barcode06 can be found at /epi2melabs/ncov_tutorial/analysis/artic/barcode06/run0
ARTIC finished for: barcode06
=============================

Running ARTIC analysis for: barcode07
-------------------------------------

Writing log file to: /epi2melabs/ncov_tutorial/analysis/artic/barcode07/run0_artic.log.
Running artic guppyplex to filter reads
 - artic guppyplex finished
Running artic minion --medaka to call variants
 - artic minion finished
Running alignment QC
Mapped/Unmapped/Short/Masked/Skipped(all matches masked): 7003/0/0/0/0
[13:28:44 - root] Processing region MN908947.3:0-29900
Mapped/Unmapped/Short/Masked/Skipped(all matches masked): 6247/0/0/0/0
[13:28:46 - root] Processing region MN908947.3:0-29900
Results for barcode07 can be found at /epi2melabs/ncov_tutorial/analysis/artic/barcode07/run0
ARTIC finished for: barcode07
=============================

Running ARTIC analysis for: barcode08
-------------------------------------

Writing log file to: /epi2melabs/ncov_tutorial/analysis/artic/barcode08/run0_artic.log.
Running artic guppyplex to filter reads
 - artic guppyplex finished
Running artic minion --medaka to call variants
 - artic minion finished
Running alignment QC
Mapped/Unmapped/Short/Masked/Skipped(all matches masked): 7638/0/0/0/0
[13:31:34 - root] Processing region MN908947.3:0-29900
Mapped/Unmapped/Short/Masked/Skipped(all matches masked): 7412/0/0/0/0
[13:31:36 - root] Processing region MN908947.3:0-29900
Results for barcode08 can be found at /epi2melabs/ncov_tutorial/analysis/artic/barcode08/run0
ARTIC finished for: barcode08
=============================

Running ARTIC analysis for: barcode09
-------------------------------------

Writing log file to: /epi2melabs/ncov_tutorial/analysis/artic/barcode09/run0_artic.log.
Running artic guppyplex to filter reads
 - artic guppyplex finished
Running artic minion --medaka to call variants
 - artic minion finished
Running alignment QC
Mapped/Unmapped/Short/Masked/Skipped(all matches masked): 7355/0/0/0/0
[13:34:29 - root] Processing region MN908947.3:0-29900
Mapped/Unmapped/Short/Masked/Skipped(all matches masked): 7054/0/0/0/0
[13:34:31 - root] Processing region MN908947.3:0-29900
Results for barcode09 can be found at /epi2melabs/ncov_tutorial/analysis/artic/barcode09/run0
ARTIC finished for: barcode09
=============================

Running ARTIC analysis for: barcode10
-------------------------------------

Writing log file to: /epi2melabs/ncov_tutorial/analysis/artic/barcode10/run0_artic.log.
Running artic guppyplex to filter reads
 - artic guppyplex finished
Running artic minion --medaka to call variants
 - artic minion finished
Running alignment QC
Mapped/Unmapped/Short/Masked/Skipped(all matches masked): 6412/0/0/0/0
[13:37:12 - root] Processing region MN908947.3:0-29900
Mapped/Unmapped/Short/Masked/Skipped(all matches masked): 6558/0/0/0/0
[13:37:14 - root] Processing region MN908947.3:0-29900
Results for barcode10 can be found at /epi2melabs/ncov_tutorial/analysis/artic/barcode10/run0
ARTIC finished for: barcode10
=============================

Running ARTIC analysis for: barcode12
-------------------------------------

Writing log file to: /epi2melabs/ncov_tutorial/analysis/artic/barcode12/run0_artic.log.
Running artic guppyplex to filter reads
 - artic guppyplex finished
Running artic minion --medaka to call variants
 - artic minion finished
Running alignment QC
Mapped/Unmapped/Short/Masked/Skipped(all matches masked): 6457/0/0/0/0
[13:40:05 - root] Processing region MN908947.3:0-29900
Mapped/Unmapped/Short/Masked/Skipped(all matches masked): 6372/0/0/0/0
[13:40:07 - root] Processing region MN908947.3:0-29900
Results for barcode12 can be found at /epi2melabs/ncov_tutorial/analysis/artic/barcode12/run0
ARTIC finished for: barcode12
=============================


# Outputting consensus sequences to FASTA (click play)
from IPython.display import FileLink

pinger.send_notebook_ping('stop', tutorial_name)

consensuses = glob.glob("{0}/artic/barcode*/{1}.consensus.fasta".format(output_folder, run_name))
collated = None

def process_files(inputs):
    global collated
    collated = os.path.join(output_folder, "all_artic_consensus.fasta")
    !rm -rf $collated
    for consensus in consensuses:
        barcode = os.path.basename(os.path.dirname(consensus))
        name = getattr(inputs, barcode)
        !sed "s/^>.*/>$name/" $consensus >> $collated
    print("Collated sequences written to:")
    display(FileLink(collated, url_prefix="/files/"))

inputs = list()
for con in consensuses:
    barcode = os.path.basename(os.path.dirname(con))
    inputs.append(notebook.InputSpec(barcode, "Sample name for {}:".format(barcode), "{}_{}".format(run_name, barcode)))
    
cons_form = notebook.InputForm(*inputs, description_width="200px")
cons_form.add_process_button(process_files)
cons_form.display()

VBox(children=(HBox(children=(Label(value='Sample name for barcode01:', layout=Layout(width='200px')), interac…


# Success/Failed analysis
from bokeh.models import Panel, Range1d, Tabs

!cp $collated . 
!grep "^>" all_artic_consensus.fasta \
        | awk 'BEGIN{OFS="\t"; print "sample\tpass"}{print substr($1, 2), $2!="Artic-Fail"}' > pass_fail.txt

status = pd.read_csv('pass_fail.txt', sep='\t')
failed = status.loc[status['pass'] == 0]
if len(failed) == 0:
        fail_list = "All samples analysed successfully"
else:
        fail_list = failed['sample'].str.cat(sep=', ')
fail_percentage = int(100 * len(failed) / len(status))
classes = ['Success', 'Analysis Failed']
values = [100 - fail_percentage, fail_percentage]
colors = ['#54B8B1', '#EF4135']
plot = bars.single_hbar(
values, classes, colors,
        title="Completed analyses",
        x_axis_label="%age Samples")
plot.x_range = Range1d(0, 140)
print(fail_list)
aplanat.show(plot, background="#F4F4F4")
report.markdown("""
#### Artic analysis status
The panel below lists samples which failed to produce results from the primary ARTIC analysis.
Samples not listed here were analysed successfully, but may still contain inconclusive or invalid results.
See the following sections for further indications of failed or inconclusive results.
""",
    key="pass-fail-head")
report.markdown("""
```{}```
""".format(fail_list),key="pass-fail-list")
report.plot(plot,key="pass-fail-plot")

All samples analysed successfully


# ARTIC Summary Table (click play)
from collections import Counter, defaultdict
import gzip
from pysam import FastxFile
from Bio import SeqIO

if "96" in barcode_arrangement:
    max_bc = 96
elif "24" in barcode_arrangement:
    max_bc = 24
else:
    max_bc = 12

!cecho ok "The tables below assume you are using up to $max_bc barcodes."

artic_output = os.path.join(output_folder, "artic")
results = defaultdict(dict)
for barcode in ('barcode{:02}'.format(x) for x in range(1, max_bc + 1)):
    results[barcode]["Ns"] = 'NA'
    results[barcode]['variants'] = 'NA'
    if not barcode in valid_barcodes:
        continue
    consensus_file = os.path.join(
        artic_output, barcode, "{}.consensus.fasta".format(run_name))
    vcf_file = os.path.join(
        artic_output, barcode, "{}.pass.vcf.gz".format(run_name))
    recs = 0
    try:
        for record in SeqIO.parse(consensus_file, "fasta"):
            recs += 1
            results[barcode]["Ns"] = record.seq.count("N")
    except FileNotFoundError as e:
        !cecho error "ARTIC pipeline failed to produce consensus.fasta file for $barcode"
        continue
    else:
        if recs != 1:
            !cecho error "ARTIC pipeline produced one than one consensus contig for $barcode"
            continue
    variants = 0
    with gzip.open(vcf_file) as vcf:
        for line in (x.decode() for x in vcf):
            if line.startswith('#'):
                continue
            variants += 1
    results[barcode]['variants'] = variants

df = pd.DataFrame.from_dict(results)
df.columns = ['bc{:02}'.format(x) for x in range(1, max_bc + 1)]
parts = [
    ['bc{:02}'.format(x) for x in range(start, start + 12)]
    for start in range(1, max_bc + 1, 12)]
report.markdown("""
### Per barcode results

The variants discovered per barcode were as follows. `NA` indicated the barcode
was not present, or that too few reads were obtained for analysis to be performed.
""", key="results_header")
for i, p in enumerate(parts):
    display(df[p])
    report.table(df[p], key="summary_pt{}".format(i))

The tables below assume you are using up to 12 barcodes.


# Collate ARTIC alignment statistics (press play)
from aplanat import lines
from aplanat import util

bam_name = ".primertrimmed.rg.sorted.bam.{}"
cover_suffix = "_{}_0_{}.depth.txt".format(ref_name, ref_len)
stats_suffix = ".stats"

dfs_cover = list()
dfs_stats = list()
for barcode in valid_barcodes:
    stats_stem = os.path.join(artic_output, barcode, run_name)
    for i in (1,2):
        df = pd.read_csv(stats_stem + bam_name.format(i) + cover_suffix, sep='\t')
        df['primer_set'] = i
        df['barcode'] = barcode
        dfs_cover.append(df)
        df = pd.read_csv(stats_stem + bam_name.format(i) + stats_suffix, sep='\t')
        df['primer_set'] = i
        df['barcode'] = barcode
        dfs_stats.append(df)
cover_summary = pd.concat(dfs_cover)
stats_summary = pd.concat(dfs_stats)
dfs_cover, dfs_stats = None, None
print("Finished collating data")

Finished collating data


# Collate and visualise depth summary report

def read_files(summaries, sep='\t'):
    """Read a set of files and join to single dataframe."""
    dfs = list()
    for fname in sorted(summaries):
        dfs.append(pd.read_csv(fname, sep=sep))
    return pd.concat(dfs)

# depth summary by primer pool
df = cover_summary
plots_pool = list()
plots_orient = list()
plots_combined = list()
depth_lim = 100
for sample in sorted(df['barcode'].unique()):
        bc = df['barcode'] == sample
        depth = df[bc].groupby('pos').sum().reset_index()
        depth_thresh = \
        100*(depth['depth'] >= depth_lim).sum() / len(depth['depth'])
        depth_mean = depth['depth'].mean()

            # total depth plot
            # plot line just to get aplanat niceities
        p = lines.line(
                [depth['pos']], [depth['depth']], colors=[Colors.cerulean],
                title="{}: {:.0f}X, {:.1f}% > {}X".format(
                    sample, depth_mean, depth_thresh, depth_lim),
                height=250, width=400,
                x_axis_label='position', y_axis_label='depth',
            )
        p.varea(
                x=depth['pos'], y1=0.1, y2=depth['depth'],
                fill_color=Colors.cerulean)
        plots_combined.append(p)

         # fwd/rev
        xs = [depth['pos'], depth['pos']]
        ys = [depth['depth_fwd'], depth['depth_rev']]
        names = ['fwd', 'rev']
        colors = [Colors.dark_gray, Colors.verdigris]

        p = lines.line(
        xs, ys, colors=colors, names=names,
        title="{}: {:.0f}X, {:.1f}% > {}X".format(
                    sample, depth_mean, depth_thresh, depth_lim),
                height=250, width=350,
        x_axis_label='position', y_axis_label='depth')
        for x, y, name, color in zip(xs, ys, names, colors):
            p.varea(
                x=x, y1=0, y2=y, legend_label=name,
                    fill_color=color, alpha=0.7,
                    muted_color=color, muted_alpha=0.2)
            p.legend.click_policy = 'mute'
            plots_orient.append(p)

        # primer set plot
        pset = df['primer_set']
        xs = [df.loc[(pset == i) & bc]['pos'] for i in (1, 2)]
        ys = [df.loc[(pset == i) & bc]['depth'] for i in (1, 2)]
        names = ['pool-1', 'pool-2']
        colors = [Colors.light_cornflower_blue, Colors.feldgrau]
        
        p = lines.line(
            xs, ys, colors=colors, names=names,
            title="{}: {:.0f}X, {:.1f}% > {}X".format(
            sample, depth_mean, depth_thresh, depth_lim),
            height=250, width=350,
            x_axis_label='position', y_axis_label='depth')
        for x, y, name, color in zip(xs, ys, names, colors):
            p.varea(
                x=x, y1=0, y2=y, legend_label=name,
                fill_color=color, alpha=0.7,
                muted_color=color, muted_alpha=0.2)
            p.legend.click_policy = 'mute'
            plots_pool.append(p)
tab1 = Panel(
            child=gridplot(plots_combined, ncols=3), title="Coverage Plot")
tab2 = Panel(
            child=gridplot(plots_pool[::2], ncols=3), title="By amplicon pool")
tab3 = Panel(
            child=gridplot(plots_orient[::2], ncols=3), title="By read orientation")
cover_panel = Tabs(tabs=[tab1, tab2, tab3])

aplanat.show(cover_panel, background="#F4F4F4")
report.markdown("""
### Genome Coverage
Plots below indicate depth of coverage from data used within the Artic analysis
coloured by amplicon pool. For adequate variant calling depth should be at least 30X in any region.


NB: To better display all possible data, the depth axes of the plots below are not tied between plots for different samples.
Care should be taken in comparing depth across samples.
""",
    key="genome-coverage-head")
report.plot(cover_panel,key="genome-coverage")


# Show basecall accuracies (press play)

plots = list()
df = stats_summary
for barcode in valid_barcodes:
    pset = df['primer_set']
    bc = df['barcode'] == barcode
    try:
        xs, ys = list(), list()
        for i in (1, 2):
            d = df.loc[(pset == i) & bc]
            if len(d) == 0:
                !cecho warning "Warning: No basecalls for $barcode and primer set $i"
                # aplanat can't currently plot empty data
                x, y = [99.9, 100], [0, 0.0001]
            else:
                x, y = util.kernel_density_estimate(d['acc'])
            xs.append(x)
            ys.append(y)
    except:
        pass
    plot = lines.line(
        list(xs), list(ys), colors=['blue', 'red'],
        title=barcode,
        height=200, width=400,
        x_axis_label='read accuracy', y_axis_label='density')
    plots.append(plot)
plots = gridplot(plots, ncols=3)
aplanat.show(plots, background="#f4f4f4")
report.markdown("### Read accuracy", key="accuracy_header")
report.plot(plots, "accuracy_plots")


# Create bcf tools report section
from aplanat.components import bcfstats

#need to update names to match those input above
vcf_stats_files=[]
for barcode in ('barcode{:02}'.format(x) for x in range(1, max_bc + 1)):
    if not barcode in valid_barcodes:
        continue
    vcf_file = os.path.join(
        artic_output, barcode, "{}.pass.vcf.gz".format(run_name))
    vcf_stats_file = os.path.join(
        artic_output, barcode, "{}.pass.vcf.stats".format(run_name))
    !bcftools stats $vcf_file > $vcf_stats_file
    vcf_stats_files.append(vcf_stats_file)


vcf_stats = bcfstats.full_report(vcf_stats_files)
_ = report.add_section(key="vcf-stats", section=vcf_stats)
!echo "Report Section prepared"


# Create Nextclade report section

import argparse
import os
import pysam
from aplanat.components import nextclade
import re

primer_scheme_directory = os.path.abspath(os.path.join(output_folder, "primer_schemes"))
primer = re.search("(?<=/).+",primer_set).group()
bedFile = os.path.join(primer_scheme_directory, 'SARS-CoV-2', primer, 'SARS-CoV-2.scheme.bed')
reference = os.path.join(primer_scheme_directory, 'SARS-CoV-2', primer, 'SARS-CoV-2.reference.fasta')
consensus = os.path.join(output_folder, 'all_artic_consensus.fasta')
output = os.path.join(output_folder, 'primers.csv')

refs = {r.name: r.sequence for r in pysam.FastxFile(reference)}
scheme = os.path.splitext(os.path.basename(bedFile))[0]
    # MN908947.3    30    54    nCoV-2019_1_LEFT    1    +
with open(bedFile) as fh, open(output, 'w') as out_fh:
        out_fh.write("Country (Institute),Target,Oligonucleotide,Sequence\n")
        for line in fh.readlines():
            # the V1200 bed doesn't specific orientation
            rname, start, end, name, pool, *_ = line.split('\t')
            pname = '{}:{}-{}'.format(rname, start, end)
            seq = refs[rname][int(start):int(end)]
            out_fh.write(','.join((scheme, pname, name, seq)))
            out_fh.write('\n')
           
output = os.path.join(output_folder, 'nextclade.json')
primers = os.path.join(output_folder, 'primers.csv')
!nextclade --input-fasta $consensus --input-pcr-primers $primers --output-json $output --jobs 1;
nextclade_section=nextclade.NextClade(output)
section = report.add_section(key="nextclade", section=nextclade_section);
section.markdown("""
NB: To better display all possible data, the depth axes of the plots below are not tied between plots for different samples.
Care should be taken in comparing depth across samples.
""", key="nextclade-text");
print("Nextclade report section generated.")

getCodon: ambiguous codon "TT-: translating to aminoacid "X"
getCodon: ambiguous codon "TA-: translating to aminoacid "X"
getCodon: ambiguous codon "AA-: translating to aminoacid "X"
getCodon: ambiguous codon "AA-: translating to aminoacid "X"
getCodon: ambiguous codon "TA-: translating to aminoacid "X"
Nextclade report section generated.


# Run pangolin to create Pangolin report
consensus = os.path.join(output_folder, 'all_artic_consensus.fasta')
!run pangolin $consensus 2>> pangolin.log 1>>pangolin.log
lineage = os.path.abspath('lineage_report.csv')
df = pd.read_csv(lineage)
display(df)
section = report.add_section()
section.markdown("""
### Lineage

The table below reports the lineage of each sample as calculated by
[pangolin](https://github.com/cov-lineages/pangolin).

""", key="pangolin-header");
section.table(df,key="pangolin");


# prepare summary report
def process_report_form(inputs):
    report.write((os.path.join(output_folder, inputs.report_file)))
    print("Report written to {}".format(os.path.join(output_folder, inputs.report_file)))

report_form = notebook.InputForm(
    notebook.InputSpec('report_file', "Report filename", "sars-cov-2-report.html"))
report_form.add_process_button(process_report_form)
report_form.display()

VBox(children=(HBox(children=(Label(value='Report filename', layout=Layout(width='150px')), interactive(childr…

	bc01	bc02	bc03	bc04	bc05	bc06	bc07	bc08	bc09	bc10	bc11	bc12
Ns	4597	4659	4661	NA	4648	4665	4729	4322	3858	4662	NA	4680
variants	13	10	12	NA	9	9	6	11	11	8	NA	10

	taxon	lineage	ambiguity_score	scorpio_call	scorpio_support	scorpio_conflict	version	pangolin_version	pangoLEARN_version	pango_version	status	note
0	run0_barcode01	B.1	0.813714	NaN	NaN	NaN	PLEARN-v1.2.66	3.1.4	2021-08-24	v1.2.66	passed_qc	NaN
1	run0_barcode09	B.1	0.852964	NaN	NaN	NaN	PLEARN-v1.2.66	3.1.4	2021-08-24	v1.2.66	passed_qc	NaN
2	run0_barcode08	B.6	0.822838	NaN	NaN	NaN	PLEARN-v1.2.66	3.1.4	2021-08-24	v1.2.66	passed_qc	NaN
3	run0_barcode07	B.1.153	0.811778	NaN	NaN	NaN	PLEARN-v1.2.66	3.1.4	2021-08-24	v1.2.66	passed_qc	NaN
4	run0_barcode03	A.1	0.810728	NaN	NaN	NaN	PLEARN-v1.2.66	3.1.4	2021-08-24	v1.2.66	passed_qc	NaN
5	run0_barcode06	B	0.810781	NaN	NaN	NaN	PLEARN-v1.2.66	3.1.4	2021-08-24	v1.2.66	passed_qc	NaN
6	run0_barcode12	B.40	0.809993	NaN	NaN	NaN	PLEARN-v1.2.66	3.1.4	2021-08-24	v1.2.66	passed_qc	NaN
7	run0_barcode10	B	0.811358	NaN	NaN	NaN	PLEARN-v1.2.66	3.1.4	2021-08-24	v1.2.66	passed_qc	NaN
8	run0_barcode02	B.1	0.811253	NaN	NaN	NaN	PLEARN-v1.2.66	3.1.4	2021-08-24	v1.2.66	passed_qc	NaN
9	run0_barcode05	B.1	0.811935	NaN	NaN	NaN	PLEARN-v1.2.66	3.1.4	2021-08-24	v1.2.66	passed_qc	NaN

SARS-Cov-2 Analysis Workflow

Getting Started¶

Install additional software¶

Sample Data¶

Using your own data¶

Data Entry¶

Analysis¶

Read Quality Control¶

Run ARTIC for each sample¶

Consensus sequences¶

Artic Analysis Status success/failed¶

Brief summary of results¶

QC Summary of ARTIC pipeline results¶

Variant call summaries¶

Nextclade Analysis¶

Lineage¶

Results summary¶

Summary¶