from epi2melabs import ping
tutorial_name = "bacterial-snp"
pinger = ping.Pingu()
pinger.send_notebook_ping('start', tutorial_name)

# create a work directory and move into it
working_dir = '/epi2melabs/{}/'.format(tutorial_name)
!mkdir -p "$working_dir"
%cd "$working_dir"


bucket = "ont-exd-int-s3-euwst1-epi2me-labs"
domain = "s3-eu-west-1.amazonaws.com"
site = "https://{}.{}".format(bucket, domain)
site = "https://ont-exd-int-s3-euwst1-epi2me-labs.s3-eu-west-1.amazonaws.com"

!mkdir -p sample_data
!cd sample_data && wget $site/bacterial_snp_tutorial/bsubtilis.reads.fasta.gz
!cd sample_data && wget $site/bacterial_snp_tutorial/bsubtilis.ref.fasta

--2021-10-12 11:39:15--  https://ont-exd-int-s3-euwst1-epi2me-labs.s3-eu-west-1.amazonaws.com/bacterial_snp_tutorial/bsubtilis.reads.fasta.gz
Resolving ont-exd-int-s3-euwst1-epi2me-labs.s3-eu-west-1.amazonaws.com (ont-exd-int-s3-euwst1-epi2me-labs.s3-eu-west-1.amazonaws.com)... 52.218.97.195
Connecting to ont-exd-int-s3-euwst1-epi2me-labs.s3-eu-west-1.amazonaws.com (ont-exd-int-s3-euwst1-epi2me-labs.s3-eu-west-1.amazonaws.com)|52.218.97.195|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 388972323 (371M) [binary/octet-stream]
Saving to: ‘bsubtilis.reads.fasta.gz’

bsubtilis.reads.fas  79%[==============>     ] 294.97M  12.3MB/s    eta 8s


import aplanat
import aplanat.graphics
import aplanat.report
import multiprocessing
import pandas as pd
import os
import ipywidgets as widgets

from epi2melabs.notebook import InputForm, InputSpec

input_fastq = None
output_folder = None
reference_genome = None
medaka_model = None
threads = None
exec_summary = None
report = None

!run medaka tools list_models > medaka_models.txt
model_keys = {'Available', 'Default consensus'}
models = dict()
with open("medaka_models.txt") as fh:
    for line in fh.readlines():
        for model_key in model_keys:
            key, items = line.split(":")
            mods = [m.strip() for m in items.split(",")]
            models[key] = [m for m in mods if ('variant' in m)]


def process_inputs(inputs):
    global input_fastq, output_folder, reference_genome, medaka_model, \
        threads, exec_summary, report

    output_folder = inputs.output_folder
    reference_genome = inputs.reference_genome
    medaka_model = inputs.medaka_model
    threads = inputs.threads
    
    input_data = os.path.abspath(inputs.input_data)
    input_fastq = os.path.join(output_folder, os.path.basename(input_data))    

    # run a command to concatenate all the files together
    !echo "Making output folder"
    !mkdir -p "$output_folder"
    !test -f "$reference_genome" || "WARNING: Reference file does not exist"
    !rationalize_fastq -i "$input_data" -o "$input_fastq"

    exec_summary = aplanat.graphics.InfoGraphItems()
    report = aplanat.report.HTMLReport(
        "Haploid Small Variant Calling", "EPI2ME Labs Summary")
    report.plot(None, key='exec-plot')  # placeholder

input_form = InputForm(
    InputSpec('input_data', 'Input reads', 'sample_data/bsubtilis.reads.fasta.gz'),
    InputSpec('output_folder', 'Output folder', 'analysis'),
    InputSpec('reference_genome', 'Reference .fasta', 'sample_data/bsubtilis.ref.fasta'),
    InputSpec(
        'medaka_model', 'Medaka model name',
        widgets.Dropdown(value=models['Default variant'][0], options=models['Available'])),
    InputSpec('threads', 'Compute threads', (1, multiprocessing.cpu_count()))
)
input_form.add_process_button(process_inputs)
input_form.display()


# Calculating read accuracy and genome coverage (click play)

# run alignments
!cecho ok "Running alignments"
data_prefix = os.path.join(output_folder, "reads2ref")
!mini_align -t $threads \
    -r $reference_genome -i $input_fastq \
    -p "$data_prefix"
# calculate read stats
!cecho ok "Creating alignment statistics"
!stats_from_bam "$data_prefix".bam > "$data_prefix".bam.stats
# calculate depth
!cecho pl "Calculating depth"
!mosdepth -t 2 --flag 1812 "$data_prefix".fwd "$data_prefix".bam 
!mosdepth -t 2 --flag 1796 --include-flag 16 "$data_prefix".rev "$data_prefix".bam
# show regions in bam
print("Available regions:")
with open(reference_genome + '.fai') as fh:
    for line in (x.split() for x in fh.readlines()):
        print('  {}:0-{}'.format(*line[:2]))

Running alignments
Constructing minimap index.
[M::mm_idx_gen::0.151*1.14] collected minimizers
[M::mm_idx_gen::0.199*1.54] sorted minimizers
[M::main::0.248*1.44] loaded/built the index for 2 target sequence(s)
[M::mm_idx_stat] kmer size: 15; skip: 10; is_hpc: 0; #seq: 2
[M::mm_idx_stat::0.256*1.42] distinct minimizers: 737336 (98.05% are singletons); average occurrences: 1.038; average spacing: 5.345
[M::main] Version: 2.17-r941
[M::main] CMD: minimap2 -I 16G -x map-ont -d sample_data/bsubtilis.ref.fasta.mmi sample_data/bsubtilis.ref.fasta
[M::main] Real time: 0.262 sec; CPU: 0.370 sec; Peak RSS: 0.041 GB
[M::main::0.054*1.08] loaded/built the index for 2 target sequence(s)
[M::mm_mapopt_update::0.064*1.07] mid_occ = 11
[M::mm_idx_stat] kmer size: 15; skip: 10; is_hpc: 0; #seq: 2
[M::mm_idx_stat::0.071*1.06] distinct minimizers: 737336 (98.05% are singletons); average occurrences: 1.038; average spacing: 5.345
[M::worker_pipeline::17.705*15.42] mapped 150251 sequences
[M::main] Version: 2.17-r941
[M::main] CMD: minimap2 -x map-ont -t 40 -a -A 2 -B 4 -O 4,24 -E 2,1 sample_data/bsubtilis.ref.fasta.mmi analysis/basecalls.fastq.gz
[M::main] Real time: 17.726 sec; CPU: 272.960 sec; Peak RSS: 1.763 GB
[bam_sort_core] merging from 0 files and 40 in-memory blocks...
Creating alignment statistics
Mapped/Unmapped/Short/Masked/Skipped(all matches masked): 150250/0/0/0/0
Calculating depth
Available regions:
  bsubtilis1:0-4065191
  bsubtilis2:0-25698


# Plotting read alignment statistics (click play)
from aplanat import annot, lines, hist, util
from bokeh.layouts import gridplot
import numpy as np

from aplanat import points

def down_sample(df, plot_limit=10000):
    # if we have a lot of points, remove some to avoid bokeh dying.
    if len(df) > plot_limit:
        display("Warning: Downsampling points to {} entries. Select a smaller region to show all points.".format(plot_limit))
        df = df[::len(df) // plot_limit]
    else:
        display("Showing all sites.")
    return df

def read_depth(fname, chrom, start, stop):
    df = pd.read_csv(
        fname, sep='\t',
        names=['ref', 'start', 'end', 'depth'])
    df = df.loc[
        (df['ref'] == chrom) & (df['start'] > start) & (df['end'] < stop)]
    df = down_sample(df).reset_index()
    return df


region = None
def set_region(inputs):
    global region
    region = inputs.region
    
    try:
        chrom, coords = region.split(":")
        start, stop = (int(x) for x in coords.split("-"))
    except Exception as e:
        print('Cannot parse region as "chrom:start-stop".')

    depth_fwd = read_depth('{}.fwd.per-base.bed.gz'.format(data_prefix), chrom, start, stop)
    depth_rev = read_depth('{}.rev.per-base.bed.gz'.format(data_prefix), chrom, start, stop)

    depth_plot = lines.steps(
        [depth_fwd['start'], depth_rev['start']],
        [depth_fwd['depth'], depth_rev['depth']],
        colors=['darkolivegreen', 'maroon'],
        names=['Forward', 'Reverse'],
        x_axis_label='Position along reference',
        y_axis_label='sequencing depth / bases')
    depth_plot.xaxis.formatter.use_scientific = False

    stats = pd.read_csv("{}.bam.stats".format(data_prefix), sep='\t')
    x_grid, pdf = util.kernel_density_estimate(stats['acc'])
    mode = x_grid[np.argmax(pdf)]
    acc_plot = lines.line(
        [x_grid], [pdf],
        xlim=(90,100),
        x_axis_label="read accuracy", y_axis_label="read density")
    acc_plot = annot.marker_vline(acc_plot, mode, label='mode: {:.1f}'.format(mode))

    plot = gridplot([acc_plot, depth_plot], ncols=2)
    aplanat.show(plot, background="#f4f4f4")
    report.markdown(
        "Read metrics with respect to the provided reference are shown below.",
        key="read-stats-lead")
    report.plot(plot, key='read-plots')    

region_select = InputForm(
    InputSpec('region', 'Region', 'bsubtilis1:0-4065191'))
region_select.add_process_button(set_region)
region_select.display()


!cecho ok "Running variant calling"
!rm -rf $output_folder/variant/
!run medaka_haploid_variant -m $medaka_model -t $threads \
    -o $output_folder/variant/ -i $input_fastq -r $reference_genome


import itertools
from pysam import VariantFile
vcf_file = os.path.join(output_folder, "variant", "medaka.annotated.vcf")
vcf = VariantFile(vcf_file)

for variant in itertools.islice(vcf.fetch(), 10):
    print(variant.chrom, variant.pos, variant.ref, ">", variant.alts[0])

bsubtilis1 465 T > C
bsubtilis1 481 T > TC
bsubtilis1 483 G > GA
bsubtilis1 486 T > C
bsubtilis1 499 G > GCA
bsubtilis1 502 T > A
bsubtilis1 516 C > A
bsubtilis1 524 AT > CA
bsubtilis1 528 C > A
bsubtilis1 532 AC > TT


# show the header information for the info fields
!grep "##INFO" $vcf_file

# show the data
for variant in itertools.islice(vcf.fetch(), 10):
    info = ' '.join(f'{k}={v}' for k, v in variant.info.iteritems())
    print(variant.chrom, variant.pos, variant.qual, info)

##INFO=<ID=DP,Number=1,Type=Integer,Description="Depth of reads at pos">
##INFO=<ID=DPS,Number=2,Type=Integer,Description="Depth of reads at pos by strand (fwd, rev)">
##INFO=<ID=DPSP,Number=1,Type=Integer,Description="Depth of reads spanning pos +-25">
##INFO=<ID=SR,Number=.,Type=Integer,Description="Depth of spanning reads by strand which best align to each allele (ref fwd, ref rev, alt1 fwd, alt1 rev, etc.)">
##INFO=<ID=AR,Number=2,Type=Integer,Description="Depth of ambiguous spanning reads by strand which align equally well to all alleles (fwd, rev)">
##INFO=<ID=SC,Number=.,Type=Integer,Description="Total alignment score to each allele of spanning reads by strand (ref fwd, ref rev, alt1 fwd, alt1 rev, etc.) aligned with parasail: match 5, mismatch -4, open 5, extend 3">
bsubtilis1 465 0.9470000267028809 DP=1 DPS=(0, 1)
bsubtilis1 481 21.972999572753906 DP=1 DPS=(0, 1)
bsubtilis1 483 22.60700035095215 DP=1 DPS=(0, 1)
bsubtilis1 486 27.981000900268555 DP=1 DPS=(0, 1)
bsubtilis1 499 49.77799987792969 DP=1 DPS=(0, 1)
bsubtilis1 502 2.2079999446868896 DP=1 DPS=(0, 1)
bsubtilis1 516 23.27899932861328 DP=1 DPS=(0, 1)
bsubtilis1 524 40.37699890136719 DP=1 DPS=(0, 1)
bsubtilis1 528 0.515999972820282 DP=1 DPS=(0, 1)
bsubtilis1 532 3.3929998874664307 DP=1 DPS=(0, 1)


# Parsing and plotting VCF data with pandas (click play)
import aplanat
from aplanat import hist

def parse_vcf(fname, info_cols=None):
    """Parse a VCF file. The INFO column is parsed to a dictionary.

    :param info_cols: dict of field:dtype for INFO fields to store
        as distinct column.
    """
    header = "CHROM POS ID REF ALT QUAL FILTER INFO FORMAT GT".split()
    vcf = pd.read_csv(fname, delimiter='\t', comment='#', names=header)
    vcf['INFO'] = vcf['INFO'].str.split(";").apply(lambda x: dict([y.split("=") for y in x]))
    rlen = vcf['REF'].apply(len)
    alen = vcf['ALT'].apply(len)
    vcf['type'] = 'sub'
    vcf.loc[rlen > alen, 'type'] = 'del'
    vcf.loc[rlen < alen, 'type'] = 'ins'
    if info_cols is not None:
        for field, dtype in info_cols.items():
            vcf[field] = vcf['INFO'].apply(lambda x: x.get(field, None))
            vcf[field] = vcf[field].astype(dtype)
    return vcf

vcf = parse_vcf(vcf_file, info_cols={'DP':int})
types = ['sub', 'ins', 'del']
plot = hist.histogram(
    [vcf.loc[(vcf['type'] == x) & (vcf['DP'] > 10), 'DP'] for x in types],
    colors=['red', 'green', 'blue'], names=types,
    x_axis_label='sequencing depth', y_axis_label='variant quality',
    title='Sequencing depth by variant type')
aplanat.show(plot, background="#f4f4f4")


!bcftools query \
    --format '%CHROM\t%REF\t%FIRST_ALT\t%QUAL\t%TYPE\t%DP\n' \
    -i 'DP>75' \
    "$vcf_file" | head

bsubtilis1	A	AG	13.307	INDEL	101
bsubtilis1	GT	G	3.674	INDEL	100
bsubtilis1	TA	T	7.577	INDEL	100
bsubtilis1	TA	T	12.622	INDEL	100
bsubtilis1	CT	C	5.725	INDEL	100
bsubtilis1	GA	G	2.551	INDEL	100
bsubtilis1	GA	G	10.188	INDEL	103
bsubtilis1	TA	T	14.793	INDEL	100
bsubtilis1	GA	G	10.679	INDEL	104
bsubtilis1	TA	T	2.51	INDEL	100

Viral and Bacterial Variant Calling

Introduction¶

Getting started¶

Sample Data¶

Using your own data¶

Data entry¶

Preliminary Analysis Section¶

Variant Calling with medaka¶

Calling variants¶

Reviewing the calls¶

Reading VCF files with Python¶

Using pysam¶

Using pandas¶

Using bcftools¶

Summary¶