from epi2melabs import ping
pinger = ping.Pingu()
pinger.send_notebook_ping('start', 'snp_tutorial')

# create a work directory and move into it
tutorial_name = "human_amp_snp_calling"
working_dir = '/epi2melabs/{}/'.format(tutorial_name)
!mkdir -p "$working_dir"
%cd "$working_dir"

/epi2melabs/human_amp_snp_calling


bucket = "ont-exd-int-s3-euwst1-epi2me-labs"
domain = "s3-eu-west-1.amazonaws.com"
site = "https://{}.{}".format(bucket, domain)

# download the data
!wget "$site/$tutorial_name/sample_cas9.tar.gz"
# the data is in a compressed archive so let's decompress it
!tar -xzvf sample_cas9.tar.gz
!wget "$site/$tutorial_name/targets.bed"


!tree .

.
├── sample_cas9
│   └── fastq_pass
│       ├── FAK76554_9b52beb8b4f9ec458eb28c28b35822acaff84952_0.fastq
│       └── FAK76554_9b52beb8b4f9ec458eb28c28b35822acaff84952_1.fastq
├── sample_cas9.tar.gz
└── targets.bed

2 directories, 4 files


!wget "$site"/grch38.tar.gz
!tar -xzvf grch38.tar.gz
# parts of the workflow don't accept compressed reference
!cd grch38 && gunzip grch38.fasta.gz


import os
import aplanat
import aplanat.graphics
import pandas as pd

from epi2melabs.notebook import InputForm, InputSpec

input_data = None
bed_file = None
output_folder = None
reference_genome = None
input_file = None
region_bed = None
exec_summary = None

def process_form(inputs):
    global input_data
    global bed_file
    global output_folder
    global reference_genome
    global region_bed
    global exec_summary
    global input_file
    
    input_data = inputs.input_data
    bed_file = inputs.bed_file
    output_folder = inputs.output_folder
    reference_genome = inputs.reference_genome

    input_file = os.path.join(output_folder, "input.fastq")

    # run a command to concatenate all the files together
    !echo "Making output folder"
    !mkdir -p "$output_folder"
    !test -f "$reference_genome" || "WARNING: Reference file does not exist"
    !rationalize_fastq -i "$input_data" -o "$input_file"

    region_bed = pd.read_csv(
        bed_file, sep='\t', header=None,
        # https://genome.ucsc.edu/FAQ/FAQformat.html#format1
        names=[
            'chrom', 'start', 'end',  #required
            'tname', 'score', 'strand', 'thickStart', 'thickEnd',
            'itemRgb', 'blockCount', 'blockSizes', 'blockStarts'])

    region_bed['region'] = [
        '{}:{}-{}'.format(x['chrom'], x.start, x.end)
        for _, x in region_bed.iterrows()]

    exec_summary = aplanat.graphics.InfoGraphItems()

desc = "Enter the paths to you input files and output folder here. The input data may be either a a single fastq, compressed or otherwise, file or a directory containing multiple such files"
input_form = InputForm(
    InputSpec('input_data', 'Input .fastq', '/epi2melabs/human_amp_snp_calling/sample_cas9.tar.gz', long_desc=desc),
    InputSpec('bed_file', 'Target .bed file', '/epi2melabs/human_amp_snp_calling/targets.bed'),
    InputSpec('output_folder', 'Output folder', '/epi2melabs/human_amp_snp_calling/analysis'),
    InputSpec('reference_genome', 'Reference .fasta', '/epi2melabs/human_amp_snp_calling/grch38/grch38.fasta'))
input_form.add_process_button(process_form)
input_form.display()


!run mini_align \
    -r "$reference_genome" -i "$input_file" \
    -p "$output_folder/alignments" \
    -t 4

Constructing minimap index.
[M::mm_idx_gen::76.224*1.53] collected minimizers
[M::mm_idx_gen::96.974*1.81] sorted minimizers
[M::main::108.536*1.72] loaded/built the index for 195 target sequence(s)
[M::mm_idx_stat] kmer size: 15; skip: 10; is_hpc: 0; #seq: 195
[M::mm_idx_stat::109.825*1.71] distinct minimizers: 100167746 (38.80% are singletons); average occurrences: 5.519; average spacing: 5.607; total length: 3099922541
[M::main] Version: 2.18-r1015
[M::main] CMD: minimap2 -I 16G -x map-ont -d /epi2melabs/human_amp_snp_calling/grch38/grch38.fasta.mmi /epi2melabs/human_amp_snp_calling/grch38/grch38.fasta
[M::main] Real time: 110.085 sec; CPU: 188.270 sec; Peak RSS: 11.338 GB
[M::main::9.438*1.00] loaded/built the index for 195 target sequence(s)
[M::mm_mapopt_update::11.763*1.00] mid_occ = 694
[M::mm_idx_stat] kmer size: 15; skip: 10; is_hpc: 0; #seq: 195
[M::mm_idx_stat::13.230*1.00] distinct minimizers: 100167746 (38.80% are singletons); average occurrences: 5.519; average spacing: 5.607; total length: 3099922541
[M::worker_pipeline::54.539*3.14] mapped 7853 sequences
[M::main] Version: 2.18-r1015
[M::main] CMD: minimap2 -x map-ont -t 4 -a -A 2 -B 4 -O 4,24 -E 2,1 /epi2melabs/human_amp_snp_calling/grch38/grch38.fasta.mmi /epi2melabs/human_amp_snp_calling/analysis/input.fastq
[M::main] Real time: 55.071 sec; CPU: 171.924 sec; Peak RSS: 8.855 GB
[bam_sort_core] merging from 0 files and 4 in-memory blocks...


!stats_from_bam "$output_folder/alignments.bam" \
    > "$output_folder/alignments.stats"

aln_summary = pd.read_csv(
    '{}/alignments.stats'.format(output_folder),
    delimiter='\t')
exec_summary.append(
    'Read accuracy', aln_summary['acc'].median(), 'bullseye', '%')
exec_summary.append(
    'Throughput', aln_summary['read_length'].sum(), 'angle-up', 'B')

Mapped/Unmapped/Short/Masked/Skipped(all matches masked): 7827/0/0/0/0


# Read summary graphs code (click play)

#load python libraries
import numpy as np
from bokeh.layouts import gridplot
import aplanat
from aplanat import annot, lines, util

x_grid, pdf = util.kernel_density_estimate(aln_summary['acc'])
mode = x_grid[np.argmax(pdf)]

p = lines.line(
    [x_grid], [pdf],
    x_axis_label="Read accuracy", y_axis_label="Read density")
p = annot.marker_vline(p, mode, label='mode: {:.1f}'.format(mode))
aplanat.show(p, background="#f4f4f4")


# Tabulating read overlaps to targets (click play)
import pyranges as pr

targets = region_bed[['chrom','start','end','tname']].rename(
    columns={'chrom':'Chromosome','start':'Start','end':'End'}
)
targets = pr.PyRanges(targets)

reads = aln_summary.rename(
    columns = {'ref':'Chromosome','rstart':'Start','rend':'End', 'direction':'Strand'}
)
reads = pr.PyRanges(reads)
ovl_summary = targets.count_overlaps(reads)
display(ovl_summary.df)
exec_summary.append(
    'Reads per target', ovl_summary.df['NumberOverlaps'].median(), 'calculator')


# Calculating inter-HET variant distance (click play)

truth = (
   'https://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/AshkenazimTrio/'
   'HG002_NA24385_son/NISTv3.3.2/GRCh38/HG002_GRCh38_GIAB_highconf_CG-'
   'Illfb-IllsentieonHC-Ion-10XsentieonHC-SOLIDgatkHC_CHROM1-22_v.3.3.2_highconf_triophased.vcf.gz')

!wget -O HG002_truth.vcf.gz "$truth"

def parse_vcf(fname):
    header = "CHROM POS ID REF ALT QUAL FILTER INFO FORMAT GT".split()
    vcf = pd.read_csv(fname, delimiter='\t', comment='#', names=header)
    return vcf

vcf = parse_vcf("HG002_truth.vcf.gz")
ishom = vcf['GT'].str.contains('HOMVAR')
vcf['het'] = ~ishom
hets = vcf.loc[vcf['het']][['POS', 'het']]
d = np.diff(hets['POS'].to_numpy())
d[d < 0] = 0
d[d > 10000] = 0
mean_dist = np.mean(d)  # crossed chr

import aplanat
from aplanat import hist, annot
p = hist.histogram(
    [d], [[1.0]*len(d)], bins=100, normalize=True,
    x_axis_label='Distance between HETs', y_axis_label='Density',
    xlim=(0, 4000), title='HG002 Inter-variant distance')
p = annot.marker_vline(p, mean_dist, label='mean: {:.0f}'.format(mean_dist))
aplanat.show(p, background="#f4f4f4")

--2021-09-23 16:31:01--  https://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/AshkenazimTrio/HG002_NA24385_son/NISTv3.3.2/GRCh38/HG002_GRCh38_GIAB_highconf_CG-Illfb-IllsentieonHC-Ion-10XsentieonHC-SOLIDgatkHC_CHROM1-22_v.3.3.2_highconf_triophased.vcf.gz
Resolving ftp-trace.ncbi.nlm.nih.gov (ftp-trace.ncbi.nlm.nih.gov)... 130.14.250.11, 130.14.250.10, 2607:f220:41f:250::230, ...
Connecting to ftp-trace.ncbi.nlm.nih.gov (ftp-trace.ncbi.nlm.nih.gov)|130.14.250.11|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 129195923 (123M) [application/x-gzip]
Saving to: ‘HG002_truth.vcf.gz’

HG002_truth.vcf.gz  100%[===================>] 123.21M  30.9MB/s    in 4.8s    

2021-09-23 16:31:06 (25.4 MB/s) - ‘HG002_truth.vcf.gz’ saved [129195923/129195923]


!run subsample_bam

usage: subsample_bam [-h] [-o OUTPUT_PREFIX] [-r REGIONS [REGIONS ...]]
                     [-p PROFILE] [-O {fwd,rev}] [-t THREADS] [-q QUALITY]
                     [-a ACCURACY] [-c COVERAGE] [-l LENGTH]
                     [--any_fail | --all_fail] [-x PATIENCE] [-s STRIDE] [-P]
                     [-S SEED]
                     bam depth [depth ...]
subsample_bam: error: the following arguments are required: bam, depth


# Running medaka to call variants (click play)

#medaka doesn't accept a bed file as input, but does accept a single region
with open(bed_file) as bed:
    for line in bed.readlines():
        chrom, start, stop, _ = line.split('\t')
        reg = '{}:{}-{}'.format(chrom, start, stop)
        print("Running medaka for {}".format(reg))
        !rm -rf "$output_folder/medaka_$reg"
        !run medaka_variant \
            -i "$output_folder/alignments.bam" -f grch38/grch38.fasta \
            -r "$reg" -t 4 -o "$output_folder/medaka_$reg"


# sample output of medaka_variant per region
with open(bed_file) as bed:
    line = bed.readline()
    chrom, start, stop, _ = line.split('\t')
    reg = '{}:{}-{}'.format(chrom, start, stop)
    !tree "$output_folder/medaka_$reg"

/epi2melabs/human_amp_snp_calling/analysis/medaka_chr1:155180400-155200000
├── alignments.bam
├── alignments.bam.bai
├── round_0_hap_mixed_phased.bam
├── round_0_hap_mixed_phased.bam.bai
├── round_0_hap_mixed_phased.vcf
├── round_0_hap_mixed_phased.vcf.gz
├── round_0_hap_mixed_phased.vcf.gz.tbi
├── round_0_hap_mixed_probs.hdf
├── round_0_hap_mixed_unphased.vcf
├── round_1_hap_1_probs.hdf
├── round_1_hap_1.vcf
├── round_1_hap_2_probs.hdf
├── round_1_hap_2.vcf
├── round_1_unfiltered.vcf
└── round_1.vcf

0 directories, 15 files


# Aggregate and annotate variant calls (click play)
files = list()
print(output_folder)
concat_vcf = os.path.join(output_folder, "combined.vcf")
annot_vcf = os.path.join(output_folder, "combined_annot.vcf")
with open(bed_file) as bed:
    for line in bed.readlines():
        chrom, start, stop, _ = line.split('\t')
        reg = '{}:{}-{}'.format(chrom, start, stop)
        path = os.path.join(output_folder, "medaka_{}".format(reg))
        for fname in ('round_1.vcf', 'round_0_hap_mixed_unphased.vcf'):
            fname = os.path.join(path, fname)
            if os.path.isfile(fname):
                files.append(fname)
                break
all_files = ' '.join(files)
print(" - Concatenating VCFs")
!bcftools concat $all_files > $concat_vcf
print(" - Annotating VCFs")
!run medaka tools annotate $concat_vcf grch38/grch38.fasta \
    "$output_folder/alignments.bam" $annot_vcf

/epi2melabs/human_amp_snp_calling/analysis
 - Concatenating VCFs
Checking the headers and starting positions of 10 files
Concatenating /epi2melabs/human_amp_snp_calling/analysis/medaka_chr1:155180400-155200000/round_1.vcf	0.000413 seconds
Concatenating /epi2melabs/human_amp_snp_calling/analysis/medaka_chr4:3072000-3078000/round_1.vcf	0.000209 seconds
Concatenating /epi2melabs/human_amp_snp_calling/analysis/medaka_chr6:170556800-170566300/round_1.vcf	0.000312 seconds
Concatenating /epi2melabs/human_amp_snp_calling/analysis/medaka_chr9:27571600-27577500/round_1.vcf	0.000275 seconds
Concatenating /epi2melabs/human_amp_snp_calling/analysis/medaka_chr11:2158100-2167500/round_1.vcf	0.000342 seconds
Concatenating /epi2melabs/human_amp_snp_calling/analysis/medaka_chr12:111596700-111603400/round_1.vcf	0.000207 seconds
Concatenating /epi2melabs/human_amp_snp_calling/analysis/medaka_chr14:92067100-92075400/round_1.vcf	0.000193 seconds
Concatenating /epi2melabs/human_amp_snp_calling/analysis/medaka_chr19:13204400-13211100/round_1.vcf	0.000253 seconds
Concatenating /epi2melabs/human_amp_snp_calling/analysis/medaka_chr22:45791500-45799400/round_1.vcf	0.000250 seconds
Concatenating /epi2melabs/human_amp_snp_calling/analysis/medaka_chrX:147910700-147933700/round_1.vcf	0.000086 seconds
 - Annotating VCFs
[17:05:02 - Annotate] Getting chrom coordinates
[17:05:02 - Annotate] Processing chunk with coordinates: chr1:155180465-155196985
[17:05:04 - Annotate] Processing chunk with coordinates: chr4:3073067-3077431
[17:05:06 - Annotate] Processing chunk with coordinates: chr6:170556928-170566290
[17:05:09 - Annotate] Processing chunk with coordinates: chr9:27574016-27577484
[17:05:10 - Annotate] Processing chunk with coordinates: chr11:2158152-2167494
[17:05:12 - Annotate] Processing chunk with coordinates: chr12:111597155-111603059
[17:05:13 - Annotate] Processing chunk with coordinates: chr14:92068066-92072609
[17:05:15 - Annotate] Processing chunk with coordinates: chr19:13204626-13211096
[17:05:15 - Annotate] Processing chunk with coordinates: chr22:45791755-45799396
[17:05:16 - Annotate] Processing chunk with coordinates: chrX:147911028-147933200


# Parsing VCF information to produce QC information (click play)
def parse_vcf(fname):
    # read a vcf and move "columns" from the INFO field into the table
    header = "CHROM POS ID REF ALT QUAL FILTER INFO FORMAT GT".split()
    vcf = pd.read_csv(fname, delimiter='\t', comment='#', names=header) 
    extra = vcf['INFO'].str.split(";") \
        .apply(lambda x: dict([y.split("=") for y in x])) \
        .tolist()
    return vcf.join(pd.DataFrame(extra))

depth_filter = 20

vcf = parse_vcf(annot_vcf)
# summarise variants by quality and sequencing depth
vcf['lowdepth'] = vcf['DP'].astype(float) < depth_filter
counts = vcf.groupby(["FILTER", "lowdepth"]).size() \
    .reset_index().rename(columns={"FILTER":'lowqual', 0: 'count'})
counts['lowqual'] = counts['lowqual'].apply(lambda x: x == 'lowqual')
display(counts)
good_variants = counts.loc[
    (counts['lowqual'] == False) & (counts['lowdepth'] == False), 'count'] \
    .tolist()[0]
exec_summary.append('Pass Variants', good_variants, 'adjust')


!bcftools query -i "FILTER='PASS' && DP>20" -f '%CHROM\t%POS\t%REF\t%ALT\t[ %GT]\n' $annot_vcf > my_filtered.variants.vcf
!head my_filtered.variants.vcf

chr4	3073068	A	G	 0/1
chr4	3073964	G	C	 0/1
chr4	3074612	T	TGGCCCCGCCCGGCG	 0/1
chr4	3074876	CCAGCAGCAG	C	 0/1
chr4	3074945	A	ACCGCCG	 0/1
chr4	3075647	CA	C	 1/1
chr4	3076048	CG	CA,C	 1/2
chr4	3077210	A	G	 0/1
chr6	170558022	G	A	 1/1
chr6	170558169	T	A	 1/1


# Executive summary (click play)
pinger.send_notebook_ping('stop', 'snp_tutorial')
aplanat.show(aplanat.graphics.infographic(exec_summary.values(), ncols=2), background="#f4f4f4")

Human Variant Calling with Medaka

Introduction¶

Getting started¶

Sample Data¶

Using your own data¶

Data entry¶

Alignment of reads¶

SNP Calling¶

Medaka's workflow¶

Coverage requirements¶

Calling variants¶

Annotating the variant calls¶

Examining the annotations¶

Headline numbers¶

Summary¶

	Chromosome	Start	End	tname	NumberOverlaps
0	chr1	155180400	155200000	MUC1	28
1	chr4	3072000	3078000	HTT	84
2	chr6	170556800	170566300	SCA17	76
3	chr9	27571600	27577500	C9orf72	39
4	chr11	2158100	2167500	INS	55
5	chr12	111596700	111603400	SCA2	231
6	chr14	92067100	92075400	SCA3	95
7	chr19	13204400	13211100	SCA6	62
8	chr22	45791500	45799400	SCA10	124
9	chrX	147910700	147933700	FMR1	70

	lowqual	lowdepth	count
0	False	False	75
1	False	True	416
2	True	False	31
3	True	True	203