from epi2melabs import ping
pinger = ping.Pingu()
pinger.send_notebook_ping('start', 'sv_tutorial')

# create a work directory and move into it
tutorial_name = "sv_tutorial"
working_dir = "/epi2melabs/{}/".format(tutorial_name)
!mkdir -p "$working_dir"
%cd "$working_dir"

/epi2melabs/sv_tutorial


# Install additional packages via mamba (click play)
!mamba install lra cuteSV seqtk fastcat -yq


# Download sample data (click play)
import os

# download the data
location='s3://ont-exd-int-s3-euwst1-epi2me-labs/sv_tutorial'
s3_bucket = "ont-exd-int-s3-euwst1-epi2me-labs"
s3_prefix = "sv_tutorial/GM24385.nf7.chr20_af_minimap2.fastq.gz"
s3_reference = "sv_tutorial/chr20_human_g1k_v37.fasta.gz"
s3_bedfile = "sv_tutorial/target.bed"
s3_alignments = "sv_tutorial/GM24385.nf7.chr20_af_minimap2.bam"
reads = os.path.basename(s3_prefix)
bam = os.path.basename(s3_alignments)
reference = os.path.basename(s3_reference)
!echo "Downloading sample data"
!wget -O "$reads" https://"$s3_bucket".s3.amazonaws.com/"$s3_prefix"
!wget -O "$reference" https://"$s3_bucket".s3.amazonaws.com/"$s3_reference"
!wget -O "target.bed" https://"$s3_bucket".s3.amazonaws.com/"$s3_bedfile"
!wget -O "$bam"  https://"$s3_bucket".s3.amazonaws.com/"$s3_alignments"


# Re-generate input form and global variables (click play)
import multiprocessing
import os

import aplanat
from aplanat import report, graphics
import ipywidgets as widgets
from epi2melabs.notebook import InputForm, InputSpec, cecho

# Inputs
fastq = None
reference = None
target_bed = None

# Parameters
threads = None
min_sv_length = None
max_sv_length = None
min_read_support = None
min_read_length = None
min_read_mapping_quality = None

# Intermediate results
aligned_bam = "aligned.bam"
calls_vcf = "calls.vcf"
filtered_vcf = "filtered.vcf"
sorted_filtered_vcf = "sorted.filtered.vcf"
sorted_filtered_vcf_gz = "sorted.filtered.vcf.gz"
depth_bedfile = 'depth.regions.bed.gz'
alignment_stats = "alignments.seqkit.tsv"

# Outputs
html_report = None
exec_summary = None


def process_form(inputs):
    global fastq, reference, target_bed, threads, \
        min_sv_length, max_sv_length, min_read_support, \
        min_read_length, min_read_mapping_quality, \
        html_report, exec_summary
    fastq = inputs.fastq
    reference = inputs.reference
    target_bed = inputs.target_bed
    threads = inputs.threads
    min_sv_length = inputs.min_sv_length
    max_sv_length = inputs.max_sv_length
    min_read_support = inputs.min_read_support
    min_read_length = inputs.min_read_length
    min_read_mapping_quality = inputs.min_read_mapping_quality
    
    if not os.path.isfile(fastq):
        !cecho error "Fastq '{}' does not exist.".format(fastq)
    if not os.path.isfile(reference):
        !cecho error "Reference '{}' does not exist.".format(reference)
    if target_bed != "" and not os.path.isfile:
        !cecho error "Target bed file '{}' does not exist.".format(target_bed)
    if min_sv_length < 0 or max_sv_length <= 0 or min_read_mapping_quality < 0:
        !echo error "Invalid value(s) given for `min_sv_length`, `max_read_length`, or `min_read_mapping_quality`."
        
    html_report = report.HTMLReport(
        "Structural Variation", "Summary of read alignments and SV discovery.",
        require_keys=True)
    html_report.markdown("## Executive Summary", "exec-title")
    html_report.placeholder("exec-plot")
    exec_summary = graphics.InfoGraphItems()

    !cecho success "Input validation complete"


input_form = InputForm(
    InputSpec('fastq', '.fastq reads file', 'GM24385.nf7.chr20_af_minimap2.fastq.gz'),
    InputSpec('reference', '.fasta reference file', 'chr20_human_g1k_v37.fasta.gz'),
    InputSpec('target_bed', '.bed target file', 'target.bed'),
    InputSpec('threads', 'Compute threads', (1, multiprocessing.cpu_count(), 1)),
    InputSpec('min_sv_length', 'Minimum SV length to report', widgets.IntText(30)),
    InputSpec('max_sv_length', 'Maximum SV length to report', widgets.IntText(1000000)),
    InputSpec('min_read_length', 'Minimum read length', widgets.IntText(1000)),
    InputSpec('min_read_support', 'Minimum reads to support variant', widgets.IntText(3)),
    InputSpec('min_read_mapping_quality', 'Minimum read mapping quality', widgets.IntText(20)),
    description_width="200px")
input_form.add_process_button(process_form)
input_form.display()


#aligned_bam Run seqkit stats and fastcat to summarise read stats and plot results (click play)
import pandas as pd
from bokeh.layouts import gridplot
from aplanat.components import fastcat
from bokeh.models import BasicTickFormatter, Range1d

# Seqkit
read_stats = "reads.seqkit.tsv"

cmd = f"seqkit stats -T -a -b -j {threads} {fastq} > {read_stats}"
!cecho ok "Running $cmd"
!$cmd

df_reads = pd.read_csv(read_stats, sep='\t')
display(df_reads)

# Fastcat
per_read_stats = 'per_read_stats.txt'

cmd = f"fastcat --read {per_read_stats} {fastq} > /dev/null"
!cecho ok "Running $cmd"
!$cmd

df_per_read = pd.read_csv(per_read_stats, sep='\t')[['read_id','filename','read_length','mean_quality']]
display(df_per_read.head(10))

# Plotting
read_qual = fastcat.read_quality_plot(df_per_read)
read_qual_median = df_per_read['mean_quality'].median()
read_length = fastcat.read_length_plot(df_per_read)
read_len_median = df_per_read['read_length'].median()
read_length.x_range = Range1d(0, 100000)
read_length.xaxis.formatter = BasicTickFormatter(use_scientific=False)

exec_summary.append(
    "Median. read quality", read_qual_median, "thumbs-up")
exec_summary.append(
    "Median. read length", read_len_median, "align-center", "b")

qc_plots = gridplot([read_length, read_qual], ncols=2)
aplanat.show(qc_plots, background="#f4f4f4")

html_report.markdown("""
#### Read quality control

The following plots give an overview of the dataset we are working with. For SV calling, a right-shifted 
read-length distribution is preferable, allowing us to unpick larger variants. Quality scores are 
mean-per-read and the peak should not be below 10.""",
"align-stat-head")

html_report.plot(qc_plots, "align-plot")

Running seqkit stats -T -a -b -j 2 GM24385.nf7.chr20_af_minimap2.fastq.gz > reads.seqkit.tsv

Running fastcat --read per_read_stats.txt GM24385.nf7.chr20_af_minimap2.fastq.gz > /dev/null


# Align reads to reference genome using LRA. Creates .bam file (click play)

cmd = f"""
    lra index -ONT {reference} \
    && seqtk seq -A {fastq} \
    | lra align -ONT -p s -t {threads} {reference} - \
    | samtools calmd -u - {reference} \
    | samtools sort -@ {threads} -o {aligned_bam} - \
    && samtools index -@ {threads} {aligned_bam}
""".strip('\n').replace('    ', '')
!cecho ok "Running $cmd"
!$cmd

Running lra index -ONT chr20_human_g1k_v37.fasta.gz && seqtk seq -A GM24385.nf7.chr20_af_minimap2.fastq.gz | lra align -ONT -p s -t 2 chr20_human_g1k_v37.fasta.gz - | samtools calmd -u - chr20_human_g1k_v37.fasta.gz | samtools sort -@ 2 -o aligned.bam - && samtools index -@ 2 aligned.bam


# Run seqkit bam to get alignment stats (click play)
cmd = f"seqkit bam {aligned_bam} 2> {alignment_stats}"
!cecho ok "Running $cmd"
!$cmd

df = pd.read_csv(alignment_stats, sep='\t')[[
    'Read','Ref','Pos','EndPos','MapQual','Acc','ReadLen','RefAln',
    'RefCov','ReadAln','ReadCov','Strand','LeftClip','RightClip',
    'Flags','IsSec','IsSup']]
df.head()

Running seqkit bam aligned.bam 2> alignments.seqkit.tsv


# Read summary plot code (click play)
import aplanat
from aplanat import hist, annot
from bokeh.layouts import gridplot
from aplanat.util import Colors
import numpy as np
import pandas as pd
import sys

pd.options.display.float_format = '{:.2f}'.format

def parse_seqkit(fname):
    cols = {
        'Read':str, 'MapQual':int, 'Acc':float, 'ReadLen':int,
        'ReadAln':int, 'ReadCov':float, 'IsSec':bool, 'IsSup':bool}
    df = pd.read_csv(fname, sep="\t", dtype=cols, usecols=cols.keys())
    df['Clipped'] = df['ReadLen'] - df['ReadAln']
    df['Type'] = 'Primary'
    df.loc[df['IsSec'], 'Type'] = 'Secondary'
    df.loc[df['IsSup'], 'Type'] = 'Supplementary'
    return df

def summarize_seqkit(df):
    grouped = df.groupby('Type').agg(**{
        'Alignments': ('Read', 'count'),
        'Unique Reads': ('Read', 'nunique'),
        'Ave. read length': ('ReadLen', 'median'),
        'Ave. mapping quality': ('MapQual', 'median'),
        'Ave. align length': ('ReadAln', 'median'),
        'Total aligned bases': ('ReadAln', 'sum'),
        'Total clipped bases': ('Clipped', 'sum')})
    for atype in ['Supplementary']:
        if atype not in df['Type'].values:
            atype_row = {
                'Alignments': 0,
                'Unique Reads': 0,
                'Ave. read length': 0,
                'Ave. mapping quality': 0,
                'Ave. align length': 0,
                'Total aligned bases': 0,
                'Total clipped bases': 0
            }
            grouped = grouped.append(pd.Series(atype_row, name=atype))
    grouped.loc[['Supplementary'], ['Ave. read length']] = "-"
    return grouped.transpose()

seqkit_df = parse_seqkit(alignment_stats)
summary_table = summarize_seqkit(seqkit_df)
primarys = seqkit_df.loc[seqkit_df['Type'] == 'Primary']

p1 = hist.histogram(
    [primarys['Acc']], xlim=(70, 101), binwidth=0.5,
    colors=[Colors.cerulean], title="Alignment accuracy distribution",
    x_axis_label='% alignment accuracy', y_axis_label='read count')

p2 = hist.histogram(
    [primarys['ReadCov']], xlim=(90, 101), binwidth=0.2,
    colors=[Colors.cerulean], title="Read containment distribution",
    x_axis_label='% of read contained in alignment',
    y_axis_label='read count')
alignment_plots = gridplot([p1, p2], ncols=2)
display(summary_table)
aplanat.show(alignment_plots, background="#f4f4f4")

prim = summary_table["Primary"]
exec_summary.append(
    "Primary alignments", prim.loc["Alignments"], "signal")

# Add plot to the html report
html_report.markdown("""
### Read alignments.

Summary of read alignments to the reference genome. Note that read-based
statistics (e.g `Ave. read length`) are not reported for secondary and
supplementary alignments where they would otherwise overcount.""",
"align-head")
html_report.table(summary_table, index=True, key="align-table")
html_report.markdown("""
#### Primary alignment statistics

The following plots give an overall impression of alignment and thereby read quality. Accuracy should show a strong peak
above 93% with alignment containment above 98%.""",
"align-stat-head")
html_report.plot(alignment_plots, "align-plot")


# Finding candidate variants with cuteSV (click play)

renamed = reference
if reference.endswith('.gz'):
    renamed = reference.split('.gz')[0]
    unzipper = f"bgzip -d -c {reference} > {renamed}"
    !$unzipper
    
!touch ./signatures
!rm -r ./signatures

cmd = f"""
    cuteSV \
        --threads {threads} \
        --genotype \
        -l {min_sv_length} \
        -L {max_sv_length} \
        -r {min_read_length} \
        -q {min_read_mapping_quality} \
        -s {min_read_support} \
        --max_cluster_bias_INS 100 \
        --diff_ratio_merging_INS 0.3 \
        --max_cluster_bias_DEL 100 \
        --diff_ratio_merging_DEL 0.3 \
        {aligned_bam} \
        {renamed} \
        {calls_vcf} .
""".strip('\n').replace('    ', '')
!cecho ok "Running $cmd"
!$cmd

Running cuteSV --threads 2 --genotype -l 30 -L 1000000 -r 1000 -q 20 -s 3 --max_cluster_bias_INS 100 --diff_ratio_merging_INS 0.3 --max_cluster_bias_DEL 100 --diff_ratio_merging_DEL 0.3 aligned.bam chr20_human_g1k_v37.fasta calls.vcf .
2021-07-21 18:04:19,493 [INFO] Running /opt/conda/bin/cuteSV --threads 2 --genotype -l 30 -L 1000000 -r 1000 -q 20 -s 3 --max_cluster_bias_INS 100 --diff_ratio_merging_INS 0.3 --max_cluster_bias_DEL 100 --diff_ratio_merging_DEL 0.3 aligned.bam chr20_human_g1k_v37.fasta calls.vcf .
2021-07-21 18:12:47,186 [INFO] Finished in 507.69 seconds.


# Running mosdepth to calculate read coverage (click play)
if target_bed != "":
    bed_opt = "-b {}".format(target_bed)
else:
    bed_opt = ""
cmd = f"mosdepth -n -x {bed_opt} -t {threads} depth {aligned_bam}"
!cecho ok "Running $cmd"
!$cmd

Running mosdepth -n -x -b target.bed -t 2 depth aligned.bam


# Run VCF filtering step (click play)
import pandas as pd
threshold_lookup = ['0'] + ['2'] * 10 + ['3'] * 9 + ['5'] * 20 + ['8'] * 100


def calculate_average_depth(path):
    """Get the average read depth."""    
    depth_table = pd.read_csv(
        path, 
        compression="gzip", 
        sep="\t", 
        header=None, 
        names=("chr", "start", "end", "avg_depth"))
    depth_table["length"] = depth_table["end"] - depth_table["start"]
    depth_table["sum_depth"] = depth_table["length"] * depth_table["avg_depth"]
    total_bases = sum(depth_table["length"])
    sum_depth = sum(depth_table["sum_depth"])
    avg_depth = sum_depth / total_bases
    
    return avg_depth


# Get SV type filters
sv_types = ['INS', 'DEL']

sv_type_filters = []
for svtype in sv_types:
    sv_type_filters.append(f'SVTYPE = \"{svtype}\"')
filter_sv_types = f"( {(' || ').join(sv_type_filters)} )"

# Get length filters
filter_min_len = f'ABS(SVLEN) >= {min_sv_length}'
filter_max_len = f'ABS(SVLEN) <= {max_sv_length}'

# Get min read support filter
avg_depth = calculate_average_depth(depth_bedfile)
avg_depth = min(avg_depth, len(threshold_lookup) - 1)
detected_read_support = int(threshold_lookup[round(avg_depth)])

read_support = min_read_support
if detected_read_support > min_read_support:
    read_support = detected_read_support

filter_min_read_support = f'INFO/RE >= {read_support}'

# Build filter string
filters = [
    filter_sv_types,
    filter_min_len,
    filter_max_len,
    filter_min_read_support
]
filter_string = f"-i '{' && '.join(filters)}'"

# Add target_bed filter (optional)
if target_bed != "":
    filter_string = f"-T {target_bed} " + filter_string

# Print command to stdout
cmd = f"bcftools view {filter_string} {calls_vcf} > {filtered_vcf}"
!cecho ok "Running $cmd"
!$cmd

Running bcftools view -T target.bed -i '( SVTYPE = INS || SVTYPE = DEL ) && ABS(SVLEN) >= 30 && ABS(SVLEN) <= 1000000 && INFO/RE >= 5' calls.vcf > filtered.vcf


# Run VCF sort and indexing step (click play)

print("Sorting VCF...")
cmd = f"bcftools sort {filtered_vcf} > {sorted_filtered_vcf}"
!cecho ok "Running $cmd"
!$cmd

print("Compressing and indexing VCF...")
cmd = f"bgzip -f {sorted_filtered_vcf} && bcftools index {sorted_filtered_vcf_gz}"
!cecho ok "Running $cmd"
!$cmd

Sorting VCF...
Running bcftools sort filtered.vcf > sorted.filtered.vcf
Writing to /tmp/bcftools-sort.sNPEug
Merging 1 temporary files
Cleaning
Done
Compressing and indexing VCF...
Running bgzip -f sorted.filtered.vcf && bcftools index sorted.filtered.vcf.gz


# Plotting SV density karyograms (click play)
import aplanat
from aplanat import bio
import pandas as pd
import numpy as np

def parse_vcf(fname):
    header = "CHROM POS ID REF ALT QUAL FILTER INFO FORMAT GT".split()
    vcf = pd.read_csv(fname, delimiter='\t', comment='#', names=header, dtype={'CHROM':str})
    #print(vcf.head())
    # The INFO field isn't quite a table, so this is cumbersome
    info = vcf['INFO'].str.split(pat=";",expand=True, n=1)
    vcf['STATUS'] = info[0]
    vcf['details'] = info[1].str.split(";").apply(lambda x: dict([y.split("=") for y in x]))
    for field in ['SVTYPE', 'SVLEN']:
        vcf[field] = vcf['details'].apply(lambda x: x[field])
    # filter to chromosome
    allowed = [str(x) for x in range(1,23)] + ['X','Y']
    vcf = vcf.loc[vcf['CHROM'].isin(allowed)]
    for orig, rename in (('INS', 'Insertion'), ('DEL', 'Deletion')):
        vcf.loc[vcf['SVTYPE'] == orig, 'SVTYPE'] = rename
    vcf['SVLEN'] = vcf['SVLEN'].astype(int)
    return vcf

vcf = parse_vcf(sorted_filtered_vcf_gz)

sv_types = ('Insertion', 'Deletion')
sv_colours = ['red', 'green', 'blue']
karyograms = list()
for sv, col in zip(sv_types, sv_colours):
    data = vcf.loc[vcf['SVTYPE'] == sv]
    plot = bio.karyotype(
        [data['POS']], [data['CHROM']], [sv], [col], alpha=0.2,
        height=300, width=400)
    karyograms.append(plot)
karyograms = gridplot(karyograms, ncols=3)

aplanat.show(karyograms, background="#f4f4f4")
html_report["Karyogram plots:"] = karyograms


# SV length distributions (click play)

vcf_summary = vcf.groupby('SVTYPE').agg(**{
        'Count': ('POS', 'count'),
        'Min. Length': ('SVLEN', lambda x: np.min(x.abs())),
        'Ave. Length': ('SVLEN', lambda x: np.median(x.abs())),
        'Max. Length': ('SVLEN', lambda x: np.max(x.abs()))}).transpose()
html_report.markdown("""
### Structural Variants

The structural variants found by the analysis and reported
in the output `.vcf` are summarized below.""",
'sv-summary-head')
html_report.table(vcf_summary, index=False, key='sv-summary-table')
exec_summary.append('Variants found', int(sum(vcf_summary.loc['Count'])), 'balance-scale')

length_plots = list()
for sv, col in zip(sv_types, sv_colours):
    data = np.log10(vcf.loc[vcf['SVTYPE'] == sv, 'SVLEN'].abs())
    plot = hist.histogram(
        [data], colors=[col],
        names=[sv], bins=200, xlim=(1, None),
        height=250,
        title="{} SV lengths".format(sv),
        x_axis_label='log10(SV length / bases)',
        y_axis_label='count')
    length_plots.append(plot)
length_plots = gridplot(length_plots, ncols=2)
html_report.plot(length_plots, key='sv-len-plots')

display(vcf_summary)
aplanat.show(length_plots, background="#f4f4f4")


# Executive summary (click play)
pinger.send_notebook_ping('stop', 'sv_tutorial')
exec_plot = graphics.infographic(exec_summary.values())
aplanat.show(exec_plot, background="#f4f4f4")

html_report['exec-plot'] = exec_plot
html_report_path = "report.html"
html_report.write(html_report_path)
print("Report written to {}".format(os.path.abspath(html_report_path)))

Report written to /epi2melabs/sv_tutorial/report.html

	read_id	filename	read_length	mean_quality
0	0175f4dc-e60a-45fc-9160-52120d6acdec	GM24385.nf7.chr20_af_minimap2.fastq.gz	109226	11.09
1	48e6444a-5b38-4d1e-923f-9c50db41db7c	GM24385.nf7.chr20_af_minimap2.fastq.gz	49387	12.76
2	2d34865e-10bc-4043-bbc7-267be5861b4e	GM24385.nf7.chr20_af_minimap2.fastq.gz	8513	12.37
3	a9176777-5819-4eee-b426-8b210557c013	GM24385.nf7.chr20_af_minimap2.fastq.gz	36824	9.81
4	9270a16f-1325-43bb-a9ab-8633a5240718	GM24385.nf7.chr20_af_minimap2.fastq.gz	83950	12.77
5	566195ce-9f45-42ca-9eb8-6967c02a1a43	GM24385.nf7.chr20_af_minimap2.fastq.gz	22715	10.53
6	2b5fb7bc-c545-439c-a818-1a229c2d5e02	GM24385.nf7.chr20_af_minimap2.fastq.gz	72372	11.19
7	b3b57ce7-7e99-4c2a-a96e-cf220168305e	GM24385.nf7.chr20_af_minimap2.fastq.gz	108802	11.86
8	afa6bbdc-6f3c-4d01-83c9-dbb0157bf6d7	GM24385.nf7.chr20_af_minimap2.fastq.gz	96831	7.75
9	647d69ea-fc98-4467-a2be-aa6decbb1e3b	GM24385.nf7.chr20_af_minimap2.fastq.gz	110170	10.32

	Read	Ref	Pos	EndPos	MapQual	Acc	ReadLen	RefAln	RefCov	ReadAln	ReadCov	Strand	LeftClip	RightClip	Flags
0	be97f83b-3d51-4779-8e2b-bc92a878174e	20	60001	64041	60	93.402	11785	4040	0.006	3940	33.432	-1	7805	40	16
1	53ab8f9f-4767-474c-bb37-85cdc87d2d0b	20	60001	90949	60	81.816	44454	30948	0.049	28407	63.902	-1	16006	41	16
2	2d34865e-10bc-4043-bbc7-267be5861b4e	20	60005	61073	60	92.216	8513	1068	0.002	1048	12.311	1	7445	20	0
3	a9176777-5819-4eee-b426-8b210557c013	20	60005	85933	60	87.272	36824	25928	0.041	25278	68.645	1	11540	6	0
4	2b5fb7bc-c545-439c-a818-1a229c2d5e02	20	60005	130667	60	91.335	72372	70662	0.112	68723	94.958	1	3617	32	0

Type	Primary	Supplementary
Alignments	87780	4011
Unique Reads	87780	2694
Ave. read length	7069	-
Ave. mapping quality	60	60
Ave. align length	6774	5744
Total aligned bases	1576230507	65875987
Total clipped bases	90629408	166734614

Structural Variation Pipeline Tutorial

Introduction¶

Getting started¶

Install additional software¶

Sample data¶

Using your own data¶

Data input¶

Inspecting input data¶

Read Alignment¶

Mapping Analysis¶

Calling Structural Variants¶

Filtering SVs by type, region, depth and frequency¶

Analysis¶

Headline Numbers¶

Summary¶

Appendix¶

SVTYPE	Deletion	Insertion
Count	407	286
Min. Length	30	30
Ave. Length	41	151
Max. Length	140015	11299