# create a work directory and move into it
tutorial_name = 'modbase_tutorial'
working_dir = '/epi2melabs/{}'.format(tutorial_name)
!mkdir -p "$working_dir"
%cd "$working_dir"

from epi2melabs import ping
pinger = ping.Pingu()
response = pinger.send_notebook_ping('start', 'modified_base_tutorial')

/epi2melabs/modbase_tutorial


# install modbam2bed for manipulating BAM files with modified base information.
!mamba install -q -y modbam2bed


bucket = "ont-exd-int-s3-euwst1-epi2me-labs"
domain = "s3-eu-west-1.amazonaws.com"
site = "https://{}.{}".format(bucket, domain)

!echo 'Downloading `.bam` data...'
!wget "$site"/modbase_tutorial/chr20.bam
!wget "$site"/modbase_tutorial/chr20.bam.bai
!mkdir -p "sample_data" && mv chr20.bam* sample_data


!wget "$site"/grch38.tar.gz
!tar -xzvf grch38.tar.gz


!tree .


from epi2melabs.notebook import InputForm, InputSpec

data_folder = None
output_folder = None
reference_genome = None

def process_form(inputs):
    global data_folder
    global output_folder
    global reference_genome
    # make sure all bams are indexed
    data_folder = inputs.data_folder
    if inputs.force_bam_index:
        !echo "Ensuring .bams are indexed"
        !find $data_folder -name "*.bam" | xargs -I {} samtools index {}
    !echo "Making output folder"
    output_folder = inputs.output_folder
    !mkdir -p "$output_folder"
    reference_genome = inputs.reference_genome
    !test -f "$reference_genome" \
        && echo "Reference file found" \
        || cecho error "WARNING: Reference file does not exist"


input_form = InputForm(
    InputSpec('data_folder', 'Input folder', 'sample_data',long_desc="Guppy/MinKNOW output directory, will be searched for BAM files:"),
    InputSpec('force_bam_index', 'Index BAMs', True, long_desc="Some versions of Guppy do not write required BAM indexes tick this box to force indexing:"),
    InputSpec('output_folder', 'Output folder', 'analysis', long_desc="Output location:"),
    InputSpec('reference_genome', 'Reference .fasta', 'grch38/grch38.fasta.gz', long_desc="Location of reference genome:"))
input_form.add_process_button(process_form)
input_form.display()

VBox(children=(HTML(value='Guppy/MinKNOW output directory, will be searched for BAM files:'), HBox(children=(L…


import os
prefix = os.path.join(output_folder, "mod-counts")
bedmethyl = prefix + ".cpg.bed"
!modbam2bed --aggregate -p $prefix -e -m 5mC --cpg -t 4 -r chr20 $reference_genome $data_folder/*.bam > $bedmethyl


!modbam2bed --help


# Modified base summary parsing (click play)
import os
import pandas as pd

methdata = pd.read_csv(
    bedmethyl, sep='\t',
    header=None,
    names=["chrom", "start", "end", "name", "score", "strand", "tstart", "tend", "color", "coverage", "freq", "canon", "mod", "filt"])
methdata.head()


# Coverage plot code (click play)
import aplanat
from aplanat import hist

names = ('fwd', 'rev')
fwdmeth = methdata.loc[methdata['strand'] == "+"]
revmeth = methdata.loc[methdata['strand'] == "-"]
plot = hist.histogram(
    [x["coverage"] for x in (fwdmeth, revmeth)],
    colors=['maroon', 'darkolivegreen'], names=names,
    binwidth=1, style='line', title='Coverage distribution',
    xlim=(0,100))
plot.xaxis.axis_label = 'coverage'
plot.yaxis.axis_label = 'frequency'
aplanat.show(plot, background='#F4F4F4')


# Methylation summary plot code (click play)
coverage_mask =  10

from bokeh.layouts import gridplot
from aplanat import annot

# join the reverse to the fwd assuming sites are one apart
# NOTE: we could just use the aggregated results from modbam2bed
print("Joining forward and reverse strand joins assuming 1-base offset.")
tmp = revmeth.copy()
tmp['start'] -= 1
tmp['end'] -= 1
methjoin = pd.merge(fwdmeth, tmp, on=("chrom", "start"), suffixes=(".fwd", ".rev"))
methjoin["coverage"] = methjoin["coverage.fwd"] + methjoin["coverage.rev"]

# proportion by site
p1 = hist.histogram(
    [methdata['freq']],
    colors=['steelblue'], xlim=(0, 100),
    bins=200, title='Methylation proportion by site')
p1.xaxis.axis_label = 'methylation proportion'
p1.yaxis.axis_label = 'frequency'

# strand bias - remove the trivial case
bias = methjoin['freq.fwd'] - methjoin['freq.rev']
bias = bias.loc[(bias<50) & (bias>-50)]
p2 = hist.histogram(
    [bias], colors=['steelblue'], bins=50,
    title="Methylation strand bias by site.")
p2.xaxis.axis_label = '(fwd. meth. prop.) - (rev. meth. prop.)'
p2.yaxis.axis_label = 'frequency'

aplanat.show(gridplot([[p1, p2]]) , background='#F4F4F4')

Joining forward and reverse strand joins assuming 1-base offset.


# Coverage plot code
from aplanat import points
import ipywidgets as widgets

def plot_callback(inputs):
    try:
        chrom, coords = inputs.region.split(":")
        start, stop = (int(x) for x in coords.split("-"))
    except Exception as e:
        print('Cannot parse region as "chrom:start-stop".')

    # filter data by inputs
    select = (
        (methjoin['coverage'] >= inputs.coverage_mask) &
        (methjoin['chrom'] == chrom) & 
        (methjoin['start'] > start) & 
        (methjoin['start'] < stop))
    d = methjoin.loc[select]

    def down_sample(df, plot_limit=350000):
        # if we have a lot of points, remove some to avoid bokeh dying.
        if len(df) > plot_limit:
            display("Warning: Downsampling points to {} entries. Select a smaller region to show all points.".format(plot_limit))
            df = df.sample(n=plot_limit)
        else:
            display("Showing all sites.")
        return df

    # create a plot
    title = '{} coverage'.format(inputs.region)

    if inputs.colour_by == "orientation":
        d = down_sample(d, plot_limit=350000)
        xs = [d['start']] * 2
        ys = [d['coverage.fwd'], -d['coverage.rev']]
        colors=['maroon', 'orange']
        names=['fwd', 'rev']
    elif inputs.colour_by == "mod. status":
        d = down_sample(d, plot_limit=350000)
        xs = [d['start']] * 2
        ys = [
            +d['mod.fwd'] + d['mod.rev'],
            -d['canon.rev'] - d['canon.rev']]
        colors = ['blue', 'green']
        names = ['modified', 'canonical']
    elif inputs.colour_by == "both":
        d = down_sample(d, plot_limit=125000)
        xs = [d['start']] * 4
        title += ' Positive: fwd strand, Negative: rev strand'
        ys = [
            +d['mod.fwd'], +d['canon.fwd'],
            -d['mod.rev'], -d['canon.fwd']]
        colors = ['blue', 'green', 'blue', 'green']
        names = ['modified', 'canonical', None, None] 
    else:
        raise ValueError("Unrecognised 'colour_by'.")

    plot = points.points(
        xs, ys, colors=colors, names=names, height=300, width=1200)
    plot.xaxis.formatter.use_scientific = False
    plot.xaxis.axis_label = 'position'
    plot.yaxis.axis_label = 'frequency'
    aplanat.show(plot, background='#F4F4F4')


plot_form = InputForm(
    InputSpec("region", "Region", "chr20:0-65000000"),
    InputSpec("colour_by", "Colour by",["mod. status", "orientation", "both"]),
    InputSpec("coverage_mask", "Coverage mask", widgets.IntText(10)))
plot_form.add_process_button(plot_callback)
plot_form.display()

VBox(children=(HBox(children=(Label(value='Region', layout=Layout(width='150px')), interactive(children=(Text(…


bucket = "ont-exd-int-s3-euwst1-epi2me-labs"
domain = "s3-eu-west-1.amazonaws.com"
site = "https://{}.{}".format(bucket, domain)

!echo 'Downloading bisulfite data...'
!wget "$site"/modbase_tutorial/chr20.bismark.bed.gz

Downloading bisulfite data...
--2022-08-03 16:02:03--  https://ont-exd-int-s3-euwst1-epi2me-labs.s3-eu-west-1.amazonaws.com/modbase_tutorial/chr20.bismark.bed.gz
Resolving ont-exd-int-s3-euwst1-epi2me-labs.s3-eu-west-1.amazonaws.com (ont-exd-int-s3-euwst1-epi2me-labs.s3-eu-west-1.amazonaws.com)... 52.218.37.3
Connecting to ont-exd-int-s3-euwst1-epi2me-labs.s3-eu-west-1.amazonaws.com (ont-exd-int-s3-euwst1-epi2me-labs.s3-eu-west-1.amazonaws.com)|52.218.37.3|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14979702 (14M) [binary/octet-stream]
Saving to: ‘chr20.bismark.bed.gz’

chr20.bismark.bed.g 100%[===================>]  14.29M  2.26MB/s    in 6.6s    

2022-08-03 16:02:10 (2.17 MB/s) - ‘chr20.bismark.bed.gz’ saved [14979702/14979702]


# Reading bisulfite data from bismark
bis = pd.read_csv(
    "chr20.bismark.bed.gz",
    sep="\t", header=None,
    names=['chrom', 'start', 'end', 'freq', 'mod', 'canon'])
bis.head()


# Methylation Comparison plot code (click play)
import numpy as np
from aplanat import spatial

combined = pd.merge(bis, methdata, how="outer", on=["chrom", "start", "end"], suffixes=[".bis", ".nano"])

plot_data = combined.loc[
    (combined['mod.bis'] + combined['canon.bis'] > 10) &
    (combined['coverage'] > 10)].dropna(axis=0)
r_coeff = np.corrcoef(plot_data['freq.bis'], plot_data['freq.nano'])
r_coeff = r_coeff[0,1]

p = spatial.heatmap2(
    plot_data['freq.bis'], plot_data['freq.nano'],
    tools = "pan,wheel_zoom,box_zoom,reset",
    log=True, x_bins=50, y_bins=50, xlim=(0,100), ylim=(0,100), zlim=(100, 100000),
    title="Methylation comparison. R={:.3f}".format(r_coeff))

# match_aspect doesn't seem to work as described...
p.match_aspect = True
p.aspect_ratio = 1.2
p.xaxis.axis_label = 'Bisulphite Methylation Frequency'
p.yaxis.axis_label = 'Nanopore Methylation Frequency'
p.toolbar.logo = None
aplanat.show(p, background='#F4F4F4')

_ = pinger.send_notebook_ping('stop', 'modified_base_tutorial')

	chrom	start	end	name	score	strand	tstart	tend	color	coverage	freq	canon	mod
0	chr20	60008	60009	5mC	1000	+	60008	60009	0,0,0	4	25.0	3	1
1	chr20	60009	60010	5mC	1000	-	60009	60010	0,0,0	1	0.0	1	0
2	chr20	60119	60120	5mC	1000	+	60119	60120	0,0,0	4	0.0	4	0
3	chr20	60120	60121	5mC	1000	-	60120	60121	0,0,0	1	0.0	1	0
4	chr20	60578	60579	5mC	833	+	60578	60579	0,0,0	12	30.0	7	3

	chrom	start	end	freq	mod	canon
0	chr20	60008	60009	14.285714	1	6
1	chr20	60009	60010	0.000000	0	9
2	chr20	60119	60120	0.000000	0	37
3	chr20	60120	60121	0.000000	0	57
4	chr20	60578	60579	47.368421	9	10

Modified Base Tutorial

Getting started¶

Install additional software¶

Sample Data¶

Modified base base-calling¶

Using your own data¶

Data entry¶

Summarising the data¶

Creating a tabular summary¶

Analysis of the summary data¶

Aggregated methylation status¶

Assessing methylation locality¶

Comparison to Bisulphite sequencing data¶

Next steps¶