from epi2melabs import ping
tutorial_name = "vcf-tutorial"
pinger = ping.Pingu()
pinger.send_notebook_ping('start', tutorial_name)

# create a work directory and move into it
working_dir = '/epi2melabs/{}/'.format(tutorial_name)
!mkdir -p "$working_dir"
%cd "$working_dir"

/epi2melabs/vcf-tutorial


bucket = "ont-exd-int-s3-euwst1-epi2me-labs"
domain = "s3-eu-west-1.amazonaws.com"
site = "https://{}.{}".format(bucket, domain)
site = "https://ont-exd-int-s3-euwst1-epi2me-labs.s3-eu-west-1.amazonaws.com"

!mkdir -p sample_data
!cd sample_data && wget -O medaka.vcf.gz.tbi $site/vcf_tutorial/ont_hg002/medaka.vcf.gz.tbi
!cd sample_data && wget -O medaka.vcf.gz $site/vcf_tutorial/ont_hg002/medaka.vcf.gz


input_file = None
output_folder = None

def process_form(inputs):
    global input_file
    global output_folder
    input_file = inputs.input_file
    output_folder = inputs.output_folder
    # run a command to concatenate all the files together
    !cecho ok "Making output folder"
    !mkdir -p "$output_folder"
    !test -f "$input_file" \
        && cecho success " - Found input file." \
        || cecho error " - Input file does not exist."

    # create index file
    !cecho ok "Creating index file"
    !tabix -f /epi2melabs/vcf-tutorial/sample_data/medaka.vcf.gz \
        && cecho success " - Successfully created index." \
        || cecho error " - Failed to create index file."

from epi2melabs.notebook import InputForm, InputSpec
input_form = InputForm(
    InputSpec('input_file', 'Input folder', '/epi2melabs/vcf-tutorial/sample_data/medaka.vcf.gz'),
    InputSpec('output_folder', 'Output folder', 'analysis'))
input_form.add_process_button(process_form)
input_form.display()

VBox(children=(HBox(children=(Label(value='Input folder', layout=Layout(width='150px')), interactive(children=…


# use zcat to decompress the file and head to read the first 20 lines
!zcat $input_file 2>/dev/null | head -n 20

##fileformat=VCFv4.1
##medaka_version=1.0.3
##contig=<ID=chr1>
##INFO=<ID=pos1,Number=.,Type=Integer,Description="POS of incorporated variants from haplotype 1">
##INFO=<ID=q1,Number=1,Type=Float,Description="Combined qual score for haplotype 1">
##INFO=<ID=pos2,Number=.,Type=Integer,Description="POS of incorporated variants from haplotype 2">
##INFO=<ID=q2,Number=1,Type=Float,Description="Combined qual score for haplotype 2">
##FORMAT=<ID=GT,Number=G,Type=String,Description="Genotype">
##FORMAT=<ID=GQ,Number=G,Type=Integer,Description="Genotype quality score">
##CL=medaka_variant -U -o chr1 -m r941_prom_variant_g360 -s r941_prom_snp_g360 -i PAD65442_3.6.1_pass.bam -f GCA_000001405.15_GRCh38_no_alt_analysis_set.fna -r chr1:0-10000000 -t 4; Fri  3 Jul 21:15:23 BST 2020
#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	SAMPLE
chr1	10108	.	C	CT	14.91	PASS	pos1=10108;pos2=10108;q1=10.99;q2=18.83	GT:GQ	1|1:15
chr1	10177	.	A	AC	4.852	PASS	pos2=10177;q2=4.852	GT:GQ	0|1:5
chr1	10257	.	A	C	0.799	PASS	pos1=10257;q1=0.799	GT:GQ	1|0:1
chr1	10291	.	C	T	8.544	PASS	pos2=10291;q2=8.544	GT:GQ	0|1:9
chr1	10297	.	C	T	8.215	PASS	pos2=10297;q2=8.215	GT:GQ	0|1:8
chr1	10303	.	C	T	0.246	PASS	pos2=10303;q2=0.246	GT:GQ	0|1:0
chr1	10309	.	C	T	2.7155	PASS	pos1=10309;pos2=10309;q1=1.046;q2=4.385	GT:GQ	1|1:3
chr1	10315	.	C	T	4.8525	PASS	pos1=10315;pos2=10315;q1=3.083;q2=6.622	GT:GQ	1|1:5
chr1	10321	.	C	T	0.562	PASS	pos2=10321;q2=0.562	GT:GQ	0|1:1


# print the header line and the first three variants
!zcat $input_file 2>/dev/null | head -n 20 | grep -A 3 '#CHROM'

#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	SAMPLE
chr1	10108	.	C	CT	14.91	PASS	pos1=10108;pos2=10108;q1=10.99;q2=18.83	GT:GQ	1|1:15
chr1	10177	.	A	AC	4.852	PASS	pos2=10177;q2=4.852	GT:GQ	0|1:5
chr1	10257	.	A	C	0.799	PASS	pos1=10257;q1=0.799	GT:GQ	1|0:1


!zcat $input_file 2>/dev/null | head -n 10000 > snippet.vcf


!bgzip -f -c snippet.vcf > snippet.vcf.gz


!tabix snippet.vcf.gz


!ls -lh snippet.vcf* | awk '{print $5, $9}'

715K snippet.vcf
199K snippet.vcf.gz
1.4K snippet.vcf.gz.tbi


!gzip -f -c snippet.vcf > snippet_gzip.vcf.gz
!tabix snippet_gzip.vcf.gz

[tabix] the compression of 'snippet_gzip.vcf.gz' is not BGZF


import itertools
from pysam import VariantFile

# open the file
vcf = VariantFile(input_file)
# fetch the variants
all_variants = vcf.fetch()
# only look at 3 records
variants = itertools.islice(all_variants, 3)

for variant in variants:
    print(variant.chrom, variant.pos, variant.ref, ">", variant.alts[0])

chr1 10108 C > CT
chr1 10177 A > AC
chr1 10257 A > C


# retrieve some data for the first 10 variants
for variant in itertools.islice(vcf.fetch(), 10):
    # print only variants with a high QUAL
    if variant.qual > 7:
        print(variant.chrom, variant.pos, variant.qual, dict(variant.info))
    else:
        print("  Bad variant at position:", variant.pos)

chr1 10108 14.90999984741211 {'pos1': (10108,), 'pos2': (10108,), 'q1': 10.989999771118164, 'q2': 18.829999923706055}
  Bad variant at position: 10177
  Bad variant at position: 10257
chr1 10291 8.543999671936035 {'pos2': (10291,), 'q2': 8.543999671936035}
chr1 10297 8.21500015258789 {'pos2': (10297,), 'q2': 8.21500015258789}
  Bad variant at position: 10303
  Bad variant at position: 10309
  Bad variant at position: 10315
  Bad variant at position: 10321
  Bad variant at position: 10381


print("contigs:", list(vcf.header.contigs))
print("filter values:", list(vcf.header.filters))
print("INFO subfields:", list(vcf.header.info))
print("samples:", list(vcf.header.samples))

contigs: ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY', 'chrM']
filter values: ['PASS']
INFO subfields: ['pos1', 'q1', 'pos2', 'q2']
samples: ['SAMPLE']


import pandas as pd

def parse_vcf(fname, info_cols=None, nrows=None):
    """Parse a VCF file. The INFO column is parsed to a dictionary.

    :param info_cols: dict of field:dtype for INFO fields to store
        as distinct column.
    :param nrows: number of rows to read from file (including header). 
    """
    header = "CHROM POS ID REF ALT QUAL FILTER INFO FORMAT GT".split()
    vcf = pd.read_csv(
        fname, delimiter='\t', comment='#', names=header, nrows=nrows)
    # create a dictionary out of INFO
    vcf['INFO'] = vcf['INFO'].str.split(";") \
        .apply(lambda x: dict([y.split("=") for y in x]))
    # add a column defining the type of the variant
    rlen = vcf['REF'].apply(len)
    alen = vcf['ALT'].apply(len)
    vcf['type'] = 'sub'
    vcf.loc[rlen > alen, 'type'] = 'del'
    vcf.loc[rlen < alen, 'type'] = 'ins'
    # add requested INFO subfields as columns
    if info_cols is not None:
        for field, dtype in info_cols.items():
            try:
                vcf[field] = vcf['INFO'].apply(lambda x: x.get(field, None))
                vcf[field] = vcf[field].astype(dtype)
            except:
                pass
    return vcf


# Parsing and plotting VCF data with pandas (click play)
from bokeh.layouts import gridplot
import aplanat
from aplanat import bars

# read the file using the function described above
vcf = parse_vcf(input_file, info_cols={'DP':int}, nrows=1000)

# hack: the indexing here just makes things the same order
counts = vcf['type'].value_counts()[['ins', 'sub', 'del']]
count_plot = bars.simple_bar(
    counts.index.tolist(), counts,
    x_axis_label='Variant type', y_axis_label='Count',
    title='Counts of variant types')
qual_plot = bars.boxplot_series(
    vcf['type'], vcf['QUAL'],
    x_axis_label='Variant type', y_axis_label='QUAL',
    title='Variant quality by type',
    ylim=(0,100))
aplanat.show(gridplot([count_plot, qual_plot], ncols=2), background="#f4f4f4")


!bcftools query \
    --format '%CHROM\t%POS\t%REF\t%FIRST_ALT\t%QUAL\t%TYPE\t[%GT]\n' \
    -i 'CHROM=="chr1" && POS>100000 && POS <120000 && QUAL>30 && GT="1|0" && TYPE!="snp"' \
    $input_file 2>/dev/null | head

chr1	114544	AAC	A	30.279	INDEL	1|0
chr1	115733	A	ACT	71.246	INDEL	1|0
chr1	118597	A	ATAAT	101.844	INDEL	1|0

	Name	Brief description (see the specification for details).
1	CHROM	The name of the sequence (typically a chromosome) on which the variation is being called (`contig` from the meta-information). This sequence is usually known as 'the reference sequence'.
2	POS	The 1-based position of the variation on the given sequence.
3	ID	The identifier of the variation, e.g. a dbSNP rs identifier, or if unknown a ".". Multiple identifiers should be separated by semi-colons without white-space. Medaka will output a "." here.
4	REF	The reference base (or bases in the case of an indel) at the given position on the given reference sequence.
5	ALT	The list of alternative alleles at this position, i.e. the sequence of "the variant"
6	QUAL	A quality score associated with the inference of the given alleles.
7	FILTER	A flag indicating which of a given set of filters the variation has passed.
8	INFO	An extensible list of key-value pairs describing the variation. Multiple fields are separated by semicolons with optional values in the format: =[,data]. The description of each field is given in the meta-information section.
9	FORMAT	An (optional) extensible list of fields for describing the samples. The description of each field is given in the meta-information section.
+	SAMPLEs	For each (optional) sample described in the file, values are given for the fields listed in FORMAT.

Introduction to Variant Call Format (.vcf) files

Introduction¶

Getting started¶

Sample Data¶

Data entry¶

VCF files¶

The VCF data table¶

File compression and indexing¶

Manipulating VCF files¶

Reading VCF files with Python¶

Using pysam¶

Using pandas¶

Using bcftools¶

Summary¶