from epi2melabs import ping
pinger = ping.Pingu()
pinger.send_notebook_ping('start', 'metagenome')

# create a work directory and move into it
tutorial_name = "metagenome_tutorial"
working_dir = '/epi2melabs/{}/'.format(tutorial_name)
!mkdir -p "$working_dir"
%cd "$working_dir"

/epi2melabs/metagenome_tutorial


!mamba create -n centrifuge centrifuge --quiet --yes

  Package              Version  Build                  Channel                    Size
────────────────────────────────────────────────────────────────────────────────────────
  Install:
────────────────────────────────────────────────────────────────────────────────────────

  _libgcc_mutex            0.1  conda_forge            conda-forge/linux-64     Cached
  _openmp_mutex            4.5  1_gnu                  conda-forge/linux-64     Cached
  ca-certificates    2020.12.5  ha878542_0             conda-forge/linux-64     Cached
  centrifuge        1.0.4_beta  h9a82719_6             bioconda/linux-64        Cached
  certifi           2019.11.28  py27h8c360ce_1         conda-forge/linux-64     Cached
  gettext             0.19.8.1  hf34092f_1004          conda-forge/linux-64     Cached
  ld_impl_linux-64      2.35.1  hea4e1c9_2             conda-forge/linux-64     Cached
  libffi                 3.2.1  he1b5a44_1007          conda-forge/linux-64     Cached
  libgcc-ng              9.3.0  h2828fa1_19            conda-forge/linux-64     Cached
  libgomp                9.3.0  h2828fa1_19            conda-forge/linux-64     Cached
  libiconv                1.16  h516909a_0             conda-forge/linux-64     Cached
  libidn2                2.3.0  h516909a_0             conda-forge/linux-64     Cached
  libstdcxx-ng           9.3.0  h6de172a_19            conda-forge/linux-64     Cached
  libunistring          0.9.10  h14c3975_0             conda-forge/linux-64     Cached
  ncurses                  6.2  h58526e2_4             conda-forge/linux-64     Cached
  openssl               1.1.1k  h7f98852_0             conda-forge/linux-64     Cached
  perl                  5.32.0  h36c2ea0_0             conda-forge/linux-64     Cached
  pip                   20.1.1  pyh9f0ad1d_0           conda-forge/noarch       Cached
  python                2.7.15  h5a48372_1011_cpython  conda-forge/linux-64     Cached
  python_abi               2.7  1_cp27mu               conda-forge/linux-64     Cached
  readline                 8.1  h46c0cb4_0             conda-forge/linux-64     Cached
  setuptools            44.0.0  py27_0                 conda-forge/linux-64     Cached
  sqlite                3.35.5  h74cdb3f_0             conda-forge/linux-64     Cached
  tar                     1.34  ha1f6473_0             conda-forge/linux-64     Cached
  tk                    8.6.10  hed695b0_1             conda-forge/linux-64     Cached
  wget                  1.20.1  h22169c7_0             conda-forge/linux-64     Cached
  wheel                 0.36.2  pyhd3deb0d_0           conda-forge/noarch       Cached
  zlib                  1.2.11  h516909a_1010          conda-forge/linux-64     Cached

  Summary:

  Install: 28 packages

  Total download: 0  B

────────────────────────────────────────────────────────────────────────────────────────

Preparing transaction: ...working... done
Verifying transaction: ...working... done
Executing transaction: ...working... done


!mamba install -c epi2melabs ncbitaxonomy==3.1.2 --quiet --yes
import ncbitaxonomy
NCBI = ncbitaxonomy.NCBITaxa(silent=True)
NCBI.update_taxonomy_database()

Updating taxdump.tar.gz from NCBI FTP site (via HTTP)...
Done. Parsing...

Loading node names...
2326238 names loaded.
245520 synonyms loaded.
Loading nodes...
2326238 nodes loaded.
Linking nodes...
Tree is loaded.
Updating database: /home/jovyan/.etetoolkit/taxa.sqlite ...
 2326000 generating entries... generating entries... 
Uploading to /home/jovyan/.etetoolkit/taxa.sqlite

Inserting synonyms:      25000

Inserting taxid merges:  60000

Inserting taxids:       25000

Inserting taxids:       2325000


bucket = "ont-exd-int-s3-euwst1-epi2me-labs"
domain = "s3-eu-west-1.amazonaws.com"
site = "https://{}.{}".format(bucket, domain)

!wget -O small_sample.fastq.gz "$site/metagenomic_tutorial/small_sample.fastq.gz"

--2021-05-07 15:49:45--  https://ont-exd-int-s3-euwst1-epi2me-labs.s3-eu-west-1.amazonaws.com/metagenomic_tutorial/small_sample.fastq.gz
Resolving ont-exd-int-s3-euwst1-epi2me-labs.s3-eu-west-1.amazonaws.com (ont-exd-int-s3-euwst1-epi2me-labs.s3-eu-west-1.amazonaws.com)... 52.218.56.232
Connecting to ont-exd-int-s3-euwst1-epi2me-labs.s3-eu-west-1.amazonaws.com (ont-exd-int-s3-euwst1-epi2me-labs.s3-eu-west-1.amazonaws.com)|52.218.56.232|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1300075 (1.2M) [binary/octet-stream]
Saving to: ‘small_sample.fastq.gz’

small_sample.fastq. 100%[===================>]   1.24M  --.-KB/s    in 0.1s    

2021-05-07 15:49:45 (10.8 MB/s) - ‘small_sample.fastq.gz’ saved [1300075/1300075]


#@markdown **Select sequencing inputs:**
import os
import multiprocessing

from epi2melabs.notebook import InputForm, InputSpec

input_file = None
output_folder = None
threads = None

def process_form(inputs):
    global input_file
    global output_folder
    global threads
    threads = inputs.threads
    # run a command to concatenate all the files together
    !echo "Making output folder"
    output_folder = inputs.output_folder
    !mkdir -p "$output_folder"
    input_data = inputs.input_data
    !test -e "$input_data" && echo "Found input file." || "WARNING: $input_data does not exist"
    input_file = os.path.join(output_folder, "input.fastq")
    !rationalize_fastq -i "$input_data" -o "$input_file"
    
input_form = InputForm(
    InputSpec('input_data', 'Input fastq', '/epi2melabs/metagenome_tutorial/small_sample.fastq.gz'),
    InputSpec('output_folder', 'Output folder', 'analysis'),
    InputSpec('threads', 'Processing threads', (1, multiprocessing.cpu_count())))
input_form.add_process_button(process_form)
input_form.display()


indices = {
    "Zymo" : "zymo",
    "Human+viruses+prokaryotes (inc. SARS-CoV-2)": "hpvc",
    "Human+viruses (inc. SARS-CoV-2)": "hvc",
    
}
dbpath = None

def process_db_form(inputs):
    global dbpath
    bucket = "ont-exd-int-s3-euwst1-epi2me-labs"
    domain = "s3-eu-west-1.amazonaws.com"
    site = "https://{}.{}".format(bucket, domain)

    db_store = "db_store"
    !mkdir -p "$db_store"
    selected_database = None

    if inputs.index_option == 'Custom':
        try:
            index1 = glob(os.path.join(inputs.custom_location, "*.1.cf"))
            if len(index1) == 1:
                index1, _, _ = index1[0].rpartition(".1.cf")
                print("* Found database {}.".format(index1))
                selected_database = os.path.abspath(index1)
        except:
            print("* Could not determine database under {}.".format(inputs.custom_location))
    elif inputs.index_option == 'Download':
        print("* Downloading/checking index.")
        prefix = indices[inputs.download_index]
        if not os.path.exists(
                os.path.join(db_store, prefix, "{}.1.cf".format(prefix))):
            print("+ Downloading {} index.".format(inputs.download_index))
            !cd "$db_store" && wget -O "$prefix".tar.gz "$site"/metagenomic_tutorial/"$prefix".tar.gz
            !cd "$db_store" && mkdir -p "$prefix" && cd "$prefix" && tar --strip-components1 -xvf ../"$prefix".tar.gz
        else:
            print("+ Found existing {} index.".format(inputs.download_index))
        selected_database = os.path.join(db_store, prefix, prefix)
    else:
        print("* Invalid value for `index_option`.")

    if selected_database is not None:
        dbpath = os.path.abspath(selected_database)    
        print("Selected database is: {}:".format(dbpath))
        !ls "$dbpath".*.cf
   
db_form = InputForm(
    InputSpec("index_option", "Index choice", ["Download", "Custom"]),
    InputSpec("download_index", "Download", list(indices.keys())),
    InputSpec("custom_location", "Custom Location", "/path/to/my-database"))
db_form.add_process_button(process_db_form)
db_form.display()


!run centrifuge -p $threads --met 5 --time --ignore-quals \
    -S analysis/read_classifications.tsv --report-file analysis/centrifuge_report.tsv \
    -x $dbpath -U $input_file

Time loading forward index: 00:00:00
Time loading reference: 00:00:00
Multiseed full-index search: 00:00:09
Time searching: 00:00:09
report file analysis/centrifuge_report.tsv
Number of iterations in EM algorithm: 0
Probability diff. (P - P_prev) in the last iteration: 0
Calculating abundance: 00:00:00
Overall time: 00:00:09


# Counting genera present (press play)

import pandas as pd
from ncbitaxonomy import ncbi_taxonomy

ncbi2 = ncbi_taxonomy.ncbiquery2.NCBITaxa()
d = pd.read_csv(os.path.join("analysis", "centrifuge_report.tsv"), sep='\t')
taxIDs = d['taxID'].tolist()
genus = []
for i in taxIDs:
    try:
        allranks = ncbi2.get_ranked_lineage(i)
        lineageGenus = allranks['genus']
        genusName = NCBI.translate_to_names([int(lineageGenus)])
        genus.append(genusName[0])
    except:
        genus.append('NA')
d['genus'] = genus
genus_id = d.groupby('genus') \
    .agg(numReads=('numReads', 'sum')) \
    .sort_values('numReads', ascending=False) \
    .reset_index()
read_filter = 1000
common_genera = sum(genus_id['numReads'] > read_filter)
print("{} genera identified with >{} reads.".format(common_genera, read_filter))
display(genus_id.head(8))

10 genera identified with >1000 reads.


!run centrifuge-kreport -x $dbpath analysis/read_classifications.tsv \
    > analysis/read_classifications.tsv.kraken

Loading taxonomy ...
Loading names file ...
Loading nodes file ...


import ipywidgets as widgets
from epi2melabs.notebook import InputForm, InputSpec

pavian_form = InputForm(
    InputSpec('port', 'Aux. EPI2ME Labs port', widgets.IntText(8889)))
pavian_form.display()


# running Pavian web server
!echo "Checking pavian install..."
!mamba install --yes --quiet r-remotes bioconductor-rsamtools
script = """
ncpus=parallel::detectCores()
options(Ncpus=ncpus)
remotes::install_github("fbreitwieser/pavian", upgrade=T, quiet=T)"""
_script = os.path.expanduser("~/.pavian_install.R")
with open(_script, "w") as fh:
    fh.write(script)
!Rscript $_script
!echo "Done."
!echo "Running pavian..."
port = pavian_form.port
!R -e "pavian::runApp(host='0.0.0.0', port="$port")"

Checking pavian install...
Done.
Running pavian...

R version 4.0.3 (2020-10-10) -- "Bunny-Wunnies Freak Out"
Copyright (C) 2020 The R Foundation for Statistical Computing
Platform: x86_64-conda-linux-gnu (64-bit)

R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.

  Natural language support but running in an English locale

R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.

Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.

> pavian::runApp(host='0.0.0.0', port=8833)
Loading required package: shiny

Listening on http://0.0.0.0:8833

	genus	numReads
0	Bacillus	14663
1	Listeria	14074
2	Enterococcus	13469
3	Staphylococcus	13420
4	Salmonella	12735
5	Escherichia	12620
6	Limosilactobacillus	5602
7	Pseudomonas	4305

Metagenomic Classification Tutorial

Introduction¶

Getting started¶

Install additional software¶

Sample Data¶

Using your own data¶

Data entry¶

Metagenomic classification¶

Running `centrifuge`¶

Identifying genera¶

Viewing results with `pavian`¶

Summary¶

Metagenomic Classification Tutorial

Introduction¶

Getting started¶

Install additional software¶

Sample Data¶

Using your own data¶

Data entry¶

Metagenomic classification¶

Running centrifuge¶

Identifying genera¶

Viewing results with pavian¶

Summary¶

Running `centrifuge`¶

Viewing results with `pavian`¶