from epi2melabs import ping
tutorial_name = "differential-expression"
pinger = ping.Pingu()
pinger.send_notebook_ping('start', tutorial_name)

# create a work directory and move into it
working_dir = '/epi2melabs/{}/'.format(tutorial_name)
!mkdir -p "$working_dir"
%cd "$working_dir"


import os

!rm -rf pipeline-transcriptome-de
!git clone --branch v0.5.0 --depth 1  https://github.com/nanoporetech/pipeline-transcriptome-de.git \
    2> /dev/null \
    && cecho success "Download complete" \
    || cecho error "Download failed"

if not os.path.exists(os.path.join(working_dir, "pipeline-transcriptome-de")):
    raise Exception("Repository was not cloned successfully")


!mamba install -y -q \
    bioconductor-genomicfeatures bioconductor-drimseq bioconductor-edger \
    bioconductor-dexseq bioconductor-stager \
    "snakemake<5.27" salmon \
    2> /dev/null


!Rscript -e 'install.packages("BiocManager", repos="https://ftp.fau.de/cran/")'  > /dev/null
!Rscript -e 'BiocManager::install(c("GenomeInfoDbData", "DRIMSeq"))'  > /dev/null


# Download sample data (click play)
experiment_name = "sample_data"
!mkdir -p $experiment_name

path = "https://ont-exd-int-s3-euwst1-epi2me-labs.s3-eu-west-1.amazonaws.com/dge_tutorial"

for condition in [1, 2]:
  condition_dir = "{}/condition{}".format(experiment_name, condition)
  !mkdir -p "$condition_dir"
  names = ["condition{}/condition{}_rep{}.fastq.gz".format(condition, condition, i) for i in [1, 2, 3]]
  names = ["{}/{}".format(path, name) for name in names]
  for address in names:
    !wget -P $condition_dir --no-clobber $address

ref = "http://ftp.ensembl.org/pub/release-95/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz"
annotation = "http://ftp.ensembl.org/pub/release-95/gtf/homo_sapiens/Homo_sapiens.GRCh38.95.gtf.gz"
!wget -P $experiment_name --no-clobber $ref
!wget -P $experiment_name --no-clobber $annotation


import errno
from glob import glob
import os
import yaml

import aplanat
from aplanat import graphics
from aplanat import report
from epi2melabs import notebook

show_errors = False  # TODO: import function

experiment_name = None
transcriptome = None
annotation = None
control_name = None
treated_name = None
experiment_dir = None
results_dir = None
exec_summary = None
report = None
control_samples = None
treated_samples = None

def ensure_path(path, verbose=False):
    """Raise error is `path` doesn't exist"""
    if not os.path.exists(path):
        raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), path)
    elif verbose:
        print(notebook.success(f"✓ {path} exists"))
    return True

def build_sample_dict(experiment_dir, sample_name, extension=".fastq.gz"):
    wildcard = "*{}".format(extension)
    samples = glob(os.path.join(experiment_dir, sample_name, wildcard))
    return {os.path.basename(f.rstrip(extension)): f for f in samples}

def process_inputs(inputs):
    global experiment_name, experiment_name, transcriptome, annotation, \
        control_name, treated_name, experiment_dir, exec_summary, report, \
        results_dir, control_samples, treated_samples
    
    experiment_name = inputs.experiment_name
    transcriptome = inputs.transcriptome
    annotation = inputs.annotation
    control_name = inputs.control_name
    treated_name = inputs.treated_name
    experiment_dir = os.path.abspath("{}/{}".format(working_dir, experiment_name))
        
    ensure_path(transcriptome)
    ensure_path(annotation)

    # setup report
    exec_summary = aplanat.graphics.InfoGraphItems()
    report = aplanat.report.HTMLReport(
        "Differential gene expression", "EPI2ME Labs Summary")
    report.markdown("## Excutive summary", key='exec-head')
    report.plot(None, 'exec-plot')

    # prepare a snakemake config
    results_dir = "results"
    extension = ".fastq.gz"
    wildcard = "*{}".format(extension)
    control_samples = build_sample_dict(experiment_dir, control_name)
    treated_samples = build_sample_dict(experiment_dir, treated_name)

    xconfig = {
        "pipeline": results_dir,
        "workdir_top": experiment_name,
        "resdir": "results",
        "repo": "https://github.com/nanoporetech/pipeline-transcriptome-de",
        "transcriptome": transcriptome,
        "annotation": annotation,
        "control_samples": control_samples,
        "treated_samples": treated_samples,
        "minimap_index_opts": "",
        "minimap2_opts": "",
        "maximum_secondary": 100,
        "secondary_score_ratio": 1.0,
        "salmon_libtype": "U",
        "min_samps_gene_expr": 3,
        "min_samps_feature_expr": 1,
        "min_gene_expr": 10,
        "min_feature_expr": 3,
        "threads": 50} 

    all_good = True
    print("Checking control samples exist...")
    for path in xconfig["control_samples"].values():
        all_good &= ensure_path(path, verbose=True)
  
    print("Checking treated samples exist...")
    for path in xconfig["treated_samples"].values():
        all_good &= ensure_path(path, verbose=True)

    if all_good:
        print()
        print("Writing new config\n")
        with open(r'config.yaml', 'w') as file:
            documents = yaml.dump(xconfig, file)
        if show_errors:
            !cat "config.yaml"
        print(notebook.success("Inputs found successfully"))
    else:
        print(notebook.error("One or more inputs were invalid."))

    exec_summary.append("Control replicates", len(control_samples), "calculator")
    exec_summary.append("Treated replicates", len(control_samples), "calculator")



input_form = notebook.InputForm(
    notebook.InputSpec('experiment_name', 'Experiment name', 'sample_data', ),
    notebook.InputSpec('transcriptome', 'Transcriptome .fasta.gz', "/epi2melabs/differential-expression/sample_data/Homo_sapiens.GRCh38.cdna.all.fa.gz"),
    notebook.InputSpec('annotation', 'Annotation .gtf.gz', "/epi2melabs/differential-expression/sample_data/Homo_sapiens.GRCh38.95.gtf.gz"),
    notebook.InputSpec('control_name', 'Control experiment prefix', 'condition1'),
    notebook.InputSpec('treated_name', 'Treatment experiment prefix', 'condition2'),
    description_width='200px'
)
input_form.add_process_button(process_inputs)
input_form.display()

VBox(children=(HBox(children=(Label(value='Experiment name', layout=Layout(width='200px')), interactive(childr…


!snakemake --snakefile pipeline-transcriptome-de/Snakefile all \
    --configfile config.yaml -j 4 \
    --until "build_minimap_index" > /dev/null


!snakemake --snakefile pipeline-transcriptome-de/Snakefile all \
    --configfile config.yaml -j 4 \
    --until "map_reads"


# Generate alignment statistics (click play)
align_dir = os.path.join(experiment_dir, results_dir, "sorted_alignments")
bams = sorted((f for f in os.listdir(align_dir) if f.endswith(".bam")))

for bam in (os.path.join(align_dir, f) for f in bams):
    seqkit_stats = "{}.seqkit.tsv".format(bam)
    !seqkit bam $bam 2> "$seqkit_stats" || cecho error "seqkit stats failed"
    !cecho success "✔ Generated tsv for $bam"

!cecho success "Step complete"


# Preparing raw read and alignment summaries for each condition (click play)
from glob import glob
import pandas as pd

pd.options.display.float_format = '{:.2f}'.format

def parse_seqkit(fname):
    cols = {
        'Read':str, 'Ref': str, 'MapQual':int, 'Acc':float, 'ReadLen':int,
        'ReadAln':int, 'ReadCov':float, 'MeanQual':float,
        'IsSec':bool, 'IsSup':bool}
    df = pd.read_csv(fname, sep="\t", dtype=cols, usecols=cols.keys())
    df['Clipped'] = df['ReadLen'] - df['ReadAln']
    df['Type'] = 'Primary'
    df.loc[df['IsSec'], 'Type'] = 'Secondary'
    df.loc[df['IsSup'], 'Type'] = 'Supplementary'
    df["fname"] = os.path.basename(fname).rstrip(".bam.seqkit.tsv")
    return df

def create_summary_table(df):
    def number_of_alignments(df, field_name):
        grouped = df.groupby('fname').agg(**{
            field_name: ('Read', 'size'),
        })
        return grouped.transpose()

    all = number_of_alignments(df, "Read mappings")
    total_alignment_count = int(all.transpose().sum())
    exec_summary.append("Total alignments", total_alignment_count, "calculator")
    primary = number_of_alignments(df.loc[df['Type'] == 'Primary'], "Primary")
    secondary = number_of_alignments(df.loc[df['Type'] == 'Secondary'], "Secondary")
    supplementary = number_of_alignments(df.loc[df['Type'] == 'Supplementary'], "Supplementary")
    avg_acc = df.loc[df['Type'] == 'Primary'].groupby('fname').agg(**{"Median Qscore": ('MeanQual', 'median'),}).transpose()
    avg_mapq = df.loc[df['Type'] == 'Primary'].groupby('fname').agg(**{"Median MAPQ": ('MapQual', 'median'),}).transpose()
    return pd.concat([
        all, primary, secondary, supplementary,
        avg_acc, avg_mapq])

def pool_csvs(folder):
    files = glob(folder + "/*bam.seqkit.tsv")
    dfs = [parse_seqkit(f) for f in files]
    return pd.concat(dfs)

alignment_stats = pool_csvs(align_dir)
alignment_summary_df = create_summary_table(alignment_stats)
report.table(alignment_summary_df, key='alignment-stats')
display(alignment_summary_df)


!snakemake --snakefile pipeline-transcriptome-de/Snakefile all \
    --configfile config.yaml -j 4 \
    --until "merge_counts"


!snakemake --snakefile pipeline-transcriptome-de/Snakefile \
    --configfile config.yaml -p -j 4 \
    --until "de_analysis" > /dev/null


# Displaying results (click play)
import pandas as pd
import numpy as np
from IPython.display import Markdown

salmon_merged_file = filtered_count_file = f"{experiment_name}/{results_dir}/merged/all_counts.tsv"
salmon_counts = pd.read_csv(salmon_merged_file, sep='\t')
salmon_counts.set_index("Reference", drop=True, append=False, inplace=True)
salmon_size_top = salmon_counts.sum(axis=1).sort_values(ascending=False)

salmon_counts = salmon_counts.applymap(np.int64)
display(salmon_counts.loc[salmon_size_top.index].head(n=10))
salmon_count_caption = '''*Table showing the 10 annotated transcript features
identified by Minimap2 mapping and Salmon transcript detection with the highest
number of mapped reads*'''

display(Markdown(salmon_count_caption))
report.markdown(salmon_count_caption, "salmon-head-caption")
report.table(salmon_counts.loc[salmon_size_top.index].head(n=10), "salmon-head")

unique_transcripts = int(salmon_counts.reset_index()["Reference"].size)
exec_summary.append("Unique transcripts found", unique_transcripts, "calculator")


# Reviewing the number of transcripts per gene (click play)
filtered_count_file = f"{experiment_name}/{results_dir}/merged/all_counts_filtered.tsv"
# note that filtered_counts only includes genes with > 1 transcript
transcripts_per_gene = pd.read_csv(filtered_count_file, \
                                   sep='\t', \
                                   usecols=['gene_id', 'feature_id']).groupby(['gene_id']).agg(['count'])
transcripts_per_gene.columns = transcripts_per_gene.columns.droplevel()
# now load the complete genes counts
gene_count_file = f"{experiment_name}/{results_dir}/merged/all_gene_counts.tsv"
gene_ids = pd.read_csv(gene_count_file, \
                                   sep='\t', \
                                   usecols=[0]).index.values.tolist()
# identify the singletons
singletons = [ gene_id for gene_id in gene_ids if not gene_id in transcripts_per_gene.index.values.tolist()] 
singletons = pd.DataFrame(index=singletons, columns=['count']).fillna(1)
# and add the singletons to transcripts_per_gene
transcripts_per_gene = pd.concat([singletons, transcripts_per_gene])

### PLEASE DELETE ME BEFORE RELEASE ###
gene_transcript_counts = transcripts_per_gene.apply(pd.value_counts)
groups = gene_transcript_counts.index.to_numpy()
# print(groups)
counts = gene_transcript_counts['count'].to_numpy()
# print(counts)
### PLEASE DELETE ME BEFORE RELEASE ###

import aplanat
from aplanat import hist
plot = hist.histogram([transcripts_per_gene['count'].tolist()], binwidth=1)
plot.xaxis.axis_label = "Number of transcripts per gene (n)"
plot.yaxis.axis_label = "Number of occurences"
aplanat.show(plot, background="#f4f4f4")
transcripts_per_gene_caption = """*Histogram showing the abundance of 
transcript counts for genes identified in the analysis.*
"""
report.markdown(transcripts_per_gene_caption, "transcripts-per-gene-caption")
report.plot(plot, "transcripts-per-gene")
display(Markdown(transcripts_per_gene_caption))


# Top 15 genes ranked by adjusted p-value from the edgeR analysis (click play)
import pandas as pd
import numpy as np

pd.reset_option('display.float_format')

dge_file = filtered_count_file = f"{experiment_name}/{results_dir}/de_analysis/results_dge.tsv"
dge_results = pd.read_csv(dge_file, sep='\t')

dge_pvals = dge_results.sort_values(by='FDR', ascending=True)

dge_results[['logFC', 'logCPM', 'F']] = dge_results[['logFC', 'logCPM', 'F']].round(2)
display(dge_results.loc[dge_pvals.index].head(n=15))

# include a legend
dge_caption = '''*Table showing the top 15 genes, ranked by adjusted p-value, 
from the edgeR analysis. Information shown includes the log2 fold change between
experimental conditions, the log-scaled counts per million measure of abundance
and the false discovery corrected p-value (FDR). This table has not been
filtered for genes that satisfy statistical or magnitudinal thresholds*'''
display(Markdown(dge_caption))
report.table(dge_results.loc[dge_pvals.index].head(n=15), "dge-table")
report.markdown(dge_caption, "dge-caption")


# Calculating differential counts (click play)
#TODO: Needs a key and the limit y lines and also a lower weight for the non-significant points

import aplanat
import pandas as pd
from aplanat import points

dge = "/epi2melabs/{}/{}/{}/" \
    "de_analysis/results_dge.tsv".format(
        tutorial_name, experiment_name, results_dir)
dge = pd.read_csv(dge, sep="\t")
pval_limit = 0.01
up = dge.loc[(dge["logFC"] > 0) & (dge['PValue'] < pval_limit)]
down = dge.loc[(dge["logFC"] <= 0) & (dge['PValue'] < pval_limit)]
not_sig = dge.loc[(dge["PValue"] >= pval_limit)]
logcpm_vs_logfc = points.points(
    x_datas=[
             up["logCPM"], 
             down["logCPM"], 
             not_sig["logCPM"],
             ], 
    y_datas=[
             up["logFC"], 
             down["logFC"],
             not_sig["logFC"],
             ],
    title="Average copy per million (CPM) vs Log-fold change (LFC)",
    colors=["blue", "red", "black"],
    names=["Up", "Down", "NotSig"]
    )

logcpm_vs_logfc.xaxis.axis_label = "Average log CPM"
logcpm_vs_logfc.yaxis.axis_label = "Log-fold change"
aplanat.show(logcpm_vs_logfc, background="#f4f4f4")
logcpm_caption = """*Results of the edgeR Analysis.*"""
report.plot(logcpm_vs_logfc, "logcpm-vs-log-fc")
report.markdown(logcpm_caption)
display(Markdown(logcpm_caption))


# Top 15 genes ranked by adjusted p-value from the DEXSeq analysis (click play)
import pandas as pd
import numpy as np

dexseq_file = filtered_count_file = f"{experiment_name}/{results_dir}/de_analysis/results_dexseq.tsv"
dexseq_results = pd.read_csv(dexseq_file, sep='\t')
dexseq_pvals = dexseq_results.sort_values(by='pvalue', ascending=True)

display(dexseq_results.loc[dexseq_pvals.index].head(n=15))
report.table(dexseq_results.loc[dexseq_pvals.index].head(n=15), "dexseq-head")
dexseq_caption = '''*Table showing the top 15 gene isoforms, ranked by adjusted 
p-value, from the DEXSeq analysis. Information shown includes the log2 fold 
change between experimental conditions, the log-scaled transcript abundance and 
the false discovery corrected p-value (FDR). This table has not been filtered 
for genes that satisfy statistical or magnitudinal thresholds*'''
display(Markdown(dexseq_caption))
report.markdown(dexseq_caption, "dexseq-head-caption")


# Further dexseq results (click play)
import aplanat
import pandas as pd
from aplanat import points

pval_limit = 0.01
up = dexseq_results.loc[(dexseq_results["Log2FC"] > 0) & (dexseq_results['pvalue'] < pval_limit)]
down = dexseq_results.loc[(dexseq_results["Log2FC"] <= 0) & (dexseq_results['pvalue'] < pval_limit)]
not_sig = dexseq_results.loc[(dexseq_results["pvalue"] >= pval_limit)]

plot = points.points(
    x_datas=[
             up["Log2MeanExon"], 
             down["Log2MeanExon"], 
             not_sig["Log2MeanExon"],
             ], 
    y_datas=[
             up["Log2FC"], 
             down["Log2FC"],
             not_sig["Log2FC"],
             ],
    title="Average copy per million (CPM) vs Log-fold change (LFC)",
    colors=["red", "blue", "black"],
    xlim = [0, 5],
    names=["Up", "Down", "NotSig"]
    )

# plot = points.points(x_datas=[dexseq_results["Log2MeanExon"]], y_datas=[dexseq_results["Log2FC"]], xlim=[0, 5])
plot.xaxis.axis_label = "A (log2 transformed mean exon read counts)"
plot.yaxis.axis_label = "M (log2 transformed differential abundance)"
aplanat.show(plot, background="#f4f4f4")
dexseq_results_caption = "dexseq results"
display(Markdown(dexseq_results_caption))
report.markdown(dexseq_results_caption)
report.plot(plot)


# Display stageR Results (click play)
import pandas as pd
import numpy as np

dtu_file = filtered_count_file = f"{experiment_name}/{results_dir}/de_analysis/results_dtu_stageR.tsv"
dtu_results = pd.read_csv(dtu_file, sep='\t')
dtu_pvals = dtu_results.sort_values(by='gene', ascending=True)
# dtu_pvals.astype({"gene": "int"})

display(dtu_results.loc[dtu_pvals.index].head(n=10))

# include a legend
dtu_caption = '''Table showing gene and transcript identifiers and their FDR 
corrected probabilities for the genes and their isoforms that have been 
identified as showing DTU using the R packages DEXSeq and StageR. This list has 
been shortened requiring that both gene and transcript must satisfy the p-value 
threshold'''
display(Markdown(dtu_caption))
report.table(dtu_results.loc[dtu_pvals.index].head(n=10), "dtu-head")
report.markdown(dtu_caption, "dtu-caption")


# Please select a gene name to see detailed gene specific information (click play)
# gene_id = "ENSG00000142937" #@  {"type": "string"}
gene_id = "ENSG00000132849" #@param {"type": "string"}

table = dtu_results.loc[(dtu_results["geneID"] == gene_id)]
msg = "Gene ID \"{}\" does not exist in the dataset, please select another"
assert not table.empty, msg.format(gene_id)

alignment_stats[alignment_stats["Ref"].isin([gene_id])]
alignment_stats["Transcript"] = alignment_stats["Ref"].apply(lambda x: x.split(".")[0])
gene_alignments = alignment_stats[alignment_stats["Transcript"].isin(table["txID"])]
gene_alignments = gene_alignments.loc[(gene_alignments["Type"] == "Primary")]
gene_alignments["condition"] = gene_alignments.apply(lambda x: control_name if x["fname"] in control_samples else treated_name, axis=1)
groups = gene_alignments.groupby(
    ["Transcript", "fname", "condition"]).agg(**{"Transcript_count": ("Read", "size")})

df = None
for gene_id, group in gene_alignments.groupby("Transcript"):
  temp = group.groupby("fname").agg(**{
      gene_id: ("Read", "size")
  }).transpose()
  if df is None:
    df = temp
  else:
    df = pd.concat([df, temp])

df = df.reset_index().rename(columns={"index": "Transcript_ID"})
table = table.rename(columns={
    "txID": "Transcript_ID",
    "gene": "p_gene",
    "transcript": "p_transcript"
    })
gene_table = pd.merge(df, table)
gene_table = gene_table.set_index(["geneID","Transcript_ID"])
file_names = set(control_samples.keys())
file_names.update(treated_samples.keys())
gene_table = gene_table.fillna(0)
gene_table = gene_table.astype({f: "int64" for f in file_names})


display(gene_table)
report.table(gene_table, "transcript-counts-by-condition")

candidate_gene_caption = '''Table showing a selected annotated
gene and its isoform transcripts. The number of reads mapped 
to each transcript is shown along withthe FDR corrected p-values 
for DGE and DTU. The read counts are the raw counts. '''
display(Markdown(candidate_gene_caption))


table = groups.reset_index()[["condition", "Transcript", "Transcript_count"]]
table['transcript, condition'] = \
    table['Transcript'].astype(str) + ', ' + table['condition'].astype(str)
repeats = groups.groupby(level=['Transcript', 'condition']).size()
min_rep, max_rep = min(repeats), max(repeats)

from aplanat.bars import boxplot_series
plot = boxplot_series(
    table['transcript, condition'], table['Transcript_count'],
    x_axis_label='Transcript, condition',
    y_axis_label='Transcript count',
    height=400, title="Transcript counts (from {}-{} replicates)".format(min_rep, max_rep))
plot.xaxis.major_label_orientation = 3.1452/2
if min_rep < 7:
    for renderer in plot.renderers:
        renderer.glyph.line_alpha = 0.2
        try:
            renderer.glyph.fill_alpha = 0.2
        except:
            pass
    plot.circle(
        table['transcript, condition'], table['Transcript_count'],
        fill_color='black', line_color='black')
aplanat.show(plot, background="#f4f4f4")
report.plot(plot, "Transcript-count-boxplot")


# Create a summary (click play)
pinger.send_notebook_ping('stop', tutorial_name)
import aplanat.graphics

exec_plot = aplanat.graphics.infographic(exec_summary.values(), ncols=3)
aplanat.show(exec_plot, background="#f4f4f4")
report.plot(exec_plot, key='exec-plot')

fname = os.path.join(experiment_dir, 'report.html')
report.write(fname)
print("Report written to: {}.".format(fname))

Report written to: /epi2melabs/differential-expression/sample_data/report.html.

	condition1_rep1	condition1_rep2	condition1_rep3	condition2_rep1	condition2_rep2	condition2_rep3
Reference
ENST00000367816.5	1677	1674	1690	3879	3872	3902
ENST00000321358.11	881	914	923	527	543	505
ENST00000290541.6	368	348	333	209	192	194
ENST00000368926.5	410	397	385	126	121	122
ENST00000648638.1	238	218	253	292	255	276
ENST00000263168.3	227	229	221	161	193	176
ENST00000436427.1	196	205	180	194	206	185
ENST00000545882.5	231	226	218	123	95	145
ENST00000629042.2	241	253	261	81	75	80
ENST00000458455.2	145	157	139	146	173	149

	logFC	logCPM	F	PValue	FDR
ENSG00000143153	-1.32	17.77	9503.62	9.848433e-11	1.018328e-07
ENSG00000168679	-6.39	12.21	2661.96	4.270058e-09	2.207620e-06
ENSG00000143443	1.59	14.36	1877.68	1.199131e-08	4.133004e-06
ENSG00000143401	1.50	13.87	1462.06	2.512218e-08	6.179591e-06
ENSG00000213625	-2.43	13.19	1378.67	2.988197e-08	6.179591e-06
ENSG00000143320	3.57	11.54	988.46	7.979872e-08	1.375198e-05
ENSG00000117523	11.60	13.99	8163.22	1.141308e-07	1.685875e-05
ENSG00000169905	10.54	12.95	5245.72	2.720549e-07	3.516309e-05
ENSG00000160679	2.30	10.58	424.15	9.608743e-07	1.103938e-04
ENSG00000126709	-1.64	11.56	404.59	1.103304e-06	1.140817e-04
ENSG00000162613	2.62	10.46	300.30	2.636129e-06	1.603387e-04
ENSG00000130770	-2.88	10.38	305.14	2.515952e-06	1.603387e-04
ENSG00000065978	0.50	16.15	308.56	2.435673e-06	1.603387e-04
ENSG00000158161	-1.35	12.02	303.64	2.552365e-06	1.603387e-04
ENSG00000132849	-2.44	11.94	311.68	2.365164e-06	1.603387e-04

	Log2MeanExon	Log2FC	pvalue	padj
ENSG00000143442:ENST00000271715	2.813474	-17.709932	3.145616e-23	1.198480e-20
ENSG00000085832:ENST00000371730	3.785937	-16.286305	1.474326e-22	2.808590e-20
ENSG00000085832:ENST00000371733	2.821409	14.871998	2.147509e-19	2.635157e-17
ENSG00000132849:ENST00000484937	2.383466	1.330886	2.766569e-19	2.635157e-17
ENSG00000126709:ENST00000339145	4.764516	-12.158246	9.475282e-17	7.220165e-15
ENSG00000126709:ENST00000361157	3.109135	21.176774	1.503898e-16	9.549751e-15
ENSG00000160752:ENST00000368356	2.967874	15.238805	6.321788e-16	3.440859e-14
ENSG00000143442:ENST00000368863	3.801362	13.925323	1.813391e-15	8.636275e-14
ENSG00000160752:ENST00000467076	2.402474	-16.955405	3.294748e-15	1.394777e-13
ENSG00000127125:ENST00000372561	4.061966	0.265387	1.302628e-13	4.526926e-12
ENSG00000127125:ENST00000372556	3.902294	-15.154522	1.306987e-13	4.526926e-12
ENSG00000204138:ENST00000632964	2.578330	-16.131548	4.956701e-13	1.573753e-11
ENSG00000132849:ENST00000613764	4.413907	-19.514454	6.157462e-13	1.804610e-11
ENSG00000122406:ENST00000644549	2.714571	14.983026	1.317396e-12	3.585200e-11
ENSG00000065978:ENST00000321358	9.440856	0.244505	6.127984e-12	1.461268e-10

	geneID	txID	transcript
231	ENSG00000132849	ENST00000613764	2.894007e-12
83	ENSG00000143442	ENST00000450842	8.902381e-02
84	ENSG00000143442	ENST00000271715	9.856265e-23
228	ENSG00000132849	ENST00000484937	1.300288e-18
229	ENSG00000132849	ENST00000459752	4.302434e-08
230	ENSG00000132849	ENST00000484562	7.108051e-04
82	ENSG00000143442	ENST00000485040	1.687110e-01
150	ENSG00000085832	ENST00000371733	3.364430e-19
151	ENSG00000085832	ENST00000371727	5.143824e-02
152	ENSG00000085832	ENST00000371730	2.309777e-22

		condition1_rep1	condition1_rep2	condition1_rep3	condition2_rep2	condition2_rep3	condition2_rep1	p_gene	p_transcript
geneID	Transcript_ID
ENSG00000132849	ENST00000459752	3	4	3	1	3	0	0.0	4.302434e-08
	ENST00000484562	3	2	3	5	3	4	0.0	7.108051e-04
	ENST00000484937	7	8	7	51	40	52	0.0	1.300288e-18
	ENST00000613764	0	0	0	2	2	3	0.0	2.894007e-12
	ENST00000646453	0	0	0	9	9	15	0.0	2.798294e-08

Differential Gene Expression Tutorial¶

Expected Duration: 30 minutes

Introduction¶

Getting Started¶

Install additional software¶

Sample Data¶

Using your own data¶

Data entry¶

Running the analysis pipeline¶

Aligning reads to reference sequence¶

Transcript quantification¶

Calculating differential expression¶

Pre-filtering of quantitative data using DRIMSeq¶

edgeR based differential expression analysis¶

Differential transcript usage using DEXSeq¶

StageR stage-wise analysis of DGE and DTU¶

Running the analysis¶

Data analysis¶

Reviewing results from the Salmon analysis¶

Differential Gene Expression (DGE)¶

Differential Transcript Usage¶

StageR analysis¶

Review of candidate genes¶

Headline numbers¶

Summary¶

References¶