from epi2melabs import ping
pinger = ping.Pingu()
pinger.send_notebook_ping('start', 'basic_qc_tutorial')

# create a work directory and move into it
tutorial_name = "basicqc_tutorial"
working_dir = '/epi2melabs/{}/'.format(tutorial_name)
!mkdir -p "$working_dir"
%cd "$working_dir"

/epi2melabs/basicqc_tutorial


bucket = "ont-exd-int-s3-euwst1-epi2me-labs"
domain = "s3-eu-west-1.amazonaws.com"
site = "https://{}.{}".format(bucket, domain)

!wget -O lambda_sequencing_summary.txt.bz2 \
    "$site/basicqc_tutorial/lambda_sequencing_summary.txt.bz2"


import os
import pandas as pd

import aplanat
import aplanat.report
import aplanat.graphics
from epi2melabs.notebook import InputForm, InputSpec

import ipywidgets as widgets

seq_summary = None
exec_summary = None
report = None


def process_form(inputs):
    summary_file = os.path.abspath(inputs.summary_file)
    global seq_summary, exec_summary, report
    # Read the file, use a tab ('\t') as the separator between columns
    print("Reading input file...")
    seq_summary = pd.read_csv(
        inputs.summary_file, delimiter='\t',
        usecols=[
            'channel', 'start_time', 'duration', 'passes_filtering',
            'sequence_length_template', 'mean_qscore_template'])
    # ensure the file is sorted by time and reset its
    # indexing after doing so
    seq_summary.sort_values('start_time', inplace=True)
    seq_summary.reset_index(drop=True, inplace=True)
    seq_summary = seq_summary
    print("Done.")

    exec_summary = aplanat.graphics.InfoGraphItems()
    report = aplanat.report.HTMLReport(
        "Basic QC Tutorial", "EPI2ME Labs Summary")
    report.markdown("## Excutive summary", key='exec-head')
    report.plot(None, 'exec-plot')  # placeholder


inputs = InputForm(
    InputSpec('summary_file', 'Sequencing summary', 'lambda_sequencing_summary.txt.bz2'),)
inputs.add_process_button(process_form)
inputs.display()

VBox(children=(HBox(children=(Label(value='Sequencing summary', layout=Layout(width='150px')), interactive(chi…


# Use the .head() method to check the contents of the dataframe 
seq_summary.head()


# Show the total number of reads
print("The total number of reads is:", len(seq_summary))

# Group the rows of the dataframe by the value of the `passes_filtering` column
# then ask the size of each group.
pass_fail = seq_summary.groupby('passes_filtering').size()

print("The number of pass reads is:", pass_fail[True])
print("The number of fail reads is:", pass_fail[False])

exec_summary.append('Total reads', pass_fail[True].sum(), 'angle-up', '')

The total number of reads is: 975715
The number of pass reads is: 814554
The number of fail reads is: 161161


# Plotting pass and fail reads (click play)
import aplanat
from aplanat import bars

pass_percentage = 100 * pass_fail[True] / len(seq_summary)
values = [pass_percentage, 100 - pass_percentage]
classes = ['Pass', 'Fail']
colors = ['#54B8B1', '#EF4135']

plot = bars.single_hbar(
    values, classes, colors,
    title="Pass / Fail reads")
plot.xaxis.axis_label = '%age Reads'
aplanat.show(plot, background="#f4f4f4")

# add item to report
report.markdown("""
## Pass/fail classification

The run contained {} pass and {} fail reads for a total of {} reads.
""".format(pass_fail[True], pass_fail[False], len(seq_summary)),
    key="pass-fail-head")
report.plot(plot)


# Calculating mean and N50 read length

# take the `sequence_length_template` column (the read lengths)
# and sort them in ascending order
sorted_lengths = seq_summary.sequence_length_template.sort_values(ascending=False).reset_index(drop=True)
# calculate the cumulative sum of lengths
cumulative_length = sorted_lengths.cumsum()
# find the total number of bases and the mean length
total_bases = cumulative_length.iloc[-1]
mean_length = total_bases / len(seq_summary)
# find the N50 length: the length for which half the reads are longer
n50_index = cumulative_length.searchsorted(total_bases / 2)
n50_length = sorted_lengths.iloc[n50_index]

exec_summary.append('Total yield', total_bases, 'signal', 'b')
exec_summary.append('Mean read length', mean_length, 'align-center', 'b')
exec_summary.append('N50 read length', n50_length, 'align-left', 'b')

report.markdown("""
## Read Lengths and yield

The following graphics depict the read-length distribution in a variety
of ways, see the tutorial for additional details.
""",
    key='length-yield-head')


# Plotting the read length distribution (click play)
# calculate counts for a histogram using numpy
from aplanat import hist, annot

datas = [seq_summary.sequence_length_template]
plot = hist.histogram(datas, bins=400, title="Read length distribution", colors=['#0084A9'])
# add vertical lines for mean and N50 read length
annot.marker_vline(plot, mean_length, 'Mean: {:.0f}'.format(mean_length))
annot.marker_vline(plot, n50_length, 'N50: {}'.format(n50_length), text_baseline='top')
# add axis labels
plot.xaxis.axis_label = 'Read Length / bases'
plot.yaxis.axis_label = 'Number of reads'
# show the plot
aplanat.show(plot, background="#f4f4f4")
report.plot(plot, key='length-plain')


# Plotting read-length mass distribution (click play)
from aplanat import hist, annot

datas = [seq_summary.sequence_length_template]
weights = [seq_summary.sequence_length_template.astype(float)]
plot = hist.histogram(
    datas, weights=weights,
    normalize=True,
    bins=400, title="Read length distribution",
    colors = ["#0084A9"])
# add vertical lines for mean and N50 read length
annot.marker_vline(plot, mean_length, 'Mean: {:.0f}'.format(mean_length))
annot.marker_vline(plot, n50_length, 'N50: {}'.format(n50_length), text_baseline='top')
# add axis labels
plot.xaxis.axis_label = 'Read Length / bases'
plot.yaxis.axis_label = 'Base density'
# show the plot
aplanat.show(plot, background="#f4f4f4")
report.plot(plot, key='length-mass')


# Plotting cumulative yield by read length (click play)
from aplanat import lines

x = sorted_lengths[0:-1:10000]
y = cumulative_length[0:-1:10000] / 1e9
plot = lines.line([x], [y], xlim=(0, None), ylim=(0, None),
    title='Base yield above given read length',
    colors = ["#0084A9"])
plot.xaxis.axis_label = 'Read Length / bases'
plot.yaxis.axis_label = 'Yield above length / Gbases'
# show the plot
aplanat.show(plot, background="#f4f4f4")
report.plot(plot, 'yield-length')


# Histogram of read quality (click play)

passes = seq_summary['passes_filtering']
datas = [
    seq_summary.loc[passes == status]['mean_qscore_template']
    for status in [True, False]]
plot = hist.histogram(
    datas, colors=['#54B8B1', '#EF4135'],
    names=['Pass', 'Fail'], bins=200, xlim=(1, None),
    title="Read quality score distribution")
plot.xaxis.axis_label = 'Read Quality Score'
plot.yaxis.axis_label = 'Count reads / M'
aplanat.show(plot, background="#f4f4f4")

pass_mean_qscore = seq_summary[seq_summary['passes_filtering'] == True]["mean_qscore_template"].mean()
exec_summary.append('Mean qscore (pass)', pass_mean_qscore, 'thumbs-up', '')

report.markdown("""
## Read quality

Distribution of read quality for pass and fail reads.
""", key='qual-head')
report.plot(plot, key='qual-plot')


def process_form(inputs):
    seq_summary['time'] = (seq_summary['start_time'] / 60 / inputs.discretize).astype(int)
    # group the data by quarter hour and pass/fail
    groups = seq_summary.groupby(['passes_filtering', 'time'], as_index=False)
    # sum the bases per group
    throughput = groups['sequence_length_template'].agg('sum')

    data = [
        throughput[throughput.passes_filtering == status]
        for status in [True, False]]
    plot = lines.line(
        [d.time / (60 / inputs.discretize) for d in data],
        [d.sequence_length_template / inputs.discretize / 1e6 for d in data],
        names=['Pass', 'Fail'],
        colors=['#54B8B1', '#EF4135'],
        xlim=(0,None), ylim=(0, None),
        title='Throughput',
        x_axis_label = 'time / hr',
        y_axis_label = 'yield / Gbases')
    aplanat.show(plot, background="#f4f4f4")

    report.markdown("""
    ## Performance through time

    The following charts illustrate read, yield, and flowcell performance through
    the course of the sequencing experiment.
    """, key='time-head')
    report.plot(plot, key='yield-time-plot')


tp_inputs = InputForm(
    InputSpec('discretize', 'Time discretization (min)', (1, 60, 1)), description_width="180px")
tp_inputs.add_process_button(process_form)
tp_inputs.display()

VBox(children=(HBox(children=(Label(value='Time discretization (min)', layout=Layout(width='180px')), interact…


# Plotting base yield through time (click play)

# group the data by pass/fail
groups = seq_summary.set_index(['passes_filtering']).groupby(level=0, as_index=False)
# calculate a cumulative sum of the number of bases sequenced (per group)
sum_tp = groups['sequence_length_template'].cumsum().reset_index()
# attach the start times from the original table
sum_tp['start_time'] = seq_summary['start_time']

data = [
    sum_tp[::1000][sum_tp[::1000].passes_filtering == status]
    for status in [True, False]]
plot = lines.line(
    [d.start_time / 3600 for d in data],
    [d.sequence_length_template / 1e9 for d in data],
    names=['Pass', 'Fail'],
    colors=['#54B8B1', '#EF4135'],
    xlim=(0,None), ylim=(0, None),
    title='Cumulative yield',
    x_axis_label = 'time / hr',
    y_axis_label = 'yield / Gbases')
aplanat.show(plot, background="#f4f4f4")
report.plot(plot, key='yield-time-plot')


# Plotting Sequencing speed plot (click play)
from bokeh import layouts
hour = (seq_summary.start_time / 3600).astype(int)
speed = seq_summary.sequence_length_template / seq_summary.duration
spd_plot = bars.boxplot_series(
    hour, speed, title='Sequencing speed',
    x_axis_label = 'time / hr',
    y_axis_label = 'speed / (bases / s)')

length = seq_summary.sequence_length_template 
len_plot = bars.boxplot_series(
    hour, length, title='Sequencing Lengths',
    x_axis_label='time / hr',
    y_axis_label='read length / bases')
len_plot.x_range=spd_plot.x_range

p = layouts.column(spd_plot, len_plot)
aplanat.show(p, background="#f4f4f4")
report.plot(p, key='speed-length-boxplots')


def process_form(inputs):
    # calculate a start time for each read rounded to discretize minutes
    seq_summary['time'] = (seq_summary['start_time'] / 60 / inputs.discretize).astype(int)
    # group the data by quarter hour
    groups = seq_summary.groupby('time', as_index=False)
    # count the unique channels in each 5 minutes
    chan_counts = groups['channel'].apply(lambda x: len(x.unique()))
    hours = chan_counts.index / (60 / inputs.discretize)

    plot = lines.line(
        [hours],
        [chan_counts['channel']],
        xlim=(0,None), ylim=(0, None),
        title='Functional channels ({}min period)'.format(inputs.discretize),
        x_axis_label = 'time / hr',
        y_axis_label = 'active channels',
        colors = ["#0084A9"])
    aplanat.show(plot, background="#f4f4f4")
    report.plot(plot, 'active-channels-plot')

ac_inputs = InputForm(
    InputSpec('discretize', 'Time discretization (min)', (1, 60, 1)), description_width="180px")
ac_inputs.add_process_button(process_form)
ac_inputs.display()

VBox(children=(HBox(children=(Label(value='Time discretization (min)', layout=Layout(width='180px')), interact…


# Calculating spatial position of channels on flowcells (click play)
import numpy as np
def minion_array():
    # The array is composed of blocks of 64 channels, the low
    # halve is to the right (high to left), working inwards
    # in a 4 x 8 block
    def channel_block(i):
        m = np.zeros((4, 16), dtype=int)
        m[4::-1, :7:-1] = np.arange(i, i + 32).reshape(4, 8)
        m[4::-1, 0:8] = np.arange(i + 32, i + 64).reshape(4, 8)
        return m
    # the blocks ascend from high to low, except the
    # lowest block is at the top
    first_channel = list(range(65, 450, 64)) + [1]
    channel_array = np.vstack([channel_block(x) for x in first_channel])
    ca = pd.DataFrame(channel_array).reset_index().melt('index')
    ca.columns = ['row', 'column', 'channel']
    return ca

def flongle_array():
    # channels are in a simple grid except two upper- and
    # lower-most channels on right-hand column are missing
    channel_array = np.concatenate([
        np.arange(1, 13), np.array([0]),
        np.arange(13, 25), np.array([0]),
        np.arange(25, 115), np.array([0]),
        np.arange(115, 127), np.array([0])]).reshape(10, 13)
    ca = pd.DataFrame(channel_array).reset_index().melt('index')
    ca.columns = ['row', 'column', 'channel']
    return ca

def promethion_array():
    # Array is simple blocks of 25*10 channels
    channel_array = np.hstack([
        np.arange(x, x + 250).reshape(25, 10)
        for x in range(1, 2752, 250)])
    ca = pd.DataFrame(channel_array).reset_index().melt('index')
    ca.columns = ['row', 'column', 'channel']
    return ca

def guess_device(max_channel):
    device = "minion"
    if max_channel > 512:
        device = "promethion"
    if max_channel <= 130:
        device = "flongle"
    return device

channel_maps = {
    'minion': minion_array(),
    'flongle': flongle_array(),
    'promethion': promethion_array()}


device = guess_device(seq_summary['channel'].max())
channel_map = channel_maps[device]

# count pass reads per channel...
counts = seq_summary[seq_summary['passes_filtering']] \
    .groupby('channel') \
    .size().to_frame('reads') \
    .reset_index()
# ...and merge with the channel map
counts = counts.merge(channel_map, on='channel', how='outer').fillna(0)


# Plotting activity heat-map plot (click play)
from aplanat import spatial

plot = spatial.heatmap(
    counts['column'], counts['row'], counts['reads'],
    name='# reads')
aplanat.show(plot, background="#f4f4f4")
report.markdown("### Active channels", key='active-channels-head')
report.plot(plot, key='active-channels-heat')


# Executive summary (click play)
pinger.send_notebook_ping('stop', 'basic_qc_tutorial')

exec_plot = aplanat.graphics.infographic(exec_summary.values(), ncols=3)
aplanat.show(exec_plot, background="#f4f4f4")
report.plot(exec_plot, key='exec-plot')

fname = os.path.join(os.path.dirname(inputs.summary_file), 'report.html')
report.write(fname)
print("Report written to: {}.".format(fname))

Report written to: report.html.

	channel	start_time	duration	passes_filtering	sequence_length_template	mean_qscore_template
0	154	8.71150	10.82900	True	3704	11.429653
1	323	9.33300	10.20775	True	3437	10.364430
2	216	9.34650	10.19425	True	3635	11.148887
3	403	9.41975	10.12100	True	3479	10.644968
4	412	9.54400	9.99675	True	3613	10.794808

Basic QC Tutorial

Expected duration: 15 minutes

Introduction¶

The `sequencing_summary.txt` file¶

Basic QC analysis and results¶

Sample Data¶

Using your own data¶

Data entry¶

Numbers of reads¶

Read Lengths and Yield¶

Quality Scores¶

Performance through time¶

Throughput¶

Sequencing speed and fragment length through time¶

Active Channels¶

Sequencing channel activity plot¶

Headline numbers¶

Summary¶

Basic QC Tutorial

Expected duration: 15 minutes

Introduction¶

The sequencing_summary.txt file¶

Basic QC analysis and results¶

Sample Data¶

Using your own data¶

Data entry¶

Numbers of reads¶

Read Lengths and Yield¶

Quality Scores¶

Performance through time¶

Throughput¶

Sequencing speed and fragment length through time¶

Active Channels¶

Sequencing channel activity plot¶

Headline numbers¶

Summary¶

The `sequencing_summary.txt` file¶