# create a work directory and move into it
directory = "fastq_tutorial"
working_dir='/epi2melabs/{}/'.format(directory)
!mkdir -p "$working_dir"
%cd "$working_dir"

from epi2melabs import ping
pinger = ping.Pingu()
pinger.send_notebook_ping('stop', 'fastq_introduction')

# Download sample data
bucket = "ont-exd-int-s3-euwst1-epi2me-labs"
domain = "s3-eu-west-1.amazonaws.com"
site = "https://{}.{}".format(bucket, domain)

!wget "$site/fast_introduction/archive.tar.gz"
!tar -xzvf archive.tar.gz
%cd test0


filename = "example3.fastq"
!echo $(( $(wc -l < $filename) / 4 )) reads


directory = "."

!find $directory -name "*.fastq"


directory = "."
output_fastq = "all_records.fastq"

!find . -type f \( -iname "*.fastq" ! -iname $output_fastq \) | \
    xargs cat > $output_fastq
!echo $(( $(wc -l < $output_fastq) / 4 )) reads


input_fastq = "all_records.fastq"
output_fastq = "deduplicated.fastq"

!seqkit rmdup "$input_fastq" -o "$output_fastq"


input_fastq = "example3.fastq"
compressed_fastq = "example3.fastq.gz"

!ls -lh "$input_fastq"
!bgzip "$input_fastq"
!ls -lh "$compressed_fastq"


compressed_fastq = "example3.fastq.gz"

!bgzip -d "$compressed_fastq"


directory = "pass"
archive = "archive.tar.gz"

# the options here mean: create, gzip compress, verbose, output file
!tar -czvf "$archive" "$directory"


archive = "archive.tar.gz"

# A temporary folder (tmp) is created here simply to avoid confusion with the
# original directory compressed in the previous example. This is not necessary
# in practice.

# the options here mean: extract, gzip compressed, verbose, input file
!rm -rf tmp && mkdir tmp && cd tmp && \
    tar -xzvf ../"$archive"


# plotting basic summary graphs
pinger.send_notebook_ping('stop', 'fastq_introduction')

import numpy as np
from pysam import FastxFile
from bokeh.layouts import gridplot

qualities = list()
mean_qualities = list()
lengths = list()

# open the file and iterate through its records
with FastxFile("all_records.fastq") as fq:
    for rec in fq:
        # ONT calculation for "mean Q score"
        quals = np.fromiter(
            (ord(x) - 33 for x in rec.quality),
            dtype=int, count=len(rec.quality))
        mean_p = np.mean(np.power(10, quals/-10))
        mean_qualities.append(-10*np.log10(mean_p))
        # all qualities
        qualities.extend(quals)
        lengths.append(len(quals))

# use the aplanat library to plot some graphs of the
# collected statistics
import aplanat
from aplanat.hist import histogram

p1 = histogram(
    [np.array(mean_qualities)], title="Read quality scores",
    x_axis_label="quality", y_axis_label="count",
    height=250, width=300)
p2 = histogram(
    [qualities], title="Base quality scores",
    x_axis_label="quality", y_axis_label="count",
    height=250, width=300)
p3 = histogram(
    [lengths], title="Read lengths",
    x_axis_label="read length / bases", y_axis_label="count",
    height=250, width=300)
aplanat.show(gridplot((p1, p2, p3), ncols=3), background="#f4f4f4")

Introduction to Fastq files

Useful snippets¶

How many records in my `.fastq` file?¶

List all the fastqs in a directory¶

Concatenate all fastqs in a directory into a single file¶

Remove all duplicates in a fastq¶

Compress or extract a fastq file¶

Compress a directory structure¶

Visualizing fastq¶

Introduction to Fastq files

Useful snippets¶

How many records in my .fastq file?¶

List all the fastqs in a directory¶

Concatenate all fastqs in a directory into a single file¶

Remove all duplicates in a fastq¶

Compress or extract a fastq file¶

Compress a directory structure¶

Visualizing fastq¶

How many records in my `.fastq` file?¶