Source code for atactk.data

#
# atactk: ATAC-seq toolkit
#
# Copyright 2015 The Parker Lab at the University of Michigan
#
# Licensed under Version 3 of the GPL or any later version
#


"""
Code for reading and manipulating data commonly used in ATAC-seq pipelines.
"""

from __future__ import print_function

import csv
import gzip
import pysam
import sys


FEATURE_FIELDNAMES = [
    'reference',
    'start',
    'end',
    'name',
    'score',
    'strand',
]

NUCLEOTIDE_COMPLEMENTS = {
    "A": "T",
    "C": "G",
    "G": "C",
    "N": "N",
    "T": "A",
    "a": "t",
    "c": "g",
    "g": "c",
    "n": "n",
    "t": "a",
}


[docs]class ExtendedFeature(object):
    """
    A feature plus a fixed extended region.

    You can define the region by passing the `extension` parameter to the constructor, e.g.::

        feature = ExtendedFeature(extension=100, **bed_record)

    Most of :class:`ExtendedFeature`'s attributes map to the first six
    fields in a BED file. Where our names for the fields differ, the
    BED format name from https://genome.ucsc.edu/FAQ/FAQformat.html is
    included in parentheses below.

    Attributes
    ----------

    reference: str
        The reference sequence on which the feature is located. (``chrom``)
    feature_start: int
        The starting position of the feature in the reference sequence, zero-based. (``chromStart``)
    feature_end: int
        The ending position of the feature in the reference sequence, which is one past the last base in the feature. (``chromEnd``)
    name: str
        The name of the feature.
    score: float
        A numeric score.
    strand: str
        Either ``+`` or ``-``.
    """

    def __init__(self, reference=None, start=None, end=None, name=None, score=0, strand=None, extension=100):

        # required BED fields
        self.reference = reference
        self.feature_start = int(start)
        self.feature_end = int(end)

        # optional BED fields
        self.name = name
        self.score = float(score)
        self.strand = strand

        # region adjustments
        self.extension = int(extension)
        self.is_reverse = strand == '-'
        self.region_start = self.feature_start - self.extension
        self.region_end = self.feature_end + self.extension

    def __str__(self):
        return '\t'.join(str(attribute or '') for attribute in [
            self.reference,
            self.feature_start,
            self.feature_end,
            self.name,
            self.score,
            self.strand,
            self.extension,
        ])

    @property
    def feature_length(self):
        return self.feature_end - self.feature_start

    @property
    def region_length(self):
        return self.region_end - self.region_start

[docs]def complement(seq):
    """
    Return the complement of the supplied nucleic sequence.

    Nucleic of course implies that the only recognized bases are A, C,
    G, T and N. Case will be preserved.

    Parameters
    ----------
    seq: str
        A nucleic sequence.

    Returns
    -------
    str
        The complement of the given sequence.
    """
    return ''.join(NUCLEOTIDE_COMPLEMENTS[base] for base in seq)


[docs]def reverse_complement(seq):
    """
    Return the reverse complement of the supplied nucleic sequence.

    Parameters
    ----------
    seq: str
        A nucleic sequence.

    Returns
    -------
    str
        The reverse complement of the given sequence.

    See also
    --------
    :func:`~atactk.data.complement`
    """
    return complement(reversed(seq))


[docs]def open_maybe_gzipped(filename):
    """
    Open a possibly gzipped file.

    Parameters
    ----------
    filename: str
        The name of the file to open.

    Returns
    -------
    file
        An open file object.
    """
    with open(filename, 'rb') as test_read:
        byte1, byte2 = ord(test_read.read(1)), ord(test_read.read(1))
        if byte1 == 0x1f and byte2 == 0x8b:
            f = gzip.open(filename, mode='rt')
        else:
            f = open(filename, 'rt')
    return f


[docs]def count_features(filename):
    count = 0
    for line in open_maybe_gzipped(filename):
        count += 1
    return count


[docs]def read_features(filename, extension=100, feature_class=ExtendedFeature):
    """
    Return a generator of :class:`ExtendedFeature` instances from the named tab-separated value file.

    Most BED-like files should work; we read the three required and
    first three optional BED fields to get coordinates, and any extra
    fields are ignored.

    Parameters
    ----------
    filename: str
        The (optionally gzipped) tab-separated value file from which to read features. Use '-' to read from standard input.
    extension: int
        The number of bases to score on either side of each feature.
    feature_class: class
        Each row of the file will be instantiated with this class.

    Yields
    ------
    feature
        An :class:`ExtendedFeature` instance for each row of the file.
    """

    if filename == '-':
        source = sys.stdin
    else:
        source = open_maybe_gzipped(filename)
    reader = csv.DictReader(source, fieldnames=FEATURE_FIELDNAMES, restkey='extra_fields', dialect='excel-tab')
    for row in reader:
        del row['extra_fields']
        yield feature_class(extension=extension, **row)


ALIGNMENT_FILE_CACHE = {}


[docs]def open_alignment_file(alignment_filename):
    if alignment_filename in ALIGNMENT_FILE_CACHE:
        return ALIGNMENT_FILE_CACHE[alignment_filename]

    alignment_file = pysam.AlignmentFile(alignment_filename, 'rb')
    try:
        alignment_file.check_index()
    except AttributeError:
        raise AttributeError('The alignments file {} is not in BAM format. Please supply an indexed BAM file.'.format(alignment_filename))
    except ValueError:
        raise ValueError('The alignment file {} is not usable. Please supply an indexed BAM file.'.format(alignment_filename))
    ALIGNMENT_FILE_CACHE[alignment_filename] = alignment_file
    return alignment_file


[docs]def filter_aligned_segments(aligned_segments, include_flags, exclude_flags, quality):
    """
    Filter aligned segments using SAM flags and mapping quality.

    Parameters
    ----------
    aligned_segments: list
        Aligned reads to filter.
    include_flags: list
        Reads matching any include flag will be returned.
    exclude_flags: list
        Reads matching any exclude flag will not be returned.
    quality: int
        Only reads with at least this mapping quality will be returned.

    Returns
    -------
    filtered_aligned_segments: list
        The set of the aligned segments supplied to the function which
        meet the specified criteria.

    Examples
    --------

    You probably want `include_flags` of [83, 99, 147, 163] and
    `exclude_flags` of [4, 8].

    Flag 4 means the read is unmapped, 8 means the mate is unmapped.

    Properly paired and mapped forward aligned segments have flags in [99, 163]

    99:
       -   1: read paired
       -   2: read mapped in proper pair
       -  32: mate reverse strand
       -  64: first in pair

    163:
       -   1: read paired
       -   2: read mapped in proper pair
       -  32: mate reverse strand
       - 128: second in pair

    Properly paired and mapped reverse aligned segments have flags in [83, 147].

    83:
       -   1: read paired
       -   2: read mapped in proper pair
       -  16: read reverse strand
       -  64: first in pair

    147:
       -   1: read paired
       -   2: read mapped in proper pair
       -  16: read reverse strand
       - 128: second in pair
    """

    filtered_aligned_segments = [a for a in aligned_segments if all([
        a.mapping_quality >= quality,
        any(map(lambda f: (a.flag & f) == f, include_flags)),
        all(map(lambda f: (a.flag & f) == 0, exclude_flags))
    ])]
    return filtered_aligned_segments


[docs]def make_fastq_pair_reader(fastq_file1, fastq_file2):
    """
    Return a generator producing pairs of records from two FASTQ files.

    The intent is to produce read pairs from paired-end sequence data.

    Parameters
    ----------
    fastq_file1: str
        The name of the first FASTQ file.

    fastq_file2: str
        The name of the second FASTQ file.

    Yields
    ------
    tuple
        A tuple containing two 4-element lists, one for each FASTQ
        record, representing the ID, sequence, comment, and quality lines.
    """

    f1 = open_maybe_gzipped(fastq_file1)
    f2 = open_maybe_gzipped(fastq_file2)
    while True:
        yield (
            [
                next(f1).strip(),  # name
                next(f1).strip(),  # sequence
                next(f1).strip(),  # comment ('+' line)
                next(f1).strip()   # quality
            ],
            [
                next(f2).strip(),  # name
                next(f2).strip(),  # sequence
                next(f2).strip(),  # comment ('+' line)
                next(f2).strip()   # quality
            ],
        )