#
# atactk: ATAC-seq toolkit
#
# Copyright 2015 The Parker Lab at the University of Michigan
#
# Licensed under Version 3 of the GPL or any later version
#
"""
Code for reading and manipulating data commonly used in ATAC-seq pipelines.
"""
from __future__ import print_function
import csv
import gzip
import pysam
import sys
FEATURE_FIELDNAMES = [
'reference',
'start',
'end',
'name',
'score',
'strand',
]
NUCLEOTIDE_COMPLEMENTS = {
"A": "T",
"C": "G",
"G": "C",
"N": "N",
"T": "A",
"a": "t",
"c": "g",
"g": "c",
"n": "n",
"t": "a",
}
[docs]class ExtendedFeature(object):
"""
A feature plus a fixed extended region.
You can define the region by passing the `extension` parameter to the constructor, e.g.::
feature = ExtendedFeature(extension=100, **bed_record)
Most of :class:`ExtendedFeature`'s attributes map to the first six
fields in a BED file. Where our names for the fields differ, the
BED format name from https://genome.ucsc.edu/FAQ/FAQformat.html is
included in parentheses below.
Attributes
----------
reference: str
The reference sequence on which the feature is located. (``chrom``)
feature_start: int
The starting position of the feature in the reference sequence, zero-based. (``chromStart``)
feature_end: int
The ending position of the feature in the reference sequence, which is one past the last base in the feature. (``chromEnd``)
name: str
The name of the feature.
score: float
A numeric score.
strand: str
Either ``+`` or ``-``.
"""
def __init__(self, reference=None, start=None, end=None, name=None, score=0, strand=None, extension=100):
# required BED fields
self.reference = reference
self.feature_start = int(start)
self.feature_end = int(end)
# optional BED fields
self.name = name
self.score = float(score)
self.strand = strand
# region adjustments
self.extension = int(extension)
self.is_reverse = strand == '-'
self.region_start = self.feature_start - self.extension
self.region_end = self.feature_end + self.extension
def __str__(self):
return '\t'.join(str(attribute or '') for attribute in [
self.reference,
self.feature_start,
self.feature_end,
self.name,
self.score,
self.strand,
self.extension,
])
@property
def feature_length(self):
return self.feature_end - self.feature_start
@property
def region_length(self):
return self.region_end - self.region_start
[docs]def complement(seq):
"""
Return the complement of the supplied nucleic sequence.
Nucleic of course implies that the only recognized bases are A, C,
G, T and N. Case will be preserved.
Parameters
----------
seq: str
A nucleic sequence.
Returns
-------
str
The complement of the given sequence.
"""
return ''.join(NUCLEOTIDE_COMPLEMENTS[base] for base in seq)
[docs]def reverse_complement(seq):
"""
Return the reverse complement of the supplied nucleic sequence.
Parameters
----------
seq: str
A nucleic sequence.
Returns
-------
str
The reverse complement of the given sequence.
See also
--------
:func:`~atactk.data.complement`
"""
return complement(reversed(seq))
[docs]def open_maybe_gzipped(filename):
"""
Open a possibly gzipped file.
Parameters
----------
filename: str
The name of the file to open.
Returns
-------
file
An open file object.
"""
with open(filename, 'rb') as test_read:
byte1, byte2 = ord(test_read.read(1)), ord(test_read.read(1))
if byte1 == 0x1f and byte2 == 0x8b:
f = gzip.open(filename, mode='rt')
else:
f = open(filename, 'rt')
return f
[docs]def count_features(filename):
count = 0
for line in open_maybe_gzipped(filename):
count += 1
return count
[docs]def read_features(filename, extension=100, feature_class=ExtendedFeature):
"""
Return a generator of :class:`ExtendedFeature` instances from the named tab-separated value file.
Most BED-like files should work; we read the three required and
first three optional BED fields to get coordinates, and any extra
fields are ignored.
Parameters
----------
filename: str
The (optionally gzipped) tab-separated value file from which to read features. Use '-' to read from standard input.
extension: int
The number of bases to score on either side of each feature.
feature_class: class
Each row of the file will be instantiated with this class.
Yields
------
feature
An :class:`ExtendedFeature` instance for each row of the file.
"""
if filename == '-':
source = sys.stdin
else:
source = open_maybe_gzipped(filename)
reader = csv.DictReader(source, fieldnames=FEATURE_FIELDNAMES, restkey='extra_fields', dialect='excel-tab')
for row in reader:
del row['extra_fields']
yield feature_class(extension=extension, **row)
ALIGNMENT_FILE_CACHE = {}
[docs]def open_alignment_file(alignment_filename):
if alignment_filename in ALIGNMENT_FILE_CACHE:
return ALIGNMENT_FILE_CACHE[alignment_filename]
alignment_file = pysam.AlignmentFile(alignment_filename, 'rb')
try:
alignment_file.check_index()
except AttributeError:
raise AttributeError('The alignments file {} is not in BAM format. Please supply an indexed BAM file.'.format(alignment_filename))
except ValueError:
raise ValueError('The alignment file {} is not usable. Please supply an indexed BAM file.'.format(alignment_filename))
ALIGNMENT_FILE_CACHE[alignment_filename] = alignment_file
return alignment_file
[docs]def filter_aligned_segments(aligned_segments, include_flags, exclude_flags, quality):
"""
Filter aligned segments using SAM flags and mapping quality.
Parameters
----------
aligned_segments: list
Aligned reads to filter.
include_flags: list
Reads matching any include flag will be returned.
exclude_flags: list
Reads matching any exclude flag will not be returned.
quality: int
Only reads with at least this mapping quality will be returned.
Returns
-------
filtered_aligned_segments: list
The set of the aligned segments supplied to the function which
meet the specified criteria.
Examples
--------
You probably want `include_flags` of [83, 99, 147, 163] and
`exclude_flags` of [4, 8].
Flag 4 means the read is unmapped, 8 means the mate is unmapped.
Properly paired and mapped forward aligned segments have flags in [99, 163]
99:
- 1: read paired
- 2: read mapped in proper pair
- 32: mate reverse strand
- 64: first in pair
163:
- 1: read paired
- 2: read mapped in proper pair
- 32: mate reverse strand
- 128: second in pair
Properly paired and mapped reverse aligned segments have flags in [83, 147].
83:
- 1: read paired
- 2: read mapped in proper pair
- 16: read reverse strand
- 64: first in pair
147:
- 1: read paired
- 2: read mapped in proper pair
- 16: read reverse strand
- 128: second in pair
"""
filtered_aligned_segments = [a for a in aligned_segments if all([
a.mapping_quality >= quality,
any(map(lambda f: (a.flag & f) == f, include_flags)),
all(map(lambda f: (a.flag & f) == 0, exclude_flags))
])]
return filtered_aligned_segments
[docs]def make_fastq_pair_reader(fastq_file1, fastq_file2):
"""
Return a generator producing pairs of records from two FASTQ files.
The intent is to produce read pairs from paired-end sequence data.
Parameters
----------
fastq_file1: str
The name of the first FASTQ file.
fastq_file2: str
The name of the second FASTQ file.
Yields
------
tuple
A tuple containing two 4-element lists, one for each FASTQ
record, representing the ID, sequence, comment, and quality lines.
"""
f1 = open_maybe_gzipped(fastq_file1)
f2 = open_maybe_gzipped(fastq_file2)
while True:
yield (
[
next(f1).strip(), # name
next(f1).strip(), # sequence
next(f1).strip(), # comment ('+' line)
next(f1).strip() # quality
],
[
next(f2).strip(), # name
next(f2).strip(), # sequence
next(f2).strip(), # comment ('+' line)
next(f2).strip() # quality
],
)