Source code for atactk.metrics

#
# atactk: ATAC-seq toolkit
#
# Copyright 2015 The Parker Lab at the University of Michigan
#
# Licensed under Version 3 of the GPL or any later version
#


"""
Code for making quantitative observations about ATAC-seq experiments.
"""

import collections
import functools

import atactk.data
import atactk.util


[docs]def reduce_scores(scores, resolution): """ Reduce a sequence of scores by summing every `resolution` values. Called with `scores` of [0, 1, 1, 4, 2], you'd get the following results at various resolutions: ========== ====== Resolution Result ========== ====== 1 [0, 1, 1, 4, 2] 2 [1, 5, 2] 3 [2, 6] 4 [6, 2] 10 [8] ========== ====== """ if resolution == 1: return scores return [sum(chunk) for chunk in atactk.util.partition(resolution, scores)]
[docs]def aggregate_scores(scores, extension, resolution): """ Adjust scores in the extended region around a feature. Given a sequence containing the score at each base in a region, the size of the extended region around the feature, and the desired resolution in that extended region, reduce the extended scores. Parameters ---------- scores: list A list containing a score for each base in a region around a feature. extension: int The number of bases at the beginning and end of the list considered the extended region. resolution: int The desired scoring resolution in the extended region. See Also -------- reduce_scores: Reduce scores by summing every `resolution` values. """ return ( reduce_scores(scores[:extension], resolution) + scores[extension:-extension] + reduce_scores(scores[-extension:], resolution) )
[docs]def find_cut_point(aligned_segment, cut_point_offset=4): """Return the position of the given aligned segment's ATAC-seq cut point. Parameters ---------- aligned_segment: :class:`pysam.AlignedSegment` https://pysam.readthedocs.org/en/latest/api.html#pysam.AlignedSegment Returns ------- int Position of the ATAC-seq cut point. """ if aligned_segment.is_reverse: # the cut point is the reference_end minus (cut_point_offset + 1) # (pysam's reference_end is one past the last aligned residue) cut_point = aligned_segment.reference_end - (cut_point_offset + 1) else: cut_point = aligned_segment.reference_start + cut_point_offset # start of the read plus offset return cut_point
[docs]def count_cut_points(aligned_segments, start, end, cut_point_offset=4): """ Return any cut points in the region from the aligned segments. Parameters ---------- aligned_segments: list A list of :class:`pysam.AlignedSegment`. start: int The start of the region of interest. end: int The end of the region of interest. Returns ------- list A list of counts, one for each position from `start` to `end`, of cut points in the aligned segments that fell between the `start` and `end`.. """ cut_points_in_region = [] for segment in aligned_segments: cut_point = find_cut_point(segment, cut_point_offset) if start <= cut_point < end: cut_points_in_region.append(cut_point) # initialize the region with zero counts counts = {p: 0 for p in range(start, end)} # add the cut points counts.update(collections.Counter(cut_points_in_region)) cut_point_counts = [counts[position] for position in sorted(counts.keys())] return cut_point_counts
[docs]def add_cut_points_to_region_tree(region_tree, group_key, strand, cut_points): """ Record cut points by position, template size, and strand. The tree consists of nested dictionaries. The first level keys are positions in a region. The second level keys are the names of template size groups. The third level keys are the strand, and the values are the cut point counts for that position, template size group, and strand. Parameters ---------- region_tree: dict The tree to which the cut point counts will be added. group_key: str The key corresponding to the template size group. strand: str The strand of the cut point's aligned segment: ``+`` or ``-``. cut_points: list The count of cut points at each position in the region that matched the template size group and strand. Notes ----- Only positive counts are recorded. It takes a lot of time and space to record so many zeroes, and it's better to produce them on demand via :class:`collections.defaultdict`. So instead, collect all the scores, and after that work is done, update a tree built with :class:`collections.defaultdict`, then work with that. See the ``make_cut_matrix`` script included with ``atactk`` for an example. """ for position, count in enumerate(cut_points, 0 - (len(cut_points) // 2)): if count > 0: if position not in region_tree: region_tree[position] = {} if group_key not in region_tree[position]: region_tree[position][group_key] = {} if strand not in region_tree[position][group_key]: region_tree[position][group_key][strand] = count else: region_tree[position][group_key][strand] += count
[docs]def score_feature(alignment_filename, bin_groups, include_flags, exclude_flags, quality, cut_point_offset, feature): """ Count the number of transposition events around the given feature. Parameters ---------- alignment_filename: str The BAM file containing aligned reads. bin_groups: iterable A sequence of iterables containing bins and the resolution with which they should be scored. include_flags: iterable The SAM flags to use when selecting aligned segments to score. exclude_flags: iterable The SAM flags to use when excluding aligned segments to score; any flag present on a read excludes it. quality: int The minimum mapping quality a read must have to be scored. feature: ExtendedFeature The feature to score. Returns ------- tuple A tuple of `(row, tree)` where * `row` is a tab-separated list of scores in the region around the feature * `tree` is a three-level dict holding a score for each position, in each of the template size bins given, on each strand, e.g.:: >>> tree[0]['36_149']['F'] 22 >>> tree[0]['36_149']['R'] 15 See Also -------- add_cut_points_to_region_tree: Where the tree for the aggregate matrix is described more fully. """ alignment_file = atactk.data.open_alignment_file(alignment_filename) aligned_segments = alignment_file.fetch(feature.reference, max(0, feature.region_start), feature.region_end) aligned_segments = atactk.data.filter_aligned_segments(aligned_segments, include_flags, exclude_flags, quality) row = [] tree = {} for group in bin_groups: group_rows = [] group_key = ','.join('%s_%s' % (bin[0], bin[1]) for bin in group) for (minimum_length, maximum_length, resolution) in group: bin_scores = [] aligned_segments_in_bin = [a for a in aligned_segments if minimum_length <= abs(a.isize) <= maximum_length] forward_aligned_segments = [a for a in aligned_segments_in_bin if not a.is_reverse] reverse_aligned_segments = [a for a in aligned_segments_in_bin if a.is_reverse] forward_cut_points = count_cut_points(forward_aligned_segments, feature.region_start, feature.region_end, cut_point_offset) reverse_cut_points = count_cut_points(reverse_aligned_segments, feature.region_start, feature.region_end, cut_point_offset) if feature.is_reverse: # need to orient the cut point positions to the motif in the matrix forward_cut_points, reverse_cut_points = list(reversed(reverse_cut_points)), list(reversed(forward_cut_points)) # for the discrete matrix: scores for each feature bin_scores.extend(aggregate_scores(forward_cut_points, feature.extension, resolution)) bin_scores.extend(aggregate_scores(reverse_cut_points, feature.extension, resolution)) group_rows.append(bin_scores) # for the aggregate matrix: scores for the entire region add_cut_points_to_region_tree(tree, group_key, 'F', forward_cut_points) add_cut_points_to_region_tree(tree, group_key, 'R', reverse_cut_points) if len(group) == 1: row.extend(group_rows[0]) else: row.extend(functools.reduce(atactk.util.add_lists, group_rows)) row = '\t'.join(str(score) for score in row) return feature, row, tree