Source code for wub.bam.stats

# -*- coding: utf-8 -*-

import six
from six.moves import reduce
import sys
from collections import defaultdict, OrderedDict
import tqdm
import numpy as np

from wub.bam import common as bam_common
from wub.util import seq as seq_util


def _frag_dict_to_array(fd, chrom_lengths):
    """ Convert fragment coverage dictionary into a numpy array. """
    res = {}
    for ref, frags in six.iteritems(fd):
        cov = np.zeros((chrom_lengths[ref],), dtype=int)
        for pos, count in six.iteritems(frags):
            cov[pos] = count
        res[ref] = cov
    return res


[docs]def frag_coverage(bam, chrom_lengths, region=None, min_aqual=0, ref_cov=True, verbose=True):
    """ Calculate fragment coverage vectors on the forward and reverse strands.

    :param bam: Input bam file.
    :param chrom_lengths: Dictionary of chromosome names and lengths.
    :param region: Restrict parsing to the specified region.
    :param min_aqual: Minimum mapping quality.
    :param verbose: Display progress bar.
    :returns: Forward and reverse fragment coverage vectors.
    :rtype: dict
    """

    frags_fwd = defaultdict(lambda: defaultdict(int))
    frags_rev = defaultdict(lambda: defaultdict(int))

    aln_ref_cov = (defaultdict(list))

    bam_reader = bam_common.pysam_open(bam, in_format='BAM')
    ue = True
    if region is not None:
        ue = False
    bam_iter = bam_reader.fetch(region=region, until_eof=ue)

    try:
        total_reads = bam_reader.mapped + bam_reader.unmapped
    except:
        total_reads = None
    if verbose and region is None:
        sys.stdout.write(
            "Gathering fragment statistics from file: {}\n".format(bam))
        bam_iter = tqdm.tqdm(bam_iter, total=total_reads)

    for r in bam_iter:
        # Skip unmapped reads:
        if r.is_unmapped:
            continue
        # Skip if mapping quality is too low:
        if r.mapq < min_aqual:
            continue
        pos = r.reference_start
        ref = r.reference_name
        if r.is_reverse:
            frags_rev[r.reference_name][pos] += 1
        else:
            frags_fwd[r.reference_name][pos] += 1

        if ref_cov:
            aln_ref_cov[ref].append(r.reference_length / float(chrom_lengths[ref]))

    frags_fwd = _frag_dict_to_array(frags_fwd, chrom_lengths)
    frags_rev = _frag_dict_to_array(frags_rev, chrom_lengths)

    res = {'frags_fwd': frags_fwd, 'frags_rev': frags_rev, 'ref_cov': aln_ref_cov}
    return res


def _update_read_stats(r, res, min_aqual):
    """ Update read statistics. """
    if r.is_unmapped:
        res['unmapped'] += 1
        res['unaligned_lengths'].append(r.query_length)

        if r.query_qualities is not None:
            res['unaligned_quals'].append(
                seq_util.mean_qscore(r.query_qualities, qround=False))
    elif r.mapping_quality >= min_aqual:
        res['mapped'] += 1
        if r.query_qualities is not None:
            res['aligned_quals'].append(
                seq_util.mean_qscore(r.query_qualities, qround=False))
        res['alignment_lengths'].append(r.query_alignment_length)
        res['aligned_lengths'].append(r.infer_query_length())
        res['mapping_quals'].append(r.mapping_quality)
    else:
        res['mapped'] += 1
        if r.query_qualities is not None:
            res['mqfail_aligned_quals'].append(
                seq_util.mean_qscore(r.query_qualities, qround=False))
        res['mqfail_alignment_lengths'].append(r.query_alignment_length)
        res['mqfail_aligned_lengths'].append(r.infer_query_length())
        res['mapping_quals'].append(r.mapping_quality)


[docs]def read_stats(bam, min_aqual=0, region=None, with_clipps=False, verbose=True):
    """ Parse reads in BAM file and record various statistics.

    :param bam: BAM file.
    :param min_aqual: Minimum mapping quality, skip read if mapping quality is lower.
    :param region: smatools region.
    :param with_clipps: Take into account clipps when calculating accuracy.
    :param verbose: Show progress bar.
    :returns: A dictionary with various global and per-read statistics.
    :rtype: dict
    """
    res = {'unmapped': 0,
           'mapped': 0,
           'unaligned_quals': [],
           'unaligned_lengths': [],
           'aligned_quals': [],
           'alignment_lengths': [],
           'aligned_lengths': [],
           'mqfail_aligned_quals': [],
           'mqfail_alignment_lengths': [],
           'mapping_quals': [],
           }
    base_stats = {'aln_length': 0, 'match': 0, 'mismatch': 0,
                  'deletion': 0, 'insertion': 0, 'clipps': 0}
    read_stats = OrderedDict([
        ("name", []),
        ("ref", []),
        ("coverage", []),
        ("direction", []),
        ("aln_length", []),
        ("insertion", []),
        ("deletion", []),
        ("mismatch", []),
        ("match", []),
        ("identity", []),
        ("accuracy", []),
        ("clipps", [])
    ])

    bam_reader = bam_common.pysam_open(bam, in_format='BAM')
    ue = True
    if region is not None:
        ue = False
    bam_iter = bam_reader.fetch(region=region, until_eof=ue)

    try:
        total_reads = bam_reader.mapped + bam_reader.unmapped
    except:
        total_reads = None
    if verbose and region is None:
        sys.stdout.write(
            "Gathering read statistics from file: {}\n".format(bam))
        bam_iter = tqdm.tqdm(bam_iter, total=total_reads)

    for r in bam_iter:
        # Update basic read statistics:
        _update_read_stats(r, res, min_aqual)

        # Get detailed statistics from aligned read and
        # updated global stats:
        bs = stats_from_aligned_read(r, with_clipps)
        # bs is None for unaligned reads.
        if bs is not None:
            for k in six.iterkeys(base_stats):
                base_stats[k] += bs[k]
            for stat, value in six.iteritems(bs):
                read_stats[stat].append(value)

    # Calculate global identity and accuracy:
    base_stats['identity'] = float(
        base_stats['match']) / (base_stats['match'] + base_stats['mismatch'])

    clipps = 0
    if with_clipps:
        clipps = base_stats['clipps']

    base_stats['accuracy'] = 1.0 - (float(base_stats['insertion'] +
                                          base_stats['deletion'] + base_stats['mismatch'] + clipps) / base_stats['aln_length'])
    res['base_stats'] = base_stats
    res['read_stats'] = read_stats
    bam_reader.close()
    return res


[docs]def pileup_stats(bam, region=None, verbose=True, with_quals=True):
    """ Parse pileup columns and extract quality values.

    :param bam: Input BAM file.
    :param region: samtools region.
    :param verbose: Show progress bar.
    :param with_quals: Return quality values per position.
    :returns: Dictionaries per reference with per-base coverage and quality values.
    :rtype: dict
    """
    st = defaultdict(lambda: defaultdict(list))
    cst = defaultdict(lambda: defaultdict(int))
    samfile = bam_common.pysam_open(bam, in_format='BAM')

    pileup_iter = samfile.pileup(region=region, min_base_quality=0)
    start, end = None, None
    if region is not None:
        tmp = region.split(":")
        _, start, end = tmp[0], int(tmp[1]) - 1, int(tmp[2])
    if verbose:
        sys.stdout.write(
            "Gathering pileup statistics from file: {}\n".format(bam))
        total_bases = sum(samfile.lengths)
        if region is not None:
            tmp = region.split(":")
            total_bases = int(tmp[2]) - int(tmp[1])
        pileup_iter = tqdm.tqdm(pileup_iter, total=total_bases)

    for pileupcolumn in pileup_iter:
        if region is not None and (pileupcolumn.reference_pos < start or pileupcolumn.reference_pos >= end):
            continue
        # print pileupcolumn.reference_name, pileupcolumn.reference_pos,
        # pileupcolumn.nsegments
        cst[pileupcolumn.reference_name][
            pileupcolumn.reference_pos] = pileupcolumn.nsegments
        for pileupread in pileupcolumn.pileups:
            if not pileupread.is_del and not pileupread.is_refskip:
                # print pileupcolumn.reference_name, pileupcolumn.reference_pos,
                # pileupread.alignment.query_qualities[pileupread.query_position]
                if (pileupread.alignment.query_qualities is not None) and with_quals:
                    st[pileupcolumn.reference_name][pileupcolumn.reference_pos].append(
                        pileupread.alignment.query_qualities[pileupread.query_position])
    samfile.close()
    return {'qualities': dict(st), 'coverage': dict(cst)}


def _register_event(e, query, ref, qpos, rpos, etype, context_sizes, fixed_context=None):
    """Register event withing a context."""
    start = rpos - context_sizes[0]
    end = rpos + context_sizes[1] + 1
    context = str(ref[start:end])

    # If context goes outside the reference boundaries,
    # do not register it:
    if start < 0 or end > len(ref):
        return
    if etype == 'match':
        base_to = query[qpos]
    elif etype == 'deletion':
        base_to = '-'
        context = fixed_context
    elif etype == 'insertion':
        base_to = '*'
    else:
        raise Exception('Unsupported event type!')

    e[context][base_to] += 1


def _register_insert(insert, rpos, insertion_lengths, insertion_composition):
    """Register insertion length and base composition."""
    insertion_lengths[len(insert)] += 1
    for base, count in six.iteritems(seq_util.base_composition(insert)):
        insertion_composition[base] += count


def _register_deletion(deletion, match_pos, context_sizes, ref, events, deletion_lengths, r, t):
    """Register deletion and deletion lengths."""
    if deletion[0] > 0:  # We have an active deletion if length is larger than zero.
        right_contex_end = match_pos + context_sizes[1] + 1
        if right_contex_end <= len(ref):
            # Deletions are registered with fixed context as we keep trakc of the left context
            # in the deletion tuple:
            _register_event(events, query=r.query_sequence, ref=ref.seq, qpos=t[
                0], rpos=t[0], etype='deletion', context_sizes=context_sizes, fixed_context=deletion[1] + str(ref.seq[match_pos:right_contex_end]))
        deletion_lengths[deletion[0]] += 1
        # Reset deletion size to zero and context to None: we left deletion
        # event.
        deletion[0] = 0
        deletion[1] = None


def _update_events(r, ref, events, indel_dists, context_sizes, base_stats):
    """Update event details."""
    match_pos = 0
    insert = ''
    deletion = [0, None]

    aligned_pairs = r.get_aligned_pairs()
    # Remove soft clips:
    if r.cigartuples[0][0] == 4:
        aligned_pairs = aligned_pairs[r.cigartuples[0][1]:]
    if r.cigartuples[-1][0] == 4:
        aligned_pairs = aligned_pairs[:-r.cigartuples[-1][1]]

    for t in aligned_pairs:
        if t[0] is None:
            # deletion
            # Increse deleted base count:
            base_stats['deletion'] += 1

            if deletion[0] == 0:
                # We are at the left edge of the deletion, register context:
                deletion[1] = str(ref.seq[t[1] - context_sizes[0]:t[1]])
            deletion[0] += 1
            # Set match position to the last seen reference base:
            match_pos = t[1]

            # Register insert if we are switching from deleltion to insertion:
            if insert != '':
                _register_event(events, query=r.query_sequence, ref=ref.seq, qpos=t[
                                0], rpos=match_pos, etype='insertion', context_sizes=context_sizes)
                _register_insert(
                    insert, match_pos, indel_dists['insertion_lengths'], indel_dists['insertion_composition'])
                insert = ''

        elif t[1] is None:
            # insertion
            # Increase inserted base count:
            base_stats['insertion'] += 1

            # Append base to insert:
            insert += r.query_sequence[t[0]]
            # If switching from deleltion to insertion:
            _register_deletion(
                deletion, match_pos, context_sizes, ref, events, indel_dists['deletion_lengths'], r, t)

        else:
            # match or mismatch
            # Increase mismatch and mathc base counts:
            if r.query_sequence[t[0]] == ref.seq[t[1]]:
                base_stats['match'] += 1
            else:
                base_stats['mismatch'] += 1

            _register_event(events, query=r.query_sequence, ref=ref.seq, qpos=t[
                            0], rpos=t[1], etype='match', context_sizes=context_sizes)
            match_pos = t[1]
            # We are switching from and insertion:
            if insert != '':
                _register_event(events, query=r.query_sequence, ref=ref.seq, qpos=t[
                                0], rpos=match_pos, etype='insertion', context_sizes=context_sizes)
                _register_insert(
                    insert, match_pos, indel_dists['insertion_lengths'], indel_dists['insertion_composition'])
                insert = ''
            # We are switching from a deletion:
            _register_deletion(
                deletion, match_pos, context_sizes, ref, events, indel_dists['deletion_lengths'], r, t)


[docs]def error_and_read_stats(bam, refs, context_sizes=(1, 1), region=None, min_aqual=0, verbose=True):
    """Gather read statistics and context-dependend error statistics from BAM file.
    WARNING: context overstepping reference start/end boundaries are not registered.

    Definition of context: for substitutions the event is happening from the "central base", in the case of indels the events are located
    between the central base and the base before.

    :param bam: Input BAM file.
    :param refs: Dictionary of references.
    :param context_sizes: The size of the left and right contexts.
    :param region: samtools regions.
    :param min_qual: Minimum mappign quality.
    :param verbose: Show progress bar.
    :returns: Dictionary with read and error statistics.
    :rtype: dict
    """
    events = defaultdict(lambda: defaultdict(int))
    read_stats = defaultdict(list)
    read_stats = {'unmapped': 0,
                  'mapped': 0,
                  'unaligned_quals': [],
                  'unaligned_lengths': [],
                  'aligned_quals': [],
                  'alignment_lengths': [],
                  'aligned_lengths': [],
                  'mqfail_aligned_quals': [],
                  'mqfail_alignment_lengths': [],
                  'mapping_quals': [],
                  }
    indel_dists = {'insertion_lengths': defaultdict(int), 'deletion_lengths': defaultdict(
        int), 'insertion_composition': defaultdict(int)}

    bam_reader = bam_common.pysam_open(bam, in_format='BAM')
    base_stats = {'match': 0, 'mismatch': 0, 'deletion': 0, 'insertion': 0}

    read_iter = bam_reader.fetch(region=region, until_eof=True)
    if verbose:
        sys.stdout.write(
            "Gathering read and error statistics from file: {}\n".format(bam))
        try:
            total_reads = bam_reader.mapped + bam_reader.unmapped
        except:
            total_reads = None
        read_iter = tqdm.tqdm(read_iter, total=total_reads)

    for r in read_iter:
        _update_read_stats(r, read_stats, min_aqual)
        if r.is_unmapped:
            continue
        if r.mapping_quality < min_aqual:
            continue
        ref = refs[r.reference_name]
        _update_events(r, ref, events, indel_dists, context_sizes, base_stats)

    base_stats['aln_length'] = base_stats['match'] + base_stats['mismatch'] + \
        base_stats['insertion'] + base_stats['deletion']
    base_stats['identity'] = float(
        base_stats['match']) / (base_stats['match'] + base_stats['mismatch'])
    base_stats['accuracy'] = 1.0 - \
        float(base_stats['mismatch'] + base_stats['insertion'] + base_stats['deletion']) / \
        base_stats['aln_length']

    res = {'events': dict(events), 'read_stats': dict(
        read_stats), 'indel_dists': dict(indel_dists), 'base_stats': base_stats}
    return res


[docs]def stats_from_aligned_read(read, with_clipps=False):
    """Create summary information for an aligned read (modified from tang.util.bio).

    :param read: :class:`pysam.AlignedSegment` object
    :param with_clipps:
    """
    tags = dict(read.tags)
    try:
        tags.get('NM')
    except:
        raise IOError(
            "Read is missing required 'NM' tag. Try running 'samtools fillmd -S - ref.fa'.")
    name = read.qname
    if read.flag == 4:
        return None
    match = reduce(lambda x, y: x + y[1] if y[0] == 0 else x, read.cigar, 0)
    ins = reduce(lambda x, y: x + y[1] if y[0] == 1 else x, read.cigar, 0)
    delt = reduce(lambda x, y: x + y[1] if y[0] == 2 else x, read.cigar, 0)
    # NM is edit distance: NM = INS + DEL + SUB
    sub = tags['NM'] - ins - delt
    length = match + ins + delt

    # Count clips:
    clipps = reduce(
        lambda x, y: x + y[1] if (y[0] == 4 or y[0] == 5) else x, read.cigar, 0)
    if with_clipps:
        length += clipps

    iden = float(match - sub) / match
    if with_clipps:
        acc = 1.0 - (float(tags['NM'] + clipps) / length)
    else:
        acc = 1.0 - (float(tags['NM']) / length)
    coverage = float(read.query_alignment_length) / read.infer_query_length()
    direction = '-' if read.is_reverse else '+'
    results = OrderedDict([
        ("name", name),
        ("ref", read.reference_name),
        ("coverage", coverage),
        ("direction", direction),
        ("aln_length", length),
        ("insertion", ins),
        ("deletion", delt),
        ("mismatch", sub),
        ("match", match - sub),
        ("identity", iden),
        ("accuracy", acc),
        ("clipps", clipps),
    ])
    return results