Source code for wub.bam.read_counter

# -*- coding: utf-8 -*-
"""Count reads per reference in BAM/SAM file."""
import sys

import pysam
import numpy as np
from collections import defaultdict
from wub.util import seq as seq_util
import tqdm


[docs]def count_reads(alignment_file, in_format='BAM', min_aln_qual=0, verbose=False, reads_gc=False): """Count reads mapping to references in a BAM file. :param alignment_file: BAM file. :param min_aln_qual: Minimum mapping quality. :param verbose: Verbose if True. :param read_gc: Calculate mean GC content of reads for each reference. :returns: Dictionary with read counts per reference and read GC contents. :rtype: tuple of dicts """ counts = defaultdict(int) gc_means = defaultdict(list) if in_format == 'BAM': mode = "rb" elif in_format == 'SAM': mode = "r" else: raise Exception("Invalid format: {}".format(in_format)) aln_iter = pysam.AlignmentFile(alignment_file, mode) if verbose and in_format == "BAM": try: total_reads = aln_iter.mapped + aln_iter.unmapped except: total_reads = None sys.stdout.write( "Gathering read statistics from file: {}\n".format(alignment_file)) if in_format == "BAM": aln_iter = tqdm.tqdm(aln_iter, total=total_reads) for segment in aln_iter: if segment.is_unmapped: continue if segment.mapping_quality >= min_aln_qual: counts[segment.reference_name] += 1 if reads_gc: gc_means[segment.reference_name].append(seq_util.gc_content(segment.query_alignment_sequence)) gc_cont = {} if reads_gc: # Calculate mean of mean GC contents: for trs, gc_ms in gc_means.items(): gc_cont[trs] = np.mean(gc_ms) aln_iter.close() return dict(counts), gc_cont
[docs]def count_reads_realtime(alignment_file='-', in_format='SAM', min_aln_qual=0, yield_freq=1, verbose=False): """Online counting of reads mapping to references in a SAM/BAM stream from stdin. :param alignment_file: BAM file (stdin). :param min_aln_qual: Minimum mapping quality. :param yield_freq: Yield frequency. :param verbose: Minimum mapping quality. :returns: Generator of dictionary with read counts per reference. :rtype: generator """ counts = defaultdict(int) if in_format == 'BAM': mode = "rb" elif in_format == 'SAM': mode = "r" else: raise Exception("Invalid format: {}".format(in_format)) aln_iter = pysam.AlignmentFile(alignment_file, mode) if verbose: sys.stdout.write( "Online counting of read statistics from file: {}\n".format(alignment_file)) aln_iter = iter(tqdm.tqdm(aln_iter)) nr_mapped = 0 while True: try: segment = aln_iter.next() except StopIteration: # Final yield: yield counts return if segment.is_unmapped: continue if segment.mapping_quality >= min_aln_qual: counts[segment.reference_name] += 1 nr_mapped += 1 if nr_mapped % yield_freq == 0: yield counts aln_iter.close()