# -*- coding: utf-8 -*-
import six
from six.moves import reduce
import sys
from collections import defaultdict, OrderedDict
import tqdm
import numpy as np
from wub.bam import common as bam_common
from wub.util import seq as seq_util
def _frag_dict_to_array(fd, chrom_lengths):
""" Convert fragment coverage dictionary into a numpy array. """
res = {}
for ref, frags in six.iteritems(fd):
cov = np.zeros((chrom_lengths[ref],), dtype=int)
for pos, count in six.iteritems(frags):
cov[pos] = count
res[ref] = cov
return res
[docs]def frag_coverage(bam, chrom_lengths, region=None, min_aqual=0, ref_cov=True, verbose=True):
""" Calculate fragment coverage vectors on the forward and reverse strands.
:param bam: Input bam file.
:param chrom_lengths: Dictionary of chromosome names and lengths.
:param region: Restrict parsing to the specified region.
:param min_aqual: Minimum mapping quality.
:param verbose: Display progress bar.
:returns: Forward and reverse fragment coverage vectors.
:rtype: dict
"""
frags_fwd = defaultdict(lambda: defaultdict(int))
frags_rev = defaultdict(lambda: defaultdict(int))
aln_ref_cov = (defaultdict(list))
bam_reader = bam_common.pysam_open(bam, in_format='BAM')
ue = True
if region is not None:
ue = False
bam_iter = bam_reader.fetch(region=region, until_eof=ue)
try:
total_reads = bam_reader.mapped + bam_reader.unmapped
except:
total_reads = None
if verbose and region is None:
sys.stdout.write(
"Gathering fragment statistics from file: {}\n".format(bam))
bam_iter = tqdm.tqdm(bam_iter, total=total_reads)
for r in bam_iter:
# Skip unmapped reads:
if r.is_unmapped:
continue
# Skip if mapping quality is too low:
if r.mapq < min_aqual:
continue
pos = r.reference_start
ref = r.reference_name
if r.is_reverse:
frags_rev[r.reference_name][pos] += 1
else:
frags_fwd[r.reference_name][pos] += 1
if ref_cov:
aln_ref_cov[ref].append(r.reference_length / float(chrom_lengths[ref]))
frags_fwd = _frag_dict_to_array(frags_fwd, chrom_lengths)
frags_rev = _frag_dict_to_array(frags_rev, chrom_lengths)
res = {'frags_fwd': frags_fwd, 'frags_rev': frags_rev, 'ref_cov': aln_ref_cov}
return res
def _update_read_stats(r, res, min_aqual):
""" Update read statistics. """
if r.is_unmapped:
res['unmapped'] += 1
res['unaligned_lengths'].append(r.query_length)
if r.query_qualities is not None:
res['unaligned_quals'].append(
seq_util.mean_qscore(r.query_qualities, qround=False))
elif r.mapping_quality >= min_aqual:
res['mapped'] += 1
if r.query_qualities is not None:
res['aligned_quals'].append(
seq_util.mean_qscore(r.query_qualities, qround=False))
res['alignment_lengths'].append(r.query_alignment_length)
res['aligned_lengths'].append(r.infer_query_length())
res['mapping_quals'].append(r.mapping_quality)
else:
res['mapped'] += 1
if r.query_qualities is not None:
res['mqfail_aligned_quals'].append(
seq_util.mean_qscore(r.query_qualities, qround=False))
res['mqfail_alignment_lengths'].append(r.query_alignment_length)
res['mqfail_aligned_lengths'].append(r.infer_query_length())
res['mapping_quals'].append(r.mapping_quality)
[docs]def read_stats(bam, min_aqual=0, region=None, with_clipps=False, verbose=True):
""" Parse reads in BAM file and record various statistics.
:param bam: BAM file.
:param min_aqual: Minimum mapping quality, skip read if mapping quality is lower.
:param region: smatools region.
:param with_clipps: Take into account clipps when calculating accuracy.
:param verbose: Show progress bar.
:returns: A dictionary with various global and per-read statistics.
:rtype: dict
"""
res = {'unmapped': 0,
'mapped': 0,
'unaligned_quals': [],
'unaligned_lengths': [],
'aligned_quals': [],
'alignment_lengths': [],
'aligned_lengths': [],
'mqfail_aligned_quals': [],
'mqfail_alignment_lengths': [],
'mapping_quals': [],
}
base_stats = {'aln_length': 0, 'match': 0, 'mismatch': 0,
'deletion': 0, 'insertion': 0, 'clipps': 0}
read_stats = OrderedDict([
("name", []),
("ref", []),
("coverage", []),
("direction", []),
("aln_length", []),
("insertion", []),
("deletion", []),
("mismatch", []),
("match", []),
("identity", []),
("accuracy", []),
("clipps", [])
])
bam_reader = bam_common.pysam_open(bam, in_format='BAM')
ue = True
if region is not None:
ue = False
bam_iter = bam_reader.fetch(region=region, until_eof=ue)
try:
total_reads = bam_reader.mapped + bam_reader.unmapped
except:
total_reads = None
if verbose and region is None:
sys.stdout.write(
"Gathering read statistics from file: {}\n".format(bam))
bam_iter = tqdm.tqdm(bam_iter, total=total_reads)
for r in bam_iter:
# Update basic read statistics:
_update_read_stats(r, res, min_aqual)
# Get detailed statistics from aligned read and
# updated global stats:
bs = stats_from_aligned_read(r, with_clipps)
# bs is None for unaligned reads.
if bs is not None:
for k in six.iterkeys(base_stats):
base_stats[k] += bs[k]
for stat, value in six.iteritems(bs):
read_stats[stat].append(value)
# Calculate global identity and accuracy:
base_stats['identity'] = float(
base_stats['match']) / (base_stats['match'] + base_stats['mismatch'])
clipps = 0
if with_clipps:
clipps = base_stats['clipps']
base_stats['accuracy'] = 1.0 - (float(base_stats['insertion'] +
base_stats['deletion'] + base_stats['mismatch'] + clipps) / base_stats['aln_length'])
res['base_stats'] = base_stats
res['read_stats'] = read_stats
bam_reader.close()
return res
[docs]def pileup_stats(bam, region=None, verbose=True, with_quals=True):
""" Parse pileup columns and extract quality values.
:param bam: Input BAM file.
:param region: samtools region.
:param verbose: Show progress bar.
:param with_quals: Return quality values per position.
:returns: Dictionaries per reference with per-base coverage and quality values.
:rtype: dict
"""
st = defaultdict(lambda: defaultdict(list))
cst = defaultdict(lambda: defaultdict(int))
samfile = bam_common.pysam_open(bam, in_format='BAM')
pileup_iter = samfile.pileup(region=region, min_base_quality=0)
start, end = None, None
if region is not None:
tmp = region.split(":")
_, start, end = tmp[0], int(tmp[1]) - 1, int(tmp[2])
if verbose:
sys.stdout.write(
"Gathering pileup statistics from file: {}\n".format(bam))
total_bases = sum(samfile.lengths)
if region is not None:
tmp = region.split(":")
total_bases = int(tmp[2]) - int(tmp[1])
pileup_iter = tqdm.tqdm(pileup_iter, total=total_bases)
for pileupcolumn in pileup_iter:
if region is not None and (pileupcolumn.reference_pos < start or pileupcolumn.reference_pos >= end):
continue
# print pileupcolumn.reference_name, pileupcolumn.reference_pos,
# pileupcolumn.nsegments
cst[pileupcolumn.reference_name][
pileupcolumn.reference_pos] = pileupcolumn.nsegments
for pileupread in pileupcolumn.pileups:
if not pileupread.is_del and not pileupread.is_refskip:
# print pileupcolumn.reference_name, pileupcolumn.reference_pos,
# pileupread.alignment.query_qualities[pileupread.query_position]
if (pileupread.alignment.query_qualities is not None) and with_quals:
st[pileupcolumn.reference_name][pileupcolumn.reference_pos].append(
pileupread.alignment.query_qualities[pileupread.query_position])
samfile.close()
return {'qualities': dict(st), 'coverage': dict(cst)}
def _register_event(e, query, ref, qpos, rpos, etype, context_sizes, fixed_context=None):
"""Register event withing a context."""
start = rpos - context_sizes[0]
end = rpos + context_sizes[1] + 1
context = str(ref[start:end])
# If context goes outside the reference boundaries,
# do not register it:
if start < 0 or end > len(ref):
return
if etype == 'match':
base_to = query[qpos]
elif etype == 'deletion':
base_to = '-'
context = fixed_context
elif etype == 'insertion':
base_to = '*'
else:
raise Exception('Unsupported event type!')
e[context][base_to] += 1
def _register_insert(insert, rpos, insertion_lengths, insertion_composition):
"""Register insertion length and base composition."""
insertion_lengths[len(insert)] += 1
for base, count in six.iteritems(seq_util.base_composition(insert)):
insertion_composition[base] += count
def _register_deletion(deletion, match_pos, context_sizes, ref, events, deletion_lengths, r, t):
"""Register deletion and deletion lengths."""
if deletion[0] > 0: # We have an active deletion if length is larger than zero.
right_contex_end = match_pos + context_sizes[1] + 1
if right_contex_end <= len(ref):
# Deletions are registered with fixed context as we keep trakc of the left context
# in the deletion tuple:
_register_event(events, query=r.query_sequence, ref=ref.seq, qpos=t[
0], rpos=t[0], etype='deletion', context_sizes=context_sizes, fixed_context=deletion[1] + str(ref.seq[match_pos:right_contex_end]))
deletion_lengths[deletion[0]] += 1
# Reset deletion size to zero and context to None: we left deletion
# event.
deletion[0] = 0
deletion[1] = None
def _update_events(r, ref, events, indel_dists, context_sizes, base_stats):
"""Update event details."""
match_pos = 0
insert = ''
deletion = [0, None]
aligned_pairs = r.get_aligned_pairs()
# Remove soft clips:
if r.cigartuples[0][0] == 4:
aligned_pairs = aligned_pairs[r.cigartuples[0][1]:]
if r.cigartuples[-1][0] == 4:
aligned_pairs = aligned_pairs[:-r.cigartuples[-1][1]]
for t in aligned_pairs:
if t[0] is None:
# deletion
# Increse deleted base count:
base_stats['deletion'] += 1
if deletion[0] == 0:
# We are at the left edge of the deletion, register context:
deletion[1] = str(ref.seq[t[1] - context_sizes[0]:t[1]])
deletion[0] += 1
# Set match position to the last seen reference base:
match_pos = t[1]
# Register insert if we are switching from deleltion to insertion:
if insert != '':
_register_event(events, query=r.query_sequence, ref=ref.seq, qpos=t[
0], rpos=match_pos, etype='insertion', context_sizes=context_sizes)
_register_insert(
insert, match_pos, indel_dists['insertion_lengths'], indel_dists['insertion_composition'])
insert = ''
elif t[1] is None:
# insertion
# Increase inserted base count:
base_stats['insertion'] += 1
# Append base to insert:
insert += r.query_sequence[t[0]]
# If switching from deleltion to insertion:
_register_deletion(
deletion, match_pos, context_sizes, ref, events, indel_dists['deletion_lengths'], r, t)
else:
# match or mismatch
# Increase mismatch and mathc base counts:
if r.query_sequence[t[0]] == ref.seq[t[1]]:
base_stats['match'] += 1
else:
base_stats['mismatch'] += 1
_register_event(events, query=r.query_sequence, ref=ref.seq, qpos=t[
0], rpos=t[1], etype='match', context_sizes=context_sizes)
match_pos = t[1]
# We are switching from and insertion:
if insert != '':
_register_event(events, query=r.query_sequence, ref=ref.seq, qpos=t[
0], rpos=match_pos, etype='insertion', context_sizes=context_sizes)
_register_insert(
insert, match_pos, indel_dists['insertion_lengths'], indel_dists['insertion_composition'])
insert = ''
# We are switching from a deletion:
_register_deletion(
deletion, match_pos, context_sizes, ref, events, indel_dists['deletion_lengths'], r, t)
[docs]def error_and_read_stats(bam, refs, context_sizes=(1, 1), region=None, min_aqual=0, verbose=True):
"""Gather read statistics and context-dependend error statistics from BAM file.
WARNING: context overstepping reference start/end boundaries are not registered.
Definition of context: for substitutions the event is happening from the "central base", in the case of indels the events are located
between the central base and the base before.
:param bam: Input BAM file.
:param refs: Dictionary of references.
:param context_sizes: The size of the left and right contexts.
:param region: samtools regions.
:param min_qual: Minimum mappign quality.
:param verbose: Show progress bar.
:returns: Dictionary with read and error statistics.
:rtype: dict
"""
events = defaultdict(lambda: defaultdict(int))
read_stats = defaultdict(list)
read_stats = {'unmapped': 0,
'mapped': 0,
'unaligned_quals': [],
'unaligned_lengths': [],
'aligned_quals': [],
'alignment_lengths': [],
'aligned_lengths': [],
'mqfail_aligned_quals': [],
'mqfail_alignment_lengths': [],
'mapping_quals': [],
}
indel_dists = {'insertion_lengths': defaultdict(int), 'deletion_lengths': defaultdict(
int), 'insertion_composition': defaultdict(int)}
bam_reader = bam_common.pysam_open(bam, in_format='BAM')
base_stats = {'match': 0, 'mismatch': 0, 'deletion': 0, 'insertion': 0}
read_iter = bam_reader.fetch(region=region, until_eof=True)
if verbose:
sys.stdout.write(
"Gathering read and error statistics from file: {}\n".format(bam))
try:
total_reads = bam_reader.mapped + bam_reader.unmapped
except:
total_reads = None
read_iter = tqdm.tqdm(read_iter, total=total_reads)
for r in read_iter:
_update_read_stats(r, read_stats, min_aqual)
if r.is_unmapped:
continue
if r.mapping_quality < min_aqual:
continue
ref = refs[r.reference_name]
_update_events(r, ref, events, indel_dists, context_sizes, base_stats)
base_stats['aln_length'] = base_stats['match'] + base_stats['mismatch'] + \
base_stats['insertion'] + base_stats['deletion']
base_stats['identity'] = float(
base_stats['match']) / (base_stats['match'] + base_stats['mismatch'])
base_stats['accuracy'] = 1.0 - \
float(base_stats['mismatch'] + base_stats['insertion'] + base_stats['deletion']) / \
base_stats['aln_length']
res = {'events': dict(events), 'read_stats': dict(
read_stats), 'indel_dists': dict(indel_dists), 'base_stats': base_stats}
return res
[docs]def stats_from_aligned_read(read, with_clipps=False):
"""Create summary information for an aligned read (modified from tang.util.bio).
:param read: :class:`pysam.AlignedSegment` object
:param with_clipps:
"""
tags = dict(read.tags)
try:
tags.get('NM')
except:
raise IOError(
"Read is missing required 'NM' tag. Try running 'samtools fillmd -S - ref.fa'.")
name = read.qname
if read.flag == 4:
return None
match = reduce(lambda x, y: x + y[1] if y[0] == 0 else x, read.cigar, 0)
ins = reduce(lambda x, y: x + y[1] if y[0] == 1 else x, read.cigar, 0)
delt = reduce(lambda x, y: x + y[1] if y[0] == 2 else x, read.cigar, 0)
# NM is edit distance: NM = INS + DEL + SUB
sub = tags['NM'] - ins - delt
length = match + ins + delt
# Count clips:
clipps = reduce(
lambda x, y: x + y[1] if (y[0] == 4 or y[0] == 5) else x, read.cigar, 0)
if with_clipps:
length += clipps
iden = float(match - sub) / match
if with_clipps:
acc = 1.0 - (float(tags['NM'] + clipps) / length)
else:
acc = 1.0 - (float(tags['NM']) / length)
coverage = float(read.query_alignment_length) / read.infer_query_length()
direction = '-' if read.is_reverse else '+'
results = OrderedDict([
("name", name),
("ref", read.reference_name),
("coverage", coverage),
("direction", direction),
("aln_length", length),
("insertion", ins),
("deletion", delt),
("mismatch", sub),
("match", match - sub),
("identity", iden),
("accuracy", acc),
("clipps", clipps),
])
return results