Source code for wub.bam.filter

# -*- coding: utf-8 -*-
"""Filter SAM/BAM records by various criteria."""

import itertools


[docs]def get_alignment_score(segement): """Get alignment score from pysam segment. :param segment: Pysam aligned segment. :returns: Alignment score. :rtype: int """ score = 0 try: score = segement.get_tag('AS') except: pass return score
[docs]def filter_top_per_query(records_iter): """Filter pysam records keeping top scoring per query. Assumes records are sorted by name. :param records_iter: Iterator of pysam aligned segments. :returns: Generator of filtered records. :rtype: generator """ buff = [] for rec in itertools.chain(records_iter, [None]): if len(buff) == 0: buff.append(rec) elif rec is None or buff[-1].query_name != rec.query_name: sorted_buff = sorted(buff, key=get_alignment_score, reverse=True) buff = [rec] yield sorted_buff[0] else: buff.append(rec)
[docs]def filter_query_coverage(records_iter, minimum_coverage): """Filter pysam records keeping the ones with sufficient query coverage. :param records_iter: Iterator of pysam aligned segments. :param minimum_coverage: Minimum fraction of covered query. :returns: Generator of filtered records. :rtype: generator """ for rec in records_iter: if rec.is_unmapped: yield rec elif (float(rec.query_alignment_length) / rec.infer_query_length()) >= minimum_coverage: yield rec
[docs]def filter_ref_coverage(records_iter, minimum_coverage, header): """Filter pysam records keeping the ones with sufficient reference coverage. :param records_iter: Iterator of pysam aligned segments. :param minimum_coverage: Minimum fraction of covered reference. :param header: SAM header with reference lengths. :returns: Generator of filtered records. :rtype: generator """ ref_lengths = dict((h['SN'], int(h['LN'])) for h in header['SQ']) for rec in records_iter: if rec.is_unmapped: yield rec elif (float(rec.query_alignment_length) / ref_lengths[rec.reference_name]) >= minimum_coverage: yield rec