Source code for wub.read_stats.contig_stats

from Bio import SeqIO
import pandas as pd
from wub.util.misc import _getextension
from wub.util.seq import mean_qscore


[docs]def readfast(fast): """ reads a fasta or fastq file. :param fast: fastq or fasta :return: list of records with attr :rtype: generator object """ extension = _getextension(fast) for rec in SeqIO.parse(open(fast), extension): yield rec
def _cumsum(df, col): ''' Calculates the cumulative sum of column :param df: dataframe with sequence length :param col: identify the sequence length :return: dataframe with cumulative sum of length :rtype: dataframe ''' df = df.sort_values(by=col, ascending=False).reset_index(drop=True) df['cumsum'] = df[col].cumsum() return df
[docs]def N50(df, col, percent=50): """ Calculate the N50 by default however, by changing percent to 75, N75 can be calculated. :param df: dataframe with seqlen column :param col: column with sequence length :param percent: percentage to be calculated :return: N50 Value :rtype: int """ df1 = _cumsum(df, col) df1['cumsum'] = df1[col].cumsum() n50 = df1['cumsum'].max() * percent / 100 return df1.where(df1['cumsum'] >= n50)[col].dropna().head(1).tolist()[0]
[docs]def L50(df, col, percent=50): """ Calculate the L50 by default however, by changing percent to 75, N75 can be calculated :param df: dataframe with seqlen column :param col: column with sequence length :param percent: percentage to be calculated :return: N50 Value :rtype: int """ df1 = _cumsum(df, col).copy() return df1[df1 >= N50(df, col, percent)][col].count()
[docs]def GC_per_read(seq_rec, fq=False): """ Calculates the number of bases per sequence, GC content and mean Q score if fastq is given :param seq_rec: sequence records with attr from biopython :param fq: boolean :return: dataframe :rtype: dataframe """ d = [] bases = ["A", "T", "C", "G", 'N'] # total_lengths = 0 for rec in seq_rec: tmp = {"SeqID": rec.id, "Seqlen": len(rec.seq), "A": 0, "T": 0, "G": 0, "C": 0, "N": 0} for base in bases: tmp[base] += rec.seq.count(base) if fq: tmp['mean_q'] = round(mean_qscore(rec.letter_annotations[ "phred_quality"], qround=False), 2) d.append(tmp) raw = pd.DataFrame(d).set_index('SeqID') raw['GC content (%)'] = raw.apply(lambda x: float( (x['G']) + x['C']) / x['Seqlen'] * 100.0, axis=1) for base in bases: raw[base + ' (%)'] = (raw[base] / raw["Seqlen"]) * 100.0 raw["other base"] = raw['Seqlen'] - raw[bases].sum(axis=1) return raw
[docs]def get_stats(df): """ Calcualtes the summary stats :param df: dataframe from GC_per_read :return: summary Series :rtype: Series """ stats = pd.Series({}) df = df.copy() Mbase = 1000000.0 bases = ["A", "T", "C", "G", 'N'] total_len = int(df["Seqlen"].sum()) total_bases = df[bases].sum().sum() stats['N75'] = N50(df, 'Seqlen', 75) stats['N50'] = N50(df, 'Seqlen', 50) stats['N25'] = N50(df, 'Seqlen', 25) stats['L75'] = L50(df, "Seqlen", 75) stats['L50'] = L50(df, "Seqlen", 50) stats['L25'] = L50(df, "Seqlen", 25) stats['Max contig'] = df['Seqlen'].max() stats['Min contig'] = df['Seqlen'].min() stats['Avg length'] = df['Seqlen'].mean() stats['Length SD'] = df['Seqlen'].std() stats['Total length (Mb)'] = total_len / Mbase stats['Total bases (Mb)'] = total_len / Mbase stats['Other bases (Mb)'] = (total_len - total_bases) / Mbase stats['No. contigs'] = df['Seqlen'].count() stats["Greater then 10 Kb"] = df[df['Seqlen'] >= 10000.0].Seqlen.count() stats["Greater then 100 Kb"] = df[df['Seqlen'] >= 100000.0].Seqlen.count() stats["Greater then 500 Kb"] = df[df['Seqlen'] >= 500000.0].Seqlen.count() stats["Greater then 1 Mb"] = df[df['Seqlen'] >= 1000000.0].Seqlen.count() stats['Yield > 10kb (Mb)'] = df[df['Seqlen'] >= 10000.0]['Seqlen'].sum() / Mbase stats['Yield > 50kb (Mb)'] = df[df['Seqlen'] >= 50000.0]['Seqlen'].sum() / Mbase if 'mean_q' in df.columns: stats['Max Qscore'] = df['mean_q'].max() stats['Min Qscore'] = df['mean_q'].min() stats['Avg Qscore'] = df['mean_q'].mean() stats['Qscore SD'] = df['mean_q'].std() stats['Yield >Q6 (Mb)'] = df[df['mean_q'] >= 6.0]['Seqlen'].sum() / Mbase stats['Yield >Q9 (Mb)'] = df[df['mean_q'] >= 9.0]['Seqlen'].sum() / Mbase stats["GC content"] = float(df[['G', "C"]].sum().sum()) / total_len * 100.0 for base in bases: stats[base + ' (%)'] = float(df[base].sum()) / total_len * 100.0 return stats.round(2)