123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272 |
- """ PreAssembly Report.
- See FALCON-pbsmrtpipe/pbfalcon/report_preassembly.py for XML version.
- """
- # Copied from
- # http://swarm/files/depot/branches/springfield/S2.3/software/smrtanalysis/bioinformatics/tools/pbreports/pbreports/report/preassembly.py
- from future.utils import viewitems
- from builtins import object
- from .FastaReader import open_fasta_reader
- from .util.io import syscall
- from . import functional
- import collections
- import glob
- import itertools
- import logging
- import os
- import pprint
- import re
- log = logging.getLogger(__name__)
- __version__ = '0.1'
- Stats = collections.namedtuple(
- 'FastaStats', ['nreads', 'total', 'n50', 'p95', 'esize'])
- # Copied from pbreports/util.py
- # We want to avoid a dependency on pbreports b/c it needs matplotlib.
- def get_fasta_readlengths(fasta_file):
- """
- Get a sorted list of contig lengths
- :return: (tuple)
- """
- lens = []
- with open_fasta_reader(fasta_file) as f:
- for record in f:
- lens.append(len(record.sequence))
- lens.sort()
- return lens
- def get_db_readlengths(fn):
- """Use DBdump on a DAZZ_DB.
- If DBsplit was run, then we see the filtered reads only, since we do not provide '-u' to DBdump.
- """
- call = 'DBdump -h {}'.format(fn)
- return list(functional.parsed_readlengths_from_dbdump_output(syscall(call)))
- class FastaContainer(object):
- def __init__(self, nreads, total, file_name):
- self.nreads = nreads
- self.total = total
- self.file_name = file_name
- @staticmethod
- def from_file(file_name):
- # nreads, total = _compute_values(file_name)
- read_lens = get_fasta_readlengths(file_name)
- nreads = len(read_lens)
- total = sum(read_lens)
- return FastaContainer(nreads, total, file_name)
- def __str__(self):
- return "N {n} Total {t} File: {f}".format(n=self.nreads, t=self.total, f=self.file_name)
- def _validate_file(file_name):
- if os.path.isfile(file_name):
- return os.path.abspath(file_name)
- else:
- msg = "Unable to find {f}".format(f=file_name)
- log.error(msg)
- raise IOError(msg)
- def cutoff_reads(read_lens, min_read_len):
- return [rl for rl in read_lens if rl >= min_read_len]
- def read_len_above(read_lens, threshold):
- subtotal = 0
- # Reverse-order calculation is faster.
- for irev, rl in enumerate(reversed(read_lens)):
- subtotal += rl
- if subtotal >= threshold:
- return rl
- def percentile(read_lens, p):
- # TODO: Fix this when p=1.0
- return read_lens[int(len(read_lens) * p)]
- def stats_from_sorted_readlengths(read_lens):
- nreads = len(read_lens)
- total = sum(read_lens)
- sum_squares = sum(r * r for r in read_lens)
- n50 = read_len_above(read_lens, int(total * 0.50))
- p95 = percentile(read_lens, 0.95)
- esize = sum_squares / total
- #alt_n50 = pbreports.util.compute_n50(read_lens)
- # log.info('our n50=%s, pbreports=%s' %(n50, alt_n50)) # Ours is more correct when median is between 2 reads.
- return Stats(nreads=nreads, total=total, n50=n50, p95=p95, esize=esize)
- def read_lens_from_fofn(fofn_fn):
- """Return sorted list.
- """
- fns = [fn.strip() for fn in open(fofn_fn) if fn.strip()]
- # get_fasta_readlengths() returns sorted, so sorting the chain is roughly linear.
- return list(sorted(itertools.chain.from_iterable(get_fasta_readlengths(fn) for fn in fns)))
- def read_lens_from_db(db_fn):
- """Return sorted read-lengths from a DAZZ_DB.
- """
- return list(sorted(get_db_readlengths(db_fn)))
- def abs_filenames(fofn_fn):
- fofn_dir = os.path.dirname(fofn_fn)
- def abs_fn(fn):
- if os.path.isabs(fn):
- return fn
- return os.path.join(fofn_dir, fn)
- fns = [abs_fn(fn.strip()) for fn in open(fofn_fn) if fn.strip()]
- return fns
- def metric_fragmentation(preads_fofn):
- # https://jira.pacificbiosciences.com/browse/SAT-105
- # sed -nr 's;>prolog/([0-9]*)[0-9]/.*;\1;p' %s/*.fasta | sort | uniq -c | awk '{print $1}' | sort | uniq -c
- fastas = abs_filenames(preads_fofn)
- assert fastas, 'No fasta found in {!r}'.format(preads_fofn)
- call = r"""perl -e 'while (<>) { if ( m{>[^/]+/(\d+)\d/} ) { $id{$1}++; } }; while (my ($k, $v) = each %%id) { $counts{$v}++; }; while (my ($k, $v) = each %%counts) { print "$v $k\n"; };' %s""" % (' '.join(fastas))
- counts = syscall(call)
- return functional.calc_metric_fragmentation(counts)
- def metric_truncation(db, preads_fofn):
- # https://jira.pacificbiosciences.com/browse/SAT-105
- fastas = abs_filenames(preads_fofn)
- assert fastas, 'No fasta found in {!r}'.format(preads_fofn)
- call = r"""perl -e 'while (<>) { if ( m{>[^/]+/0*(\d+)\d/(\d+)_(\d+)} ) { $lengths{(1 + $1)} += ($3 - $2); } }; while (my ($k, $v) = each %%lengths) { print "$k $v\n"; };' %s""" % (' '.join(fastas))
- # The +1 is because of the DBdump readids start at 1, but these start at 0.
- length_pairs_output = syscall(call)
- call = 'DBdump -rh {}'.format(db)
- dbdump_output = syscall(call)
- return functional.calc_metric_truncation(dbdump_output, length_pairs_output)
- def stats_dict(stats_raw_reads, stats_seed_reads, stats_corrected_reads, genome_length, length_cutoff,
- fragmentation, truncation):
- """All inputs are paths to fasta files.
- genome_length and length_cutoff can be None.
- """
- log.info('stats for raw reads: %s' % repr(stats_raw_reads))
- log.info('stats for seed reads: %s' % repr(stats_seed_reads))
- log.info('stats for corrected reads: %s' % repr(stats_corrected_reads))
- kwds = {}
- genome_length = -1 if not genome_length else genome_length
- kwds['genome_length'] = genome_length
- kwds['length_cutoff'] = 0 if length_cutoff is None else length_cutoff
- kwds['raw_reads'] = stats_raw_reads.nreads
- kwds['raw_bases'] = stats_raw_reads.total
- kwds['raw_mean'] = stats_raw_reads.total / stats_raw_reads.nreads
- kwds['raw_n50'] = stats_raw_reads.n50
- kwds['raw_p95'] = stats_raw_reads.p95
- kwds['raw_coverage'] = stats_raw_reads.total / genome_length
- kwds['raw_esize'] = stats_raw_reads.esize
- kwds['seed_reads'] = stats_seed_reads.nreads
- kwds['seed_bases'] = stats_seed_reads.total
- kwds['seed_mean'] = stats_seed_reads.total / stats_seed_reads.nreads
- kwds['seed_n50'] = stats_seed_reads.n50
- kwds['seed_p95'] = stats_seed_reads.p95
- kwds['seed_coverage'] = stats_seed_reads.total / genome_length
- kwds['seed_esize'] = stats_seed_reads.esize
- kwds['preassembled_reads'] = stats_corrected_reads.nreads
- kwds['preassembled_bases'] = stats_corrected_reads.total
- kwds['preassembled_mean'] = stats_corrected_reads.total / \
- stats_corrected_reads.nreads
- kwds['preassembled_n50'] = stats_corrected_reads.n50
- kwds['preassembled_p95'] = stats_corrected_reads.p95
- kwds['preassembled_coverage'] = stats_corrected_reads.total / genome_length
- kwds['preassembled_esize'] = stats_corrected_reads.esize
- kwds['preassembled_yield'] = stats_corrected_reads.total / \
- stats_seed_reads.total
- kwds['preassembled_seed_fragmentation'] = fragmentation
- kwds['preassembled_seed_truncation'] = truncation
- def round_if_float(v):
- return v if type(v) is not float else round(v, 3)
- result = {k: round_if_float(v) for (k, v) in viewitems(kwds)}
- return result
- # DEPRECATED
- def make_dict(
- i_preads_fofn_fn,
- i_raw_reads_fofn_fn,
- genome_length,
- length_cutoff,
- fragmentation=-1,
- truncation=-1,
- ):
- raw_reads = read_lens_from_fofn(i_raw_reads_fofn_fn)
- stats_raw_reads = stats_from_sorted_readlengths(raw_reads)
- seed_reads = cutoff_reads(raw_reads, length_cutoff)
- stats_seed_reads = stats_from_sorted_readlengths(seed_reads)
- preads = read_lens_from_fofn(i_preads_fofn_fn)
- stats_preads = stats_from_sorted_readlengths(preads)
- report_dict = stats_dict(
- stats_raw_reads=stats_raw_reads,
- stats_seed_reads=stats_seed_reads,
- stats_corrected_reads=stats_preads,
- genome_length=genome_length,
- length_cutoff=length_cutoff,
- fragmentation=fragmentation,
- truncation=truncation,
- )
- return report_dict
- def calc_dict(
- i_preads_fofn_fn,
- i_raw_reads_db_fn,
- genome_length,
- length_cutoff,
- ):
- try:
- frag = metric_fragmentation(i_preads_fofn_fn)
- except:
- frag = -1.0
- log.exception('Using arbitrary fragmentation metric: {}'.format(frag))
- try:
- trunc = metric_truncation(i_raw_reads_db_fn, i_preads_fofn_fn)
- except:
- trunc = -1.0
- log.exception('Using arbitrary truncation metric: {}'.format(trunc))
- raw_reads = read_lens_from_db(i_raw_reads_db_fn)
- stats_raw_reads = stats_from_sorted_readlengths(raw_reads)
- seed_reads = cutoff_reads(raw_reads, length_cutoff)
- stats_seed_reads = stats_from_sorted_readlengths(seed_reads)
- preads = read_lens_from_fofn(i_preads_fofn_fn)
- stats_preads = stats_from_sorted_readlengths(preads)
- report_dict = stats_dict(
- stats_raw_reads=stats_raw_reads,
- stats_seed_reads=stats_seed_reads,
- stats_corrected_reads=stats_preads,
- genome_length=genome_length,
- length_cutoff=length_cutoff,
- fragmentation=frag,
- truncation=trunc,
- )
- log.info('Calculated pre-assembly stats:\n{}'.format(
- pprint.pformat(report_dict)))
- return report_dict
|