stats_preassembly.py 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272
  1. """ PreAssembly Report.
  2. See FALCON-pbsmrtpipe/pbfalcon/report_preassembly.py for XML version.
  3. """
  4. # Copied from
  5. # http://swarm/files/depot/branches/springfield/S2.3/software/smrtanalysis/bioinformatics/tools/pbreports/pbreports/report/preassembly.py
  6. from future.utils import viewitems
  7. from builtins import object
  8. from .FastaReader import open_fasta_reader
  9. from .util.io import syscall
  10. from . import functional
  11. import collections
  12. import glob
  13. import itertools
  14. import logging
  15. import os
  16. import pprint
  17. import re
  18. log = logging.getLogger(__name__)
  19. __version__ = '0.1'
  20. Stats = collections.namedtuple(
  21. 'FastaStats', ['nreads', 'total', 'n50', 'p95', 'esize'])
  22. # Copied from pbreports/util.py
  23. # We want to avoid a dependency on pbreports b/c it needs matplotlib.
  24. def get_fasta_readlengths(fasta_file):
  25. """
  26. Get a sorted list of contig lengths
  27. :return: (tuple)
  28. """
  29. lens = []
  30. with open_fasta_reader(fasta_file) as f:
  31. for record in f:
  32. lens.append(len(record.sequence))
  33. lens.sort()
  34. return lens
  35. def get_db_readlengths(fn):
  36. """Use DBdump on a DAZZ_DB.
  37. If DBsplit was run, then we see the filtered reads only, since we do not provide '-u' to DBdump.
  38. """
  39. call = 'DBdump -h {}'.format(fn)
  40. return list(functional.parsed_readlengths_from_dbdump_output(syscall(call)))
  41. class FastaContainer(object):
  42. def __init__(self, nreads, total, file_name):
  43. self.nreads = nreads
  44. self.total = total
  45. self.file_name = file_name
  46. @staticmethod
  47. def from_file(file_name):
  48. # nreads, total = _compute_values(file_name)
  49. read_lens = get_fasta_readlengths(file_name)
  50. nreads = len(read_lens)
  51. total = sum(read_lens)
  52. return FastaContainer(nreads, total, file_name)
  53. def __str__(self):
  54. return "N {n} Total {t} File: {f}".format(n=self.nreads, t=self.total, f=self.file_name)
  55. def _validate_file(file_name):
  56. if os.path.isfile(file_name):
  57. return os.path.abspath(file_name)
  58. else:
  59. msg = "Unable to find {f}".format(f=file_name)
  60. log.error(msg)
  61. raise IOError(msg)
  62. def cutoff_reads(read_lens, min_read_len):
  63. return [rl for rl in read_lens if rl >= min_read_len]
  64. def read_len_above(read_lens, threshold):
  65. subtotal = 0
  66. # Reverse-order calculation is faster.
  67. for irev, rl in enumerate(reversed(read_lens)):
  68. subtotal += rl
  69. if subtotal >= threshold:
  70. return rl
  71. def percentile(read_lens, p):
  72. # TODO: Fix this when p=1.0
  73. return read_lens[int(len(read_lens) * p)]
  74. def stats_from_sorted_readlengths(read_lens):
  75. nreads = len(read_lens)
  76. total = sum(read_lens)
  77. sum_squares = sum(r * r for r in read_lens)
  78. n50 = read_len_above(read_lens, int(total * 0.50))
  79. p95 = percentile(read_lens, 0.95)
  80. esize = sum_squares / total
  81. #alt_n50 = pbreports.util.compute_n50(read_lens)
  82. # log.info('our n50=%s, pbreports=%s' %(n50, alt_n50)) # Ours is more correct when median is between 2 reads.
  83. return Stats(nreads=nreads, total=total, n50=n50, p95=p95, esize=esize)
  84. def read_lens_from_fofn(fofn_fn):
  85. """Return sorted list.
  86. """
  87. fns = [fn.strip() for fn in open(fofn_fn) if fn.strip()]
  88. # get_fasta_readlengths() returns sorted, so sorting the chain is roughly linear.
  89. return list(sorted(itertools.chain.from_iterable(get_fasta_readlengths(fn) for fn in fns)))
  90. def read_lens_from_db(db_fn):
  91. """Return sorted read-lengths from a DAZZ_DB.
  92. """
  93. return list(sorted(get_db_readlengths(db_fn)))
  94. def abs_filenames(fofn_fn):
  95. fofn_dir = os.path.dirname(fofn_fn)
  96. def abs_fn(fn):
  97. if os.path.isabs(fn):
  98. return fn
  99. return os.path.join(fofn_dir, fn)
  100. fns = [abs_fn(fn.strip()) for fn in open(fofn_fn) if fn.strip()]
  101. return fns
  102. def metric_fragmentation(preads_fofn):
  103. # https://jira.pacificbiosciences.com/browse/SAT-105
  104. # sed -nr 's;>prolog/([0-9]*)[0-9]/.*;\1;p' %s/*.fasta | sort | uniq -c | awk '{print $1}' | sort | uniq -c
  105. fastas = abs_filenames(preads_fofn)
  106. assert fastas, 'No fasta found in {!r}'.format(preads_fofn)
  107. call = r"""perl -e 'while (<>) { if ( m{>[^/]+/(\d+)\d/} ) { $id{$1}++; } }; while (my ($k, $v) = each %%id) { $counts{$v}++; }; while (my ($k, $v) = each %%counts) { print "$v $k\n"; };' %s""" % (' '.join(fastas))
  108. counts = syscall(call)
  109. return functional.calc_metric_fragmentation(counts)
  110. def metric_truncation(db, preads_fofn):
  111. # https://jira.pacificbiosciences.com/browse/SAT-105
  112. fastas = abs_filenames(preads_fofn)
  113. assert fastas, 'No fasta found in {!r}'.format(preads_fofn)
  114. call = r"""perl -e 'while (<>) { if ( m{>[^/]+/0*(\d+)\d/(\d+)_(\d+)} ) { $lengths{(1 + $1)} += ($3 - $2); } }; while (my ($k, $v) = each %%lengths) { print "$k $v\n"; };' %s""" % (' '.join(fastas))
  115. # The +1 is because of the DBdump readids start at 1, but these start at 0.
  116. length_pairs_output = syscall(call)
  117. call = 'DBdump -rh {}'.format(db)
  118. dbdump_output = syscall(call)
  119. return functional.calc_metric_truncation(dbdump_output, length_pairs_output)
  120. def stats_dict(stats_raw_reads, stats_seed_reads, stats_corrected_reads, genome_length, length_cutoff,
  121. fragmentation, truncation):
  122. """All inputs are paths to fasta files.
  123. genome_length and length_cutoff can be None.
  124. """
  125. log.info('stats for raw reads: %s' % repr(stats_raw_reads))
  126. log.info('stats for seed reads: %s' % repr(stats_seed_reads))
  127. log.info('stats for corrected reads: %s' % repr(stats_corrected_reads))
  128. kwds = {}
  129. genome_length = -1 if not genome_length else genome_length
  130. kwds['genome_length'] = genome_length
  131. kwds['length_cutoff'] = 0 if length_cutoff is None else length_cutoff
  132. kwds['raw_reads'] = stats_raw_reads.nreads
  133. kwds['raw_bases'] = stats_raw_reads.total
  134. kwds['raw_mean'] = stats_raw_reads.total / stats_raw_reads.nreads
  135. kwds['raw_n50'] = stats_raw_reads.n50
  136. kwds['raw_p95'] = stats_raw_reads.p95
  137. kwds['raw_coverage'] = stats_raw_reads.total / genome_length
  138. kwds['raw_esize'] = stats_raw_reads.esize
  139. kwds['seed_reads'] = stats_seed_reads.nreads
  140. kwds['seed_bases'] = stats_seed_reads.total
  141. kwds['seed_mean'] = stats_seed_reads.total / stats_seed_reads.nreads
  142. kwds['seed_n50'] = stats_seed_reads.n50
  143. kwds['seed_p95'] = stats_seed_reads.p95
  144. kwds['seed_coverage'] = stats_seed_reads.total / genome_length
  145. kwds['seed_esize'] = stats_seed_reads.esize
  146. kwds['preassembled_reads'] = stats_corrected_reads.nreads
  147. kwds['preassembled_bases'] = stats_corrected_reads.total
  148. kwds['preassembled_mean'] = stats_corrected_reads.total / \
  149. stats_corrected_reads.nreads
  150. kwds['preassembled_n50'] = stats_corrected_reads.n50
  151. kwds['preassembled_p95'] = stats_corrected_reads.p95
  152. kwds['preassembled_coverage'] = stats_corrected_reads.total / genome_length
  153. kwds['preassembled_esize'] = stats_corrected_reads.esize
  154. kwds['preassembled_yield'] = stats_corrected_reads.total / \
  155. stats_seed_reads.total
  156. kwds['preassembled_seed_fragmentation'] = fragmentation
  157. kwds['preassembled_seed_truncation'] = truncation
  158. def round_if_float(v):
  159. return v if type(v) is not float else round(v, 3)
  160. result = {k: round_if_float(v) for (k, v) in viewitems(kwds)}
  161. return result
  162. # DEPRECATED
  163. def make_dict(
  164. i_preads_fofn_fn,
  165. i_raw_reads_fofn_fn,
  166. genome_length,
  167. length_cutoff,
  168. fragmentation=-1,
  169. truncation=-1,
  170. ):
  171. raw_reads = read_lens_from_fofn(i_raw_reads_fofn_fn)
  172. stats_raw_reads = stats_from_sorted_readlengths(raw_reads)
  173. seed_reads = cutoff_reads(raw_reads, length_cutoff)
  174. stats_seed_reads = stats_from_sorted_readlengths(seed_reads)
  175. preads = read_lens_from_fofn(i_preads_fofn_fn)
  176. stats_preads = stats_from_sorted_readlengths(preads)
  177. report_dict = stats_dict(
  178. stats_raw_reads=stats_raw_reads,
  179. stats_seed_reads=stats_seed_reads,
  180. stats_corrected_reads=stats_preads,
  181. genome_length=genome_length,
  182. length_cutoff=length_cutoff,
  183. fragmentation=fragmentation,
  184. truncation=truncation,
  185. )
  186. return report_dict
  187. def calc_dict(
  188. i_preads_fofn_fn,
  189. i_raw_reads_db_fn,
  190. genome_length,
  191. length_cutoff,
  192. ):
  193. try:
  194. frag = metric_fragmentation(i_preads_fofn_fn)
  195. except:
  196. frag = -1.0
  197. log.exception('Using arbitrary fragmentation metric: {}'.format(frag))
  198. try:
  199. trunc = metric_truncation(i_raw_reads_db_fn, i_preads_fofn_fn)
  200. except:
  201. trunc = -1.0
  202. log.exception('Using arbitrary truncation metric: {}'.format(trunc))
  203. raw_reads = read_lens_from_db(i_raw_reads_db_fn)
  204. stats_raw_reads = stats_from_sorted_readlengths(raw_reads)
  205. seed_reads = cutoff_reads(raw_reads, length_cutoff)
  206. stats_seed_reads = stats_from_sorted_readlengths(seed_reads)
  207. preads = read_lens_from_fofn(i_preads_fofn_fn)
  208. stats_preads = stats_from_sorted_readlengths(preads)
  209. report_dict = stats_dict(
  210. stats_raw_reads=stats_raw_reads,
  211. stats_seed_reads=stats_seed_reads,
  212. stats_corrected_reads=stats_preads,
  213. genome_length=genome_length,
  214. length_cutoff=length_cutoff,
  215. fragmentation=frag,
  216. truncation=trunc,
  217. )
  218. log.info('Calculated pre-assembly stats:\n{}'.format(
  219. pprint.pformat(report_dict)))
  220. return report_dict