zmw_subsample.py 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160
  1. """
  2. Takes a CSV file with a list of ZMWs with their corresponding lengths.
  3. The script outputs a JSON file with a whitelist of ZMWs selected by a given
  4. strategy (random, longest, etc.) and desired coverage of a genome.
  5. Author: Ivan Sovic
  6. """
  7. import falcon_kit.util.system as system
  8. import falcon_kit.io as io
  9. import os
  10. import sys
  11. import argparse
  12. import logging
  13. import contextlib
  14. import itertools
  15. import random
  16. import json
  17. LOG = logging.getLogger()
  18. STRATEGY_RANDOM = 'random'
  19. STRATEGY_LONGEST = 'longest'
  20. def strategy_func_random(zmws):
  21. """
  22. >>> random.seed(12345); strategy_func_random([])
  23. []
  24. >>> random.seed(12345); strategy_func_random([('synthetic/1', 9)])
  25. [('synthetic/1', 9)]
  26. >>> random.seed(12345); strategy_func_random([('synthetic/1', 9), ('synthetic/2', 21), ('synthetic/3', 9), ('synthetic/4', 15), ('synthetic/5', 20)])
  27. [('synthetic/5', 20), ('synthetic/3', 9), ('synthetic/2', 21), ('synthetic/1', 9), ('synthetic/4', 15)]
  28. """
  29. ret = list(zmws)
  30. random.shuffle(ret)
  31. return ret
  32. def strategy_func_longest(zmws):
  33. """
  34. >>> strategy_func_longest([])
  35. []
  36. >>> strategy_func_longest([('synthetic/1', 9)])
  37. [('synthetic/1', 9)]
  38. >>> strategy_func_longest([('synthetic/1', 9), ('synthetic/2', 21), ('synthetic/3', 9), ('synthetic/4', 15), ('synthetic/5', 20)])
  39. [('synthetic/2', 21), ('synthetic/5', 20), ('synthetic/4', 15), ('synthetic/1', 9), ('synthetic/3', 9)]
  40. """
  41. return sorted(zmws, key = lambda x: x[1], reverse = True)
  42. STRATEGY_TYPE_TO_FUNC = { STRATEGY_RANDOM: strategy_func_random,
  43. STRATEGY_LONGEST: strategy_func_longest,
  44. }
  45. def select_zmws(zmws, min_requested_bases):
  46. """
  47. >>> select_zmws([], 0)
  48. ([], 0)
  49. >>> select_zmws([], 10)
  50. ([], 0)
  51. >>> select_zmws([('zmw/1', 1), ('zmw/2', 2), ('zmw/3', 5), ('zmw/4', 7), ('zmw/5', 10), ('zmw/6', 15)], 10)
  52. (['zmw/1', 'zmw/2', 'zmw/3', 'zmw/4'], 15)
  53. >>> select_zmws([('zmw/1', 1), ('zmw/2', 2), ('zmw/3', 5), ('zmw/4', 7), ('zmw/5', 10), ('zmw/6', 15)], 20)
  54. (['zmw/1', 'zmw/2', 'zmw/3', 'zmw/4', 'zmw/5'], 25)
  55. >>> select_zmws([('zmw/1', 1), ('zmw/1', 2), ('zmw/1', 5), ('zmw/1', 7), ('zmw/1', 10), ('zmw/1', 15)], 20)
  56. (['zmw/1', 'zmw/1', 'zmw/1', 'zmw/1', 'zmw/1'], 25)
  57. """
  58. # Select the first N ZMWs which sum up to the desired coverage.
  59. num_bases = 0
  60. subsampled_zmws = []
  61. for zmw_name, seq_len in zmws:
  62. num_bases += seq_len
  63. subsampled_zmws.append(zmw_name)
  64. if num_bases >= min_requested_bases:
  65. break
  66. return subsampled_zmws, num_bases
  67. def calc_stats(total_unique_molecular_bases, total_bases, output_bases, genome_size, coverage):
  68. """
  69. >>> calc_stats(0, 0, 0, 0, 0) == \
  70. {'genome_size': 0, 'coverage': 0, 'total_bases': 0, 'total_unique_molecular_bases': 0, \
  71. 'output_bases': 0, 'unique_molecular_avg_cov': 0.0, 'output_avg_cov': 0.0, 'total_avg_cov': 0.0}
  72. True
  73. >>> calc_stats(10000, 100000, 2000, 1000, 2) == \
  74. {'genome_size': 1000, 'coverage': 2, 'total_bases': 100000, 'total_unique_molecular_bases': 10000, \
  75. 'output_bases': 2000, 'unique_molecular_avg_cov': 10.0, 'output_avg_cov': 2.0, 'total_avg_cov': 100.0}
  76. True
  77. """
  78. unique_molecular_avg_cov = 0.0 if genome_size == 0 else float(total_unique_molecular_bases) / float(genome_size)
  79. total_avg_cov = 0.0 if genome_size == 0 else float(total_bases) / float(genome_size)
  80. output_avg_cov = 0.0 if genome_size == 0 else float(output_bases) / float(genome_size)
  81. ret = {
  82. 'genome_size': genome_size,
  83. 'coverage': coverage,
  84. 'total_bases': total_bases,
  85. 'total_unique_molecular_bases': total_unique_molecular_bases,
  86. 'output_bases': output_bases,
  87. 'total_avg_cov': total_avg_cov,
  88. 'unique_molecular_avg_cov': unique_molecular_avg_cov,
  89. 'output_avg_cov': output_avg_cov,
  90. }
  91. return ret
  92. def collect_zmws(fp_in):
  93. zmws = []
  94. seen_zmws = set()
  95. unique_molecular_size = 0
  96. total_size = 0
  97. for line in fp_in:
  98. sl = line.strip().split()
  99. movie_zmw, zmw_median_len, zmw_total_len, zmw_num_passes = sl[0], int(sl[1]), int(sl[2]), int(sl[3])
  100. assert movie_zmw not in seen_zmws, 'Duplicate ZMWs detected in the input. Offender: "{}".'.format(movie_zmw)
  101. unique_molecular_size += zmw_median_len
  102. total_size += zmw_total_len
  103. zmws.append((movie_zmw, zmw_median_len))
  104. seen_zmws.add(movie_zmw)
  105. return zmws, unique_molecular_size, total_size
  106. def run(fp_in, coverage, genome_size, strategy_func):
  107. zmws, total_unique_molecular_bases, total_bases = collect_zmws(fp_in)
  108. zmws = strategy_func(zmws)
  109. subsampled_zmws, output_bases = select_zmws(zmws, coverage * genome_size)
  110. stats_dict = calc_stats(total_unique_molecular_bases, total_bases, output_bases, genome_size, coverage)
  111. return subsampled_zmws, zmws, stats_dict
  112. def parse_args(argv):
  113. parser = argparse.ArgumentParser(description="Produces a list of ZMW where the median unique molecular "\
  114. "coverage sums up to the desired coverage of the given genome size, "\
  115. "given a specified subsampling strategy. Input is a TSV passed via stdin. "\
  116. "Output is to stdout.",
  117. formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  118. parser.add_argument('--strategy', type=str, default='random',
  119. help='Subsampling strategy: random, longest')
  120. parser.add_argument('--coverage', type=float, default=60,
  121. help='Desired coverage for subsampling.')
  122. parser.add_argument('--genome-size', type=float, default=0,
  123. help='Genome size estimate of the input dataset.', required=True)
  124. parser.add_argument('--random-seed', type=int, default=12345,
  125. help='Seed value used for the random generator.', required=False)
  126. parser.add_argument('out_fn', type=str, default='zmw.whitelist.json',
  127. help='Output JSON file with subsampled ZMWs.')
  128. args = parser.parse_args(argv[1:])
  129. return args
  130. def main(argv=sys.argv):
  131. args = parse_args(argv)
  132. logging.basicConfig(level=logging.INFO)
  133. strategy_func = STRATEGY_TYPE_TO_FUNC[args.strategy]
  134. LOG.info('Using subsampling strategy: "{}"'.format(args.strategy))
  135. system.set_random_seed(args.random_seed)
  136. zmws_whitelist, zmws_all, stats_dict = run(
  137. sys.stdin, args.coverage, args.genome_size, strategy_func)
  138. io.serialize(args.out_fn, zmws_whitelist)
  139. if __name__ == "__main__": # pragma: no cover
  140. main(sys.argv) # pragma: no cover