fetch_reads.py 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158
  1. from falcon_kit.FastaReader import open_fasta_reader
  2. import argparse
  3. import contextlib
  4. import os
  5. import glob
  6. import sys
  7. import re
  8. def fetch_ref_and_reads(base_dir, fofn, ctg_id, out_dir, min_ctg_lenth):
  9. read_fofn = fofn
  10. if out_dir == None:
  11. out_dir = os.path.join(base_dir, '3-unzip/reads')
  12. ctg_fa = os.path.join(base_dir, '2-asm-falcon/p_ctg.fa')
  13. read_map_dir = os.path.join(base_dir, '2-asm-falcon/read_maps')
  14. rawread_id_file = os.path.join(
  15. read_map_dir, 'dump_rawread_ids', 'rawread_ids')
  16. pread_id_file = os.path.join(read_map_dir, 'dump_pread_ids', 'pread_ids')
  17. rid_to_oid = open(rawread_id_file).read().split(
  18. '\n') # daligner raw read id to the original ids
  19. pid_to_fid = open(pread_id_file).read().split(
  20. '\n') # daligner pread id to the fake ids
  21. assert rid_to_oid, 'Empty rid_to_oid. Maybe empty {!r}?'.format(
  22. rawread_id_file)
  23. assert pid_to_fid, 'Empty pid_to_fid. Maybe empty {!r}?'.format(
  24. pread_id_file)
  25. def pid_to_oid(pid):
  26. fid = pid_to_fid[int(pid)]
  27. rid = int(fid.split('/')[1]) // 10
  28. return rid_to_oid[int(rid)]
  29. with open_fasta_reader(ctg_fa) as ref_fasta:
  30. all_ctg_ids = set()
  31. for s in ref_fasta:
  32. s_id = s.name.split()[0]
  33. if ctg_id != 'all' and s_id != ctg_id:
  34. continue
  35. if len(s.sequence) < min_ctg_lenth:
  36. continue
  37. if ctg_id != 'all':
  38. ref_out = open(os.path.join(
  39. out_dir, '%s_ref.fa' % ctg_id), 'w')
  40. else:
  41. ref_out = open(os.path.join(out_dir, '%s_ref.fa' % s_id), 'w')
  42. print('>%s' % s_id, file=ref_out)
  43. print(s.sequence, file=ref_out)
  44. all_ctg_ids.add(s_id)
  45. ref_out.close()
  46. read_set = {}
  47. ctg_id_hits = {}
  48. map_fn = os.path.join(read_map_dir, 'rawread_to_contigs')
  49. with open(map_fn, 'r') as f:
  50. for row in f:
  51. row = row.strip().split()
  52. hit_ctg = row[1]
  53. hit_ctg = hit_ctg.split('-')[0]
  54. if int(row[3]) == 0:
  55. o_id = rid_to_oid[int(row[0])]
  56. read_set[o_id] = hit_ctg
  57. ctg_id_hits[hit_ctg] = ctg_id_hits.get(hit_ctg, 0) + 1
  58. assert read_set, 'Empty read_set. Maybe empty {!r}?'.format(map_fn)
  59. map_fn = os.path.join(read_map_dir, 'pread_to_contigs')
  60. with open(map_fn, 'r') as f:
  61. for row in f:
  62. row = row.strip().split()
  63. hit_ctg = row[1]
  64. hit_ctg = hit_ctg.split('-')[0]
  65. if hit_ctg not in read_set and int(row[3]) == 0:
  66. o_id = pid_to_oid(row[0])
  67. read_set[o_id] = hit_ctg
  68. ctg_id_hits[hit_ctg] = ctg_id_hits.get(hit_ctg, 0) + 1
  69. with open(os.path.join(out_dir, 'ctg_list'), 'w') as f:
  70. for ctg_id in sorted(list(all_ctg_ids)):
  71. if ctg_id_hits.get(ctg_id, 0) < 5:
  72. continue
  73. # ignore small circle contigs, they need different approach
  74. if ctg_id[-1] not in ['F', 'R']:
  75. continue
  76. print(ctg_id, file=f)
  77. read_out_files = {}
  78. @contextlib.contextmanager
  79. def reopened_fasta_out(ctg_id):
  80. # A convenient closure, with a contextmanager.
  81. if ctg_id not in read_out_files:
  82. read_out = open(os.path.join(out_dir, '%s_reads.fa' % ctg_id), 'w')
  83. read_out_files[ctg_id] = 1
  84. else:
  85. read_out = open(os.path.join(out_dir, '%s_reads.fa' % ctg_id), 'a')
  86. yield read_out
  87. read_out.close()
  88. with open(read_fofn, 'r') as f:
  89. for r_fn in f:
  90. r_fn = r_fn.strip()
  91. # will soon handle .dexta too
  92. with open_fasta_reader(r_fn) as read_fa_file:
  93. for r in read_fa_file:
  94. rid = r.name.split()[0]
  95. if rid not in read_set:
  96. ctg_id = 'unassigned'
  97. else:
  98. ctg_id = read_set[rid]
  99. if ctg_id == 'NA' or ctg_id not in all_ctg_ids:
  100. ctg_id = 'unassigned'
  101. with reopened_fasta_out(ctg_id) as read_out:
  102. print('>' + rid, file=read_out)
  103. print(r.sequence, file=read_out)
  104. def parse_args(argv):
  105. parser = argparse.ArgumentParser(
  106. description='using the read to contig mapping data to partition the reads grouped by contigs')
  107. parser.add_argument('--base_dir', type=str, default='./',
  108. help='the base working dir of a falcon assembly')
  109. parser.add_argument('--fofn', type=str, default='./input.fofn',
  110. help='path to the file of the list of raw read fasta files')
  111. parser.add_argument('--ctg_id', type=str, default='all',
  112. help='contig identifier in the contig fasta file')
  113. parser.add_argument('--out_dir', default=None, type=str,
  114. help='the output base_dir, default to `base_dir/3-unzip/reads` directory')
  115. parser.add_argument('--min_ctg_lenth', default=20000, type=int,
  116. help='the minimum length of the contig for the outputs, default=20000')
  117. #parser.add_argument('--ctg_fa', type=str, default='./2-asm-falcon/p_ctg.fa', help='path to the contig fasta file')
  118. #parser.add_argument('--read_map_dir', type=str, default='./2-asm-falcon/read_maps', help='path to the read-contig map directory')
  119. # we can run this in parallel mode in the furture
  120. # parser.add_argument('--n_core', type=int, default=4,
  121. # help='number of processes used for generating consensus')
  122. args = parser.parse_args(argv[1:])
  123. return args
  124. def main(argv=sys.argv):
  125. args = parse_args(argv)
  126. fetch_ref_and_reads(**vars(args))
  127. if __name__ == '__main__':
  128. main()