dedup_a_tigs.py 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138
  1. from falcon_kit.FastaReader import open_fasta_reader
  2. import argparse
  3. import sys
  4. # import falcon_kit.align_dw as align
  5. import falcon_kit.align_edlib as align
  6. class TooLongError(Exception): pass
  7. def log(msg):
  8. sys.stderr.write(msg)
  9. sys.stderr.write('\n')
  10. def yield_single_compound(fp_in):
  11. """
  12. Loads all a_ctg with the same ctg_id and a_id.
  13. The header of a_ctg is:
  14. p_ctg_id-a_id-sub_id v w total_length total_score num_path_edges delta_len idt cov
  15. The first sequence in the returned list is the base a_ctg (the exact sequence
  16. which is part of the primary contig (the base of the bubble)).
  17. """
  18. ret = []
  19. prev_id = None
  20. for r in fp_in:
  21. tig_id, v, w, len_, ovl, ne, delta_l, idt, cov = r.name.split()
  22. p_ctg_id, a_id, sub_id = tig_id.split('-')
  23. curr_id = (p_ctg_id, a_id)
  24. if prev_id != None and prev_id != curr_id:
  25. yield ret
  26. ret = []
  27. prev_id = curr_id
  28. ret.append(r)
  29. yield ret
  30. def filter_duplicate(compound_a_ctg, max_idt, max_aln_cov, min_len_diff, min_seq_len, ploidy):
  31. """
  32. Takes a list of a_ctg sequences in a compound unitig (bubble) which need
  33. to be deduplicated according to the parameters.
  34. The zeroth sequence in the list is the "base" sequence. This sequence is
  35. already part of the primary path by definition, and will not be output.
  36. """
  37. ret = []
  38. # Sanity check.
  39. if len(compound_a_ctg) == 0: return ret # pragma: no cover
  40. ref_seqs = [compound_a_ctg[0]]
  41. # Zeroth sequence is the base seq.
  42. for i in range(1, len(compound_a_ctg)):
  43. header = compound_a_ctg[i].name.split()
  44. a_ctg_id, v, w, len_, ovl, ne, delta_l, idt, cov = header
  45. a_ctg_seq = compound_a_ctg[i].sequence
  46. # Reset the values
  47. delta_l, idt, cov = 0.0, 1.0, 1.0
  48. is_duplicate = False
  49. # Align against the base sequence and all non-filtered alternate branches.
  50. loop_to = len(ref_seqs) if ploidy <= 0 else min(ploidy, len(ref_seqs))
  51. for j in range(0, loop_to):
  52. # Just fetch the components for readibility.
  53. ref_ctg_id = ref_seqs[j].name.split()[0]
  54. ref_seq = ref_seqs[j].sequence
  55. log('[i = %d, j = %d] Comparing: query "%s" vs ref "%s".' % (i, j, a_ctg_id, ref_ctg_id))
  56. # Align.
  57. delta_l, idt, cov = align.get_aln_results(ref_seq, a_ctg_seq, min_seq_len)
  58. # Round to the floor of 2 decimal places. Needed to reproduce
  59. # old behaviour.
  60. idt = float('%.2f' % (idt))
  61. cov = float('%.2f' % (cov))
  62. log(' Rounded: new_delta_l = %d, new_idt = %.2f, new_cov = %.2f' % (delta_l, idt, cov))
  63. # Check if this is a duplicate.
  64. # The same conditions apply as in the old version.
  65. if 100 * idt > max_idt and \
  66. 100 * cov > max_aln_cov and \
  67. abs(delta_l) < min_len_diff:
  68. is_duplicate = True
  69. log(' -> Duplicate!')
  70. break
  71. if is_duplicate == False:
  72. # This branch is not a duplicate. Add it to references,
  73. # so that the following branches can be compared to it afterwards.
  74. ref_seqs.append(compound_a_ctg[i])
  75. # Append the non-duplicates.
  76. new_header = ' '.join([a_ctg_id, v, w, len_, ovl, ne, str(delta_l), '%.2f' % (idt), '%.2f' % (cov)])
  77. ret.append((compound_a_ctg[i], new_header))
  78. log('')
  79. return ret
  80. def run(fp_out, fp_in, max_idt, max_aln_cov, min_len_diff, min_seq_len, ploidy):
  81. for compound_a_ctg in yield_single_compound(fp_in):
  82. filtered_a_ctg = filter_duplicate(compound_a_ctg, max_idt, max_aln_cov, min_len_diff, min_seq_len, ploidy)
  83. for a_ctg, new_header in filtered_a_ctg:
  84. fp_out.write('>%s\n' % (new_header))
  85. fp_out.write(a_ctg.sequence)
  86. fp_out.write('\n')
  87. def parse_args(argv):
  88. parser = argparse.ArgumentParser(description='Removes duplicate a-tig, iff *all* conditions are violated. Assumes the working directory has the a_ctg_all.fa file, and produces a_ctg.fa',
  89. formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  90. parser.add_argument('--max-idt', type=int,
  91. help="Keep a-tig if the identity (in %%) to the primary contig is <= max_idt", default=96)
  92. parser.add_argument('--max-aln-cov', type=int,
  93. help="Keep a-tig if the alignment coverage (in %%) on the a-tig is <= max_aln_cov", default=97)
  94. parser.add_argument('--min-len-diff', type=int,
  95. help="Keep a-tig if the length different > min_len_diff", default=500)
  96. parser.add_argument('--min-seq-len', type=int,
  97. help="Branches with length less than this threshold will always be deduplicated.", default=2000)
  98. parser.add_argument('--ploidy', type=int,
  99. help="For a diplid genome, 2 branches per SV are expected. This parameter limits the number of pairwise comparison. If <= 0, this threshold is not applied.", default=2)
  100. parser.add_argument('--a-ctg-all', type=str,
  101. help="Input set of all associate contigs for deduplication.", default="a_ctg_all.fa")
  102. args = parser.parse_args(argv[1:])
  103. return args
  104. def main(argv=sys.argv):
  105. args = parse_args(argv)
  106. with open_fasta_reader(args.a_ctg_all) as fp_in:
  107. run(sys.stdout, fp_in, args.max_idt, args.max_aln_cov, args.min_len_diff, args.min_seq_len, args.ploidy)
  108. if __name__ == "__main__": # pragma: no cover
  109. main(sys.argv) # pragma: no cover