fc_asm_graph.py 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180
  1. from builtins import zip
  2. from builtins import object
  3. from .FastaReader import open_fasta_reader
  4. from .io import FilePercenter
  5. import networkx as nx
  6. RCMAP = dict(list(zip("ACGTacgtNn-", "TGCAtgcaNn-")))
  7. def reverse_end(node_id):
  8. node_id, end = node_id.split(":")
  9. new_end = "B" if end == "E" else "E"
  10. return node_id + ":" + new_end
  11. class AsmGraph(object):
  12. def __init__(self, sg_file, utg_file, ctg_file):
  13. self.sg_edges = {}
  14. self.sg_edge_seqs = {}
  15. self.utg_data = {}
  16. self.ctg_data = {}
  17. self.utg_to_ctg = {}
  18. self.node_to_ctg = {}
  19. self.node_to_utg = {}
  20. self.load_sg_data(sg_file)
  21. self.load_utg_data(utg_file)
  22. self.load_ctg_data(ctg_file)
  23. self.build_node_map()
  24. def load_sg_data(self, sg_file):
  25. counter = FilePercenter(sg_file)
  26. with open(sg_file) as f:
  27. for l in f:
  28. counter(len(l))
  29. l = l.strip().split()
  30. v, w = l[0:2]
  31. seq_id, b, e = l[2:5]
  32. b, e = int(b), int(e)
  33. score, idt = l[5:7]
  34. score, idt = int(score), float(idt)
  35. type_ = l[7]
  36. self.sg_edges[(v, w)] = ((seq_id, b, e), score, idt, type_)
  37. def load_sg_seq(self, fasta_fn):
  38. all_read_ids = set() # read ids in the graph
  39. for v, w in self.sg_edges:
  40. type_ = self.sg_edges[(v, w)][-1]
  41. if type_ != "G":
  42. continue
  43. v = v.split(":")[0]
  44. w = w.split(":")[0]
  45. all_read_ids.add(v)
  46. all_read_ids.add(w)
  47. seqs = {}
  48. # load all p-read name into memory
  49. with open_fasta_reader(fasta_fn) as f:
  50. for r in f:
  51. if r.name not in all_read_ids:
  52. continue
  53. seqs[r.name] = r.sequence.upper()
  54. for v, w in self.sg_edges:
  55. seq_id, s, t = self.sg_edges[(v, w)][0]
  56. type_ = self.sg_edges[(v, w)][-1]
  57. if type_ != "G":
  58. continue
  59. if s < t:
  60. e_seq = seqs[seq_id][s:t]
  61. else:
  62. e_seq = "".join([RCMAP[c] for c in seqs[seq_id][t:s][::-1]])
  63. self.sg_edge_seqs[(v, w)] = e_seq
  64. def get_seq_from_path(self, path):
  65. if len(self.sg_edge_seqs) == 0:
  66. return ""
  67. v = path[0]
  68. seqs = []
  69. for w in path[1:]:
  70. seqs.append(self.sg_edge_seqs[(v, w)])
  71. v = w
  72. return "".join(seqs)
  73. def load_utg_data(self, utg_file):
  74. counter = FilePercenter(utg_file)
  75. with open(utg_file) as f:
  76. for l in f:
  77. counter(len(l))
  78. l = l.strip().split()
  79. s, v, t = l[0:3]
  80. type_, length, score = l[3:6]
  81. length, score = int(length), int(score)
  82. path_or_edges = l[6]
  83. self.utg_data[(s, t, v)] = (
  84. type_, length, score, path_or_edges)
  85. def load_ctg_data(self, ctg_file):
  86. counter = FilePercenter(ctg_file)
  87. with open(ctg_file) as f:
  88. for l in f:
  89. counter(len(l))
  90. l = l.strip().split()
  91. ctg_id, ctg_type = l[0:2]
  92. start_edge = l[2]
  93. end_node = l[3]
  94. length = int(l[4])
  95. score = int(l[5])
  96. path = tuple((e.split("~") for e in l[6].split("|")))
  97. self.ctg_data[ctg_id] = (
  98. ctg_type, start_edge, end_node, length, score, path)
  99. for u in path:
  100. s, v, t = u
  101. # rint s,v,t
  102. type_, length, score, path_or_edges = self.utg_data[(
  103. s, t, v)]
  104. if type_ != "compound":
  105. self.utg_to_ctg[(s, t, v)] = ctg_id
  106. else:
  107. for svt in path_or_edges.split("|"):
  108. s, v, t = svt.split("~")
  109. self.utg_to_ctg[(s, t, v)] = ctg_id
  110. def get_sg_for_utg(self, utg_id):
  111. sg = nx.DiGraph()
  112. type_, length, score, path_or_edges = self.utg_data[utg_id]
  113. if type_ == "compound":
  114. for svt in path_or_edges.split("|"):
  115. s, v, t = svt.split("~")
  116. type_, length, score, one_path = self.utg_data[(s, t, v)]
  117. one_path = one_path.split("~")
  118. nx.add_path(sg, one_path)
  119. else:
  120. one_path = path_or_edges.split("~")
  121. nx.add_path(sg, one_path)
  122. return sg
  123. def get_sg_for_ctg(self, ctg_id):
  124. sg = nx.DiGraph()
  125. utgs = []
  126. path = self.ctg_data[ctg_id][-1]
  127. for s, v, t in path:
  128. type_, length, score, path_or_edges = self.utg_data[(s, t, v)]
  129. utgs.append((type_, path_or_edges))
  130. for t, utg in utgs:
  131. if t == "simple":
  132. one_path = utg.split("~")
  133. nx.add_path(sg, one_path)
  134. elif t == "compound":
  135. for svt in utg.split("|"):
  136. s, v, t = svt.split("~")
  137. type_, length, score, one_path = self.utg_data[(s, t, v)]
  138. one_path = one_path.split("~")
  139. nx.add_path(sg, one_path)
  140. return sg
  141. def build_node_map(self):
  142. for ctg_id in self.ctg_data:
  143. sg = self.get_sg_for_ctg(ctg_id)
  144. for n in sg.nodes():
  145. self.node_to_ctg.setdefault(n, set())
  146. self.node_to_ctg[n].add(ctg_id)
  147. for u_id in self.utg_data:
  148. if self.utg_data[u_id][0] == "compound":
  149. continue
  150. sg = self.get_sg_for_utg(u_id)
  151. for n in sg.nodes():
  152. self.node_to_utg.setdefault(n, set())
  153. self.node_to_utg[n].add(u_id)