123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181 |
- from builtins import zip
- from builtins import range
- from falcon_kit import kup, falcon, DWA
- from falcon_kit.fc_asm_graph import AsmGraph
- import networkx as nx
- import sys
- RCMAP = dict(list(zip("ACGTacgtNn-", "TGCAtgcaNn-")))
- def rc(seq):
- return "".join([RCMAP[c] for c in seq[::-1]])
- def get_aln_data(t_seq, q_seq):
- aln_data = []
- K = 8
- seq0 = t_seq
- lk_ptr = kup.allocate_kmer_lookup(1 << (K * 2))
- sa_ptr = kup.allocate_seq(len(seq0))
- sda_ptr = kup.allocate_seq_addr(len(seq0))
- kup.add_sequence(0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr)
- q_id = "dummy"
- kmer_match_ptr = kup.find_kmer_pos_for_seq(
- q_seq, len(q_seq), K, sda_ptr, lk_ptr)
- kmer_match = kmer_match_ptr[0]
- aln_range_ptr = kup.find_best_aln_range(kmer_match_ptr, K, K * 5, 12)
- aln_range = aln_range_ptr[0]
- x, y = list(zip(* [(kmer_match.query_pos[i], kmer_match.target_pos[i])
- for i in range(kmer_match.count)]))
- kup.free_kmer_match(kmer_match_ptr)
- s1, e1, s2, e2 = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2
- if e1 - s1 > 100:
- alignment = DWA.align(q_seq[s1:e1], e1 - s1,
- seq0[s2:e2], e2 - s2,
- 1500, 1)
- if alignment[0].aln_str_size > 100:
- aln_data.append((q_id, 0, s1, e1, len(q_seq), s2, e2, len(
- seq0), alignment[0].aln_str_size, alignment[0].dist))
- aln_str1 = alignment[0].q_aln_str
- aln_str0 = alignment[0].t_aln_str
- DWA.free_alignment(alignment)
- kup.free_kmer_lookup(lk_ptr)
- kup.free_seq_array(sa_ptr)
- kup.free_seq_addr_array(sda_ptr)
- return aln_data, x, y
- def main(argv=sys.argv):
- G_asm = AsmGraph("sg_edges_list", "utg_data", "ctg_paths")
- G_asm.load_sg_seq("preads4falcon.fasta")
- utg_out = open("utgs.fa", "w")
- for utg in G_asm.utg_data:
- s, t, v = utg
- type_, length, score, path_or_edges = G_asm.utg_data[(s, t, v)]
- if type_ == "simple":
- path_or_edges = path_or_edges.split("~")
- seq = G_asm.get_seq_from_path(path_or_edges)
- print(">%s~%s~%s-%d %d %d" % (
- s, v, t, 0, length, score), file=utg_out)
- print(seq, file=utg_out)
- if type_ == "compound":
- c_graph = nx.DiGraph()
- all_alt_path = []
- path_or_edges = [c.split("~") for c in path_or_edges.split("|")]
- for ss, vv, tt in path_or_edges:
- type_, length, score, sub_path = G_asm.utg_data[(ss, tt, vv)]
- sub_path = sub_path.split("~")
- v1 = sub_path[0]
- for v2 in sub_path[1:]:
- c_graph.add_edge(
- v1, v2, e_score=G_asm.sg_edges[(v1, v2)][1])
- v1 = v2
- shortest_path = nx.shortest_path(c_graph, s, t, "e_score")
- score = nx.shortest_path_length(c_graph, s, t, "e_score")
- all_alt_path.append((score, shortest_path))
- # a_ctg_data.append( (s, t, shortest_path) ) #first path is the same as the one used in the primary contig
- while 1:
- if s == t:
- break
- n0 = shortest_path[0]
- for n1 in shortest_path[1:]:
- c_graph.remove_edge(n0, n1)
- n0 = n1
- try:
- shortest_path = nx.shortest_path(c_graph, s, t, "e_score")
- score = nx.shortest_path_length(c_graph, s, t, "e_score")
- #a_ctg_data.append( (s, t, shortest_path) )
- all_alt_path.append((score, shortest_path))
- except nx.exception.NetworkXNoPath:
- break
- # if len(shortest_path) < 2:
- # break
- all_alt_path.sort()
- all_alt_path.reverse()
- shortest_path = all_alt_path[0][1]
- score, atig_path = all_alt_path[0]
- atig_output = []
- atig_path_edges = list(zip(atig_path[:-1], atig_path[1:]))
- sub_seqs = []
- total_length = 0
- total_score = 0
- for vv, ww in atig_path_edges:
- r, aln_score, idt, typs_ = G_asm.sg_edges[(vv, ww)]
- e_seq = G_asm.sg_edge_seqs[(vv, ww)]
- rid, ss, tt = r
- sub_seqs.append(e_seq)
- total_length += abs(ss - tt)
- total_score += aln_score
- base_seq = "".join(sub_seqs)
- atig_output.append(
- (s, t, atig_path, total_length, total_score, base_seq, atig_path_edges, 1, 1))
- duplicated = True
- for score, atig_path in all_alt_path[1:]:
- atig_path_edges = list(zip(atig_path[:-1], atig_path[1:]))
- sub_seqs = []
- total_length = 0
- total_score = 0
- for vv, ww in atig_path_edges:
- r, aln_score, idt, type_ = G_asm.sg_edges[(vv, ww)]
- e_seq = G_asm.sg_edge_seqs[(vv, ww)]
- rid, ss, tt = r
- sub_seqs.append(e_seq)
- total_length += abs(ss - tt)
- total_score += aln_score
- seq = "".join(sub_seqs)
- aln_data, x, y = get_aln_data(base_seq, seq)
- if len(aln_data) != 0:
- idt = 1.0 - 1.0 * aln_data[-1][-1] / aln_data[-1][-2]
- cov = 1.0 * (aln_data[-1][3] -
- aln_data[-1][2]) / aln_data[-1][4]
- if idt < 0.96 or cov < 0.98:
- duplicated = False
- atig_output.append(
- (s, t, atig_path, total_length, total_score, seq, atig_path_edges, idt, cov))
- else:
- duplicated = False
- atig_output.append(
- (s, t, atig_path, total_length, total_score, seq, atig_path_edges, 0, 0))
- # if len(atig_output) == 1:
- # continue
- sub_id = 0
- for data in atig_output:
- v0, w0, tig_path, total_length, total_score, seq, atig_path_edges, a_idt, cov = data
- print(">%s~%s~%s-%d %d %d" % (
- v0, "NA", w0, sub_id, total_length, total_score), file=utg_out)
- print(seq, file=utg_out)
- sub_id += 1
- if __name__ == "__main__":
- main(sys.argv)
|