123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243 |
- import argparse
- import os
- import sys
- import json
- from falcon_kit.fc_asm_graph import AsmGraph
- from falcon_kit.FastaReader import FastaReader
- from falcon_kit.gfa_graph import *
- import falcon_kit.tiling_path
- def load_seqs(fasta_fn, store_only_seq_len):
- """
- If store_only_seq_len is True, then the seq is discarded and
- only it's length stored.
- """
- seqs = {}
- f = FastaReader(fasta_fn)
- if store_only_seq_len == False:
- for r in f:
- seqs[r.name.split()[0]] = (len(r.sequence), r.sequence.upper())
- else:
- for r in f:
- seqs[r.name.split()[0]] = (len(r.sequence), '*')
- return seqs
- def load_pread_overlaps(fp_in):
- preads_overlap_dict = {}
- for line in fp_in:
- sl = line.strip().split()
- if len(sl) < 13:
- continue
- # Example line: 000000009 000000082 -3004 99.90 0 4038 7043 7043 1 6488 9492 9492 overlap 000000F.5000003.0 000000F.5000003.0
- preads_overlap_dict[(sl[0], sl[1])] = sl[0:4] + [int(val) for val in sl[4:12]] + sl[12:]
-
- # Overlaps are not always symmetrically represented in the preads.ovl for some reason, so add the
- # reverse overlap here as well, but do not overwrite existing (just to be safe).
- if (sl[1], sl[0]) not in preads_overlap_dict:
- preads_overlap_dict[(sl[1], sl[0])] = [sl[1], sl[0], sl[2], sl[3]] + [int(val) for val in sl[8:12]] + [int(val) for val in sl[4:8]] + sl[12:]
- return preads_overlap_dict
- def load_sg_edges(fp_in):
- """
- Loads all sg_edges_list so that haplotig paths can be reversed if needed.
- with open(os.path.join(fc_asm_path, "sg_edges_list"), 'r') as fp:
- sg_edges_dict = load_sg_edges(fp)
- """
- sg_edges_dict = {}
- for line in fp_in:
- sl = line.strip().split()
- if len(sl) < 8:
- continue
- # Example line: 000000512:B 000000679:E 000000679 4290 7984 4290 99.95 TR
- sg_edges_dict[(sl[0], sl[1])] = sl[0:3] + [int(val) for val in sl[3:6]] + [float(sl[6])] + sl[7:]
- return sg_edges_dict
- def add_node(gfa_graph, v, preads_dict):
- v_name, v_orient = v.split(':')
- v_len, v_seq = preads_dict[v_name]
- gfa_graph.add_node(v_name, v_len, v_seq)
- def add_edge(gfa_graph, v, w, edge_split_line, preads_overlap_dict, sg_edges_dict):
- edge_name = 'edge-%d' % (len(gfa_graph.edges))
- v_name, v_orient = v.split(':')
- w_name, w_orient = w.split(':')
- v_orient = '+' if v_orient == 'E' else '-'
- w_orient = '+' if w_orient == 'E' else '-'
- cigar = '*'
- # Get the SG edge and the overlap, and set the tags and labels.
- sg_edge = sg_edges_dict[(v, w)]
- overlap = preads_overlap_dict[(v_name, w_name)]
- labels = {'tp': edge_split_line, 'sg_edge': sg_edge, 'overlap': overlap}
- tags = {}
- # Example overlap:
- # 000000001 000000170 -6104 99.75 0 1909 8010 8010 1 1250 7354 7354 overlap 000000F.5000003.0 000000F.5000003.0
- # Handle the overlap coordinates - GFA format requires the coordinates to be with
- # respect to the fwd strand, and the M4 format reports overlaps on the
- # strand of the alignment.
- _, _, score, idt, v_rev, v_start, v_end, v_len, w_rev, w_start, w_end, w_len = overlap[0:12]
- if v_rev == 1:
- v_start, v_end = v_end, v_start
- v_start = v_len - v_start
- v_end = v_len - v_end
- if w_rev == 1:
- w_start, w_end = w_end, w_start
- w_start = w_len - w_start
- w_end = w_len - w_end
- gfa_graph.add_edge(edge_name, v_name, v_orient, w_name, w_orient, v_start, v_end, w_start, w_end, cigar, tags = tags, labels = labels)
- def add_tiling_paths_to_gfa(gfa_graph, tiling_paths, preads_dict, preads_overlap_dict, sg_edges_dict):
- # Add nodes.
- for ctg_id, tiling_path in tiling_paths.items():
- for edge in tiling_path.edges:
- add_node(gfa_graph, edge.v, preads_dict)
- add_node(gfa_graph, edge.w, preads_dict)
- # Add edges.
- for ctg_id, tiling_path in tiling_paths.items():
- for edge in tiling_path.edges:
- add_edge(gfa_graph, edge.v, edge.w, edge.get_split_line(), preads_overlap_dict, sg_edges_dict)
- # Add path.
- for ctg_id, tiling_path in tiling_paths.items():
- path_nodes = []
- path_cigars = []
- if len(tiling_path.edges) == 0:
- continue
- # Add the first node to the path.
- v = tiling_path.edges[0].v
- v_name, v_orient = v.split(':')
- cigar = '%dM' % (tiling_path.coords[v]) # This will be 0 if the contig is improper, and length of v otherwise.
- path_nodes.append(v_name)
- path_cigars.append(cigar)
- # Add the rest of the nodes.
- for edge in tiling_path.edges:
- w_name, w_orient = edge.w.split(':')
- cigar = '%dM' % (abs(edge.e - edge.b))
- path_nodes.append(w_name)
- path_cigars.append(cigar)
- gfa_graph.add_path(ctg_id, path_nodes, path_cigars)
- def add_string_graph_to_gfa(gfa_graph, sg_edges_list, utg_data, ctg_paths, preads_dict, preads_overlap_dict, sg_edges_dict):
- asm_graph = AsmGraph(sg_edges_list, utg_data, ctg_paths)
- for v, w in asm_graph.sg_edges:
- add_node(gfa_graph, v, preads_dict)
- add_node(gfa_graph, w, preads_dict)
- for v, w in asm_graph.sg_edges:
- edge_data = asm_graph.sg_edges[(v, w)]
- if edge_data[-1] != 'G':
- continue
- add_edge(gfa_graph, v, w, edge_data, preads_overlap_dict, sg_edges_dict)
- def run(fp_out, p_ctg_tiling_path, a_ctg_tiling_path,
- preads_fasta, p_ctg_fasta, a_ctg_fasta,
- sg_edges_list, preads_ovl, utg_data, ctg_paths,
- add_string_graph, write_reads,
- min_p_len, min_a_len, only_these_contigs):
- """
- This method produces a GFAGraph object containing info required
- to write both the GFA-1 and GFA-2 formatted assemblies.
- However, it does not write the GFA formats directly, but instead
- dumps a JSON file to disk.
- The JSON file is converted to a GFA-1 or a GFA-2 with outside scripts.
- The graphical output is produced from either the entire string
- graph (only the non-filtered edges are considered) or from only
- the tiling paths. String graph can show the neighborhood of contig
- breaks, whereas the tiling path output is more sparse.
- Output is written to stdout.
- """
- gfa_graph = GFAGraph()
- # Load preads.
- preads_dict = load_seqs(preads_fasta, (not write_reads))
- # Load the pread overlaps
- with open(preads_ovl, 'r') as fp:
- preads_overlap_dict = load_pread_overlaps(fp)
- # Load the SG edges.
- with open(sg_edges_list, 'r') as fp:
- sg_edges_dict = load_sg_edges(fp)
- # Load the primary and associate contig files.
- p_ctg_seqs = load_seqs(p_ctg_fasta, True)
- a_ctg_seqs = load_seqs(a_ctg_fasta, True)
- # Collect the sequence lengths from the above dicts.
- p_ctg_lens = {key: val[0] for key, val in p_ctg_seqs.items()}
- a_ctg_lens = {key: val[0] for key, val in a_ctg_seqs.items()}
- # Create whitelists for filtering contigs.
- p_ctg_whitelist = set(p_ctg_seqs.keys())
- a_ctg_whitelist = set([key for key in list(a_ctg_seqs.keys())])
- if only_these_contigs:
- p_ctg_whitelist = set(open(only_these_contigs).read().splitlines()) & set(p_ctg_whitelist)
- a_ctg_whitelist = set([key for key in list(a_ctg_seqs.keys()) if key.split('-')[0].split('_')[0] in p_ctg_whitelist])
- # Load the tiling paths and assign coordinates.
- p_paths = falcon_kit.tiling_path.load_tiling_paths(p_ctg_tiling_path, whitelist_seqs=p_ctg_whitelist, contig_lens=p_ctg_lens)
- a_paths = falcon_kit.tiling_path.load_tiling_paths(a_ctg_tiling_path, whitelist_seqs=a_ctg_whitelist, contig_lens=a_ctg_lens)
- add_tiling_paths_to_gfa(gfa_graph, p_paths, preads_dict, preads_overlap_dict, sg_edges_dict)
- add_tiling_paths_to_gfa(gfa_graph, a_paths, preads_dict, preads_overlap_dict, sg_edges_dict)
- if add_string_graph:
- add_string_graph_to_gfa(gfa_graph, sg_edges_list, utg_data, ctg_paths, preads_dict, preads_overlap_dict, sg_edges_dict)
- fp_out.write(serialize_gfa(gfa_graph))
- fp_out.write('\n')
- def parse_args(argv):
- parser = argparse.ArgumentParser(description="Generates GFA output (on stdout) from FALCON's assembly.",
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('--p-ctg-tiling-path', type=str, default='p_ctg_tiling_path',
- help='location of the p_ctg tiling path file')
- parser.add_argument('--a-ctg-tiling-path', type=str, default='a_ctg_tiling_path',
- help='location of the a_ctg tiling path file')
- parser.add_argument('--preads-fasta', type=str, default='preads4falcon.fasta',
- help='path to the preads4falcon.fasta file')
- parser.add_argument('--p-ctg-fasta', type=str, default='p_ctg.fa',
- help='path to the primary contigs file')
- parser.add_argument('--a-ctg-fasta', type=str, default='a_ctg.fa',
- help='path to the associate contigs file')
- parser.add_argument('--sg-edges-list', type=str, default='sg_edges_list',
- help='string graph edges file from Falcon assembly')
- parser.add_argument('--preads-ovl', type=str, default='preads.ovl',
- help='the preads overlap file')
- parser.add_argument('--utg-data', type=str,
- default='utg_data', help='unitig data file from Falcon')
- parser.add_argument('--ctg-paths', type=str, default='ctg_paths',
- help='contig paths file from Falcon assembly')
- parser.add_argument('--add-string-graph', action='store_true',
- help="in addition to tiling paths, output other edges and nodes from the final string graph")
- parser.add_argument('--write-reads', '-r', action='store_true',
- help="output read sequences in S lines")
- # parser.add_argument('--write-contigs', '-c', action='store_true',
- # help="output contig sequences as S lines")
- parser.add_argument('--min-p-len', type=int, default=0,
- help='primary contig paths with length smaller than this will not be reported')
- parser.add_argument('--min-a-len', type=int, default=0,
- help='associate contig paths with length smaller than this will not be reported')
- parser.add_argument('--only-these-contigs', type=str, default='',
- help='limit output to specified contigs listed in file (one per line)')
- args = parser.parse_args(argv[1:])
- return args
- def main(argv=sys.argv):
- args = parse_args(argv)
- run(sys.stdout, **vars(args))
- if __name__ == '__main__': # pragma: no cover
- main()
|