123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158 |
- from falcon_kit.FastaReader import open_fasta_reader
- import argparse
- import contextlib
- import os
- import glob
- import sys
- import re
- def fetch_ref_and_reads(base_dir, fofn, ctg_id, out_dir, min_ctg_lenth):
- read_fofn = fofn
- if out_dir == None:
- out_dir = os.path.join(base_dir, '3-unzip/reads')
- ctg_fa = os.path.join(base_dir, '2-asm-falcon/p_ctg.fa')
- read_map_dir = os.path.join(base_dir, '2-asm-falcon/read_maps')
- rawread_id_file = os.path.join(
- read_map_dir, 'dump_rawread_ids', 'rawread_ids')
- pread_id_file = os.path.join(read_map_dir, 'dump_pread_ids', 'pread_ids')
- rid_to_oid = open(rawread_id_file).read().split(
- '\n') # daligner raw read id to the original ids
- pid_to_fid = open(pread_id_file).read().split(
- '\n') # daligner pread id to the fake ids
- assert rid_to_oid, 'Empty rid_to_oid. Maybe empty {!r}?'.format(
- rawread_id_file)
- assert pid_to_fid, 'Empty pid_to_fid. Maybe empty {!r}?'.format(
- pread_id_file)
- def pid_to_oid(pid):
- fid = pid_to_fid[int(pid)]
- rid = int(fid.split('/')[1]) // 10
- return rid_to_oid[int(rid)]
- with open_fasta_reader(ctg_fa) as ref_fasta:
- all_ctg_ids = set()
- for s in ref_fasta:
- s_id = s.name.split()[0]
- if ctg_id != 'all' and s_id != ctg_id:
- continue
- if len(s.sequence) < min_ctg_lenth:
- continue
- if ctg_id != 'all':
- ref_out = open(os.path.join(
- out_dir, '%s_ref.fa' % ctg_id), 'w')
- else:
- ref_out = open(os.path.join(out_dir, '%s_ref.fa' % s_id), 'w')
- print('>%s' % s_id, file=ref_out)
- print(s.sequence, file=ref_out)
- all_ctg_ids.add(s_id)
- ref_out.close()
- read_set = {}
- ctg_id_hits = {}
- map_fn = os.path.join(read_map_dir, 'rawread_to_contigs')
- with open(map_fn, 'r') as f:
- for row in f:
- row = row.strip().split()
- hit_ctg = row[1]
- hit_ctg = hit_ctg.split('-')[0]
- if int(row[3]) == 0:
- o_id = rid_to_oid[int(row[0])]
- read_set[o_id] = hit_ctg
- ctg_id_hits[hit_ctg] = ctg_id_hits.get(hit_ctg, 0) + 1
- assert read_set, 'Empty read_set. Maybe empty {!r}?'.format(map_fn)
- map_fn = os.path.join(read_map_dir, 'pread_to_contigs')
- with open(map_fn, 'r') as f:
- for row in f:
- row = row.strip().split()
- hit_ctg = row[1]
- hit_ctg = hit_ctg.split('-')[0]
- if hit_ctg not in read_set and int(row[3]) == 0:
- o_id = pid_to_oid(row[0])
- read_set[o_id] = hit_ctg
- ctg_id_hits[hit_ctg] = ctg_id_hits.get(hit_ctg, 0) + 1
- with open(os.path.join(out_dir, 'ctg_list'), 'w') as f:
- for ctg_id in sorted(list(all_ctg_ids)):
- if ctg_id_hits.get(ctg_id, 0) < 5:
- continue
- # ignore small circle contigs, they need different approach
- if ctg_id[-1] not in ['F', 'R']:
- continue
- print(ctg_id, file=f)
- read_out_files = {}
- @contextlib.contextmanager
- def reopened_fasta_out(ctg_id):
- # A convenient closure, with a contextmanager.
- if ctg_id not in read_out_files:
- read_out = open(os.path.join(out_dir, '%s_reads.fa' % ctg_id), 'w')
- read_out_files[ctg_id] = 1
- else:
- read_out = open(os.path.join(out_dir, '%s_reads.fa' % ctg_id), 'a')
- yield read_out
- read_out.close()
- with open(read_fofn, 'r') as f:
- for r_fn in f:
- r_fn = r_fn.strip()
- # will soon handle .dexta too
- with open_fasta_reader(r_fn) as read_fa_file:
- for r in read_fa_file:
- rid = r.name.split()[0]
- if rid not in read_set:
- ctg_id = 'unassigned'
- else:
- ctg_id = read_set[rid]
- if ctg_id == 'NA' or ctg_id not in all_ctg_ids:
- ctg_id = 'unassigned'
- with reopened_fasta_out(ctg_id) as read_out:
- print('>' + rid, file=read_out)
- print(r.sequence, file=read_out)
- def parse_args(argv):
- parser = argparse.ArgumentParser(
- description='using the read to contig mapping data to partition the reads grouped by contigs')
- parser.add_argument('--base_dir', type=str, default='./',
- help='the base working dir of a falcon assembly')
- parser.add_argument('--fofn', type=str, default='./input.fofn',
- help='path to the file of the list of raw read fasta files')
- parser.add_argument('--ctg_id', type=str, default='all',
- help='contig identifier in the contig fasta file')
- parser.add_argument('--out_dir', default=None, type=str,
- help='the output base_dir, default to `base_dir/3-unzip/reads` directory')
- parser.add_argument('--min_ctg_lenth', default=20000, type=int,
- help='the minimum length of the contig for the outputs, default=20000')
- #parser.add_argument('--ctg_fa', type=str, default='./2-asm-falcon/p_ctg.fa', help='path to the contig fasta file')
- #parser.add_argument('--read_map_dir', type=str, default='./2-asm-falcon/read_maps', help='path to the read-contig map directory')
- # we can run this in parallel mode in the furture
- # parser.add_argument('--n_core', type=int, default=4,
- # help='number of processes used for generating consensus')
- args = parser.parse_args(argv[1:])
- return args
- def main(argv=sys.argv):
- args = parse_args(argv)
- fetch_ref_and_reads(**vars(args))
- if __name__ == '__main__':
- main()
|