liuwei
/
pb_assembly


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158
							

from falcon_kit.FastaReader import open_fasta_reader
import argparse
import contextlib
import os
import glob
import sys
import re


def fetch_ref_and_reads(base_dir, fofn, ctg_id, out_dir, min_ctg_lenth):
    read_fofn = fofn
    if out_dir == None:
        out_dir = os.path.join(base_dir, '3-unzip/reads')

    ctg_fa = os.path.join(base_dir, '2-asm-falcon/p_ctg.fa')
    read_map_dir = os.path.join(base_dir, '2-asm-falcon/read_maps')

    rawread_id_file = os.path.join(
        read_map_dir, 'dump_rawread_ids', 'rawread_ids')
    pread_id_file = os.path.join(read_map_dir, 'dump_pread_ids', 'pread_ids')

    rid_to_oid = open(rawread_id_file).read().split(
        '\n')  # daligner raw read id to the original ids
    pid_to_fid = open(pread_id_file).read().split(
        '\n')  # daligner pread id to the fake ids
    assert rid_to_oid, 'Empty rid_to_oid. Maybe empty {!r}?'.format(
        rawread_id_file)
    assert pid_to_fid, 'Empty pid_to_fid. Maybe empty {!r}?'.format(
        pread_id_file)

    def pid_to_oid(pid):
        fid = pid_to_fid[int(pid)]
        rid = int(fid.split('/')[1]) // 10
        return rid_to_oid[int(rid)]

    with open_fasta_reader(ctg_fa) as ref_fasta:
        all_ctg_ids = set()
        for s in ref_fasta:
            s_id = s.name.split()[0]
            if ctg_id != 'all' and s_id != ctg_id:
                continue

            if len(s.sequence) < min_ctg_lenth:
                continue

            if ctg_id != 'all':
                ref_out = open(os.path.join(
                    out_dir, '%s_ref.fa' % ctg_id), 'w')
            else:
                ref_out = open(os.path.join(out_dir, '%s_ref.fa' % s_id), 'w')

            print('>%s' % s_id, file=ref_out)
            print(s.sequence, file=ref_out)
            all_ctg_ids.add(s_id)
            ref_out.close()

    read_set = {}
    ctg_id_hits = {}

    map_fn = os.path.join(read_map_dir, 'rawread_to_contigs')
    with open(map_fn, 'r') as f:
        for row in f:
            row = row.strip().split()
            hit_ctg = row[1]
            hit_ctg = hit_ctg.split('-')[0]
            if int(row[3]) == 0:
                o_id = rid_to_oid[int(row[0])]
                read_set[o_id] = hit_ctg
                ctg_id_hits[hit_ctg] = ctg_id_hits.get(hit_ctg, 0) + 1
    assert read_set, 'Empty read_set. Maybe empty {!r}?'.format(map_fn)
    map_fn = os.path.join(read_map_dir, 'pread_to_contigs')
    with open(map_fn, 'r') as f:
        for row in f:
            row = row.strip().split()
            hit_ctg = row[1]
            hit_ctg = hit_ctg.split('-')[0]
            if hit_ctg not in read_set and int(row[3]) == 0:
                o_id = pid_to_oid(row[0])
                read_set[o_id] = hit_ctg
                ctg_id_hits[hit_ctg] = ctg_id_hits.get(hit_ctg, 0) + 1

    with open(os.path.join(out_dir, 'ctg_list'), 'w') as f:
        for ctg_id in sorted(list(all_ctg_ids)):
            if ctg_id_hits.get(ctg_id, 0) < 5:
                continue
            # ignore small circle contigs, they need different approach
            if ctg_id[-1] not in ['F', 'R']:
                continue
            print(ctg_id, file=f)

    read_out_files = {}

    @contextlib.contextmanager
    def reopened_fasta_out(ctg_id):
                # A convenient closure, with a contextmanager.
        if ctg_id not in read_out_files:
            read_out = open(os.path.join(out_dir, '%s_reads.fa' % ctg_id), 'w')
            read_out_files[ctg_id] = 1
        else:
            read_out = open(os.path.join(out_dir, '%s_reads.fa' % ctg_id), 'a')
        yield read_out
        read_out.close()

    with open(read_fofn, 'r') as f:
        for r_fn in f:
            r_fn = r_fn.strip()
            # will soon handle .dexta too
            with open_fasta_reader(r_fn) as read_fa_file:
                for r in read_fa_file:
                    rid = r.name.split()[0]
                    if rid not in read_set:
                        ctg_id = 'unassigned'
                    else:
                        ctg_id = read_set[rid]

                    if ctg_id == 'NA' or ctg_id not in all_ctg_ids:
                        ctg_id = 'unassigned'

                    with reopened_fasta_out(ctg_id) as read_out:
                        print('>' + rid, file=read_out)
                        print(r.sequence, file=read_out)


def parse_args(argv):
    parser = argparse.ArgumentParser(
        description='using the read to contig mapping data to partition the reads grouped by contigs')
    parser.add_argument('--base_dir', type=str, default='./',
                        help='the base working dir of a falcon assembly')
    parser.add_argument('--fofn', type=str, default='./input.fofn',
                        help='path to the file of the list of raw read fasta files')
    parser.add_argument('--ctg_id', type=str, default='all',
                        help='contig identifier in the contig fasta file')
    parser.add_argument('--out_dir', default=None, type=str,
                        help='the output base_dir, default to `base_dir/3-unzip/reads` directory')
    parser.add_argument('--min_ctg_lenth', default=20000, type=int,
                        help='the minimum length of the contig for the outputs, default=20000')
    #parser.add_argument('--ctg_fa', type=str, default='./2-asm-falcon/p_ctg.fa', help='path to the contig fasta file')
    #parser.add_argument('--read_map_dir', type=str, default='./2-asm-falcon/read_maps', help='path to the read-contig map directory')
    # we can run this in parallel mode in the furture
    # parser.add_argument('--n_core', type=int, default=4,
    #                    help='number of processes used for generating consensus')

    args = parser.parse_args(argv[1:])
    return args


def main(argv=sys.argv):
    args = parse_args(argv)
    fetch_ref_and_reads(**vars(args))


if __name__ == '__main__':
    main()