123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249 |
- """A pre-processor for DAZZ_DB/fasta2DB.
- Since fasta2DB has several constraints
- (a single movie per fasta, limited line-width, actual filenames),
- we write intermediate fasta
- files to disk. To reduce disk I/O, we can also compress them.
- Currently, we ignore zmw numbers and instead use a global counter.
- Inputs may be compressed, and may be either fasta or fastq.
- (For now, we ignore QVs.)
- """
- from future.utils import itervalues
- from builtins import object
- from ..util.system import abs_fns
- import argparse
- import glob
- import gzip
- import logging
- import os
- import re
- import sys
- log = logging.getLogger()
- DNA_BASES = ['A', 'C', 'G', 'T']
- COMPLEMENT = {
- 'A': 'T',
- 'C': 'G',
- 'G': 'C',
- 'T': 'A',
- }
- def complement(x): return (COMPLEMENT[base] for base in x)
- zmw_counter = None
- def WriteSplit(write, seq, split=8000):
- i = 0
- while i < len(seq):
- slice = seq[i:i + split]
- write(slice)
- write('\n')
- i += split
- def parse_header(header, zmw_counter=None):
- """
- >>> parse_header('>mine foo bar', 1)
- ('mine', 1, 'foo bar', 2)
- >>> parse_header('>mine/123/5_75 foo bar')
- ('mine', 123, '5_75 foo bar', None)
- For now, ignore the zmw and instead use a global counter.
- """
- if '/' in header:
- parts = header[1:].split('/')
- else:
- parts = header[1:].split(None, 1)
- movie = parts[0]
- if zmw_counter is None:
- zmw = int(parts[1])
- else:
- zmw = zmw_counter
- zmw_counter += 1
- if len(parts) > 1:
- extra = parts[-1]
- else:
- extra = ''
- return movie, zmw, extra, zmw_counter
- re_range = re.compile('^(\d+)_(\d+)\s*(.*)$')
- def process_fasta(ifs, movie2write):
- header = ifs.readline().strip()
- if header[0] != '>':
- raise Exception('{!r} is not a fasta file.'.format(ifs.name))
- while header:
- global zmw_counter
- movie, zmw, extra, zmw_counter = parse_header(header, zmw_counter)
- write = movie2write[movie]
- # log.info('header={!r}'.format(header))
- seq = ''
- line = ifs.readline().strip()
- while line and not line.startswith('>'):
- seq += line.strip()
- line = ifs.readline().strip()
- length = len(seq)
- #log.info('seq:{!r}...({})'.format(seq[:5], length))
- beg, end = 0, length
- mo = re_range.search(extra)
- if mo:
- beg, end, extra = mo.groups()
- beg = int(beg)
- end = int(end)
- if (end - beg) != length:
- end = beg + length
- # Probably never happens tho.
- if extra:
- extra = ' ' + extra
- new_header = '>{movie}/{zmw}/{beg}_{end}{extra}\n'.format(**locals())
- write(new_header)
- WriteSplit(write, seq)
- header = line
- def process_fastq(ifs, movie2write):
- header = ifs.readline().strip()
- if header[0] != '@':
- raise Exception('{!r} is not a fastq file.'.format(ifs.name))
- while header:
- global zmw_counter
- movie, zmw, extra, zmw_counter = parse_header(header, zmw_counter)
- write = movie2write[movie]
- # log.info('header={!r}'.format(header))
- seq = ifs.readline().strip()
- header2 = ifs.readline().strip()
- quals = ifs.readline().strip()
- length = len(seq)
- #log.info('seq:{!r}...({})'.format(seq[:5], length))
- new_header = '>{movie}/{zmw}/0_{length} {extra}\n'.format(**locals())
- write(new_header)
- WriteSplit(write, seq)
- header = ifs.readline().strip()
- def process_try_both(ifs, movie2write):
- try:
- process_fasta(ifs, movie2write)
- except Exception:
- log.exception('bad fasta: {!r}; trying as fastq...'.format(ifs.name))
- process_fastq(ifs, movie2write)
- def process(ifn, movie2write):
- root, ext = os.path.splitext(ifn)
- if ifn.endswith('.gz'):
- Open = gzip.GzipFile
- ext = os.path.splitext(root)[1]
- elif ifn.endswith('.bz2'):
- import bz2
- Open = bz2.BZ2File
- ext = os.path.splitext(root)[1]
- else:
- Open = open
- log.info('ext={!r}'.format(ext))
- if ext in ('.fasta', '.fa'):
- func = process_fasta
- elif ext in ('.fastq', '.fq'):
- func = process_fastq
- else:
- func = process_try_both
- with Open(ifn) as ifs:
- func(ifs, movie2write)
- class WriterMap(object):
- def basenames(self):
- return list(self.__obn2movie.keys())
- def close(self):
- for ofs in itervalues(self.__movie2ofs):
- ofs.close()
- def __getitem__(self, movie):
- """Get or create a 'write' function.
- """
- ofs = self.__movie2ofs.get(movie)
- if ofs is None:
- obn = self.__basename(movie)
- self.__obn2movie[obn] = movie
- if os.path.exists(obn):
- log.info('Over-writing {!r}'.format(obn))
- else:
- log.info('Creating {!r}'.format(obn))
- ofs = self.__open(obn, mode='w')
- self.__movie2ofs[movie] = ofs
- return ofs.write
- def __init__(self, Basename, Open):
- self.__obn2movie = dict()
- self.__movie2ofs = dict()
- self.__basename = Basename
- self.__open = Open
- def get_writer(Gzip=False):
- if Gzip:
- def Basename(movie): return movie + '.fasta.gz'
- import functools
- Open = functools.partial(gzip.GzipFile, compresslevel=1)
- # A little better, a little slower:
- #import bz2
- #Open = bz2.BZ2File
- #Basename = lambda movie: movie + '.fasta.bz2'
- else:
- def Basename(movie): return movie + '.fasta'
- Open = open
- movie2write = WriterMap(Basename, Open)
- return movie2write
- def fixall(ifns, Gzip=False):
- """Given an iterator of input absolute filenames (fasta or fastq),
- return a list of output basenames of resulting .fasta(.gz) files, relative to CWD.
- """
- if Gzip:
- open = gzip.GzipFile
- movie2write = get_writer(Gzip)
- for ifn in ifns:
- process(ifn, movie2write)
- movie2write.close()
- return movie2write.basenames()
- def main():
- parser = argparse.ArgumentParser()
- parser.add_argument('--gzip',
- action='store_true',
- help='Compress intermediate fasta with gzip. (Not currently implemented.)')
- parser.add_argument('--zmw-start',
- type=int,
- help='Ignore the zmw number in the fasta header. Instead, use a global counter, starting at this numer.')
- # parser.add_argument('--clean',
- # action='store_true',
- # help='Remove intermediate fasta when done.')
- # parser.add_argument('--fofn',
- # help='Dump intermediate FOFN. This can be used directly by "fasta2DB foo -ffofn" if fasta are uncompressed.')
- # parser.add_argument('--fasta2DB',
- # help='Pass these arguments along to fasta2DB. These should exclude fasta inputs.')
- #global ARGS
- ARGS = parser.parse_args()
- global zmw_counter
- zmw_counter = ARGS.zmw_start
- for obn in fixall(abs_fns(sys.stdin, os.getcwd()), Gzip=ARGS.gzip):
- sys.stdout.write('{}\n'.format(os.path.abspath(obn)))
- if __name__ == "__main__":
- logging.basicConfig()
- log.setLevel(logging.DEBUG)
- main()
|