fasta2fasta.py 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249
  1. """A pre-processor for DAZZ_DB/fasta2DB.
  2. Since fasta2DB has several constraints
  3. (a single movie per fasta, limited line-width, actual filenames),
  4. we write intermediate fasta
  5. files to disk. To reduce disk I/O, we can also compress them.
  6. Currently, we ignore zmw numbers and instead use a global counter.
  7. Inputs may be compressed, and may be either fasta or fastq.
  8. (For now, we ignore QVs.)
  9. """
  10. from future.utils import itervalues
  11. from builtins import object
  12. from ..util.system import abs_fns
  13. import argparse
  14. import glob
  15. import gzip
  16. import logging
  17. import os
  18. import re
  19. import sys
  20. log = logging.getLogger()
  21. DNA_BASES = ['A', 'C', 'G', 'T']
  22. COMPLEMENT = {
  23. 'A': 'T',
  24. 'C': 'G',
  25. 'G': 'C',
  26. 'T': 'A',
  27. }
  28. def complement(x): return (COMPLEMENT[base] for base in x)
  29. zmw_counter = None
  30. def WriteSplit(write, seq, split=8000):
  31. i = 0
  32. while i < len(seq):
  33. slice = seq[i:i + split]
  34. write(slice)
  35. write('\n')
  36. i += split
  37. def parse_header(header, zmw_counter=None):
  38. """
  39. >>> parse_header('>mine foo bar', 1)
  40. ('mine', 1, 'foo bar', 2)
  41. >>> parse_header('>mine/123/5_75 foo bar')
  42. ('mine', 123, '5_75 foo bar', None)
  43. For now, ignore the zmw and instead use a global counter.
  44. """
  45. if '/' in header:
  46. parts = header[1:].split('/')
  47. else:
  48. parts = header[1:].split(None, 1)
  49. movie = parts[0]
  50. if zmw_counter is None:
  51. zmw = int(parts[1])
  52. else:
  53. zmw = zmw_counter
  54. zmw_counter += 1
  55. if len(parts) > 1:
  56. extra = parts[-1]
  57. else:
  58. extra = ''
  59. return movie, zmw, extra, zmw_counter
  60. re_range = re.compile('^(\d+)_(\d+)\s*(.*)$')
  61. def process_fasta(ifs, movie2write):
  62. header = ifs.readline().strip()
  63. if header[0] != '>':
  64. raise Exception('{!r} is not a fasta file.'.format(ifs.name))
  65. while header:
  66. global zmw_counter
  67. movie, zmw, extra, zmw_counter = parse_header(header, zmw_counter)
  68. write = movie2write[movie]
  69. # log.info('header={!r}'.format(header))
  70. seq = ''
  71. line = ifs.readline().strip()
  72. while line and not line.startswith('>'):
  73. seq += line.strip()
  74. line = ifs.readline().strip()
  75. length = len(seq)
  76. #log.info('seq:{!r}...({})'.format(seq[:5], length))
  77. beg, end = 0, length
  78. mo = re_range.search(extra)
  79. if mo:
  80. beg, end, extra = mo.groups()
  81. beg = int(beg)
  82. end = int(end)
  83. if (end - beg) != length:
  84. end = beg + length
  85. # Probably never happens tho.
  86. if extra:
  87. extra = ' ' + extra
  88. new_header = '>{movie}/{zmw}/{beg}_{end}{extra}\n'.format(**locals())
  89. write(new_header)
  90. WriteSplit(write, seq)
  91. header = line
  92. def process_fastq(ifs, movie2write):
  93. header = ifs.readline().strip()
  94. if header[0] != '@':
  95. raise Exception('{!r} is not a fastq file.'.format(ifs.name))
  96. while header:
  97. global zmw_counter
  98. movie, zmw, extra, zmw_counter = parse_header(header, zmw_counter)
  99. write = movie2write[movie]
  100. # log.info('header={!r}'.format(header))
  101. seq = ifs.readline().strip()
  102. header2 = ifs.readline().strip()
  103. quals = ifs.readline().strip()
  104. length = len(seq)
  105. #log.info('seq:{!r}...({})'.format(seq[:5], length))
  106. new_header = '>{movie}/{zmw}/0_{length} {extra}\n'.format(**locals())
  107. write(new_header)
  108. WriteSplit(write, seq)
  109. header = ifs.readline().strip()
  110. def process_try_both(ifs, movie2write):
  111. try:
  112. process_fasta(ifs, movie2write)
  113. except Exception:
  114. log.exception('bad fasta: {!r}; trying as fastq...'.format(ifs.name))
  115. process_fastq(ifs, movie2write)
  116. def process(ifn, movie2write):
  117. root, ext = os.path.splitext(ifn)
  118. if ifn.endswith('.gz'):
  119. Open = gzip.GzipFile
  120. ext = os.path.splitext(root)[1]
  121. elif ifn.endswith('.bz2'):
  122. import bz2
  123. Open = bz2.BZ2File
  124. ext = os.path.splitext(root)[1]
  125. else:
  126. Open = open
  127. log.info('ext={!r}'.format(ext))
  128. if ext in ('.fasta', '.fa'):
  129. func = process_fasta
  130. elif ext in ('.fastq', '.fq'):
  131. func = process_fastq
  132. else:
  133. func = process_try_both
  134. with Open(ifn) as ifs:
  135. func(ifs, movie2write)
  136. class WriterMap(object):
  137. def basenames(self):
  138. return list(self.__obn2movie.keys())
  139. def close(self):
  140. for ofs in itervalues(self.__movie2ofs):
  141. ofs.close()
  142. def __getitem__(self, movie):
  143. """Get or create a 'write' function.
  144. """
  145. ofs = self.__movie2ofs.get(movie)
  146. if ofs is None:
  147. obn = self.__basename(movie)
  148. self.__obn2movie[obn] = movie
  149. if os.path.exists(obn):
  150. log.info('Over-writing {!r}'.format(obn))
  151. else:
  152. log.info('Creating {!r}'.format(obn))
  153. ofs = self.__open(obn, mode='w')
  154. self.__movie2ofs[movie] = ofs
  155. return ofs.write
  156. def __init__(self, Basename, Open):
  157. self.__obn2movie = dict()
  158. self.__movie2ofs = dict()
  159. self.__basename = Basename
  160. self.__open = Open
  161. def get_writer(Gzip=False):
  162. if Gzip:
  163. def Basename(movie): return movie + '.fasta.gz'
  164. import functools
  165. Open = functools.partial(gzip.GzipFile, compresslevel=1)
  166. # A little better, a little slower:
  167. #import bz2
  168. #Open = bz2.BZ2File
  169. #Basename = lambda movie: movie + '.fasta.bz2'
  170. else:
  171. def Basename(movie): return movie + '.fasta'
  172. Open = open
  173. movie2write = WriterMap(Basename, Open)
  174. return movie2write
  175. def fixall(ifns, Gzip=False):
  176. """Given an iterator of input absolute filenames (fasta or fastq),
  177. return a list of output basenames of resulting .fasta(.gz) files, relative to CWD.
  178. """
  179. if Gzip:
  180. open = gzip.GzipFile
  181. movie2write = get_writer(Gzip)
  182. for ifn in ifns:
  183. process(ifn, movie2write)
  184. movie2write.close()
  185. return movie2write.basenames()
  186. def main():
  187. parser = argparse.ArgumentParser()
  188. parser.add_argument('--gzip',
  189. action='store_true',
  190. help='Compress intermediate fasta with gzip. (Not currently implemented.)')
  191. parser.add_argument('--zmw-start',
  192. type=int,
  193. help='Ignore the zmw number in the fasta header. Instead, use a global counter, starting at this numer.')
  194. # parser.add_argument('--clean',
  195. # action='store_true',
  196. # help='Remove intermediate fasta when done.')
  197. # parser.add_argument('--fofn',
  198. # help='Dump intermediate FOFN. This can be used directly by "fasta2DB foo -ffofn" if fasta are uncompressed.')
  199. # parser.add_argument('--fasta2DB',
  200. # help='Pass these arguments along to fasta2DB. These should exclude fasta inputs.')
  201. #global ARGS
  202. ARGS = parser.parse_args()
  203. global zmw_counter
  204. zmw_counter = ARGS.zmw_start
  205. for obn in fixall(abs_fns(sys.stdin, os.getcwd()), Gzip=ARGS.gzip):
  206. sys.stdout.write('{}\n'.format(os.path.abspath(obn)))
  207. if __name__ == "__main__":
  208. logging.basicConfig()
  209. log.setLevel(logging.DEBUG)
  210. main()