FastaReader.py 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279
  1. from builtins import next
  2. from builtins import range
  3. from builtins import object
  4. from os.path import abspath, expanduser
  5. from .io import NativeIO as StringIO
  6. from .io import FilePercenter
  7. import contextlib
  8. import gzip
  9. import hashlib
  10. import logging
  11. import os
  12. import re
  13. import subprocess
  14. import sys
  15. import warnings
  16. LOG = logging.getLogger(__name__)
  17. ##
  18. # Utility functions for FastaReader
  19. ##
  20. def wrap(s, columns):
  21. return "\n".join(s[start:start + columns]
  22. for start in range(0, len(s), columns))
  23. def splitFastaHeader(name):
  24. """
  25. Split a FASTA/FASTQ header into its id and metadata components
  26. >>> splitFastaHeader('>m54329_180926_230856/34669168/0_42 FOO=BAR X=Y')
  27. ('>m54329_180926_230856/34669168/0_42', 'FOO=BAR X=Y')
  28. """
  29. nameParts = re.split(r'\s', name, maxsplit=1)
  30. id_ = nameParts[0]
  31. if len(nameParts) > 1:
  32. metadata = nameParts[1].strip()
  33. else:
  34. metadata = None
  35. return (id_, metadata)
  36. def splitFileContents(f, delimiter, BLOCKSIZE=8192):
  37. """
  38. Same semantics as f.read().split(delimiter), but with memory usage
  39. determined by largest chunk rather than entire file size
  40. """
  41. remainder = StringIO()
  42. while True:
  43. block = f.read(BLOCKSIZE)
  44. if not block:
  45. break
  46. parts = block.split(delimiter)
  47. remainder.write(parts[0])
  48. for part in parts[1:]:
  49. yield remainder.getvalue()
  50. remainder = StringIO()
  51. remainder.write(part)
  52. yield remainder.getvalue()
  53. class FastaRecord(object):
  54. """
  55. A FastaRecord object models a named sequence in a FASTA file.
  56. """
  57. DELIMITER = ">"
  58. COLUMNS = 60
  59. def __init__(self, name, sequence):
  60. try:
  61. assert "\n" not in name
  62. assert "\n" not in sequence
  63. assert self.DELIMITER not in sequence
  64. self._name = name
  65. self._sequence = sequence
  66. self._md5 = hashlib.md5(self._sequence.encode('ascii')).hexdigest()
  67. self._id, self._metadata = splitFastaHeader(name)
  68. except AssertionError:
  69. raise ValueError("Invalid FASTA record data")
  70. @property
  71. def name(self):
  72. """
  73. The name of the sequence in the FASTA file, equal to the entire
  74. FASTA header following the '>' character
  75. """
  76. return self._name
  77. @property
  78. def id(self):
  79. """
  80. The id of the sequence in the FASTA file, equal to the FASTA header
  81. up to the first whitespace.
  82. """
  83. return self._id
  84. @property
  85. def metadata(self):
  86. """
  87. The metadata associated with the sequence in the FASTA file, equal to
  88. the contents of the FASTA header following the first whitespace
  89. """
  90. return self._metadata
  91. @property
  92. def sequence(self):
  93. """
  94. The sequence for the record as present in the FASTA file.
  95. (Newlines are removed but otherwise no sequence normalization
  96. is performed).
  97. """
  98. return self._sequence
  99. @property
  100. def length(self):
  101. """
  102. Get the length of the FASTA sequence
  103. """
  104. return len(self._sequence)
  105. @property
  106. def md5(self):
  107. """
  108. The MD5 checksum (hex digest) of `sequence`
  109. """
  110. return self._md5
  111. @classmethod
  112. def fromString(cls, s):
  113. """
  114. Interprets a string as a FASTA record. Does not make any
  115. assumptions about wrapping of the sequence string.
  116. """
  117. try:
  118. lines = s.splitlines()
  119. assert len(lines) > 1
  120. assert lines[0][0] == cls.DELIMITER
  121. name = lines[0][1:]
  122. sequence = "".join(lines[1:])
  123. return FastaRecord(name, sequence)
  124. except AssertionError:
  125. raise ValueError("String not recognized as a valid FASTA record")
  126. def __eq__(self, other):
  127. if isinstance(other, self.__class__):
  128. return (self.name == other.name and
  129. self._sequence == other._sequence)
  130. else:
  131. return False
  132. def __ne__(self, other):
  133. return not self.__eq__(other)
  134. def __str__(self):
  135. """
  136. Output a string representation of this FASTA record, observing
  137. standard conventions about sequence wrapping.
  138. """
  139. return (">%s\n" % self.name) + \
  140. wrap(self._sequence, self.COLUMNS)
  141. # These are refactored from ReaderBase/FastaReader.
  142. def yield_fasta_record(f, fn=None, log=LOG.info):
  143. """
  144. f: fileobj
  145. fn: str - filename (for exceptions); inferred from f.name if not provided
  146. """
  147. if not fn and f is not sys.stdin:
  148. fn = getattr(f, 'name', None)
  149. if fn is not None and not os.path.exists(fn):
  150. log('Not sure what to do with FASTA file name="{}"'.format(fn))
  151. fn = ''
  152. counter = FilePercenter(fn, log=log)
  153. try:
  154. parts = splitFileContents(f, ">")
  155. assert "" == next(parts)
  156. for part in parts:
  157. counter(len(part))
  158. yield FastaRecord.fromString(">" + part)
  159. except AssertionError:
  160. raise Exception("Invalid FASTA file {!r}".format(fn))
  161. def yield_fasta_records(f, fn=None, log=LOG.info):
  162. warnings.warn('use yield_fasta_record() instead', DeprecationWarning)
  163. return yield_fasta_record(f, fn, log)
  164. def stream_stdout(call, fn):
  165. args = call.split()
  166. proc = subprocess.Popen(args, stdin=open(fn), stdout=subprocess.PIPE)
  167. return proc.stdout
  168. @contextlib.contextmanager
  169. def open_fasta_reader(fn, log=LOG.info):
  170. """
  171. fn: str - filename
  172. Note: If you already have a fileobj, you can iterate over yield_fasta_record() directly.
  173. Streaming reader for FASTA files, useable as a one-shot iterator
  174. over FastaRecord objects. Agnostic about line wrapping.
  175. Example:
  176. .. doctest::
  177. TODO: Get data.
  178. > from pbcore import data
  179. > filename = data.getTinyFasta()
  180. > r = FastaReader(filename)
  181. > with open_fasta_reader(filename) as r:
  182. ... for record in r:
  183. ... print record.name, len(record.sequence), record.md5
  184. ref000001|EGFR_Exon_2 183 e3912e9ceacd6538ede8c1b2adda7423
  185. ref000002|EGFR_Exon_3 203 4bf218da37175a91869033024ac8f9e9
  186. ref000003|EGFR_Exon_4 215 245bc7a046aad0788c22b071ed210f4d
  187. ref000004|EGFR_Exon_5 157 c368b8191164a9d6ab76fd328e2803ca
  188. """
  189. filename = abspath(expanduser(fn))
  190. mode = 'r'
  191. if filename.endswith(".gz"):
  192. ofs = gzip.open(filename, mode)
  193. elif filename.endswith(".dexta"):
  194. ofs = stream_stdout("undexta -vkU -w60 -i", filename)
  195. elif '-' == fn:
  196. ofs = sys.stdin
  197. filename = fn
  198. else:
  199. ofs = open(filename, mode)
  200. yield yield_fasta_record(ofs, filename, log=log)
  201. ofs.close()
  202. @contextlib.contextmanager
  203. def open_fasta_writer(fn, log=LOG.info):
  204. """
  205. fn: str - filename
  206. Yield a writer, possibly compressing. This is not actually
  207. fasta-specific, except it can also write Gene Myers "dexta".
  208. Wraps as the default COLUMNS=60.
  209. Example:
  210. with open_fasta_reader(ifn) as rin:
  211. with open_fasta_writer(ofn) as writer
  212. for record in rin:
  213. writer.write(str(record))
  214. """
  215. filename = abspath(expanduser(fn))
  216. if filename.endswith(".gz"):
  217. ofs = gzip.open(filename, 'wb')
  218. elif filename.endswith(".dexta"):
  219. ofs = stream_stdout("dexta -vk -i", filename)
  220. elif '-' == fn:
  221. ofs = sys.stdout
  222. filename = fn
  223. else:
  224. ofs = open(filename, 'w')
  225. yield ofs
  226. ofs.close()
  227. class FastaReader(object):
  228. """Deprecated, but should still work (with filenames).
  229. """
  230. def __iter__(self):
  231. with open_fasta_reader(self.filename, log=self.log) as reader:
  232. for rec in reader:
  233. yield rec
  234. def __init__(self, f, log=LOG.info):
  235. self.filename = f
  236. self.log = log