hgap4_adapt.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413
  1. """Given a full HGAP4 run,
  2. generate directories and symlinks to make it look like
  3. a pypeflow run.
  4. Then, fc_run/fc_unzip/fc_quiver can operate on it. In fact, fc_run
  5. should be already satisfied.
  6. One caveat: At the moment, parts of falcon-unzip actually write into the
  7. falcon dirs. We should fix that. But for now, we create writable run-dirs.
  8. We do *not* write into the HGAP4 run-dir.
  9. """
  10. from future.utils import viewitems
  11. from ..util.system import (cd, touch, make_dirs)
  12. import argparse
  13. import contextlib
  14. import glob
  15. import json
  16. import logging
  17. import os
  18. import sys
  19. LOG = logging.getLogger(__name__)
  20. """
  21. Note that, even though this program is designed to let unzip run,
  22. it has nothing to do with unzip. It merely mimics falcon, so that
  23. the falcon jobs appear as fully satisfied to pypeflow. That is why
  24. this program is in this repo rather than in FALCON_unzip.
  25. However, if HGAP4/pbsmrtpipe-tasks change, then this would need to
  26. be updated.
  27. """
  28. """Post-FALCON steps:
  29. pbcoretools.tasks.fasta2referenceset-0
  30. pbalign.tasks.pbalign-0 *******
  31. pbreports.tasks.summarize_coverage-0
  32. genomic_consensus.tasks.variantcaller-0 *******
  33. pbreports.tasks.coverage_report_hgap-0
  34. genomic_consensus.tasks.gff2bed-0
  35. pbcoretools.tasks.contigset2fasta-0
  36. pbreports.tasks.polished_assembly-0
  37. pbreports.tasks.mapping_stats_hgap-0
  38. falcon_ns.tasks.task_report_preassembly_yield-0
  39. Or with chunking:
  40. pbcoretools.tasks.fasta2referenceset-0
  41. pbcoretools.tasks.subreadset_align_scatter-1
  42. pbalign.tasks.pbalign-2
  43. pbalign.tasks.pbalign-1
  44. .pbcoretools.tasks.subreadset_align_scatter-b473df0f-c3d5-46ab-8c45-be5054ea0dbd-gathered-pipeline.chunks.json
  45. pbcoretools.tasks.gather_alignmentset-1
  46. pbreports.tasks.summarize_coverage-0
  47. pbcoretools.tasks.alignment_contig_scatter-1
  48. pbreports.tasks.coverage_report_hgap-0
  49. genomic_consensus.tasks.variantcaller-2
  50. genomic_consensus.tasks.variantcaller-1
  51. .pbcoretools.tasks.alignment_contig_scatter-7eda161b-3ed9-4891-97f3-300dd975407a-gathered-pipeline.chunks.json
  52. pbcoretools.tasks.gather_gff-1
  53. pbreports.tasks.mapping_stats_hgap-0
  54. pbcoretools.tasks.gather_fastq-1
  55. pbcoretools.tasks.gather_contigset-1
  56. pbcoretools.tasks.gather_vcf-1
  57. genomic_consensus.tasks.gff2bed-0
  58. pbreports.tasks.polished_assembly-0
  59. pbcoretools.tasks.contigset2fasta-0
  60. """
  61. @contextlib.contextmanager
  62. def mkcd(newdir):
  63. make_dirs(newdir)
  64. with cd(newdir):
  65. yield
  66. def symlink(jo):
  67. """Caller should first cd into link-dir.
  68. """
  69. def assert_exists(path):
  70. assert os.path.exists(path), 'File does not exist: {!r}'.format(path)
  71. def assert_dir(path):
  72. assert os.path.isdir(path), 'Not a directory: {!r}'.format(path)
  73. assert_dir(jo)
  74. taskdir = os.path.join(jo, 'tasks')
  75. assert_dir(taskdir)
  76. def touch_done():
  77. """Standard pypeflow convention.
  78. """
  79. touch('run.sh.done')
  80. def abstdir(basetdir):
  81. return os.path.abspath(os.path.join(jo, 'tasks', basetdir))
  82. def link(targetdir, basename, linkname=None):
  83. if not linkname:
  84. linkname = basename
  85. reldir = os.path.relpath(targetdir)
  86. target = os.path.join(reldir, basename)
  87. assert_exists(os.path.abspath(target))
  88. if os.path.lexists(linkname):
  89. if os.readlink(linkname) == target:
  90. return
  91. os.unlink(linkname)
  92. LOG.info('link {!r} to {}/{}'.format(linkname, reldir, basename))
  93. os.symlink(target, linkname)
  94. # Define task symlinkers
  95. def task_make_fofn_abs_raw():
  96. """deprecated
  97. "input_fofn" from cfg
  98. """
  99. rdir = abstdir('falcon_ns2.tasks.task_falcon_make_fofn_abs-0')
  100. with mkcd('0-rawreads/raw-fofn-abs/'):
  101. link(rdir, 'file.fofn', 'input.fofn')
  102. # touch('input.fofn')
  103. touch_done()
  104. def task_build_rdb():
  105. rdir = abstdir('falcon_ns2.tasks.task_falcon0_dazzler_build_raw-0')
  106. with mkcd('0-rawreads/build/'):
  107. #touch('length_cutoff', 'rdb_build_done', 'run_jobs.sh', 'raw_reads.db')
  108. link(rdir, 'raw_reads.db')
  109. link(rdir, '.raw_reads.bps')
  110. link(rdir, '.raw_reads.idx')
  111. link(rdir, '.raw_reads.dust.data')
  112. link(rdir, '.raw_reads.dust.anno')
  113. touch_done()
  114. def task_tan_split():
  115. rdir = abstdir('falcon_ns2.tasks.task_falcon0_dazzler_tan_split-0')
  116. with mkcd('0-rawreads/tan-split/'):
  117. #link(rdir, 'split.json', 'tan-uows.json')
  118. with open('tan-uows.json', 'w') as stream:
  119. data = dict()
  120. stream.write(json.dumps(data))
  121. link(rdir, 'bash_template.txt', 'bash_template.sh')
  122. touch_done()
  123. def task_tan_gathered():
  124. with mkcd('0-rawreads/tan-gathered/'):
  125. touch_done()
  126. def task_tan_combine():
  127. rdir = abstdir('falcon_ns2.tasks.task_falcon0_dazzler_tan_combine-0')
  128. with mkcd('0-rawreads/tan-combine/'):
  129. link(rdir, 'raw_reads.db')
  130. link(rdir, '.raw_reads.bps')
  131. link(rdir, '.raw_reads.idx')
  132. link(rdir, '.raw_reads.dust.data')
  133. link(rdir, '.raw_reads.dust.anno')
  134. link(rdir, '.raw_reads.tan.data')
  135. link(rdir, '.raw_reads.tan.anno')
  136. touch_done()
  137. def task_daligner_split():
  138. rdir = abstdir('falcon_ns2.tasks.task_falcon0_dazzler_daligner_split-0')
  139. with mkcd('0-rawreads/daligner-split/'):
  140. #link(rdir, 'split.json', 'all-units-of-work.json')
  141. with open('all-units-of-work.json', 'w') as stream:
  142. data = dict()
  143. stream.write(json.dumps(data))
  144. link(rdir, 'bash_template.txt', 'bash_template.sh')
  145. touch_done()
  146. def task_daligner_gathered():
  147. with mkcd('0-rawreads/daligner-gathered/'):
  148. touch_done()
  149. def task_daligner_combine():
  150. rdir = abstdir('falcon_ns2.tasks.task_falcon0_dazzler_daligner_combine-0')
  151. with mkcd('0-rawreads/daligner-combine/'):
  152. link(rdir, 'las_paths.json', 'gathered-las.json')
  153. touch_done()
  154. def task_lamerge_split():
  155. rdir = abstdir('falcon_ns2.tasks.task_falcon0_dazzler_lamerge_split-0')
  156. with mkcd('0-rawreads/las-merge-split/'):
  157. #link(rdir, 'split.json', 'all-units-of-work.json')
  158. with open('all-units-of-work.json', 'w') as stream:
  159. data = dict()
  160. stream.write(json.dumps(data))
  161. link(rdir, 'bash_template.txt', 'las-merge-bash-template.sh')
  162. touch_done()
  163. def task_lamerge_gathered():
  164. with mkcd('0-rawreads/las-merge-gathered/'):
  165. touch_done()
  166. def task_lamerge_combine():
  167. # falcon_unzip/rr_hctg_track.py looks at las-merge-combine/las_paths.json, with abspaths
  168. rdir = abstdir(
  169. 'falcon_ns2.tasks.task_falcon0_dazzler_lamerge_combine-0')
  170. with mkcd('0-rawreads/las-merge-combine/'):
  171. link(rdir, 'las_paths.json', 'las_fofn.json') # unzip/quiver, for now
  172. link(rdir, 'las_paths.json')
  173. link(rdir, 'block2las.json')
  174. touch_done()
  175. def task_cns_split():
  176. rdir = abstdir('falcon_ns2.tasks.task_falcon0_dazzler_cns_split-0')
  177. with mkcd('0-rawreads/cns-split/'):
  178. #link(rdir, 'split.json', 'all-units-of-work.json')
  179. with open('split.json', 'w') as stream:
  180. data = dict()
  181. stream.write(json.dumps(data))
  182. #link(rdir, 'bash_template.txt', 'bash_template.sh')
  183. touch_done()
  184. def task_cns_gather():
  185. #rdir = abstdir('falcon_ns2.tasks.task_falcon0_dazzler_cns_split-0')
  186. #rdir = abstdir('falcon_ns2.tasks.task_falcon0_run_cns_post_gather-0')
  187. with mkcd('0-rawreads/cns-gather/'):
  188. touch_done()
  189. #def task_cns_combine():
  190. # rdir = abstdir('falcon_ns2.tasks.task_falcon0_dazzler_cns_combine-0')
  191. # with mkcd('0-rawreads/cns-combine/'):
  192. # touch_done()
  193. def task_preads():
  194. rdir = abstdir('falcon_ns2.tasks.task_falcon0_run_cns_post_gather-0')
  195. with mkcd('0-rawreads/preads/'):
  196. link(rdir, 'input-preads.fofn', 'input_preads.fofn')
  197. touch_done()
  198. def task_report_pre_assembly():
  199. """
  200. """
  201. with mkcd('0-rawreads/report/'):
  202. # touch('pre_assembly_stats.json')
  203. touch_done()
  204. def task_build_pdb():
  205. """
  206. """
  207. rdir = abstdir('falcon_ns2.tasks.task_falcon1_dazzler_build_p-0')
  208. with mkcd('1-preads_ovl/build/'):
  209. #touch('pdb_build_done', 'run_jobs.sh', 'preads.db')
  210. link(rdir, 'preads.db')
  211. link(rdir, '.preads.bps')
  212. link(rdir, '.preads.idx')
  213. link(rdir, '.preads.dust.data')
  214. link(rdir, '.preads.dust.anno')
  215. touch_done()
  216. def task_daligner_split1():
  217. #rdir = abstdir('falcon_ns2.tasks.task_falcon1_dazzler_daligner_split-0')
  218. with mkcd('1-preads_ovl/daligner-split/'):
  219. #link(rdir, 'split.json', 'all-units-of-work.json')
  220. with open('all-units-of-work.json', 'w') as stream:
  221. data = dict()
  222. stream.write(json.dumps(data))
  223. #link(rdir, 'bash_template.txt', 'bash_template.sh')
  224. touch_done()
  225. def task_daligner_gathered1():
  226. with mkcd('1-preads_ovl/daligner-gathered/'):
  227. touch_done()
  228. def task_daligner_combine1():
  229. rdir = abstdir('falcon_ns2.tasks.task_falcon1_dazzler_daligner_combine-0')
  230. with mkcd('1-preads_ovl/daligner-combine/'):
  231. link(rdir, 'las_paths.json', 'gathered-las.json')
  232. touch_done()
  233. def task_lamerge_split1():
  234. #rdir = abstdir('falcon_ns2.tasks.task_falcon1_dazzler_lamerge_split-0')
  235. with mkcd('1-preads_ovl/las-merge-split/'):
  236. #link(rdir, 'split.json', 'all-units-of-work.json')
  237. with open('all-units-of-work.json', 'w') as stream:
  238. data = dict()
  239. stream.write(json.dumps(data))
  240. #link(rdir, 'bash_template.txt', 'las-merge-bash-template.sh')
  241. touch_done()
  242. def task_lamerge_gathered1():
  243. with mkcd('1-preads_ovl/las-merge-gathered/'):
  244. touch_done()
  245. def task_lamerge_combine1():
  246. rdir = abstdir(
  247. 'falcon_ns2.tasks.task_falcon1_dazzler_lamerge_combine-0')
  248. with mkcd('1-preads_ovl/las-merge-combine/'):
  249. link(rdir, 'las_paths.json', 'las_fofn.json') # unzip/quiver, for now
  250. link(rdir, 'las_paths.json', 'las_paths.json')
  251. link(rdir, 'block2las.json', 'block2las.json')
  252. touch_done()
  253. def task_run_db2falcon():
  254. rdir = abstdir('falcon_ns2.tasks.task_falcon1_run_db2falcon-0')
  255. with mkcd('1-preads_ovl/db2falcon/'):
  256. link(rdir, 'preads4falcon.fasta')
  257. touch_done()
  258. def task_run_falcon_asm():
  259. rdir = abstdir('falcon_ns2.tasks.task_falcon2_run_falcon_asm-0')
  260. with mkcd('2-asm-falcon/'):
  261. # workflow depends on:
  262. touch('falcon_asm_done')
  263. # get_read_ctg_map needs:
  264. link(rdir, 'sg_edges_list')
  265. link(rdir, 'utg_data')
  266. link(rdir, 'ctg_paths')
  267. # fetch_reads needs:
  268. link(rdir, 'p_ctg.fa')
  269. link(rdir, 'a_ctg.fa')
  270. link(rdir, 'p_ctg_tiling_path')
  271. link(rdir, 'a_ctg_tiling_path')
  272. touch_done()
  273. #task_make_fofn_abs_raw()
  274. task_build_rdb()
  275. task_tan_split()
  276. task_tan_gathered()
  277. task_tan_combine()
  278. task_daligner_split()
  279. task_daligner_gathered()
  280. task_daligner_combine()
  281. task_lamerge_split()
  282. task_lamerge_gathered()
  283. task_lamerge_combine()
  284. task_cns_split()
  285. task_cns_gather()
  286. #task_cns_combine()
  287. task_preads()
  288. task_report_pre_assembly()
  289. task_build_pdb()
  290. task_daligner_split1()
  291. task_daligner_gathered1()
  292. task_daligner_combine1()
  293. task_lamerge_split1()
  294. task_lamerge_gathered1()
  295. task_lamerge_combine1()
  296. task_run_db2falcon()
  297. task_run_falcon_asm()
  298. def dump_fc_run(fn):
  299. input_fofn = os.path.join(abstdir('falcon_ns2.tasks.task_falcon_make_fofn_abs-0'), 'file.fofn')
  300. length_cutoff = int(open(os.path.join(abstdir('falcon_ns2.tasks.task_falcon0_dazzler_build_raw-0'), 'length_cutoff.txt')).read())
  301. with open(fn, 'w') as stream:
  302. p = lambda x: stream.write(x + '\n')
  303. p('[General]')
  304. p('input_fofn = {}'.format(input_fofn))
  305. p('length_cutoff = {}'.format(length_cutoff))
  306. p('[Unzip]')
  307. p('input_fofn = {}'.format(input_fofn))
  308. p('# You need to find this!')
  309. p('input_bam_fofn = {}'.format('input_bam.fofn'))
  310. p('[job.defaults]')
  311. p('pwatcher_type = blocking')
  312. #p('submit = /bin/bash -c "${JOB_SCRIPT}"')
  313. p('submit = /bin/bash -c "${JOB_SCRIPT}" > "${JOB_STDOUT}" 2> "${JOB_STDERR}"')
  314. dump_fc_run('fc_run.generated.cfg')
  315. def get_parser():
  316. class HelpF(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
  317. pass
  318. description = 'Given a full HGAP4 run, generate directories and symlinks to make it look like a pypeflow run.'
  319. epilog = """
  320. Typically:
  321. mkdir mydir/
  322. cd mydir/
  323. python3 -m falcon_kit.mains.hgap4_adapt --job-output-dir=../job_output/
  324. fc_run fc_run.cfg -- (A)
  325. fc_unzip.py fc_unzip.cfg -- (B)
  326. fc_quiver.py fc_unzip.cfg -- (C)
  327. You need to create/modify the .cfg files.
  328. (A) This should be a no-op, and you do not need to run this at all. Just a sanity check.
  329. It will tell you that everything is already satisfied. But it
  330. cannot work unless you provide `input.fofn` (which can be empty) and set it to `input_fofn`
  331. in your .cfg.
  332. (B)/(C) These will need both `input_fofn` and `input_bam_fofn`. The latter
  333. should name actual BAM files to use for Quiver (also for partitioning for pbalign).
  334. For more help on .cfg files, see
  335. * https://github.com/PacificBiosciences/FALCON/wiki
  336. * https://github.com/PacificBiosciences/FALCON_unzip/wiki
  337. * https://github.com/PacificBiosciences/FALCON-integrate/wiki
  338. """
  339. parser = argparse.ArgumentParser(
  340. description=description,
  341. epilog=epilog,
  342. formatter_class=HelpF)
  343. parser.add_argument('--job-output-dir', default='job_output',
  344. help='Directory of HGAP4 job_output. (A symlink or relative path is fine.) Task-dirs are under here in "tasks/"')
  345. return parser
  346. def main(argv=sys.argv):
  347. args = get_parser().parse_args(argv[1:])
  348. symlink(args.job_output_dir)
  349. if __name__ == '__main__':
  350. logging.basicConfig(level=logging.INFO)
  351. main()