pype_tasks.py 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195
  1. from future.utils import viewitems
  2. from future.utils import itervalues
  3. # PypeTask functions now need to be module-level.
  4. from . import run_support as support
  5. from . import bash # for scattering
  6. # from pypeflow.simple_pwatcher_bridge import fn # not really needed
  7. import collections
  8. import json
  9. import logging
  10. import os.path
  11. LOG = logging.getLogger(__name__)
  12. TASK_BAM2DEXTA_SPLIT_SCRIPT = """\
  13. python3 -m falcon_kit.mains.bam2dexta split --wildcards={params.wildcards} --bam={input.bam} --split-fn={output.split} --bash-template-fn={output.bash_template}
  14. """
  15. TASK_BAM2DEXTA_APPLY_SCRIPT = """\
  16. python3 -m falcon_kit.mains.bam2dexta apply --bam-fn={input.bam} --dexta-fn={output.dexta}
  17. """
  18. TASK_BAM2DEXTA_COMBINE_SCRIPT = """\
  19. python3 -m falcon_kit.mains.bam2dexta combine --gathered-fn={input.gathered} --dexta-fofn-fn={output.fofn}
  20. """
  21. TASK_CONSENSUS_SPLIT_SCRIPT = """\
  22. python3 -m falcon_kit.mains.consensus_split --wildcards={params.wildcards} --p-id2las-fn={input.p_id2las} --db-fn={input.raw_reads_db} --length-cutoff-fn={input.length_cutoff} --config-fn={input.config} --split-fn={output.split} --bash-template-fn={output.bash_template}
  23. """
  24. TASK_CONSENSUS_TASK_SCRIPT = """\
  25. python3 -m falcon_kit.mains.consensus_task --nproc={params.pypeflow_nproc} --las-fn={input.las} --db-fn={input.db} --length-cutoff-fn={input.length_cutoff} --config-fn={input.config} --fasta-fn={output.fasta}
  26. """
  27. TASK_CONSENSUS_GATHER_SCRIPT = """\
  28. python3 -m falcon_kit.mains.consensus_gather_fasta_fofn --gathered-fn={input.gathered} --db-fn={input.raw_reads_db} --config-fn={input.config} --preads-fofn-fn={output.preads_fofn}
  29. """
  30. TASK_REPORT_PRE_ASSEMBLY_SCRIPT = """\
  31. python3 -m falcon_kit.mains.task_report_pre_assembly --config-fn={input.config} --length-cutoff-fn={input.length_cutoff} --raw-reads-db-fn={input.raw_reads_db} --preads-fofn-fn={input.preads_fofn} --pre-assembly-report-fn={output.pre_assembly_report}
  32. """
  33. TASK_DB_BUILD_SCRIPT = """\
  34. python3 -m falcon_kit.mains.dazzler --config-fn={input.config} --db-fn={output.db} build --input-fofn-fn={input.input_fofn} --length-cutoff-fn={output.length_cutoff}
  35. # TODO: Verify that db exists.
  36. #ln -sf {output.length_cutoff} length_cutoff
  37. """
  38. TASK_DB_TAN_SPLIT_SCRIPT = """\
  39. python3 -m falcon_kit.mains.dazzler --config={input.config} --db={input.db} tan-split --split={output.split} --bash-template={output.bash_template}
  40. """
  41. TASK_DB_TAN_APPLY_SCRIPT = """\
  42. python3 -m falcon_kit.mains.dazzler --config={input.config} --db={input.db} tan-apply --script={input.script} --job-done={output.job_done}
  43. """
  44. TASK_DB_TAN_COMBINE_SCRIPT = """\
  45. python3 -m falcon_kit.mains.dazzler --config={input.config} --db={input.db} tan-combine --gathered={input.gathered} --new-db={output.new_db}
  46. """
  47. TASK_DB_REP_SPLIT_SCRIPT = """\
  48. python3 -m falcon_kit.mains.dazzler --config={input.config} --db={input.db} rep-split --las-paths-fn={input.las_paths} --wildcards={params.wildcards} -g{params.group_size} -c{params.coverage_limit} --split={output.split} --bash-template={output.bash_template}
  49. """
  50. TASK_DB_REP_APPLY_SCRIPT = """\
  51. python3 -m falcon_kit.mains.dazzler --config={input.config} --db={input.db} rep-apply --script={input.script} --job-done={output.job_done}
  52. """
  53. TASK_DB_REP_COMBINE_SCRIPT = """\
  54. python3 -m falcon_kit.mains.dazzler --config={input.config} --db={input.db} rep-combine -g{params.group_size} --gathered={input.gathered} --new-db={output.new_db}
  55. """
  56. TASK_DB_REP_DALIGNER_SPLIT_SCRIPT = """\
  57. python3 -m falcon_kit.mains.dazzler --config={input.config} --db={input.db} --nproc={params.pypeflow_nproc} rep-daligner-split --wildcards={params.wildcards} --group-size={params.group_size} --coverage-limit={params.coverage_limit} --split-fn={output.split} --bash-template-fn={output.bash_template}
  58. """
  59. TASK_DB_DALIGNER_SPLIT_SCRIPT = """\
  60. python3 -m falcon_kit.mains.dazzler --config={input.config} --db={input.db} --nproc={params.pypeflow_nproc} daligner-split --wildcards={params.wildcards} --length-cutoff-fn={input.length_cutoff} --split-fn={output.split} --bash-template-fn={output.bash_template}
  61. """
  62. TASK_DB_DALIGNER_APPLY_SCRIPT = """\
  63. python3 -m falcon_kit.mains.dazzler --config={input.config} --db={input.db} daligner-apply --script={input.script} --job-done={output.job_done}
  64. """
  65. TASK_DB_DALIGNER_COMBINE_SCRIPT = """\
  66. python3 -m falcon_kit.mains.dazzler --config={input.config} --db={input.db} daligner-combine --gathered={input.gathered} --las-paths-fn={output.las_paths}
  67. """
  68. TASK_DB_LAMERGE_SPLIT_SCRIPT = """\
  69. python3 -m falcon_kit.mains.dazzler --config={input.config} merge-split --db-prefix={params.db_prefix} --las-paths={input.las_paths} --wildcards={params.wildcards} --split-fn={output.split} --bash-template-fn={output.bash_template}
  70. """
  71. TASK_DB_LAMERGE_APPLY_SCRIPT = """\
  72. python3 -m falcon_kit.mains.dazzler --config={input.config} merge-apply --las-paths={input.las_paths} --las-fn={output.las_fn}
  73. """
  74. TASK_DB_LAMERGE_COMBINE_SCRIPT = """\
  75. python3 -m falcon_kit.mains.dazzler --config={input.config} merge-combine --gathered={input.gathered} --las-paths-fn={output.las_paths} --block2las-fn={output.block2las}
  76. """
  77. TASK_DUMP_RAWREAD_IDS_SCRIPT = """\
  78. DBshow -n {input.rawread_db} | tr -d '>' | LD_LIBRARY_PATH= awk '{{print $1}}' > {output.rawread_id_file}
  79. """
  80. TASK_DUMP_PREAD_IDS_SCRIPT = """\
  81. DBshow -n {input.pread_db} | tr -d '>' | LD_LIBRARY_PATH= awk '{{print $1}}' > {output.pread_id_file}
  82. """
  83. TASK_GENERATE_READ_TO_CTG_MAP_SCRIPT = """\
  84. python3 -m falcon_kit.mains.generate_read_to_ctg_map --rawread-id={input.rawread_id_file} --pread-id={input.pread_id_file} --sg-edges-list={input.sg_edges_list} --utg-data={input.utg_data} --ctg-paths={input.ctg_paths} --output={output.read_to_contig_map}
  85. """
  86. TASK_RUN_DB_TO_FALCON_SCRIPT = """\
  87. # Given preads.db,
  88. # write preads4falcon.fasta (implicitly) in CWD.
  89. time DB2Falcon -U {input.preads_db}
  90. [ -f {output.preads4falcon} ] || exit 1
  91. touch {output.job_done}
  92. """
  93. TASK_RUN_FALCON_ASM_SCRIPT = """\
  94. # Given, las_fofn.json,
  95. # write preads.ovl:
  96. # mobs uses binwrappers, so it does not see our "entry-points".
  97. # So, after dropping "src/py_scripts/*.py", we can call these via python3 -m:
  98. time python3 -m falcon_kit.mains.ovlp_filter --db {input.db_file} --las-fofn {input.las_fofn} {params.overlap_filtering_setting} --min-len {params.length_cutoff_pr} --out-fn preads.ovl
  99. ln -sf {input.preads4falcon_fasta} ./preads4falcon.fasta
  100. # Given preads.ovl,
  101. # write sg_edges_list, c_path, utg_data, ctg_paths.
  102. time python3 -m falcon_kit.mains.ovlp_to_graph {params.fc_ovlp_to_graph_option} --overlap-file preads.ovl >| fc_ovlp_to_graph.log
  103. # Given sg_edges_list, utg_data, ctg_paths, preads4falcon.fasta,
  104. # write p_ctg.fa and a_ctg_all.fa,
  105. # plus a_ctg_base.fa, p_ctg_tiling_path, a_ctg_tiling_path, a_ctg_base_tiling_path:
  106. time python3 -m falcon_kit.mains.graph_to_contig
  107. # Given a_ctg_all.fa, write a_ctg.fa:
  108. time python3 -m falcon_kit.mains.dedup_a_tigs >| a_ctg.fa
  109. # Given a_ctg.fa and a_ctg_all_tiling_path, write a_ctg_tiling_path:
  110. time python3 -m falcon_kit.mains.dedup_a_tp >| a_ctg_tiling_path
  111. # Collect all info needed to format the GFA-1 and GFA-2 representations of
  112. # the assembly graphs.
  113. time python3 -m falcon_kit.mains.collect_pread_gfa >| asm.gfa.json
  114. time python3 -m falcon_kit.mains.collect_pread_gfa --add-string-graph >| sg.gfa.json
  115. time python3 -m falcon_kit.mains.collect_contig_gfa >| contig.gfa.json
  116. # Output the assembly pread graph.
  117. time python3 -m falcon_kit.mains.gen_gfa_v1 asm.gfa.json >| asm.gfa
  118. time python3 -m falcon_kit.mains.gen_gfa_v2 asm.gfa.json >| asm.gfa2
  119. # Output the string graph.
  120. time python3 -m falcon_kit.mains.gen_gfa_v1 sg.gfa.json >| sg.gfa
  121. time python3 -m falcon_kit.mains.gen_gfa_v2 sg.gfa.json >| sg.gfa2
  122. # Output the contig graph with associate contigs attached to each primary contig.
  123. time python3 -m falcon_kit.mains.gen_gfa_v2 contig.gfa.json >| contig.gfa2
  124. #rm -f ./preads4falcon.fasta
  125. touch {output.falcon_asm_done}
  126. """
  127. def fn(p): return p
  128. def system(call, check=False):
  129. LOG.debug('$(%s)' % repr(call))
  130. rc = os.system(call)
  131. msg = 'Call %r returned %d.' % (call, rc)
  132. if rc:
  133. LOG.warning(msg)
  134. if check:
  135. raise Exception(msg)
  136. else:
  137. LOG.debug(msg)
  138. return rc
  139. def task_dump_rawread_ids(self):
  140. rawread_db = fn(self.rawread_db)
  141. rawread_id_file = fn(self.rawread_id_file)
  142. input = object()
  143. input.rawread_db = rawread_db
  144. output = object()
  145. output.rawread_id_file = rawread_id_file
  146. system(TASK_DUMP_RAWREAD_IDS_SCRIPT.format(**locals()))
  147. def task_dump_pread_ids(self):
  148. pread_db = fn(self.pread_db)
  149. pread_id_file = fn(self.pread_id_file)
  150. input = object()
  151. input.pread_db = pread_db
  152. output = object()
  153. output.pread_id_file = pread_id_file
  154. system(TASK_DUMP_PREAD_IDS_SCRIPT.format(**locals()))
  155. def task_generate_read_to_ctg_map(self):
  156. input = object()
  157. input.rawread_id_file = fn(self.rawread_id_file)
  158. input.pread_id_file = fn(self.pread_id_file)
  159. input.sg_edges_list = fn(self.sg_edges_list)
  160. input.utg_data = fn(self.utg_data)
  161. input.ctg_paths = fn(self.ctg_paths)
  162. output = object()
  163. output.read_to_contig_map = fn(self.read_to_contig_map)
  164. system(TASK_GENERATE_READ_TO_CTG_MAP_SCRIPT.format(**locals()))