Browse Source

init commit

liuwei 5 years ago
commit
d892e7a198
100 changed files with 18798 additions and 0 deletions
  1. 279 0
      FALCON/falcon_kit/FastaReader.py
  2. 8 0
      FALCON/falcon_kit/__init__.py
  3. 97 0
      FALCON/falcon_kit/align_dw.py
  4. 155 0
      FALCON/falcon_kit/align_edlib.py
  5. 38 0
      FALCON/falcon_kit/bash.py
  6. 179 0
      FALCON/falcon_kit/falcon_kit.py
  7. 180 0
      FALCON/falcon_kit/fc_asm_graph.py
  8. 580 0
      FALCON/falcon_kit/functional.py
  9. 238 0
      FALCON/falcon_kit/gfa_graph.py
  10. 199 0
      FALCON/falcon_kit/io.py
  11. 57 0
      FALCON/falcon_kit/mains/LAmerge.py
  12. 52 0
      FALCON/falcon_kit/mains/LAsort.py
  13. 0 0
      FALCON/falcon_kit/mains/__init__.py
  14. 34 0
      FALCON/falcon_kit/mains/actg_coordinate.py
  15. 219 0
      FALCON/falcon_kit/mains/bam2dexta.py
  16. 79 0
      FALCON/falcon_kit/mains/calc_cutoff.py
  17. 103 0
      FALCON/falcon_kit/mains/collect_contig_gfa.py
  18. 243 0
      FALCON/falcon_kit/mains/collect_pread_gfa.py
  19. 411 0
      FALCON/falcon_kit/mains/consensus.py
  20. 90 0
      FALCON/falcon_kit/mains/consensus_gather_fasta_fofn.py
  21. 148 0
      FALCON/falcon_kit/mains/consensus_split.py
  22. 197 0
      FALCON/falcon_kit/mains/consensus_task.py
  23. 35 0
      FALCON/falcon_kit/mains/contig_annotate.py
  24. 59 0
      FALCON/falcon_kit/mains/copy_fofn.py
  25. 97 0
      FALCON/falcon_kit/mains/copy_mapped.py
  26. 73 0
      FALCON/falcon_kit/mains/cromwell_run_uows_tar.py
  27. 76 0
      FALCON/falcon_kit/mains/cromwell_symlink.py
  28. 55 0
      FALCON/falcon_kit/mains/cromwell_undot.py
  29. 58 0
      FALCON/falcon_kit/mains/cromwell_write_json.py
  30. 85 0
      FALCON/falcon_kit/mains/ctg_link_analysis.py
  31. 1533 0
      FALCON/falcon_kit/mains/dazzler.py
  32. 70 0
      FALCON/falcon_kit/mains/db.py
  33. 138 0
      FALCON/falcon_kit/mains/dedup_a_tigs.py
  34. 48 0
      FALCON/falcon_kit/mains/dedup_a_tp.py
  35. 249 0
      FALCON/falcon_kit/mains/fasta2fasta.py
  36. 384 0
      FALCON/falcon_kit/mains/fasta_filter.py
  37. 220 0
      FALCON/falcon_kit/mains/fasta_subsample.py
  38. 158 0
      FALCON/falcon_kit/mains/fetch_reads.py
  39. 29 0
      FALCON/falcon_kit/mains/gen_gfa_v1.py
  40. 29 0
      FALCON/falcon_kit/mains/gen_gfa_v2.py
  41. 97 0
      FALCON/falcon_kit/mains/generate_read_to_ctg_map.py
  42. 68 0
      FALCON/falcon_kit/mains/generic_gather.py
  43. 117 0
      FALCON/falcon_kit/mains/generic_run_units_of_work.py
  44. 89 0
      FALCON/falcon_kit/mains/generic_scatter_one_uow.py
  45. 82 0
      FALCON/falcon_kit/mains/generic_scatter_uows.py
  46. 101 0
      FALCON/falcon_kit/mains/generic_scatter_uows_tar.py
  47. 74 0
      FALCON/falcon_kit/mains/generic_tar_uows.py
  48. 69 0
      FALCON/falcon_kit/mains/generic_unsplit.py
  49. 107 0
      FALCON/falcon_kit/mains/get_read_ctg_map.py
  50. 338 0
      FALCON/falcon_kit/mains/graph_to_contig.py
  51. 181 0
      FALCON/falcon_kit/mains/graph_to_utgs.py
  52. 413 0
      FALCON/falcon_kit/mains/hgap4_adapt.py
  53. 46 0
      FALCON/falcon_kit/mains/las_write_empty.py
  54. 325 0
      FALCON/falcon_kit/mains/ovlp_filter.py
  55. 142 0
      FALCON/falcon_kit/mains/ovlp_stats.py
  56. 1600 0
      FALCON/falcon_kit/mains/ovlp_to_graph.py
  57. 178 0
      FALCON/falcon_kit/mains/pr_ctg_track.py
  58. 63 0
      FALCON/falcon_kit/mains/reduce_preads.py
  59. 48 0
      FALCON/falcon_kit/mains/report_pre_assembly.py
  60. 185 0
      FALCON/falcon_kit/mains/rr_ctg_track.py
  61. 710 0
      FALCON/falcon_kit/mains/run1.py
  62. 79 0
      FALCON/falcon_kit/mains/symlink_mapped.py
  63. 85 0
      FALCON/falcon_kit/mains/task_report_pre_assembly.py
  64. 40 0
      FALCON/falcon_kit/mains/tasks.py
  65. 53 0
      FALCON/falcon_kit/mains/zmw_collect.py
  66. 160 0
      FALCON/falcon_kit/mains/zmw_subsample.py
  67. 36 0
      FALCON/falcon_kit/multiproc.py
  68. 203 0
      FALCON/falcon_kit/pype.py
  69. 195 0
      FALCON/falcon_kit/pype_tasks.py
  70. 586 0
      FALCON/falcon_kit/run_support.py
  71. 250 0
      FALCON/falcon_kit/snakemake.py
  72. 272 0
      FALCON/falcon_kit/stats_preassembly.py
  73. 0 0
      FALCON/falcon_kit/testkit/__init__.py
  74. 61 0
      FALCON/falcon_kit/testkit/test_assembly.py
  75. 36 0
      FALCON/falcon_kit/testkit/test_foo.py
  76. 199 0
      FALCON/falcon_kit/tiling_path.py
  77. 0 0
      FALCON/falcon_kit/util/__init__.py
  78. 72 0
      FALCON/falcon_kit/util/dataset_split.py
  79. 273 0
      FALCON/falcon_kit/util/io.py
  80. 72 0
      FALCON/falcon_kit/util/ordered_set.py
  81. 126 0
      FALCON/falcon_kit/util/system.py
  82. 97 0
      FALCON/setup.py
  83. 337 0
      FALCON/src/c/DW_banded.c
  84. 20 0
      FALCON/src/c/Makefile
  85. 16 0
      FALCON/src/c/Makefile.osx
  86. 178 0
      FALCON/src/c/common.h
  87. 84 0
      FALCON/src/c/ext_falcon.c
  88. 841 0
      FALCON/src/c/falcon.c
  89. 591 0
      FALCON/src/c/kmer_lookup.c
  90. 20 0
      FALCON/src/py_scripts/fc_run.py
  91. 0 0
      pypeFlow/pwatcher/__init__.py
  92. 512 0
      pypeFlow/pwatcher/blocking.py
  93. 785 0
      pypeFlow/pwatcher/fs_based.py
  94. 0 0
      pypeFlow/pwatcher/mains/__init__.py
  95. 149 0
      pypeFlow/pwatcher/mains/fs_heartbeat.py
  96. 36 0
      pypeFlow/pwatcher/mains/job_start.sh
  97. 176 0
      pypeFlow/pwatcher/mains/network_heartbeat.py
  98. 12 0
      pypeFlow/pwatcher/mains/pwatcher.py
  99. 131 0
      pypeFlow/pwatcher/mains/pypeflow_example.py
  100. 0 0
      pypeFlow/pwatcher/mains/query_server.py

+ 279 - 0
FALCON/falcon_kit/FastaReader.py

@@ -0,0 +1,279 @@
+
+
+from builtins import next
+from builtins import range
+from builtins import object
+from os.path import abspath, expanduser
+from .io import NativeIO as StringIO
+from .io import FilePercenter
+import contextlib
+import gzip
+import hashlib
+import logging
+import os
+import re
+import subprocess
+import sys
+import warnings
+
+LOG = logging.getLogger(__name__)
+
+##
+# Utility functions for FastaReader
+##
+
+
+def wrap(s, columns):
+    return "\n".join(s[start:start + columns]
+                     for start in range(0, len(s), columns))
+
+
+def splitFastaHeader(name):
+    """
+    Split a FASTA/FASTQ header into its id and metadata components
+
+    >>> splitFastaHeader('>m54329_180926_230856/34669168/0_42 FOO=BAR X=Y')
+    ('>m54329_180926_230856/34669168/0_42', 'FOO=BAR X=Y')
+    """
+    nameParts = re.split(r'\s', name, maxsplit=1)
+    id_ = nameParts[0]
+    if len(nameParts) > 1:
+        metadata = nameParts[1].strip()
+    else:
+        metadata = None
+    return (id_, metadata)
+
+
+def splitFileContents(f, delimiter, BLOCKSIZE=8192):
+    """
+    Same semantics as f.read().split(delimiter), but with memory usage
+    determined by largest chunk rather than entire file size
+    """
+    remainder = StringIO()
+    while True:
+        block = f.read(BLOCKSIZE)
+        if not block:
+            break
+        parts = block.split(delimiter)
+        remainder.write(parts[0])
+        for part in parts[1:]:
+            yield remainder.getvalue()
+            remainder = StringIO()
+            remainder.write(part)
+    yield remainder.getvalue()
+
+
+class FastaRecord(object):
+    """
+    A FastaRecord object models a named sequence in a FASTA file.
+    """
+    DELIMITER = ">"
+    COLUMNS = 60
+
+    def __init__(self, name, sequence):
+        try:
+            assert "\n" not in name
+            assert "\n" not in sequence
+            assert self.DELIMITER not in sequence
+            self._name = name
+            self._sequence = sequence
+            self._md5 = hashlib.md5(self._sequence.encode('ascii')).hexdigest()
+            self._id, self._metadata = splitFastaHeader(name)
+        except AssertionError:
+            raise ValueError("Invalid FASTA record data")
+
+    @property
+    def name(self):
+        """
+        The name of the sequence in the FASTA file, equal to the entire
+        FASTA header following the '>' character
+        """
+        return self._name
+
+    @property
+    def id(self):
+        """
+        The id of the sequence in the FASTA file, equal to the FASTA header
+        up to the first whitespace.
+        """
+        return self._id
+
+    @property
+    def metadata(self):
+        """
+        The metadata associated with the sequence in the FASTA file, equal to
+        the contents of the FASTA header following the first whitespace
+        """
+        return self._metadata
+
+    @property
+    def sequence(self):
+        """
+        The sequence for the record as present in the FASTA file.
+        (Newlines are removed but otherwise no sequence normalization
+        is performed).
+        """
+        return self._sequence
+
+    @property
+    def length(self):
+        """
+        Get the length of the FASTA sequence
+        """
+        return len(self._sequence)
+
+    @property
+    def md5(self):
+        """
+        The MD5 checksum (hex digest) of `sequence`
+        """
+        return self._md5
+
+    @classmethod
+    def fromString(cls, s):
+        """
+        Interprets a string as a FASTA record.  Does not make any
+        assumptions about wrapping of the sequence string.
+        """
+        try:
+            lines = s.splitlines()
+            assert len(lines) > 1
+            assert lines[0][0] == cls.DELIMITER
+            name = lines[0][1:]
+            sequence = "".join(lines[1:])
+            return FastaRecord(name, sequence)
+        except AssertionError:
+            raise ValueError("String not recognized as a valid FASTA record")
+
+    def __eq__(self, other):
+        if isinstance(other, self.__class__):
+            return (self.name == other.name and
+                    self._sequence == other._sequence)
+        else:
+            return False
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+    def __str__(self):
+        """
+        Output a string representation of this FASTA record, observing
+        standard conventions about sequence wrapping.
+        """
+        return (">%s\n" % self.name) + \
+            wrap(self._sequence, self.COLUMNS)
+
+
+# These are refactored from ReaderBase/FastaReader.
+
+def yield_fasta_record(f, fn=None, log=LOG.info):
+    """
+    f: fileobj
+    fn: str - filename (for exceptions); inferred from f.name if not provided
+    """
+    if not fn and f is not sys.stdin:
+        fn = getattr(f, 'name', None)
+        if fn is not None and not os.path.exists(fn):
+            log('Not sure what to do with FASTA file name="{}"'.format(fn))
+            fn = ''
+    counter = FilePercenter(fn, log=log)
+    try:
+        parts = splitFileContents(f, ">")
+        assert "" == next(parts)
+        for part in parts:
+            counter(len(part))
+            yield FastaRecord.fromString(">" + part)
+    except AssertionError:
+        raise Exception("Invalid FASTA file {!r}".format(fn))
+
+def yield_fasta_records(f, fn=None, log=LOG.info):
+    warnings.warn('use yield_fasta_record() instead', DeprecationWarning)
+    return yield_fasta_record(f, fn, log)
+
+def stream_stdout(call, fn):
+    args = call.split()
+    proc = subprocess.Popen(args, stdin=open(fn), stdout=subprocess.PIPE)
+    return proc.stdout
+
+
+@contextlib.contextmanager
+def open_fasta_reader(fn, log=LOG.info):
+    """
+    fn: str - filename
+
+    Note: If you already have a fileobj, you can iterate over yield_fasta_record() directly.
+
+    Streaming reader for FASTA files, useable as a one-shot iterator
+    over FastaRecord objects.  Agnostic about line wrapping.
+    Example:
+    .. doctest::
+        TODO: Get data.
+        > from pbcore import data
+        > filename = data.getTinyFasta()
+        > r = FastaReader(filename)
+        > with open_fasta_reader(filename) as r:
+        ...  for record in r:
+        ...     print record.name, len(record.sequence), record.md5
+        ref000001|EGFR_Exon_2 183 e3912e9ceacd6538ede8c1b2adda7423
+        ref000002|EGFR_Exon_3 203 4bf218da37175a91869033024ac8f9e9
+        ref000003|EGFR_Exon_4 215 245bc7a046aad0788c22b071ed210f4d
+        ref000004|EGFR_Exon_5 157 c368b8191164a9d6ab76fd328e2803ca
+    """
+    filename = abspath(expanduser(fn))
+    mode = 'r'
+    if filename.endswith(".gz"):
+        ofs = gzip.open(filename, mode)
+    elif filename.endswith(".dexta"):
+        ofs = stream_stdout("undexta -vkU -w60 -i", filename)
+    elif '-' == fn:
+        ofs = sys.stdin
+        filename = fn
+    else:
+        ofs = open(filename, mode)
+    yield yield_fasta_record(ofs, filename, log=log)
+    ofs.close()
+
+
+@contextlib.contextmanager
+def open_fasta_writer(fn, log=LOG.info):
+    """
+    fn: str - filename
+
+    Yield a writer, possibly compressing. This is not actually
+    fasta-specific, except it can also write Gene Myers "dexta".
+
+    Wraps as the default COLUMNS=60.
+
+    Example:
+
+        with open_fasta_reader(ifn) as rin:
+            with open_fasta_writer(ofn) as writer
+                for record in rin:
+                    writer.write(str(record))
+    """
+    filename = abspath(expanduser(fn))
+    if filename.endswith(".gz"):
+        ofs = gzip.open(filename, 'wb')
+    elif filename.endswith(".dexta"):
+        ofs = stream_stdout("dexta -vk -i", filename)
+    elif '-' == fn:
+        ofs = sys.stdout
+        filename = fn
+    else:
+        ofs = open(filename, 'w')
+
+    yield ofs
+    ofs.close()
+
+
+class FastaReader(object):
+    """Deprecated, but should still work (with filenames).
+    """
+    def __iter__(self):
+        with open_fasta_reader(self.filename, log=self.log) as reader:
+            for rec in reader:
+                yield rec
+
+    def __init__(self, f, log=LOG.info):
+        self.filename = f
+        self.log = log

+ 8 - 0
FALCON/falcon_kit/__init__.py

@@ -0,0 +1,8 @@
+from .falcon_kit import *
+__version__ = '1.4.2' # should match setup.py
+
+try:
+    import sys, pkg_resources
+    sys.stderr.write('{}\n'.format(pkg_resources.get_distribution('falcon-kit')))
+except Exception:
+    pass

+ 97 - 0
FALCON/falcon_kit/align_dw.py

@@ -0,0 +1,97 @@
+
+
+
+import argparse
+import sys
+from falcon_kit import kup, falcon, DWA
+
+class TooLongError(Exception): pass
+
+def log(msg):
+    sys.stderr.write(msg)
+    sys.stderr.write('\n')
+
+def get_aln_data(t_seq, q_seq):
+    """
+    Inputs are Unicode.
+    """
+    t_seq = t_seq.encode('ascii')
+    q_seq = q_seq.encode('ascii')
+    aln_data = []
+    #x = []
+    #y = []
+    K = 8
+    seq0 = t_seq
+    lk_ptr = kup.allocate_kmer_lookup(1 << (K * 2))
+    sa_ptr = kup.allocate_seq(len(seq0))
+    sda_ptr = kup.allocate_seq_addr(len(seq0))
+    kup.add_sequence(0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr)
+    q_id = "dummy"
+
+    kmer_match_ptr = kup.find_kmer_pos_for_seq(
+        q_seq, len(q_seq), K, sda_ptr, lk_ptr)
+    kmer_match = kmer_match_ptr[0]
+
+    if kmer_match.count != 0:
+        aln_range_ptr = kup.find_best_aln_range(kmer_match_ptr, K, K * 5, 12)
+        aln_range = aln_range_ptr[0]
+        #x, y = list(zip(* [(kmer_match.query_pos[i], kmer_match.target_pos[i])
+        #              for i in range(kmer_match.count)]))
+
+        s1, e1, s2, e2 = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2
+
+        log('Mapped (q, s1 = {}, e1 = {}, len1 = {}, (e1 - s1) = {}, t, s2 = {}, e2 = {}, (e2 - s2) = {}, len2 = {})'.format(
+                s1, e1, e1 - s1, len(q_seq), s2, e2, e2 - s2, len(t_seq)))
+
+        max_len = 250000 # to keep allocations < 16GB, given band_tol=1500
+        if (e1 - s1) >= max_len or (e2 - s2) >= max_len:
+            # DW.align() would crash, so raise here.
+            # (500000 is the approx. upper bound for int overflow,
+            #  but some users run out of memory anyway.)
+            raise TooLongError('q_len={} or t_len={} are too big, over 500k'.format(
+                (e1-s1), (e2-s2)))
+        if e1 - s1 > 100:
+            log('Calling DW_banded.align(q, s1 = {}, e1 = {}, len1 = {}, (e1 - s1) = {}, t, s2 = {}, e2 = {}, (e2 - s2) = {}, len2 = {})'.format(
+                s1, e1, e1 - s1, len(q_seq), s2, e2, e2 - s2, len(t_seq)))
+            alignment = DWA.align(q_seq[s1:e1], e1 - s1,
+                                  seq0[s2:e2], e2 - s2,
+                                  1500, 1)
+
+            if alignment[0].aln_str_size > 100:
+                aln_data.append((q_id, 0, s1, e1, len(q_seq), s2, e2, len(
+                    seq0), alignment[0].aln_str_size, alignment[0].dist))
+                #aln_str1 = alignment[0].q_aln_str
+                #aln_str0 = alignment[0].t_aln_str
+
+            DWA.free_alignment(alignment)
+
+        kup.free_aln_range(aln_range_ptr)
+
+    kup.free_kmer_match(kmer_match_ptr)
+    kup.free_kmer_lookup(lk_ptr)
+    kup.free_seq_array(sa_ptr)
+    kup.free_seq_addr_array(sda_ptr)
+    return aln_data #, x, y
+
+def get_aln_results(ref_seq, query_seq, min_seq_len):
+    # Align the a_ctg against the base.
+    log('Aligning (DW): len(ref_seq) = %d, len(query_seq) = %d' % (len(ref_seq), len(query_seq)))
+    delta_len = len(query_seq) - len(ref_seq)
+    idt = 0.0
+    cov = 0.0
+    if len(ref_seq) > min_seq_len and len(query_seq) > min_seq_len:
+        try:
+            aln_data = get_aln_data(ref_seq, query_seq)
+            if len(aln_data) != 0:
+                idt = 1.0 - 1.0 * \
+                    aln_data[-1][-1] / aln_data[-1][-2]
+                cov = 1.0 * \
+                    (aln_data[-1][3] - aln_data[-1]
+                        [2]) / aln_data[-1][4]
+            else:
+                log('len(aln_data) == 0!')
+        except TooLongError:
+            log('WARNING: Seqs were too long for get_aln_data(), so we set idt/cov low enough to prevent filtering by dedup_a_tigs. len(ref_seq) = {}, len(query_seq) = {}'.format(len(ref_seq), len(query_seq)))
+            idt = -1.0
+            cov = -1.0
+    return delta_len, idt, cov

+ 155 - 0
FALCON/falcon_kit/align_edlib.py

@@ -0,0 +1,155 @@
+
+
+
+import argparse
+import sys
+
+from falcon_kit import kup, falcon
+import edlib
+
+def log(msg):
+    sys.stderr.write(msg)
+    sys.stderr.write('\n')
+
+def count_cigar_ops(cigar):
+    """
+    For curious people: regexes are very slow for parsing CIGAR strings.
+
+    cigar: Unicode
+    """
+    b = 0
+    num_m, num_i, num_d = 0, 0, 0
+    for i in range(len(cigar)):
+        if cigar[i] <= '9':
+            continue
+        # Check if there are no digits before the op char.
+        assert(b < i)
+        count = int(cigar[b:i])
+        op = cigar[i]
+        b = i + 1
+        if op == 'D':
+            num_d += count
+        elif op == 'I':
+            num_i += count
+        elif op in ['M', '=', 'X']:
+            num_m += count
+        else:        # pragma: no cover
+            pass     # pragma: no cover
+    # Check if there are dangling ops.
+    assert(b == len(cigar))
+    total_len = num_d + num_i + num_m
+    return num_m, num_i, num_d, total_len
+
+def get_aln_data(t_seq, q_seq):
+    """
+    Inputs in bytes.
+    """
+    aln_data = []
+    K = 8
+    seq0 = t_seq
+    lk_ptr = kup.allocate_kmer_lookup(1 << (K * 2))
+    sa_ptr = kup.allocate_seq(len(seq0))
+    sda_ptr = kup.allocate_seq_addr(len(seq0))
+    kup.add_sequence(0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr)
+    q_id = "dummy"
+
+    kmer_match_ptr = kup.find_kmer_pos_for_seq(
+        q_seq, len(q_seq), K, sda_ptr, lk_ptr)
+    kmer_match = kmer_match_ptr[0]
+
+    if kmer_match.count != 0:
+        aln_range_ptr = kup.find_best_aln_range(kmer_match_ptr, K, K * 5, 12)
+        aln_range = aln_range_ptr[0]
+
+        s1, e1, s2, e2 = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2
+
+        log('Mapped (q, s1 = {}, e1 = {}, len1 = {}, (e1 - s1) = {}, t, s2 = {}, e2 = {}, (e2 - s2) = {}, len2 = {})'.format(
+                s1, e1, e1 - s1, len(q_seq), s2, e2, e2 - s2, len(t_seq)))
+
+        if e1 - s1 > 100:
+            log('Calling edlib.align(q, s1 = {}, e1 = {}, len1 = {}, (e1 - s1) = {}, t, s2 = {}, e2 = {}, (e2 - s2) = {}, len2 = {})'.format(
+                s1, e1, e1 - s1, len(q_seq), s2, e2, e2 - s2, len(t_seq)))
+
+            # Align using Edlib instead of DWA.
+            edlib_result = edlib.align(q_seq[s1:e1], seq0[s2:e2], mode="NW")
+
+            delta_l = len(q_seq) - len(t_seq)
+            cov = float(e1 - s1) / float(len(q_seq))
+            idt = float(e1 - s1 - edlib_result['editDistance']) / float(e1 - s1)
+
+            aln_data.append((q_id, 0, s1, e1, len(q_seq), s2, e2, len(seq0),
+                            delta_l, idt, cov))
+
+        kup.free_aln_range(aln_range_ptr)
+
+    kup.free_kmer_match(kmer_match_ptr)
+    kup.free_kmer_lookup(lk_ptr)
+    kup.free_seq_array(sa_ptr)
+    kup.free_seq_addr_array(sda_ptr)
+    return aln_data #, x, y
+
+def get_global_aln_results(ref_seq, query_seq, min_seq_len):
+    """
+    Aligns two sequences globally, and returns (delta_len, idt, cov) values
+    compatible with the legacy deduplication code.
+    Currently unused - it was used in an intermediate version, and might be useful
+    at some point in the future.
+
+    Inputs in Unicode.
+    """
+    ref_seq = ref_seq.encode('ascii')
+    query_seq = query_seq.encode('ascii')
+
+    log('Aligning (Edlib): len(ref_seq) = %d, len(query_seq) = %d' % (len(ref_seq), len(query_seq)))
+    delta_len = len(query_seq) - len(ref_seq)
+
+    idt = 0.0
+    cov = 0.0
+
+    if len(ref_seq) < min_seq_len or len(query_seq) < min_seq_len:
+        return delta_len, idt, cov
+
+    result = edlib.align(query_seq, ref_seq, mode="NW", task="path")
+    # result map is Unicode
+    cigar = result['cigar']
+    num_m, num_i, num_d, total_len = count_cigar_ops(result['cigar'])
+    num_eq = (num_m + num_i + num_d) - result['editDistance']
+    num_x = num_m - num_eq
+
+    idt_query = float(num_eq) / float(num_eq + num_x + num_i)
+    idt_ref = float(num_eq) / float(num_eq + num_x + num_d)
+    idt = min(idt_query, idt_ref)
+
+    cov = 1.0
+
+    log('  - Alignment stats: num_m = %d, num_i = %d, num_d = %d, total_len = %d, num_eq = %d, num_x = %d' % (num_m, num_i, num_d, total_len, num_eq, num_x))
+
+    return delta_len, idt, cov
+
+def get_aln_results(ref_seq, query_seq, min_seq_len):
+    """
+    Runs the legacy mapping code, and aligns the selected region using Edlib
+    instead of the legacy DWA alignment with quadratic memory.
+
+    Inputs in Unicode.
+    """
+    ref_seq = ref_seq.encode('ascii')
+    query_seq = query_seq.encode('ascii')
+
+    log('Aligning (Edlib): len(ref_seq) = %d, len(query_seq) = %d' % (len(ref_seq), len(query_seq)))
+
+    delta_len = len(query_seq) - len(ref_seq)
+    idt = 0.0
+    cov = 0.0
+
+    if len(ref_seq) < min_seq_len or len(query_seq) < min_seq_len:
+        return delta_len, idt, cov
+
+    aln_data = get_aln_data(ref_seq, query_seq)
+    if len(aln_data) != 0:
+        delta_len, idt, cov = aln_data[-1][8:11]
+
+    else:
+        log('len(aln_data) == 0!')
+
+    return delta_len, idt, cov

+ 38 - 0
FALCON/falcon_kit/bash.py

@@ -0,0 +1,38 @@
+"""Most bash-scripting is generated here.
+"""
+
+
+BASH = '/bin/bash'
+BUG_avoid_Text_file_busy = True
+# http://stackoverflow.com/questions/1384398/usr-bin-perl-bad-interpreter-text-file-busy/
+
+
+def write_sub_script(ofs, script):
+    # We use shebang + chmod so we can see the sub-script in 'top'.
+    # In order to avoid '/bin/bash: bad interpreter: Text file busy',
+    # we 'touch' the sub-script after chmod.
+    #   http://superuser.com/questions/934300/bin-bash-bad-interpreter-text-file-busy-even-though-the-file-editor-closed
+    ofs.write('#!{}\n'.format(BASH))
+    ofs.write('set -vex\n')
+    ofs.write(script)
+
+    if BUG_avoid_Text_file_busy:
+        exe = BASH
+    else:
+        # We prefer to run via shebang b/c we want the script-name to appear to 'top',
+        # but some users have a problem with that, e.g.
+        #   https://github.com/PacificBiosciences/FALCON/issues/269
+        # Another idea never worked reliably:
+        # chmod +x {sub_script_bfn}
+        # touch {sub_script_bfn}
+        # We are trying to avoid this problem:
+        #   /bin/bash: bad interpreter: Text file busy
+        exe = ''
+    return exe
+
+
+def write_script(script, script_fn, job_done_fn=None):
+    if job_done_fn:
+        script += '\ntouch {}\n'.format(job_done_fn)
+    with open(script_fn, 'w') as ofs:
+        exe = write_sub_script(ofs, script)

+ 179 - 0
FALCON/falcon_kit/falcon_kit.py

@@ -0,0 +1,179 @@
+
+
+__all__ = [
+    'kup', 'DWA', 'falcon',
+    'KmerLookup', 'KmerMatch', 'AlnRange', 'ConsensusData',
+    'Alignment', 'get_alignment',
+]
+
+from ctypes import *
+import os
+import ext_falcon
+#module_path = os.path.split(__file__)[0]
+
+
+seq_coor_t = c_int
+base_t = c_uint8
+
+
+class KmerLookup(Structure):
+    _fields_ = [("start", seq_coor_t),
+                ("last", seq_coor_t),
+                ("count", seq_coor_t)]
+
+
+class KmerMatch(Structure):
+    _fields_ = [("count", seq_coor_t),
+                ("query_pos", POINTER(seq_coor_t)),
+                ("target_pos", POINTER(seq_coor_t))]
+
+
+class AlnRange(Structure):
+    _fields_ = [("s1", seq_coor_t),
+                ("e1", seq_coor_t),
+                ("s2", seq_coor_t),
+                ("e2", seq_coor_t),
+                ("score", c_long)]
+
+
+class ConsensusData(Structure):
+    _fields_ = [("sequence", c_char_p),
+                ("eff_cov", POINTER(c_uint))]
+
+
+try:
+    falcon_dll = CDLL(ext_falcon.__file__)
+except OSError:
+    # It seems that setup.py has changed the __file__ it attaches to an extension module.
+    # I have no idea why or why, but this works around it.
+    falcon_dll = CDLL(os.path.join(os.path.dirname(__file__),
+                                   '..', os.path.basename(ext_falcon.__file__)))
+
+kup = falcon_dll
+
+kup.allocate_kmer_lookup.argtypes = [seq_coor_t]
+kup.allocate_kmer_lookup.restype = POINTER(KmerLookup)
+kup.init_kmer_lookup.argtypes = [POINTER(KmerLookup), seq_coor_t]
+kup.free_kmer_lookup.argtypes = [POINTER(KmerLookup)]
+
+kup.allocate_seq.argtypes = [seq_coor_t]
+kup.allocate_seq.restype = POINTER(base_t)
+kup.init_seq_array.argtypes = [POINTER(base_t), seq_coor_t]
+kup.free_seq_array.argtypes = [POINTER(base_t)]
+
+kup.allocate_seq_addr.argtypes = [seq_coor_t]
+kup.allocate_seq_addr.restype = POINTER(seq_coor_t)
+kup.free_seq_addr_array.argtypes = [POINTER(seq_coor_t)]
+
+kup.add_sequence.argtypes = [seq_coor_t, c_uint, POINTER(c_char), seq_coor_t, POINTER(seq_coor_t),
+                             POINTER(c_uint8), POINTER(KmerLookup)]
+kup.mask_k_mer.argtypes = [c_long, POINTER(KmerLookup), c_long]
+kup.find_kmer_pos_for_seq.argtypes = [POINTER(c_char), seq_coor_t, c_uint, POINTER(seq_coor_t),
+                                      POINTER(KmerLookup)]
+kup.find_kmer_pos_for_seq.restype = POINTER(KmerMatch)
+kup.free_kmer_match.argtypes = [POINTER(KmerMatch)]
+
+
+kup.find_best_aln_range.argtypes = [
+    POINTER(KmerMatch), seq_coor_t, seq_coor_t, seq_coor_t]
+kup.find_best_aln_range.restype = POINTER(AlnRange)
+kup.find_best_aln_range2.argtypes = [
+    POINTER(KmerMatch), seq_coor_t, seq_coor_t, seq_coor_t]
+kup.find_best_aln_range2.restype = POINTER(AlnRange)
+kup.free_aln_range.argtypes = [POINTER(AlnRange)]
+
+
+class Alignment(Structure):
+    """
+    typedef struct {
+        seq_coor_t aln_str_size ;
+        seq_coor_t dist ;
+        seq_coor_t aln_q_s;
+        seq_coor_t aln_q_e;
+        seq_coor_t aln_t_s;
+        seq_coor_t aln_t_e;
+        char * q_aln_str;
+        char * t_aln_str;
+    } alignment;
+    """
+    _fields_ = [("aln_str_size", seq_coor_t),
+                ("dist", seq_coor_t),
+                ("aln_q_s", seq_coor_t),
+                ("aln_q_e", seq_coor_t),
+                ("aln_t_s", seq_coor_t),
+                ("aln_t_e", seq_coor_t),
+                ("q_aln_str", c_char_p),
+                ("t_aln_str", c_char_p)]
+
+
+DWA = falcon_dll
+
+DWA.align.argtypes = [POINTER(c_char), c_long, POINTER(
+    c_char), c_long, c_long, c_int]
+DWA.align.restype = POINTER(Alignment)
+DWA.free_alignment.argtypes = [POINTER(Alignment)]
+
+
+falcon = falcon_dll
+
+falcon.generate_consensus.argtypes = [
+    POINTER(c_char_p), c_uint, c_uint, c_uint, c_uint, c_uint, c_double]
+falcon.generate_consensus.restype = POINTER(ConsensusData)
+falcon.free_consensus_data.argtypes = [POINTER(ConsensusData)]
+
+
+def get_alignment(seq1, seq0):
+    K = 8
+    lk_ptr = kup.allocate_kmer_lookup(1 << (K * 2))
+    sa_ptr = kup.allocate_seq(len(seq0))
+    sda_ptr = kup.allocate_seq_addr(len(seq0))
+    kup.add_sequence(0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr)
+
+    kmer_match_ptr = kup.find_kmer_pos_for_seq(
+        seq1, len(seq1), K, sda_ptr, lk_ptr)
+    kmer_match = kmer_match_ptr[0]
+    aln_range_ptr = kup.find_best_aln_range(kmer_match_ptr, K, K * 10, 50)
+    #x,y = zip( * [ (kmer_match.query_pos[i], kmer_match.target_pos[i]) for i in range(kmer_match.count )] )
+    kup.free_kmer_match(kmer_match_ptr)
+    aln_range = aln_range_ptr[0]
+    s1, e1, s2, e2 = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2
+    kup.free_aln_range(aln_range_ptr)
+
+    if e1 - s1 > 500:
+        #s1 = 0 if s1 < 14 else s1 - 14
+        #s2 = 0 if s2 < 14 else s2 - 14
+        e1 = len(seq1) if e1 >= len(seq1) - 2 * K else e1 + K * 2
+        e2 = len(seq0) if e2 >= len(seq0) - 2 * K else e2 + K * 2
+
+        alignment = DWA.align(seq1[s1:e1], e1 - s1,
+                              seq0[s2:e2], e2 - s2,
+                              100,
+                              0)
+        # print seq1[s1:e1]
+        # print seq0[s2:e2]
+        # if alignment[0].aln_str_size > 500:
+
+        #aln_str1 = alignment[0].q_aln_str
+        #aln_str0 = alignment[0].t_aln_str
+        aln_size = alignment[0].aln_str_size
+        aln_dist = alignment[0].dist
+        aln_q_s = alignment[0].aln_q_s
+        aln_q_e = alignment[0].aln_q_e
+        aln_t_s = alignment[0].aln_t_s
+        aln_t_e = alignment[0].aln_t_e
+
+        # print "X,",alignment[0].aln_q_s, alignment[0].aln_q_e
+        # print "Y,",alignment[0].aln_t_s, alignment[0].aln_t_e
+
+        # print aln_str1
+        # print aln_str0
+
+        DWA.free_alignment(alignment)
+
+    kup.free_seq_addr_array(sda_ptr)
+    kup.free_seq_array(sa_ptr)
+    kup.free_kmer_lookup(lk_ptr)
+    if e1 - s1 > 500 and aln_size > 500:
+        return s1, s1 + aln_q_e - aln_q_s, s2, s2 + aln_t_e - aln_t_s, aln_size, aln_dist
+    else:
+        return None

+ 180 - 0
FALCON/falcon_kit/fc_asm_graph.py

@@ -0,0 +1,180 @@
+
+
+from builtins import zip
+from builtins import object
+from .FastaReader import open_fasta_reader
+from .io import FilePercenter
+import networkx as nx
+
+RCMAP = dict(list(zip("ACGTacgtNn-", "TGCAtgcaNn-")))
+
+
+def reverse_end(node_id):
+    node_id, end = node_id.split(":")
+    new_end = "B" if end == "E" else "E"
+    return node_id + ":" + new_end
+
+
+class AsmGraph(object):
+
+    def __init__(self, sg_file, utg_file, ctg_file):
+        self.sg_edges = {}
+        self.sg_edge_seqs = {}
+        self.utg_data = {}
+        self.ctg_data = {}
+        self.utg_to_ctg = {}
+        self.node_to_ctg = {}
+        self.node_to_utg = {}
+
+        self.load_sg_data(sg_file)
+        self.load_utg_data(utg_file)
+        self.load_ctg_data(ctg_file)
+
+        self.build_node_map()
+
+    def load_sg_data(self, sg_file):
+        counter = FilePercenter(sg_file)
+        with open(sg_file) as f:
+            for l in f:
+                counter(len(l))
+                l = l.strip().split()
+                v, w = l[0:2]
+                seq_id, b, e = l[2:5]
+                b, e = int(b), int(e)
+                score, idt = l[5:7]
+                score, idt = int(score), float(idt)
+                type_ = l[7]
+                self.sg_edges[(v, w)] = ((seq_id, b, e), score, idt, type_)
+
+    def load_sg_seq(self, fasta_fn):
+        all_read_ids = set()  # read ids in the graph
+
+        for v, w in self.sg_edges:
+            type_ = self.sg_edges[(v, w)][-1]
+            if type_ != "G":
+                continue
+            v = v.split(":")[0]
+            w = w.split(":")[0]
+            all_read_ids.add(v)
+            all_read_ids.add(w)
+
+        seqs = {}
+        # load all p-read name into memory
+        with open_fasta_reader(fasta_fn) as f:
+            for r in f:
+                if r.name not in all_read_ids:
+                    continue
+                seqs[r.name] = r.sequence.upper()
+
+        for v, w in self.sg_edges:
+            seq_id, s, t = self.sg_edges[(v, w)][0]
+            type_ = self.sg_edges[(v, w)][-1]
+
+            if type_ != "G":
+                continue
+
+            if s < t:
+                e_seq = seqs[seq_id][s:t]
+            else:
+                e_seq = "".join([RCMAP[c] for c in seqs[seq_id][t:s][::-1]])
+            self.sg_edge_seqs[(v, w)] = e_seq
+
+    def get_seq_from_path(self, path):
+        if len(self.sg_edge_seqs) == 0:
+            return ""
+        v = path[0]
+        seqs = []
+        for w in path[1:]:
+            seqs.append(self.sg_edge_seqs[(v, w)])
+            v = w
+        return "".join(seqs)
+
+    def load_utg_data(self, utg_file):
+        counter = FilePercenter(utg_file)
+        with open(utg_file) as f:
+            for l in f:
+                counter(len(l))
+                l = l.strip().split()
+                s, v, t = l[0:3]
+                type_, length, score = l[3:6]
+                length, score = int(length), int(score)
+                path_or_edges = l[6]
+                self.utg_data[(s, t, v)] = (
+                    type_, length, score, path_or_edges)
+
+    def load_ctg_data(self, ctg_file):
+        counter = FilePercenter(ctg_file)
+        with open(ctg_file) as f:
+            for l in f:
+                counter(len(l))
+                l = l.strip().split()
+                ctg_id, ctg_type = l[0:2]
+                start_edge = l[2]
+                end_node = l[3]
+                length = int(l[4])
+                score = int(l[5])
+                path = tuple((e.split("~") for e in l[6].split("|")))
+                self.ctg_data[ctg_id] = (
+                    ctg_type, start_edge, end_node,  length, score, path)
+                for u in path:
+                    s, v, t = u
+                    # rint s,v,t
+                    type_, length, score, path_or_edges = self.utg_data[(
+                        s, t, v)]
+                    if type_ != "compound":
+                        self.utg_to_ctg[(s, t, v)] = ctg_id
+                    else:
+                        for svt in path_or_edges.split("|"):
+                            s, v, t = svt.split("~")
+                            self.utg_to_ctg[(s, t, v)] = ctg_id
+
+    def get_sg_for_utg(self, utg_id):
+        sg = nx.DiGraph()
+        type_, length, score, path_or_edges = self.utg_data[utg_id]
+        if type_ == "compound":
+            for svt in path_or_edges.split("|"):
+                s, v, t = svt.split("~")
+                type_, length, score, one_path = self.utg_data[(s, t, v)]
+                one_path = one_path.split("~")
+                nx.add_path(sg, one_path)
+        else:
+            one_path = path_or_edges.split("~")
+            nx.add_path(sg, one_path)
+        return sg
+
+    def get_sg_for_ctg(self, ctg_id):
+        sg = nx.DiGraph()
+        utgs = []
+        path = self.ctg_data[ctg_id][-1]
+        for s, v, t in path:
+            type_, length, score, path_or_edges = self.utg_data[(s, t, v)]
+            utgs.append((type_, path_or_edges))
+
+        for t, utg in utgs:
+            if t == "simple":
+                one_path = utg.split("~")
+                nx.add_path(sg, one_path)
+            elif t == "compound":
+                for svt in utg.split("|"):
+                    s, v, t = svt.split("~")
+                    type_, length, score, one_path = self.utg_data[(s, t, v)]
+                    one_path = one_path.split("~")
+                    nx.add_path(sg, one_path)
+
+        return sg
+
+    def build_node_map(self):
+
+        for ctg_id in self.ctg_data:
+            sg = self.get_sg_for_ctg(ctg_id)
+            for n in sg.nodes():
+                self.node_to_ctg.setdefault(n, set())
+                self.node_to_ctg[n].add(ctg_id)
+
+        for u_id in self.utg_data:
+            if self.utg_data[u_id][0] == "compound":
+                continue
+            sg = self.get_sg_for_utg(u_id)
+            for n in sg.nodes():
+                self.node_to_utg.setdefault(n, set())
+                self.node_to_utg[n].add(u_id)

+ 580 - 0
FALCON/falcon_kit/functional.py

@@ -0,0 +1,580 @@
+"""Purely functional code.
+"""
+
+
+
+
+
+from future.utils import viewitems
+from .io import NativeIO as StringIO
+import collections
+import logging
+import re
+
+LOG = logging.getLogger(__name__)
+
+
+def _verify_pairs(pairs1, pairs2):
+    if pairs1 != pairs2:  # pragma: no cover
+        print('pair2dali:', pairs1)
+        print('pair2sort:', pairs2)
+        print('dali-sort:', set(pairs1) - set(pairs2))
+        print('sort-dali:', set(pairs2) - set(pairs1))
+        print('pair2dali:', len(pairs1))
+        print('pair2sort:', len(pairs2))
+        assert pairs1 == pairs2
+
+
+def skip_LAcheck(bash):
+    def lines():
+        for line in StringIO(bash):
+            if 'LAcheck' in line:
+                yield 'set +e\n'
+                yield line
+                yield 'set -e\n'
+            else:
+                yield line
+    return ''.join(lines())
+
+
+def get_daligner_job_descriptions_sans_LAcheck(run_jobs_stream, db_prefix, single=False):
+    """Strip LAcheck (somehow) from each bash script.
+    (For now, we will run it but not fail on error.)
+    """
+    descs = get_daligner_job_descriptions(run_jobs_stream, db_prefix, single)
+    result = {}
+    for (k, v) in viewitems(descs):
+        bash = skip_LAcheck(v)
+        bash = bash.replace(
+            'LAsort', 'python3 -m falcon_kit.mains.LAsort {}'.format(db_prefix))
+        bash = bash.replace(
+            'LAmerge', 'python3 -m falcon_kit.mains.LAmerge {}'.format(db_prefix))
+        result[k] = bash
+    return result
+
+
+def get_daligner_job_descriptions(run_jobs_stream, db_prefix, single=False):
+    """Return a dict of job-desc-tuple -> HPCdaligner bash-job.
+
+    Comments are ignored.
+
+    E.g., each item will look like:
+      ('.2', '.1', '.2', '.3'): 'daligner
+
+    Rationale
+    ---------
+    For i/o efficiency, we want to daligner calls with LAsort/LAmerge lines. But
+    Gene has done this himself too. So now, we only want the daligner calls here.
+
+    Later, we will do the extra LAmerge lines, grouped by A-read.
+    """
+    re_block_dali = re.compile(r'%s(\.\d+|)' % db_prefix)
+
+    def blocks_dali(line):
+        """Return ['.1', '.2', ...]
+        Can return [''] if only 1 block.
+        """
+        return [mo.group(1) for mo in re_block_dali.finditer(line)]
+    # X == blocks[0]; A/B/C = blocks[...]
+
+    lines = [line.strip() for line in run_jobs_stream]
+    # in case caller passed filename, not stream
+    assert any(len(l) > 1 for l in lines), repr('\n'.join(lines))
+
+    lines_dali = [l for l in lines if l.startswith(
+        'daligner')]  # could be daligner_p
+    result = {}
+    for dali in lines_dali:
+        id = tuple(blocks_dali(dali))
+        early_checks = [
+            "LAcheck -v {db_prefix} *.las".format(db_prefix=db_prefix)]
+        script = '\n'.join([dali] + early_checks) + '\n'
+        result[id] = script
+    return result
+
+
+re_first_block_las = re.compile(r'^(?:\S+)(?:\s+-\S+)*\s+[^\.]+\.(\d+|)')
+
+
+def first_block_las(line):
+    """
+    >>> first_block_las('LAsort -v -a foo.1.foo.1.C0')
+    1
+    """
+    mo = re_first_block_las.search(line)
+    try:
+        return int(mo.group(1))
+    except Exception as e:
+        raise Exception('Pattern {!r} does not match line {!r}: {}'.format(
+            re_first_block_las.pattern, line, e))
+
+
+def get_las_filenames(mjob_data, db_prefix):
+    """Given result of get_mjob_data(),
+    return {int: final_las_filename}
+    """
+    # This is our best guess.
+    # (We might not even need this, since we know the output filename of each merge-task by convention.)
+    # Eventually, we need to re-write HPC.daligner.
+    result = {}
+    re_LAmerge = re.compile(r'^LAmerge\s+(?:\-\S+\s+)(\S+)')
+    re_LAcheck = re.compile(r'^LAcheck\s+(?:\-\S+\s+)\S+\s+(\S+)')
+    for (p_id, bash_lines) in viewitems(mjob_data):
+        if not bash_lines:
+            # The daligner+LAsort+LAmerge job produced the final .las
+            # for this block. We will symlink it later.
+            las_fn = '{}.{}.las'.format(db_prefix, p_id)
+            result[p_id] = las_fn
+            continue
+        # Find the last line which can tell us the final .las name.
+        i = len(bash_lines) - 1
+        while bash_lines[i].split()[0] not in ('LAmerge', 'LAcheck'):
+            i -= 1
+        # Now we will raise an exception if there were none. But in theory, there
+        # must be at least an LAsort.
+        first_word = bash_lines[i].split()[0]
+        if first_word == 'LAmerge':
+            regex = re_LAmerge
+        elif first_word == 'LAcheck':
+            regex = re_LAcheck
+        else:
+            raise Exception('first_word={!r} in line#{} of {!r}'.format(
+                first_word, i, bash_lines))
+        mo = regex.search(bash_lines[i])
+        if not mo:
+            raise Exception('Regex {!r} failed on {!r}'.format(
+                regex.pattern, bash_lines[i]))
+        las_fn = mo.group(1) + '.las'
+        result[p_id] = las_fn
+    return result
+
+
+def get_mjob_data(run_jobs_stream):
+    """Given output of HPC.daligner,
+    return {int: [bash-lines]}
+    """
+    f = run_jobs_stream
+
+    # Strip either '&& rm ...' or '; rm ...' ?
+    #re_strip_rm = re.compile(r'^(.*) ((\&\&)|;) .*$')
+
+    mjob_data = {}
+    for l in f:
+        l = l.strip()
+        first_word = l.split()[0]
+        if first_word not in ("LAsort", "LAmerge", "rm"):
+            continue
+        if first_word in ["LAsort"]:
+            # We now run this part w/ daligner, but we still need
+            # a small script for some book-keeping.
+            p_id = first_block_las(l)
+            mjob_data.setdefault(p_id, [])
+            # mjob_data[p_id].append(  " ".join(l) ) # Already done w/ daligner!
+            raise Exception('We do not expect to see LAsort at all anymore.')
+        elif first_word in ["LAmerge"]:
+            p_id = first_block_las(l)
+            mjob_data.setdefault(p_id, [])
+            # l = re_strip_rm.sub(r'\1', l) # (LAmerge && rm) rm is very safe if we run in /tmp
+            mjob_data[p_id].append(l)
+            #LOG.info('{}: {}'.format(p_id, l))
+        elif first_word in ["rm"]:
+            p_id = first_block_las(l)
+            mjob_data.setdefault(p_id, [])
+            mjob_data[p_id].append(l)
+            #LOG.info('rm{}: {}'.format(p_id, l))
+    #for key, data in mjob_data.items():
+    #    mjob_data[key] = '\n'.join(data)
+    return mjob_data
+
+
+def yield_args_from_line(bash_line):
+    """Given a line of LAmerge, etc.,
+    return [output_las_fn, input_las_fn0, input_las_fn1, ...]
+    """
+    for word in bash_line.split():
+        if word.startswith('-') or word in ('LAcheck', 'LAmerge', 'LAsort'):
+            continue
+        yield word
+
+
+_re_sub_daligner = re.compile(r'^daligner\b', re.MULTILINE)
+
+
+def xform_script_for_preads(script):
+    daligner_exe = 'daligner_p'
+    # , flags=re.MULTILINE) # flags in py2.7
+    return _re_sub_daligner.sub(daligner_exe, script)
+
+
+def xform_script_for_raw_reads(script):
+    return script
+
+
+def get_script_xformer(pread_aln):
+    if pread_aln:
+        return xform_script_for_preads
+    else:
+        return xform_script_for_raw_reads
+
+
+class GenomeCoverageError(Exception):
+    pass
+
+
+def calc_cutoff_from_reverse_sorted_readlength_counts(rl_counts, target):
+    """Return first read_len which gives at least 'target' bases.
+    """
+    total = sum(pair[0] * pair[1] for pair in rl_counts)
+    subtotal = 0
+    if target > total:
+        msg = 'Not enough reads available for desired genome coverage (bases needed={} > actual={})'.format(
+            target, total)
+        raise GenomeCoverageError(msg)
+    cutoff = 0
+    for (rl, count) in rl_counts:
+        subtotal += rl * count
+        if subtotal >= target:
+            cutoff = rl
+            break
+    else:  # pragma: no cover
+        msg = 'Impossible target (probably a bug): target={target}, subtotal={subtotal}, total={total}'.format(
+            locals())
+        raise Exception(msg)
+    return cutoff
+
+
+def num2int(num):
+    """
+    >>> num2int('1,000,000')
+    1000000
+    """
+    return int(num.replace(',', ''))
+
+
+def get_reverse_sorted_readlength_counts_from_DBstats(DBstats_output):
+    """Return pairs of (readlength, count).
+        Bin:      Count  % Reads  % Bases     Average
+    169,514:          1      0.0      0.0      169514
+    ...
+    ->
+    [(169514, 1), ...]
+    """
+    rl_counts = list()
+    lines = DBstats_output.splitlines()
+    re_stat = re.compile(
+        r'^\s*(?P<bin>\S+):\s+(?P<count>\S+)\s+\S+\s+\S+\s+\S+\s*$')
+    for line in lines:
+        match = re_stat.search(line)
+        if not match:
+            continue
+        rl = num2int(match.group('bin'))
+        count = num2int(match.group('count'))
+        rl_counts.append((rl, count))
+    return rl_counts
+
+
+def calc_cutoff(target, DBstats_output):
+    """Calculate the length_cutoff needed for at least 'target' bases.
+    DBstats_output: ASCII output of 'DBstats -b1 DB',
+    """
+    rl_counts = get_reverse_sorted_readlength_counts_from_DBstats(
+        DBstats_output)
+    return calc_cutoff_from_reverse_sorted_readlength_counts(rl_counts, target)
+
+
+def parse_2columns_of_ints(data):
+    r"""Given 2 columns of integers,
+    space- and line-delimited,
+    yield tuples.
+
+    >>> tuple(parse_2columns_of_ints("1 2\n3 4"))
+    ((1, 2), (3, 4))
+    """
+    for line in data.splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        yield tuple(int(x) for x in line.split())
+
+
+def weighted_average(cols):
+    """Given tuples of (weight, value),
+    return weighted average.
+
+    >>> weighted_average(((100, 1), (200, 2), (100, 5)))
+    2.5
+    """
+    return sum(w * v for (w, v) in cols) / sum(w for (w, v) in cols)
+
+
+def parsed_readlengths_from_dbdump_output(output):
+    """Given output text from the DBump command,
+    yield all read-lengths.
+    """
+    re_length = re.compile(r'^L\s+\d+\s+(\d+)\s+(\d+)$')
+    for line in output.splitlines():
+        mo = re_length.search(line)
+        if mo:
+            beg, end = mo.group(1, 2)
+            beg = int(beg)
+            end = int(end)
+            yield end - beg
+
+
+def mapped_readlengths_from_dbdump_output(output):
+    """Given output text from the DBump command,
+    return dict of (id => read-length).
+    There will be alternate lines like these:
+      R #
+      L # # #
+    https://dazzlerblog.wordpress.com/command-guides/dazz_db-command-guide/
+    """
+    lengths = dict()
+    re_rid = re.compile(r'^R\s+(\d+)$')
+    re_length = re.compile(r'^L\s+(\d+)\s+(\d+)\s+(\d+)$')
+    for line in output.splitlines():
+        mo = re_rid.search(line)
+        if mo:
+            idx = int(mo.group(1))
+            continue
+        mo = re_length.search(line)
+        if mo:
+            well, beg, end = mo.group(1, 2, 3)
+            well = int(idx)
+            beg = int(beg)
+            end = int(end)
+            lengths[idx] = (end - beg)
+            continue
+    return lengths
+
+
+def average_difference(dictA, dictB):
+    """Return the average difference of
+    values in dictA minus dictB, only
+    using values in dictA.
+    If a value is missing from dictB, raise Exception.
+    """
+    total_diff = 0.0
+    for (k, va) in viewitems(dictA):
+        vb = dictB[k]
+        total_diff += (va - vb)
+    return total_diff / len(dictA)
+
+
+def calc_metric_fragmentation(perl_counts_output):
+    # """perl -e 'while (<>) { if ( m{>[^/]+/(\d+)\d/} ) { $id{$1}++; } }; while (my ($k, $v) = each %%id) { $counts{$v}++; }; while (my ($k, $v) = each %%counts) { print "$v $k\n"; };' %s""" %(fastas)
+    cols = tuple(parse_2columns_of_ints(perl_counts_output))
+    avg = weighted_average(cols)
+    return avg
+
+
+def calc_metric_truncation(dbdump_output, length_pairs_output):
+    # """perl -e 'while (<>) { if ( m{>[^/]+/0*(\d+)\d/(\d+)_(\d+)} ) { $lengths{$1} += ($3 - $2); } }; while (my ($k, $v) = each %%lengths) { print "$k $v\n"; };' %s""" %(fastas)
+    cols = tuple(parse_2columns_of_ints(length_pairs_output))
+    pread_lengths = dict((k, v) for (k, v) in cols)
+    orig_lengths = mapped_readlengths_from_dbdump_output(dbdump_output)
+    avg = -average_difference(pread_lengths, orig_lengths)
+    return avg
+
+
+def choose_cat_fasta(fofn):
+    """Given the contents of a fasta FOFN,
+    return a command to write the contents of a fasta to stdout,
+    keeping the original file.
+    Raise Exception on error.
+
+    >>> choose_cat_fasta('abc.gz')
+    'zcat '
+    >>> choose_cat_fasta('abc.dexta')
+    'undexta -vkU -w60 -i < '
+    >>> choose_cat_fasta('abc')
+    'cat '
+    """
+    first_line = fofn.splitlines()[0]
+    if first_line.endswith('.gz'):
+        return 'zcat '
+    elif first_line.endswith('.dexta'):
+        return 'undexta -vkU -w60 -i < '
+    else:
+        return 'cat '
+
+
+re_underscore_flag = re.compile(r'(--[\w-]+)(_)')
+def dash_flags(val):
+    """
+    >>> dash_flags('--foo_bar --one_two_three')
+    '--foo-bar --one-two-three'
+    >>> dash_flags('')
+    ''
+    """
+    while True:
+        # Repeat until settled, as there might be multiple _ in the same flag.
+        new_val = re_underscore_flag.sub(r'\1-', val)
+        if new_val == val:
+            return new_val
+        val = new_val
+
+
+def cfg_tobool(v):
+    """
+    >>> cfg_tobool('yes')
+    True
+    >>> cfg_tobool('true')
+    True
+    >>> cfg_tobool('T')
+    True
+    >>> cfg_tobool('1')
+    True
+    >>> cfg_tobool('no')
+    False
+    >>> cfg_tobool('false')
+    False
+    >>> cfg_tobool('F')
+    False
+    >>> cfg_tobool('0')
+    False
+    >>> cfg_tobool('')
+    False
+    """
+    if v in (True, False, None):
+        return v
+    if not v:
+        return False
+    if v.upper()[0] in ('T', 'Y'):
+        return True
+    if v.upper()[0] in ('F', 'N'):
+        return False
+    return bool(int(v))
+
+
+# https://stackoverflow.com/questions/3387691/how-to-perfectly-override-a-dict
+# We derived from dict instead of from MutableMapping to json.dumps() works.
+
+_RaiseKeyError = object() # singleton for no-default behavior
+
+class LowerDict(dict):  # dicts take a mapping or iterable as their optional first argument
+    __slots__ = () # no __dict__ - that would be redundant
+    def __init__(self):
+        # No args allowed, to keep it simple.
+        super(LowerDict, self).__init__(self)
+    def __getitem__(self, k):
+        return super(LowerDict, self).__getitem__(k.lower())
+    def __setitem__(self, k, v):
+        return super(LowerDict, self).__setitem__(k.lower(), v)
+    def __delitem__(self, k):
+        return super(LowerDict, self).__delitem__(k.lower())
+    def get(self, k, default=None):
+        return super(LowerDict, self).get(k.lower(), default)
+    def setdefault(self, k, default=None):
+        return super(LowerDict, self).setdefault(k.lower(), default)
+    def pop(self, k, v=_RaiseKeyError):
+        if v is _RaiseKeyError:
+            return super(LowerDict, self).pop(k.lower())
+        return super(LowerDict, self).pop(k.lower(), v)
+    #def update(self, mapping=(), **kwargs):
+    #    super(LowerDict, self).update(self._process_args(mapping, **kwargs))
+    def __contains__(self, k):
+        return super(LowerDict, self).__contains__(k.lower())
+    #def copy(self): # don't delegate w/ super - dict.copy() -> dict :(
+    #    return type(self)(self)
+    @classmethod
+    def fromkeys(cls, keys, v=None):
+        return super(LowerDict, cls).fromkeys((k.lower() for k in keys), v)
+    def __repr__(self):
+        return '{0}({1})'.format(type(self).__name__, super(LowerDict, self).__repr__())
+
+
+__loop_set = set()
+
+def toLowerDict(cfg):
+    """Change key-names to be lower-case, at all levels of dict cfg.
+    Then, return the case-insensitive LowerDict, substituted recursively.
+    """
+    if isinstance(cfg, LowerDict):
+        return cfg
+    if id(cfg) in __loop_set:
+        # Prevent infinite loop.
+        raise Exception('Already ran update_lowercase({}) (len(set)=={}):\n  {}'.format(
+            id(cfg), len(__loop_set), cfg))
+    __loop_set.add(id(cfg))
+
+    low = LowerDict()
+
+    for k,v in list(cfg.items()):
+        if isinstance(v, dict):
+            v = toLowerDict(v) # RECURSION
+        if k in low:
+            msg = 'Collision for "{}" in dict:\n{}'.format(k, cfg)
+            if v != low[k]:
+                raise Exception(msg)
+        low[k] = v
+    return low
+
+
+def parse_REPmask_code(code):
+    """
+    Return list of 3 (group_size, coverage_limit) pairs.
+
+    group_size==0 indicates "no-op".
+    Otherwise, super-high coverage_limit indicates "do work, but produce empty mask-track".
+
+    >>> parse_REPmask_code('1,10/2,20/3,300')
+    [(1, 10), (2, 20), (3, 300)]
+    """
+    ec = 0 # arbitrary
+    result = [(0, ec), (0, ec), (0, ec)] # all no-op by default
+    try:
+        if '/' in code:
+            pairs = code.split('/')
+        else:
+            assert ';' in code, 'code contains neither ";" nor "/": {!r}'.format(code)
+            pairs = code.split(';')
+        assert len(pairs) <= 3
+        for i, p in enumerate(pairs):
+            g, c = list(map(int, p.split(',')))
+            result[i] = (g, c)
+    except Exception as exc:
+        LOG.exception('Failed to parse REPmask_code {!r}. Using extreme, to produce empty rep tracks.'.format(code))
+    # Validate
+    paira = result[0]
+    pairb = result[1]
+    pairc = result[2]
+    if (paira[0] != 0 and pairb[0] != 0 and pairc[0] != 0):
+        # Check only if all groups are non-zero. Otherwise, the user must know what they're doing.
+        if (paira[0] == pairb[0] or
+            pairb[0] == pairc[0]):
+            raise Exception('Non-zero group sizes must not match in parsed REPmask_code: {!r} from {!r}'.format(result, code))
+        if (paira[0] > pairb[0] or pairb[0] > pairc[0]):
+            raise Exception('Non-zero group sizes must increase monotonically in parsed REPmask_code: {!r} from {!r}'.format(result, code))
+    return result
+
+
+def dazzler_get_nblocks(db_stream):
+    """Return #blocks in dazzler-db.
+    """
+    nblock = 1
+    new_db = True
+    for l in db_stream:
+        l = l.strip().split()
+        if l[0] == "blocks" and l[1] == "=":
+            nblock = int(l[2])
+            new_db = False
+            break
+    return nblock
+
+
+re_R = re.compile(r'^\+ R (\d+)')
+
+def dazzler_num_reads(dump):
+    """Given DBdump, return number of reads. (Proper DBdump call is assumed.)
+
+    >>> dazzler_num_reads('+ R 27')
+    27
+    >>> dazzler_num_reads('')
+    -1
+    """
+    mo = re_R.search(dump)
+    if mo:
+        return int(mo.group(1))
+    else:
+        return -1

+ 238 - 0
FALCON/falcon_kit/gfa_graph.py

@@ -0,0 +1,238 @@
+
+import os
+import sys
+import json
+
+GFA_H_TAG = 'H'
+GFA_S_TAG = 'S'
+GFA_L_TAG = 'L'
+GFA_P_TAG = 'P'
+GFA_ORIENT_FWD = '+'
+GFA_ORIENT_REV = '-'
+GFA_SEQ_UNKNOWN = '*'
+GFA_LINK_CIGAR_UNKNOWN = '*'
+GFA2_E_TAG = 'E'
+
+KW_NAME = 'name'
+KW_TAGS = 'tags'
+KW_LABELS = 'labels'
+
+KW_NODE_SEQ = 'seq'
+KW_NODE_LEN = 'len'
+
+KW_EDGE_SOURCE = 'v'
+KW_EDGE_SOURCE_ORIENT = 'v_orient'
+KW_EDGE_SINK = 'w'
+KW_EDGE_SINK_ORIENT = 'w_orient'
+KW_EDGE_CIGAR = 'cigar'
+KW_EDGE_SOURCE_START = 'v_start'
+KW_EDGE_SOURCE_END = 'v_end'
+KW_EDGE_SINK_START = 'w_start'
+KW_EDGE_SINK_END = 'w_end'
+
+KW_PATH_NODES = 'nodes'
+KW_PATH_CIGARS = 'cigars'
+
+"""
+GFA-1:
+- H line: line = '\t'.join([GFA_H_TAG, '\tVN:Z:1.0'])
+- S line: line = '\t'.join([GFA_S_TAG, rname, GFA_SEQ_UNKNOWN if (not write_reads) else r.sequence, 'LN:i:%s' % len(r.sequence)])
+- L line: line = '\t'.join([GFA_L_TAG, edge.sg_edge.v_name, edge.sg_edge.v_orient, edge.sg_edge.w_name, edge.sg_edge.w_orient, cig_str])
+- P line: line = '\t'.join([GFA_P_TAG, ctg_name, ','.join(segs), ','.join(segs_cigar)]
+
+GFA-2:
+- H line: line = '\t'.join([GFA_H_TAG, '\tVN:Z:2.0'])
+- S line: line = '\t'.join([GFA_S_TAG, rname, str(len(r.sequence)), GFA_SEQ_UNKNOWN if (not write_reads) else r.sequence])
+- E line: line = '\t'.join([GFA2_E_TAG, edge_name, source_node, sink_node, source_start, source_end, sink_start, sink_end, cig_str])
+
+"""
+
+class GFAGraph:
+    def __init__(self):
+        self.nodes = {}
+        self.edges = {}
+        self.paths = {}
+
+        """
+        Node: {KW_NAME: '01234', KW_NODE_SEQ: 'ACTG', 'len': 4}
+        Node: {'name': '56789', KW_NODE_SEQ: 'CAGT', 'len': 4}
+        Edge: {KW_NAME: 'edge1', 'source': '01234', 'sink': '56789', 'cigar': '*', 'source_start': 3, 'source_end': 4, 'sink_start': 0, 'sink_end': 1}
+        Path: {KW_NAME: '000000F', 'nodes': ['01234', '56789'], '
+        """
+
+    def add_node(self, node_name, node_len, node_seq='*', tags={}, labels={}):
+        if len(node_name) == 0:
+            raise 'Node name should be a non-empty string.\n'
+        if node_len < 0:
+            raise 'Node length should be >= 0.\n'
+        if len(node_seq) == 0:
+            raise 'Node sequence should be a non-empty string. Use "*" instead.\n'
+        if isinstance(tags, dict) == False:
+            raise 'The tags object must be a dict.\n'
+        if isinstance(labels, dict) == False:
+            raise 'The labels object must be a dict.\n'
+
+        self.nodes[node_name] = {
+                                    KW_NAME: node_name,
+                                    KW_NODE_LEN: node_len,
+                                    KW_NODE_SEQ: node_seq,
+                                    KW_TAGS: tags,
+                                    KW_LABELS: labels
+                                }
+
+    def add_edge(self, edge_name, source, source_orient, sink, sink_orient, source_start, source_end, sink_start, sink_end, cigar, tags={}, labels={}):
+        """
+        source_orient   + if fwd, - otherwise.
+        sink_orient   + if fwd, - otherwise.
+        """
+        if len(edge_name) == 0:
+            raise 'Edge name should be a non-empty string.\n'
+        if len(source) == 0:
+            raise 'Source node not specified.\n'
+        if len(sink) == 0:
+            raise 'Sink node not specified.\n'
+        if source_orient not in '+-':
+            raise 'Source orientation should be either "+" or "-".\n'
+        if sink_orient not in '+-':
+            raise 'Sink orientation should be either "+" or "-".\n'
+        if source_start < 0 or source_end < 0:
+            raise 'Source coordinates should be >= 0.\n'
+        if sink_start < 0 or sink_end < 0:
+            raise 'Sink coordinates should be >= 0.\n'
+        if len(cigar) == 0:
+            raise 'Cigar string should not be empty. Use "*" instead.\n'
+        if source_end < source_start:
+            sys.stderr.write('ERROR with: source = %s, source_start = %s, source_end = %s, sink = %s, sink_start = %s, sink_end = %s\n' % (source, source_start, source_end, sink, sink_start, sink_end))
+            raise 'Source end coordinate should be >= source start coordinate.\n'
+        if sink_end < sink_start:
+            raise 'Sink end coordinate should be >= sink start coordinate.\n'
+        if isinstance(tags, dict) == False:
+            raise 'The tags object must be a dict.\n'
+        if isinstance(labels, dict) == False:
+            raise 'The labels object must be a dict.\n'
+
+        self.edges[str((source, sink))] = {
+                                        KW_NAME: edge_name,
+                                        KW_EDGE_SOURCE: source,
+                                        KW_EDGE_SOURCE_ORIENT: source_orient,
+                                        KW_EDGE_SINK: sink,
+                                        KW_EDGE_SINK_ORIENT: sink_orient,
+                                        KW_EDGE_SOURCE_START: source_start,
+                                        KW_EDGE_SOURCE_END: source_end,
+                                        KW_EDGE_SINK_START: sink_start,
+                                        KW_EDGE_SINK_END: sink_end,
+                                        KW_EDGE_CIGAR: cigar,
+                                        KW_TAGS: tags,
+                                        KW_LABELS: labels
+                                    }
+
+    def add_path(self, path_name, path_nodes, path_cigars, tags={}, labels={}):
+        """
+        path_nodes is a list of nodes which should be joined
+        consecutively in a path.
+        path_cigars is a list of CIGAR strings describing how the
+        two neighboring nodes are joined.
+        len(path_nodes) == len(path_cigars)
+        """
+        if len(path_name) == 0:
+            raise 'Path name should be a non-empty string.\n'
+        if len(path_nodes) == 0:
+            raise 'Path nodes should be a non-empty list.\n'
+        if len(path_cigars) == 0:
+            raise 'Path cigars should be a non-empty list.\n'
+        if isinstance(tags, dict) == False:
+            raise 'The tags object must be a dict.\n'
+        if isinstance(labels, dict) == False:
+            raise 'The labels object must be a dict.\n'
+        if len(path_nodes) != len(path_cigars):
+            raise 'The path_nodes and path_cigars should have the same length.\n'
+
+        self.paths[path_name] = {
+                                    KW_NAME: path_name,
+                                    KW_PATH_NODES: path_nodes,
+                                    KW_PATH_CIGARS: path_cigars,
+                                    KW_TAGS: tags,
+                                    KW_LABELS: labels
+                                }
+
+    def write_gfa_v1(self, fp_out):
+        # Header
+        line = '\t'.join([GFA_H_TAG, 'VN:Z:1.0'])
+        fp_out.write(line + '\n')
+
+        # Sequences.
+        for node_name, node_data in self.nodes.items():
+            line = '\t'.join([  GFA_S_TAG,
+                                node_data[KW_NAME],
+                                node_data[KW_NODE_SEQ],
+                                'LN:i:%d' % node_data[KW_NODE_LEN]])
+            fp_out.write(line + '\n')
+
+        for edge, edge_data in self.edges.items():
+            cigar = edge_data[KW_EDGE_CIGAR] if edge_data[KW_EDGE_CIGAR] != '*' else '%dM' % (abs(edge_data[KW_EDGE_SINK_END] - edge_data[KW_EDGE_SINK_START]))
+
+            line = '\t'.join([str(val) for val in
+                                [  GFA_L_TAG,
+                                    edge_data[KW_EDGE_SOURCE],
+                                    edge_data[KW_EDGE_SOURCE_ORIENT],
+                                    edge_data[KW_EDGE_SINK],
+                                    edge_data[KW_EDGE_SINK_ORIENT],
+                                    cigar
+                                ]
+                            ])
+            fp_out.write(line + '\n')
+
+        for path_name, path_data in self.paths.items():
+            line = '\t'.join([GFA_P_TAG, path_data[KW_NAME], ','.join(path_data[KW_PATH_NODES]), ','.join(path_data[KW_PATH_CIGARS])])
+            fp_out.write(line + '\n')
+
+    def write_gfa_v2(self, fp_out):
+        # Header
+        line = '\t'.join([GFA_H_TAG, 'VN:Z:2.0'])
+        fp_out.write(line + '\n')
+
+        # Sequences.
+        for node_name, node_data in self.nodes.items():
+            line = '\t'.join([  GFA_S_TAG,
+                                node_data[KW_NAME],
+                                str(node_data[KW_NODE_LEN]),
+                                node_data[KW_NODE_SEQ]])
+            fp_out.write(line + '\n')
+
+        for edge, edge_data in self.edges.items():
+            v = edge_data[KW_EDGE_SOURCE]
+            w = edge_data[KW_EDGE_SINK]
+            v_len = self.nodes[v][KW_NODE_LEN]
+            w_len = self.nodes[w][KW_NODE_LEN]
+
+            # GFA-2 specifies a special char '$' when a coordinate is the same as the sequence length.
+            v_start = str(edge_data[KW_EDGE_SOURCE_START]) + ('$' if edge_data[KW_EDGE_SOURCE_START] == v_len else '')
+            v_end = str(edge_data[KW_EDGE_SOURCE_END]) + ('$' if edge_data[KW_EDGE_SOURCE_END] == v_len else '')
+            w_start = str(edge_data[KW_EDGE_SINK_START]) + ('$' if edge_data[KW_EDGE_SINK_START] == w_len else '')
+            w_end = str(edge_data[KW_EDGE_SINK_END]) + ('$' if edge_data[KW_EDGE_SINK_END] == w_len else '')
+
+            line = '\t'.join([str(val) for val in
+                                [  GFA2_E_TAG, edge_data[KW_NAME],
+                                    edge_data[KW_EDGE_SOURCE] + edge_data[KW_EDGE_SOURCE_ORIENT],
+                                    edge_data[KW_EDGE_SINK] + edge_data[KW_EDGE_SINK_ORIENT],
+                                    v_start, v_end,
+                                    w_start, w_end,
+                                    edge_data[KW_EDGE_CIGAR],
+                                ]
+                            ])
+            fp_out.write(line + '\n')
+
+def serialize_gfa(gfa_graph):
+    gfa_dict = {}
+    gfa_dict['nodes'] = gfa_graph.nodes
+    gfa_dict['edges'] = gfa_graph.edges
+    gfa_dict['paths'] = gfa_graph.paths
+    return json.dumps(gfa_dict, separators=(', ', ': '), sort_keys=True)
+
+def deserialize_gfa(fp_in):
+    gfa_dict = json.load(fp_in)
+    gfa = GFAGraph()
+    gfa.nodes = gfa_dict['nodes']
+    gfa.edges = gfa_dict['edges']
+    gfa.paths = gfa_dict['paths']
+    return gfa

+ 199 - 0
FALCON/falcon_kit/io.py

@@ -0,0 +1,199 @@
+
+
+
+from pypeflow.io import (
+        syscall, capture, cd,
+        mkdirs, symlink, rm, touch, filesize, exists_and_not_empty) # needed?
+import contextlib
+import io
+import logging
+import os
+import pprint
+import sys
+
+if sys.version_info >= (3, 0):
+    NativeIO = io.StringIO
+else:
+    NativeIO = io.BytesIO
+
+LOG = logging.getLogger()
+
+
+def log(*msgs):
+    LOG.debug(' '.join(repr(m) for m in msgs))
+
+
+def eng(number):
+    return '{:.1f}MB'.format(number / 2**20)
+
+
+class Percenter(object):
+    """Report progress by golden exponential.
+
+    Usage:
+        counter = Percenter('mystruct', total_len(mystruct))
+
+        for rec in mystruct:
+            counter(len(rec))
+    """
+    def __init__(self, name, total, log=LOG.info, units='units'):
+        if sys.maxsize == total:
+            log('Counting {} from "{}"'.format(units, name))
+        else:
+            log('Counting {:,d} {} from\n  "{}"'.format(total, units, name))
+        self.total = total
+        self.log = log
+        self.name = name
+        self.units = units
+        self.call = 0
+        self.count = 0
+        self.next_count = 0
+        self.a = 1 # double each time
+    def __call__(self, more, label=''):
+        self.call += 1
+        self.count += more
+        if self.next_count <= self.count:
+            self.a = 2 * self.a
+            self.a = max(self.a, more)
+            self.a = min(self.a, (self.total-self.count), round(self.total/10.0))
+            self.next_count = self.count + self.a
+            if self.total == sys.maxsize:
+                msg = '{:>10} count={:15,d} {}'.format(
+                    '#{:,d}'.format(self.call), self.count, label)
+            else:
+                msg = '{:>10} count={:15,d} {:6.02f}% {}'.format(
+                    '#{:,d}'.format(self.call), self.count, 100.0*self.count/self.total, label)
+            self.log(msg)
+    def finish(self):
+        self.log('Counted {:,d} {} in {} calls from:\n  "{}"'.format(
+            self.count, self.units, self.call, self.name))
+
+
+def FilePercenter(fn, log=LOG.info):
+    if '-' == fn or not fn:
+        size = sys.maxsize
+    else:
+        size = filesize(fn)
+        if fn.endswith('.dexta'):
+            size = size * 4
+        elif fn.endswith('.gz'):
+            size = sys.maxsize # probably 2.8x to 3.2x, but we are not sure, and higher is better than lower
+            # https://stackoverflow.com/a/22348071
+            # https://jira.pacificbiosciences.com/browse/TAG-2836
+    return Percenter(fn, size, log, units='bytes')
+
+@contextlib.contextmanager
+def open_progress(fn, mode='r', log=LOG.info):
+    """
+    Usage:
+        with open_progress('foo', log=LOG.info) as stream:
+            for line in stream:
+                use(line)
+
+    That will log progress lines.
+    """
+    def get_iter(stream, progress):
+        for line in stream:
+            progress(len(line))
+            yield line
+
+    fp = FilePercenter(fn, log=log)
+    with open(fn, mode=mode) as stream:
+        yield get_iter(stream, fp)
+    fp.finish()
+
+
+def read_as_msgpack(bytestream):
+    import msgpack
+    content = bytestream.read()
+    log('  Read {} as msgpack'.format(eng(len(content))))
+    return msgpack.unpackb(content, raw=False,
+            max_map_len=2**25,
+            max_array_len=2**25,
+    )
+
+
+def read_as_json(bytestream):
+    import json
+    content = bytestream.read().decode('ascii')
+    log('  Read {} as json'.format(eng(len(content))))
+    return json.loads(content)
+
+
+def write_as_msgpack(bytestream, val):
+    import msgpack
+    content = msgpack.packb(val)
+    log('  Serialized to {} as msgpack'.format(eng(len(content))))
+    bytestream.write(content)
+
+
+def write_as_json(bytestream, val):
+    import json
+    content = json.dumps(val, indent=2, separators=(',', ': ')).encode('ascii')
+    log('  Serialized to {} as json'.format(eng(len(content))))
+    bytestream.write(content)
+
+
+def deserialize(fn):
+    log('Deserializing from {!r}'.format(fn))
+    with open(fn, 'rb') as ifs:
+        log('  Opened for read: {!r}'.format(fn))
+        if fn.endswith('.msgpack'):
+            val = read_as_msgpack(ifs)
+        elif fn.endswith('.json'):
+            val = read_as_json(ifs)
+        else:
+            raise Exception('Unknown extension for {!r}'.format(fn))
+    log('  Deserialized {} records'.format(len(val)))
+    return val
+
+
+def serialize(fn, val):
+    """Assume dirname exists.
+    """
+    log('Serializing {} records'.format(len(val)))
+    mkdirs(os.path.dirname(fn))
+    with open(fn, 'wb') as ofs:
+        log('  Opened for write: {!r}'.format(fn))
+        if fn.endswith('.msgpack'):
+            write_as_msgpack(ofs, val)
+        elif fn.endswith('.json'):
+            write_as_json(ofs, val)
+            ofs.write(b'\n') # for vim
+        else:
+            raise Exception('Unknown extension for {!r}'.format(fn))
+
+
+def yield_abspath_from_fofn(fofn_fn):
+    """Yield each filename.
+    Relative paths are resolved from the FOFN directory.
+    'fofn_fn' can be .fofn, .json, .msgpack
+    """
+    try:
+        fns = deserialize(fofn_fn)
+    except:
+        #LOG('las fofn {!r} does not seem to be JSON; try to switch, so we can detect truncated files.'.format(fofn_fn))
+        fns = open(fofn_fn).read().strip().split()
+    try:
+        basedir = os.path.dirname(fofn_fn)
+        for fn in fns:
+            if not os.path.isabs(fn):
+                fn = os.path.abspath(os.path.join(basedir, fn))
+            yield fn
+    except Exception:
+        LOG.error('Problem resolving paths in FOFN {!r}'.format(fofn_fn))
+        raise
+
+
+def rmdirs(*dirnames):
+    for d in dirnames:
+        assert os.path.normpath(d.strip()) not in ['.', '', '/']
+    syscall('rm -rf {}'.format(' '.join(dirnames)))
+
+def rmdir(d):
+    rmdirs(d)
+
+def rm_force(*fns):
+    for fn in fns:
+        if os.path.exists(fn):
+            os.unlink(fn)

+ 57 - 0
FALCON/falcon_kit/mains/LAmerge.py

@@ -0,0 +1,57 @@
+#!/usr/bin/env python
+"""Usage:
+
+    LAmerge.py DB <args>
+
+Run LAcheck on each input in args. Exclude any failures from
+the arglist. Then run LAmerge on the remaining arglist.
+
+This differs from LAsort.py in that the first las file is actually
+an *explicit* output, whereas LAsort relies on *implicit* outputs.
+"""
+
+
+import sys
+import os
+
+
+def log(msg):
+    sys.stderr.write(msg + '\n')
+
+
+def system(call, checked=False):
+    log('!{}'.format(call))
+    rc = os.system(call)
+    if rc:
+        msg = '{} <- {!r}'.format(rc, call)
+        if checked:
+            raise Exception(msg)
+        log(msg)
+    return rc
+
+
+def main(argv=sys.argv):
+    db = argv[1]
+    args = argv[2:]  # Skip program name
+    lass = list()
+    new_args = list()
+    new_args.append('LAmerge')
+    for arg in args:
+        if arg.startswith('-'):
+            new_args.append(arg)
+        else:
+            lass.append(arg)
+    outlas = lass[0]
+    new_args.append(outlas)  # This is the output las.
+    for las in lass[1:]:
+        rc = system('LAcheck -vS {} {}.las'.format(db, las))  # Assume sorted.
+        if rc:
+            log('Skipping {}.las'.format(las))
+        else:
+            new_args.append(las)
+    system(' '.join(new_args))
+    system('LAcheck -vS {} {}.las'.format(db, outlas))  # Assume sorted.
+
+
+if __name__ == "__main__":
+    main()

+ 52 - 0
FALCON/falcon_kit/mains/LAsort.py

@@ -0,0 +1,52 @@
+#!/usr/bin/env python
+"""Usage:
+
+    LAsort.py DB <args>
+
+Run LAcheck on each input in args. Exclude any failures from
+the arglist. Then run LAsort on the remaining arglist.
+"""
+
+
+import sys
+import os
+
+
+def log(msg):
+    sys.stderr.write(msg + '\n')
+
+
+def system(call, checked=False):
+    log('!{}'.format(call))
+    rc = os.system(call)
+    if rc:
+        msg = '{} <- {!r}'.format(rc, call)
+        if checked:
+            raise Exception(msg)
+        log(msg)
+    return rc
+
+
+def main(argv=sys.argv):
+    log('argv:{!r}'.format(argv))
+    db = argv[1]
+    args = argv[2:]  # Skip program name
+    lass = list()
+    new_args = list()
+    new_args.append('LAsort')
+    for arg in args:
+        if arg.startswith('-'):
+            new_args.append(arg)
+        else:
+            lass.append(arg)
+    for las in lass:
+        rc = system('LAcheck -v {} {}.las'.format(db, las))
+        if rc:
+            log('Skipping {}.las'.format(las))
+        else:
+            new_args.append(las)
+    system(' '.join(new_args))
+
+
+if __name__ == "__main__":
+    main()

+ 0 - 0
FALCON/falcon_kit/mains/__init__.py


+ 34 - 0
FALCON/falcon_kit/mains/actg_coordinate.py

@@ -0,0 +1,34 @@
+
+
+
+from falcon_kit.FastaReader import open_fasta_reader
+import sys
+
+
+def main(argv=sys.argv):
+    p_ctg_coor_map = {}
+    with open("p_ctg_tiling_path") as f:
+        for row in f:
+            row = row.strip().split()
+            ctg_id, v, w, edge_rid, b, e = row[:6]
+            if ctg_id not in p_ctg_coor_map:
+                coor = 0   # the p_ctg_tiling_path should be sorted by contig the order of the edges in the tiling path
+                p_ctg_coor_map[ctg_id] = {}
+                p_ctg_coor_map[ctg_id][v] = 0
+                coor += abs(int(b) - int(e))
+                p_ctg_coor_map[ctg_id][w] = coor
+                continue
+            else:
+                coor += abs(int(b) - int(e))
+                p_ctg_coor_map[ctg_id][w] = coor
+
+    with open_fasta_reader("a_ctg.fa") as a_ctg_fasta:
+        for r in a_ctg_fasta:
+            rid = r.name.split()
+            rid, v, w = rid[:3]
+            pid = rid.split("-")[0]
+            print(rid, p_ctg_coor_map[pid][v], p_ctg_coor_map[pid][w])
+
+
+if __name__ == "__main__":
+    main(sys.argv)

+ 219 - 0
FALCON/falcon_kit/mains/bam2dexta.py

@@ -0,0 +1,219 @@
+import argparse
+import collections
+import glob
+import logging
+import os
+import re
+import sys
+import time
+from .. import io, functional
+from .. import(
+        bash,  # for write_sub_script
+        pype_tasks,  # for TASKS
+)
+
+LOG = logging.getLogger()
+WAIT = 20 # seconds
+
+
+def bam2dexta_split(bam_subreadset_fn, wildcards, split_fn, bash_template_fn):
+    assert bam_subreadset_fn.endswith('.xml')
+    with open(bash_template_fn, 'w') as stream:
+        stream.write(pype_tasks.TASK_BAM2DEXTA_APPLY_SCRIPT)
+
+    split_dataset_prefix = os.path.join(os.getcwd(), 'split') # TODO: Test this as relative sub-dir.
+
+    #from ..util import dataset_split # introduces pbcore dependency
+    #bam_paths = dataset_split.split_dataset(bam_subreadset_fn, split_dataset_prefix)
+    bam_paths = [bam_subreadset_fn] # Lose parallelism, but avoid pbcore.
+
+    jobs = list()
+    for i, bam_fn in enumerate(bam_paths):
+        job_id = 'b_{:03d}'.format(i)
+
+        # Write the las files for this job.
+        #input_dir = os.path.join('bam2dexta-scripts', job_id)
+        #bam_paths_fn = os.path.join('.', input_dir, 'bam-paths.json')
+        #io.mkdirs(input_dir)
+        #io.serialize(bam_paths_fn, bam_paths)
+
+        # Record in a job-dict.
+        dexta_fn = 'subreads.{}.dexta'.format(job_id)
+        job = dict()
+        job['input'] = dict(
+                bam=bam_fn,
+        )
+        job['output'] = dict(
+                dexta=dexta_fn
+        )
+        job['params'] = dict(
+        )
+        job['wildcards'] = {wildcards: job_id}
+        jobs.append(job)
+    io.serialize(split_fn, jobs)
+
+
+def bam2dexta_apply(bam_fn, dexta_fn):
+    """Given a bam subread DataSet, write a .dexta file.
+    """
+    io.rm_force(dexta_fn)
+    tmpdir = '.' # There is no significant improvement to runnning on local disk.
+    cmd = 'rm -f {dexta_fn}; ls -larth {tmpdir}; (bam2fasta -u -o - {bam_fn} | dexta -i >| {tmpdir}/foo.dexta); mv -f {tmpdir}/foo.dexta {dexta_fn}'.format(
+        **locals())
+    # Note: If 'dexta' fails, the script will error. So we might still have an empty foo.dexta, but
+    # we will not have moved it to {dexta_fn}.
+    io.syscall(cmd)
+
+def bam2dexta_combine(gathered_fn, dexta_fofn_fn):
+    gathered = io.deserialize(gathered_fn)
+    d = os.path.abspath(os.path.realpath(os.path.dirname(gathered_fn)))
+    def abspath(fn):
+        if os.path.isabs(fn):
+            return fn # I expect this never to happen though.
+        return os.path.join(d, fn)
+    dexta_fns = list()
+    for job_output in gathered:
+        assert len(job_output) == 1, 'len(job_output) == {} != 1'.format(len(job_output))
+        for fn in list(job_output.values()):
+            abs_fn = abspath(fn)
+            dexta_fns.append(abs_fn)
+    dexta_paths = list()
+    for dexta_fn in sorted(dexta_fns):
+        if not os.path.exists(dexta_fn):
+            msg = 'Did not find {!r}. Waiting {} seconds.'.format(dexta_fn, WAIT)
+            LOG.info(msg)
+            time.sleep(WAIT)
+            if not os.path.exists(dexta_fn):
+                msg = 'Did not find {!r}, even after waiting {} seconds. Maybe retry later?'.format(dexta_fn, WAIT)
+                raise Exception(msg)
+        dexta_paths.append(dexta_fn)
+
+    # Serialize result.
+    #io.serialize(dexta_paths_fn, sorted(dexta_paths))
+    with open(dexta_fofn_fn, 'w') as stream:
+        stream.write('\n'.join(dexta_paths))
+        stream.write('\n')
+
+
+def setup_logging(log_level):
+    hdlr = logging.StreamHandler(sys.stderr)
+    hdlr.setLevel(log_level)
+    hdlr.setFormatter(logging.Formatter('[%(levelname)s]%(message)s'))
+    LOG.addHandler(hdlr)
+    LOG.setLevel(logging.NOTSET)
+    LOG.info('Log-level: {}'.format(log_level))
+
+def cmd_split(args):
+    bam2dexta_split(
+            args.bam_subreadset_fn,
+            args.wildcards,
+            args.split_fn, args.bash_template_fn,
+    )
+def cmd_apply(args):
+    bam2dexta_apply(args.bam_fn, args.dexta_fn)
+def cmd_combine(args):
+    bam2dexta_combine(args.gathered_fn, args.dexta_fofn_fn)
+
+#def get_ours(config_fn):
+#    ours = dict()
+#    config = io.deserialize(config_fn)
+#    LOG.info('config({!r}):\n{}'.format(config_fn, config))
+#    LOG.info('our subset of config:\n{}'.format(ours))
+#    return ours
+
+def add_split_arguments(parser):
+    parser.add_argument(
+        '--wildcards', default='bam2dexta0_id',
+        help='Comma-separated string of keys to be subtituted into output paths for each job, if any. (Helps with snakemake and pypeflow; not needed in pbsmrtpipe, since outputs are pre-determined.)',
+    )
+    parser.add_argument(
+        '--bam-subreadset-fn',
+        help='input. Dataset (.xml) of bam files of subreads.'
+    )
+    parser.add_argument(
+        '--split-fn', default='bam2dexta-uows.json',
+        help='output. Units-of-work for bam2fasta/dexta.',
+    )
+    parser.add_argument(
+        '--bash-template-fn', default='bash-template.sh',
+        help='output. Script to apply later.',
+    )
+def add_apply_arguments(parser):
+    parser.add_argument(
+        '--bam-fn', required=True,
+        help='input. bam or dataset')
+    parser.add_argument(
+        '--dexta-fn', required=True,
+        help='output. The dazzler (Gene Myers) dexta-file.',
+    )
+def add_combine_arguments(parser):
+    parser.add_argument(
+        '--gathered-fn', required=True,
+        help='input. List of sentinels. Produced by gen_parallel_tasks() gathering. The .las files are next to these.',
+    )
+    parser.add_argument(
+        '--dexta-fofn-fn', required=True,
+        help='output. FOFN of dexta paths.')
+
+class HelpF(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
+    pass
+
+def parse_args(argv):
+    description = 'Efficiently generate .dexta from BAM or subread datasets.'
+    epilog = 'For more details on .dexta, see https://dazzlerblog.wordpress.com/command-guides/dextractor-command-guide/'
+    parser = argparse.ArgumentParser(
+        description=description,
+        epilog=epilog,
+        formatter_class=HelpF,
+    )
+    parser.add_argument(
+        '--log-level', default='INFO',
+        help='Python logging level.',
+    )
+    parser.add_argument(
+        '--nproc', type=int, default=0,
+        help='ignored for now, but non-zero will mean "No more than this."',
+    )
+
+    help_split = 'get each bam-file (or subread dataset file)'
+    help_apply = 'run bam2fasta and dexta as a unit-of-work'
+    help_combine = 'generate a file of .dexta files'
+
+    subparsers = parser.add_subparsers(help='sub-command help')
+
+    parser_split = subparsers.add_parser('split',
+            formatter_class=HelpF,
+            description=help_split,
+            epilog='',
+            help=help_split)
+    add_split_arguments(parser_split)
+    parser_split.set_defaults(func=cmd_split)
+
+    parser_apply = subparsers.add_parser('apply',
+            formatter_class=HelpF,
+            description=help_apply,
+            epilog='',
+            help=help_apply)
+    add_apply_arguments(parser_apply)
+    parser_apply.set_defaults(func=cmd_apply)
+
+    parser_combine = subparsers.add_parser('combine',
+            formatter_class=HelpF,
+            description=help_combine,
+            epilog='',
+            help=help_combine)
+    add_combine_arguments(parser_combine)
+    parser_combine.set_defaults(func=cmd_combine)
+
+    args = parser.parse_args(argv[1:])
+    return args
+
+
+def main(argv=sys.argv):
+    args = parse_args(argv)
+    setup_logging(args.log_level)
+    args.func(args)
+
+
+if __name__ == '__main__':  # pragma: no cover
+    main()

+ 79 - 0
FALCON/falcon_kit/mains/calc_cutoff.py

@@ -0,0 +1,79 @@
+
+
+
+from .. import functional as f
+import argparse
+import uuid
+import json
+import os
+import sys
+import traceback
+
+
+def main(argv=sys.argv):
+    import argparse
+
+    description = """
+Given the result of 'DBstats -u -b1' on stdin,
+print the lowest read-length required for sufficient coverage of the genome
+(i.e. 'length_cutoff').
+"""
+    epilog = """
+This is useful when length_cutoff is not provided but the genome-size
+can be estimated. The purpose is to *reduce* the amount of data seen by
+DALIGNER, since otherwise it will miss many alignments when it
+encounters resource limits.
+
+Note: If PBFALCON_ERRFILE is defined (and its directory is writable),
+we will write errors there in addition to stderr.
+"""
+    parser = argparse.ArgumentParser(description=description, epilog=epilog,
+                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--coverage', type=float, default=20,
+                        help='Desired coverage ratio (i.e. over-sampling)')
+    parser.add_argument('genome_size', type=int,
+                        help='Estimated number of bases in genome. (haploid?)')
+    parser.add_argument('capture',  # default='-', # I guess default is not allowed for required args.
+                        help='File with captured output of DBstats. (Otherwise, stdin.)')
+    args = parser.parse_args(argv[1:])
+
+    target = int(args.genome_size * args.coverage)
+    def capture():
+        # This generator ensures that our file is closed at end-of-program.
+        if args.capture != '-':
+            with open(args.capture) as sin:
+                yield sin
+        else:
+            yield sys.stdin
+    for sin in capture():
+        stats = sin.read()
+    try:
+        cutoff = f.calc_cutoff(target, stats)
+    except Exception as e:
+        tb = traceback.format_exc()
+        msg = 'User-provided genome_size: {}\nDesired coverage: {}\n'.format(
+            args.genome_size, args.coverage)
+        # pbfalcon wants us to write errs here.
+        errfile = os.environ.get('PBFALCON_ERRFILE')
+        if errfile:
+            with open(errfile, 'w') as ofs:
+                ofs.write(tb + msg)
+        # this is propagated to SMRT Link UI
+        # see PacBioAlarm class in pbcommand.models.common for details
+        with open("alarms.json", "w") as alarms_out:
+            alarms_out.write(json.dumps([
+                {
+                    "exception": e.__class__.__name__,
+                    "info": tb,
+                    "message": str(e) + "\n" + msg,
+                    "name": e.__class__.__name__,
+                    "severity": "ERROR",
+                    "owner": "python3",
+                    "id": str(uuid.uuid4())
+                }]))
+        raise Exception(tb + msg)
+    sys.stdout.write(str(cutoff))
+
+
+if __name__ == "__main__":
+    main(sys.argv)

+ 103 - 0
FALCON/falcon_kit/mains/collect_contig_gfa.py

@@ -0,0 +1,103 @@
+import argparse
+import os
+import sys
+import json
+
+from falcon_kit.gfa_graph import GFAGraph, serialize_gfa, deserialize_gfa
+import falcon_kit.mains.collect_pread_gfa
+import falcon_kit.tiling_path
+
+def run(fp_out, p_ctg_tiling_path, a_ctg_tiling_path,
+        p_ctg_fasta, a_ctg_fasta,
+        write_contigs,
+        min_p_len, min_a_len, only_these_contigs):
+
+    gfa_graph = GFAGraph()
+
+    # Load the primary and associate contig files.
+    p_ctg_dict = falcon_kit.mains.collect_pread_gfa.load_seqs(p_ctg_fasta, (not write_contigs))
+    p_ctg_lens = {key: val[0] for key, val in p_ctg_dict.items()}
+    p_ctg_seqs = {key: val[1] for key, val in p_ctg_dict.items()}
+
+    a_ctg_dict = falcon_kit.mains.collect_pread_gfa.load_seqs(a_ctg_fasta, (not write_contigs))
+    a_ctg_lens = {key: val[0] for key, val in a_ctg_dict.items()}
+    a_ctg_seqs = {key: val[1] for key, val in a_ctg_dict.items()}
+
+    # Create whitelists for filtering contigs.
+    p_ctg_whitelist = set(p_ctg_seqs.keys())
+    a_ctg_whitelist = set([key for key in list(a_ctg_seqs.keys())])
+    if only_these_contigs:
+        p_ctg_whitelist = set(open(only_these_contigs).read().splitlines()) & set(p_ctg_whitelist)
+        a_ctg_whitelist = set([key for key in list(a_ctg_seqs.keys()) if key.split('-')[0].split('_')[0] in p_ctg_whitelist])
+
+    # Load the tiling paths and assign coordinates.
+    p_paths = falcon_kit.tiling_path.load_tiling_paths(p_ctg_tiling_path, whitelist_seqs=p_ctg_whitelist, contig_lens=p_ctg_lens)
+    a_paths = falcon_kit.tiling_path.load_tiling_paths(a_ctg_tiling_path, whitelist_seqs=a_ctg_whitelist, contig_lens=a_ctg_lens)
+
+    # Find the associate contig placement. `a_placement` is a dict:
+    #   placement[p_ctg_id][a_ctg_id] = (start, end, p_ctg_id, a_ctg_id, first_node, last_node)
+    a_placement = falcon_kit.tiling_path.find_a_ctg_placement(p_paths, a_paths)
+
+    # Add the nodes.
+    for ctg_id, tiling_path in p_paths.items():
+        gfa_graph.add_node(ctg_id, p_ctg_lens[ctg_id], p_ctg_seqs[ctg_id])
+    for ctg_id, tiling_path in a_paths.items():
+        gfa_graph.add_node(ctg_id, a_ctg_lens[ctg_id], a_ctg_seqs[ctg_id])
+
+    # Add edges between primary and associate contigs.
+    for p_ctg_id, a_dict in a_placement.items():
+        for a_ctg_id, placement in a_dict.items():
+            start, end, p_ctg_id, a_ctg_id, first_node, last_node = placement
+
+            a_ctg_len = a_ctg_lens[a_ctg_id]
+
+            # edge_name = 'edge-%d-out-%s-to-%s' % (len(gfa_graph.edges), a_ctg_id, p_ctg_id)
+            edge_name = 'edge-%d' % (len(gfa_graph.edges))
+            gfa_graph.add_edge(edge_name, p_ctg_id, '+', a_ctg_id, '+', start, start, 0, 0, '*', tags = {}, labels = {})
+
+            # edge_name = 'edge-%d-in-%s-to-%s' % (len(gfa_graph.edges), a_ctg_id, p_ctg_id)
+            edge_name = 'edge-%d' % (len(gfa_graph.edges))
+            gfa_graph.add_edge(edge_name, a_ctg_id, '+', p_ctg_id, '+', a_ctg_len, a_ctg_len, end, end, '*', tags = {}, labels = {})
+
+    # Add circular edges to the primary contigs, if they exist.
+    for ctg_id, tiling_path in p_paths.items():
+        if len(tiling_path.edges) == 0:
+            continue
+        if tiling_path.edges[0].v != tiling_path.edges[-1].w:
+            continue
+        p_len = p_ctg_lens[ctg_id]
+        edge_name = 'edge-%d' % (len(gfa_graph.edges))
+        gfa_graph.add_edge(edge_name, ctg_id, '+', ctg_id, '+', p_len, p_len, 0, 0, '*', tags = {}, labels = {})
+
+    fp_out.write(serialize_gfa(gfa_graph))
+    fp_out.write('\n')
+
+def parse_args(argv):
+    parser = argparse.ArgumentParser(description="Generates GFA output (on stdout) from FALCON's assembly.",
+                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--p-ctg-tiling-path', type=str, default='p_ctg_tiling_path',
+                        help='location of the p_ctg tiling path file')
+    parser.add_argument('--a-ctg-tiling-path', type=str, default='a_ctg_tiling_path',
+                        help='location of the a_ctg tiling path file')
+    parser.add_argument('--p-ctg-fasta', type=str, default='p_ctg.fa',
+                        help='path to the primary contigs file')
+    parser.add_argument('--a-ctg-fasta', type=str, default='a_ctg.fa',
+                        help='path to the associate contigs file')
+    parser.add_argument('--write-contigs', '-c', action='store_true',
+                        help="output contig sequences as S lines")
+    parser.add_argument('--min-p-len', type=int, default=0,
+                        help='primary contig paths with length smaller than this will not be reported')
+    parser.add_argument('--min-a-len', type=int, default=0,
+                        help='associate contig paths with length smaller than this will not be reported')
+    parser.add_argument('--only-these-contigs', type=str, default='',
+                        help='limit output to specified contigs listed in file (one per line)')
+    args = parser.parse_args(argv[1:])
+    return args
+
+def main(argv=sys.argv):
+    args = parse_args(argv)
+
+    run(sys.stdout, **vars(args))
+
+if __name__ == '__main__':  # pragma: no cover
+    main()

+ 243 - 0
FALCON/falcon_kit/mains/collect_pread_gfa.py

@@ -0,0 +1,243 @@
+import argparse
+import os
+import sys
+import json
+
+from falcon_kit.fc_asm_graph import AsmGraph
+from falcon_kit.FastaReader import FastaReader
+from falcon_kit.gfa_graph import *
+import falcon_kit.tiling_path
+
+def load_seqs(fasta_fn, store_only_seq_len):
+    """
+    If store_only_seq_len is True, then the seq is discarded and
+    only it's length stored.
+    """
+    seqs = {}
+    f = FastaReader(fasta_fn)
+    if store_only_seq_len == False:
+        for r in f:
+            seqs[r.name.split()[0]] = (len(r.sequence), r.sequence.upper())
+    else:
+        for r in f:
+            seqs[r.name.split()[0]] = (len(r.sequence), '*')
+    return seqs
+
+def load_pread_overlaps(fp_in):
+    preads_overlap_dict = {}
+    for line in fp_in:
+        sl = line.strip().split()
+        if len(sl) < 13:
+            continue
+        # Example line: 000000009 000000082 -3004 99.90 0 4038 7043 7043 1 6488 9492 9492 overlap 000000F.5000003.0 000000F.5000003.0
+        preads_overlap_dict[(sl[0], sl[1])] = sl[0:4] + [int(val) for val in sl[4:12]] + sl[12:]
+        
+        # Overlaps are not always symmetrically represented in the preads.ovl for some reason, so add the
+        # reverse overlap here as well, but do not overwrite existing (just to be safe).
+        if (sl[1], sl[0]) not in preads_overlap_dict:
+            preads_overlap_dict[(sl[1], sl[0])] = [sl[1], sl[0], sl[2], sl[3]] + [int(val) for val in sl[8:12]] + [int(val) for val in sl[4:8]] + sl[12:]
+
+    return preads_overlap_dict
+
+def load_sg_edges(fp_in):
+    """
+    Loads all sg_edges_list so that haplotig paths can be reversed if needed.
+    with open(os.path.join(fc_asm_path, "sg_edges_list"), 'r') as fp:
+        sg_edges_dict = load_sg_edges(fp)
+    """
+    sg_edges_dict = {}
+    for line in fp_in:
+        sl = line.strip().split()
+        if len(sl) < 8:
+            continue
+        # Example line: 000000512:B 000000679:E 000000679 4290 7984 4290 99.95 TR
+        sg_edges_dict[(sl[0], sl[1])] = sl[0:3] + [int(val) for val in sl[3:6]] + [float(sl[6])] + sl[7:]
+    return sg_edges_dict
+
+def add_node(gfa_graph, v, preads_dict):
+    v_name, v_orient = v.split(':')
+    v_len, v_seq = preads_dict[v_name]
+    gfa_graph.add_node(v_name, v_len, v_seq)
+
+def add_edge(gfa_graph, v, w, edge_split_line, preads_overlap_dict, sg_edges_dict):
+    edge_name = 'edge-%d' % (len(gfa_graph.edges))
+    v_name, v_orient = v.split(':')
+    w_name, w_orient = w.split(':')
+    v_orient = '+' if v_orient == 'E' else '-'
+    w_orient = '+' if w_orient == 'E' else '-'
+    cigar = '*'
+
+    # Get the SG edge and the overlap, and set the tags and labels.
+    sg_edge = sg_edges_dict[(v, w)]
+    overlap = preads_overlap_dict[(v_name, w_name)]
+    labels = {'tp': edge_split_line, 'sg_edge': sg_edge, 'overlap': overlap}
+    tags = {}
+
+    # Example overlap:
+    #   000000001 000000170 -6104 99.75 0 1909 8010 8010 1 1250 7354 7354 overlap 000000F.5000003.0 000000F.5000003.0
+    # Handle the overlap coordinates - GFA format requires the coordinates to be with
+    # respect to the fwd strand, and the M4 format reports overlaps on the
+    # strand of the alignment.
+    _, _, score, idt, v_rev, v_start, v_end, v_len, w_rev, w_start, w_end, w_len = overlap[0:12]
+    if v_rev == 1:
+        v_start, v_end = v_end, v_start
+        v_start = v_len - v_start
+        v_end = v_len - v_end
+    if w_rev == 1:
+        w_start, w_end = w_end, w_start
+        w_start = w_len - w_start
+        w_end = w_len - w_end
+
+    gfa_graph.add_edge(edge_name, v_name, v_orient, w_name, w_orient, v_start, v_end, w_start, w_end, cigar, tags = tags, labels = labels)
+
+def add_tiling_paths_to_gfa(gfa_graph, tiling_paths, preads_dict, preads_overlap_dict, sg_edges_dict):
+    # Add nodes.
+    for ctg_id, tiling_path in tiling_paths.items():
+        for edge in tiling_path.edges:
+            add_node(gfa_graph, edge.v, preads_dict)
+            add_node(gfa_graph, edge.w, preads_dict)
+
+    # Add edges.
+    for ctg_id, tiling_path in tiling_paths.items():
+        for edge in tiling_path.edges:
+            add_edge(gfa_graph, edge.v, edge.w, edge.get_split_line(), preads_overlap_dict, sg_edges_dict)
+
+    # Add path.
+    for ctg_id, tiling_path in tiling_paths.items():
+        path_nodes = []
+        path_cigars = []
+        if len(tiling_path.edges) == 0:
+            continue
+
+        # Add the first node to the path.
+        v = tiling_path.edges[0].v
+        v_name, v_orient = v.split(':')
+        cigar = '%dM' % (tiling_path.coords[v]) # This will be 0 if the contig is improper, and length of v otherwise.
+        path_nodes.append(v_name)
+        path_cigars.append(cigar)
+
+        # Add the rest of the nodes.
+        for edge in tiling_path.edges:
+            w_name, w_orient = edge.w.split(':')
+            cigar = '%dM' % (abs(edge.e - edge.b))
+            path_nodes.append(w_name)
+            path_cigars.append(cigar)
+
+        gfa_graph.add_path(ctg_id, path_nodes, path_cigars)
+
+def add_string_graph_to_gfa(gfa_graph, sg_edges_list, utg_data, ctg_paths, preads_dict, preads_overlap_dict, sg_edges_dict):
+    asm_graph = AsmGraph(sg_edges_list, utg_data, ctg_paths)
+
+    for v, w in asm_graph.sg_edges:
+        add_node(gfa_graph, v, preads_dict)
+        add_node(gfa_graph, w, preads_dict)
+
+    for v, w in asm_graph.sg_edges:
+        edge_data = asm_graph.sg_edges[(v, w)]
+        if edge_data[-1] != 'G':
+            continue
+        add_edge(gfa_graph, v, w, edge_data, preads_overlap_dict, sg_edges_dict)
+
+def run(fp_out, p_ctg_tiling_path, a_ctg_tiling_path,
+                      preads_fasta, p_ctg_fasta, a_ctg_fasta,
+                      sg_edges_list, preads_ovl, utg_data, ctg_paths,
+                      add_string_graph, write_reads,
+                      min_p_len, min_a_len, only_these_contigs):
+    """
+    This method produces a GFAGraph object containing info required
+    to write both the GFA-1 and GFA-2 formatted assemblies.
+    However, it does not write the GFA formats directly, but instead
+    dumps a JSON file to disk.
+    The JSON file is converted to a GFA-1 or a GFA-2 with outside scripts.
+
+    The graphical output is produced from either the entire string
+    graph (only the non-filtered edges are considered) or from only
+    the tiling paths. String graph can show the neighborhood of contig
+    breaks, whereas the tiling path output is more sparse.
+    Output is written to stdout.
+    """
+
+    gfa_graph = GFAGraph()
+
+    # Load preads.
+    preads_dict = load_seqs(preads_fasta, (not write_reads))
+
+    # Load the pread overlaps
+    with open(preads_ovl, 'r') as fp:
+        preads_overlap_dict = load_pread_overlaps(fp)
+
+    # Load the SG edges.
+    with open(sg_edges_list, 'r') as fp:
+        sg_edges_dict = load_sg_edges(fp)
+
+    # Load the primary and associate contig files.
+    p_ctg_seqs = load_seqs(p_ctg_fasta, True)
+    a_ctg_seqs = load_seqs(a_ctg_fasta, True)
+
+    # Collect the sequence lengths from the above dicts.
+    p_ctg_lens = {key: val[0] for key, val in p_ctg_seqs.items()}
+    a_ctg_lens = {key: val[0] for key, val in a_ctg_seqs.items()}
+
+    # Create whitelists for filtering contigs.
+    p_ctg_whitelist = set(p_ctg_seqs.keys())
+    a_ctg_whitelist = set([key for key in list(a_ctg_seqs.keys())])
+    if only_these_contigs:
+        p_ctg_whitelist = set(open(only_these_contigs).read().splitlines()) & set(p_ctg_whitelist)
+        a_ctg_whitelist = set([key for key in list(a_ctg_seqs.keys()) if key.split('-')[0].split('_')[0] in p_ctg_whitelist])
+
+    # Load the tiling paths and assign coordinates.
+    p_paths = falcon_kit.tiling_path.load_tiling_paths(p_ctg_tiling_path, whitelist_seqs=p_ctg_whitelist, contig_lens=p_ctg_lens)
+    a_paths = falcon_kit.tiling_path.load_tiling_paths(a_ctg_tiling_path, whitelist_seqs=a_ctg_whitelist, contig_lens=a_ctg_lens)
+
+    add_tiling_paths_to_gfa(gfa_graph, p_paths, preads_dict, preads_overlap_dict, sg_edges_dict)
+    add_tiling_paths_to_gfa(gfa_graph, a_paths, preads_dict, preads_overlap_dict, sg_edges_dict)
+
+    if add_string_graph:
+        add_string_graph_to_gfa(gfa_graph, sg_edges_list, utg_data, ctg_paths, preads_dict, preads_overlap_dict, sg_edges_dict)
+
+    fp_out.write(serialize_gfa(gfa_graph))
+    fp_out.write('\n')
+
+def parse_args(argv):
+    parser = argparse.ArgumentParser(description="Generates GFA output (on stdout) from FALCON's assembly.",
+                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--p-ctg-tiling-path', type=str, default='p_ctg_tiling_path',
+                        help='location of the p_ctg tiling path file')
+    parser.add_argument('--a-ctg-tiling-path', type=str, default='a_ctg_tiling_path',
+                        help='location of the a_ctg tiling path file')
+    parser.add_argument('--preads-fasta', type=str, default='preads4falcon.fasta',
+                        help='path to the preads4falcon.fasta file')
+    parser.add_argument('--p-ctg-fasta', type=str, default='p_ctg.fa',
+                        help='path to the primary contigs file')
+    parser.add_argument('--a-ctg-fasta', type=str, default='a_ctg.fa',
+                        help='path to the associate contigs file')
+    parser.add_argument('--sg-edges-list', type=str, default='sg_edges_list',
+                        help='string graph edges file from Falcon assembly')
+    parser.add_argument('--preads-ovl', type=str, default='preads.ovl',
+                        help='the preads overlap file')
+    parser.add_argument('--utg-data', type=str,
+                        default='utg_data', help='unitig data file from Falcon')
+    parser.add_argument('--ctg-paths', type=str, default='ctg_paths',
+                        help='contig paths file from Falcon assembly')
+    parser.add_argument('--add-string-graph', action='store_true',
+                        help="in addition to tiling paths, output other edges and nodes from the final string graph")
+    parser.add_argument('--write-reads', '-r', action='store_true',
+                        help="output read sequences in S lines")
+    # parser.add_argument('--write-contigs', '-c', action='store_true',
+    #                     help="output contig sequences as S lines")
+    parser.add_argument('--min-p-len', type=int, default=0,
+                        help='primary contig paths with length smaller than this will not be reported')
+    parser.add_argument('--min-a-len', type=int, default=0,
+                        help='associate contig paths with length smaller than this will not be reported')
+    parser.add_argument('--only-these-contigs', type=str, default='',
+                        help='limit output to specified contigs listed in file (one per line)')
+    args = parser.parse_args(argv[1:])
+    return args
+
+def main(argv=sys.argv):
+    args = parse_args(argv)
+
+    run(sys.stdout, **vars(args))
+
+if __name__ == '__main__':  # pragma: no cover
+    main()

+ 411 - 0
FALCON/falcon_kit/mains/consensus.py

@@ -0,0 +1,411 @@
+
+
+
+
+from builtins import range
+from ctypes import (POINTER, c_char_p, c_uint, c_uint,
+                    c_uint, c_uint, c_uint, c_double, string_at, pointer)
+from falcon_kit.multiproc import Pool
+from falcon_kit import falcon
+import argparse
+import logging
+import multiprocessing
+import os
+import re
+import sys
+import falcon_kit
+import falcon_kit.util.io as io
+import collections
+
+LOG = logging.getLogger()
+
+falcon.generate_consensus.argtypes = [
+    POINTER(c_char_p), c_uint, c_uint, c_uint, c_double]
+falcon.generate_consensus.restype = POINTER(falcon_kit.ConsensusData)
+falcon.free_consensus_data.argtypes = [POINTER(falcon_kit.ConsensusData)]
+
+falcon.generate_consensus_from_mapping.argtypes = [
+    POINTER(c_char_p), POINTER(POINTER(falcon_kit.AlnRange)), c_uint, c_uint, c_uint, c_double]
+falcon.generate_consensus_from_mapping.restype = POINTER(falcon_kit.ConsensusData)
+
+"""
+SeqTuple encodes a single line in a block for consensus. Legacy code used only the 'name' and 'seq' (read from input),
+but if the coordinates are already known, we can use this info.
+The `qlen` and `tlen` are necessary because this consensus code can clip the end of a sequence if it's
+beyond a certain threshold. If it's clipped, the start/end coordinates can fall within the clipped region,
+which means that the internal alignment will have to be triggered.
+The 'tstart' and 'tend' relate to the seed read, and the 'qstart' and 'qend' to the currenr read on the same line.
+The current query should be in the same strand as the target. For consistency, we added a 'qstranq' as well, but
+in the current LA4Falcon output it will always be 0.
+The 'aln' field can be used to provide an alignment directly from the tool which determined that these sequences
+need to go into the same block. This could be used downstream to prevent quadratic memory consumption during error
+correction, and speed up the process.
+Parameter 'is_trimmed' is a bool, indicating that the sequence was trimmed from the back because it exceeded the maximum length.
+"""
+SeqTuple = collections.namedtuple('SeqTuple', ['name', 'seq', 'qstrand', 'qstart', 'qend', 'qlen', 'tstart', 'tend', 'tlen', 'aln', 'is_mapped', 'is_trimmed'])
+
+def get_longest_reads(seqs, max_n_read, max_cov_aln, sort=True):
+    # including the sort kwarg allows us to avoid a redundant sort
+    # in get_consensus_trimmed()
+    if sort:
+        seqs = seqs[:1] + sorted(seqs[1:], key=lambda x: -len(x.seq))
+
+    longest_n_reads = max_n_read
+    if max_cov_aln > 0:
+        longest_n_reads = 1
+        seed_len = len(seqs[0].seq)
+        read_cov = 0
+        for seq in seqs[1:]:
+            if read_cov // seed_len > max_cov_aln:
+                break
+            longest_n_reads += 1
+            read_cov += len(seq.seq)
+
+        longest_n_reads = min(longest_n_reads, max_n_read)
+
+    return(seqs[:longest_n_reads])
+
+
+def get_alignment(seq1, seq0, edge_tolerance=1000):
+
+    kup = falcon_kit.kup
+    K = 8
+    lk_ptr = kup.allocate_kmer_lookup(1 << (K * 2))
+    sa_ptr = kup.allocate_seq(len(seq0))
+    sda_ptr = kup.allocate_seq_addr(len(seq0))
+    kup.add_sequence(0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr)
+
+    kup.mask_k_mer(1 << (K * 2), lk_ptr, 16)
+    kmer_match_ptr = kup.find_kmer_pos_for_seq(
+        seq1, len(seq1), K, sda_ptr, lk_ptr)
+    kmer_match = kmer_match_ptr[0]
+    aln_range_ptr = kup.find_best_aln_range2(kmer_match_ptr, K, K * 50, 25)
+    #x,y = zip( * [ (kmer_match.query_pos[i], kmer_match.target_pos[i]) for i in range(kmer_match.count )] )
+    aln_range = aln_range_ptr[0]
+    kup.free_kmer_match(kmer_match_ptr)
+    s1, e1, s0, e0, km_score = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2, aln_range.score
+    e1 += K + K // 2
+    e0 += K + K // 2
+    kup.free_aln_range(aln_range)
+    len_1 = len(seq1)
+    len_0 = len(seq0)
+    if e1 > len_1:
+        e1 = len_1
+    if e0 > len_0:
+        e0 = len_0
+
+    aln_size = 1
+    if e1 - s1 > 500:
+
+        aln_size = max(e1 - s1, e0 - s0)
+        aln_score = int(km_score * 48)
+        aln_q_s = s1
+        aln_q_e = e1
+        aln_t_s = s0
+        aln_t_e = e0
+
+    kup.free_seq_addr_array(sda_ptr)
+    kup.free_seq_array(sa_ptr)
+    kup.free_kmer_lookup(lk_ptr)
+
+    if s1 > edge_tolerance and s0 > edge_tolerance:
+        return 0, 0, 0, 0, 0, 0, "none"
+
+    if len_1 - e1 > edge_tolerance and len_0 - e0 > edge_tolerance:
+        return 0, 0, 0, 0, 0, 0, "none"
+
+    if e1 - s1 > 500 and aln_size > 500:
+        return s1, s1 + aln_q_e - aln_q_s, s0, s0 + aln_t_e - aln_t_s, aln_size, aln_score, "aln"
+    else:
+        return 0, 0, 0, 0, 0, 0, "none"
+
+
+def get_trimmed_seq(seq, s, e):
+    # Mapping info is useless after clipping, so just reset it.
+    ret = SeqTuple(name = seq.name, seq = seq.seq[s:e],
+                    qstrand = seq.qstrand, qstart = -1, qend = -1, qlen = -1,
+                    tstart = -1, tend = -1, tlen = -1,
+                    aln = '*', is_mapped = False, is_trimmed = True)
+    return ret
+
+
+def get_consensus_core(seqs, min_cov, K, min_idt, allow_external_mapping):
+    seqs_ptr = (c_char_p * len(seqs))()
+    seqs_ptr[:] = [bytes(val.seq, encoding='ascii')  for val in seqs]
+
+    all_seqs_mapped = False
+
+    if allow_external_mapping:
+        all_seqs_mapped = True
+        for seq in seqs:
+            if not seq.is_mapped:
+                all_seqs_mapped = False
+                break
+
+    if not all_seqs_mapped:
+        LOG.info('Internally mapping the sequences.')
+        consensus_data_ptr = falcon.generate_consensus(
+            seqs_ptr, len(seqs), min_cov, K, min_idt)
+    else:
+        LOG.info('Using external mapping coordinates from input.')
+        aln_ranges_ptr = (POINTER(falcon_kit.AlnRange) * len(seqs))()
+        for i, seq in enumerate(seqs):
+            a = falcon_kit.AlnRange(seq.qstart, seq.qend, seq.tstart, seq.tend, (seq.qend - seq.qstart))
+            aln_ranges_ptr[i] = pointer(a)
+        consensus_data_ptr = falcon.generate_consensus_from_mapping(
+            seqs_ptr, aln_ranges_ptr, len(seqs), min_cov, K, min_idt)
+        del aln_ranges_ptr
+
+    del seqs_ptr
+
+    if not consensus_data_ptr:
+        LOG.warning("====>get_consensus_core return consensus_data_ptr={}".format(consensus_data_ptr))
+        return ''
+    # assert consensus_data_ptr
+    consensus = string_at(consensus_data_ptr[0].sequence)[:]
+    #eff_cov = consensus_data_ptr[0].eff_cov[:len(consensus)]
+    LOG.debug(' Freeing')
+    falcon.free_consensus_data(consensus_data_ptr)
+    return consensus.decode('ascii')
+
+def get_consensus_without_trim(c_input):
+    seqs, seed_id, config = c_input
+    LOG.debug('Starting get_consensus_without_trim(len(seqs)=={}, seed_id={})'.format(
+        len(seqs), seed_id))
+    min_cov, K, max_n_read, min_idt, edge_tolerance, trim_size, min_cov_aln, max_cov_aln, allow_external_mapping = config
+    if len(seqs) > max_n_read:
+        seqs = get_longest_reads(seqs, max_n_read, max_cov_aln, sort=True)
+
+    consensus = get_consensus_core(seqs, min_cov, K, min_idt, allow_external_mapping)
+    LOG.debug(' Finishing get_consensus_without_trim(seed_id={})'.format(seed_id))
+
+    return consensus, seed_id
+
+def get_consensus_with_trim(c_input):
+    seqs, seed_id, config = c_input
+    LOG.debug('Starting get_consensus_with_trim(len(seqs)=={}, seed_id={})'.format(
+        len(seqs), seed_id))
+    min_cov, K, max_n_read, min_idt, edge_tolerance, trim_size, min_cov_aln, max_cov_aln, allow_external_mapping = config
+    trim_seqs = []
+    seed = seqs[0]
+    for seq in seqs[1:]:
+        aln_data = get_alignment(seq.seq, seed.seq, edge_tolerance)
+        s1, e1, s2, e2, aln_size, aln_score, c_status = aln_data
+        if c_status == "none":
+            continue
+        if aln_score > 1000 and e1 - s1 > 500:
+            e1 -= trim_size
+            s1 += trim_size
+            trim_seqs.append((e1 - s1, get_trimmed_seq(seq, s1, e1)))
+            # trim_seqs.append((e1 - s1, seq.seq[s1:e1]))
+    trim_seqs.sort(key=lambda x: -x[0])  # use longest alignment first
+    trim_seqs = [x[1] for x in trim_seqs]
+
+    trim_seqs = [seed] + trim_seqs
+    if len(trim_seqs[1:]) > max_n_read:
+        # seqs already sorted, dont' sort again
+        trim_seqs = get_longest_reads(
+            trim_seqs, max_n_read, max_cov_aln, sort=False)
+
+    consensus = get_consensus_core(trim_seqs, min_cov, K, min_idt, allow_external_mapping)
+    LOG.debug(' Finishing get_consensus_with_trim(seed_id={})'.format(seed_id))
+
+    return consensus, seed_id
+
+def get_seq_data(config, min_n_read, min_len_aln):
+    max_len = 128000
+    min_cov, K, max_n_read, min_idt, edge_tolerance, trim_size, min_cov_aln, max_cov_aln, allow_external_mapping = config
+    seqs = []
+    seed_id = None
+    seed_len = 0
+    seqs_data = []
+    read_cov = 0
+    read_ids = set()
+    with sys.stdin as f:
+        for line in f:
+            split_line = line.strip().split()
+            if len(split_line) < 2:
+                continue
+
+            qname = split_line[0]
+            qseq = split_line[1]
+            qstrand, qstart, qend, qlen = 0, -1, -1, -1
+            tstart, tend, tlen = -1, -1, -1
+            aln, is_mapped, is_trimmed = '*', False, False
+
+            if len(split_line) >= 10:
+                qstrand = int(split_line[2])
+                qstart = int(split_line[3])
+                qend = int(split_line[4])
+                qlen = int(split_line[5])
+                tstart = int(split_line[6])
+                tend = int(split_line[7])
+                tlen = int(split_line[8])
+                aln = split_line[9]
+                is_mapped = True
+
+            new_seq = SeqTuple(name = qname, seq = qseq,
+                                qstrand = qstrand, qstart = qstart, qend = qend, qlen = qlen,
+                                tstart = tstart, tend = tend, tlen = tlen,
+                                aln = aln, is_mapped = is_mapped, is_trimmed = is_trimmed)
+
+            if len(new_seq.seq) > max_len:
+                new_seq = get_trimmed_seq(new_seq, 0, max_len - 1)
+
+            if new_seq.name not in ("+", "-", "*"):
+                if len(new_seq.seq) >= min_len_aln:
+                    if len(seqs) == 0:
+                        seqs.append(new_seq)  # the "seed"
+                        seed_len = len(new_seq.seq)
+                        seed_id = new_seq.name
+                    if new_seq.name not in read_ids:  # avoidng using the same read twice. seed is used again here by design
+                        seqs.append(new_seq)
+                        read_ids.add(new_seq.name)
+                        read_cov += len(new_seq.seq)
+
+            elif split_line[0] == "+":
+                if len(seqs) >= min_n_read and read_cov // seed_len >= min_cov_aln:
+                    seqs = get_longest_reads(
+                        seqs, max_n_read, max_cov_aln, sort=True)
+                    yield (seqs, seed_id, config)
+                #seqs_data.append( (seqs, seed_id) )
+                seqs = []
+                read_ids = set()
+                seed_id = None
+                read_cov = 0
+            elif split_line[0] == "*":
+                seqs = []
+                read_ids = set()
+                seed_id = None
+                read_cov = 0
+            elif split_line[0] == "-":
+                # yield (seqs, seed_id)
+                #seqs_data.append( (seqs, seed_id) )
+                break
+
+
+def format_seq(seq, col):
+    return "\n".join([seq[i:(i + col)] for i in range(0, len(seq), col)])
+
+
+def parse_args(argv):
+    parser = argparse.ArgumentParser(description='a simple multi-processor consensus sequence generator',
+                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--n-core', type=int, default=24,
+                        help='number of processes used for generating consensus; '
+                        '0 for main process only')
+    parser.add_argument('--min-cov', type=int, default=6,
+                        help='minimum coverage to break the consensus')
+    parser.add_argument('--min-cov-aln', type=int, default=10,
+                        help='minimum coverage of alignment data; a seed read with less than MIN_COV_ALN average depth' +
+                        ' of coverage will be completely ignored')
+    parser.add_argument('--max-cov-aln', type=int, default=0,  # 0 to emulate previous behavior
+                        help='maximum coverage of alignment data; a seed read with more than MAX_COV_ALN average depth' + \
+                        ' of coverage of the longest alignments will be capped, excess shorter alignments will be ignored')
+    parser.add_argument('--min-len-aln', type=int, default=0,  # 0 to emulate previous behavior
+                        help='minimum length of a sequence in an alignment to be used in consensus; any shorter sequence will be completely ignored')
+    parser.add_argument('--min-n-read', type=int, default=10,
+                        help='1 + minimum number of reads used in generating the consensus; a seed read with fewer alignments will ' +
+                        'be completely ignored')
+    parser.add_argument('--max-n-read', type=int, default=500,
+                        help='1 + maximum number of reads used in generating the consensus')
+    parser.add_argument('--trim', action="store_true", default=False,
+                        help='trim the input sequence with k-mer spare dynamic programming to find the mapped range')
+    parser.add_argument('--output-full', action="store_true", default=False,
+                        help='output uncorrected regions too')
+    parser.add_argument('--output-multi', action="store_true", default=False,
+                        help='output multi correct regions')
+    parser.add_argument('--min-idt', type=float, default=0.70,
+                        help='minimum identity of the alignments used for correction')
+    parser.add_argument('--edge-tolerance', type=int, default=1000,
+                        help='for trimming, the there is unaligned edge leng > edge_tolerance, ignore the read')
+    parser.add_argument('--trim-size', type=int, default=50,
+                        help='the size for triming both ends from initial sparse aligned region')
+    parser.add_argument('--allow-external-mapping', action="store_true", default=False,
+                        help='if provided, externally determined mapping coordinates will be used for error correction')
+    parser.add_argument('-v', '--verbose-level', type=float, default=2.0,
+                        help='logging level (WARNING=3, INFO=2, DEBUG=1)')
+    return parser.parse_args(argv[1:])
+
+def run(args):
+    # logging.basicConfig(level=int(round(10*args.verbose_level)))
+    logging.basicConfig(level=logging.NOTSET,
+                        format='%(asctime)s: [%(module)s:%(funcName)s()line:%(lineno)d] - %(levelname)s :  %(message)s')
+
+    assert args.n_core <= multiprocessing.cpu_count(), 'Requested n_core={} > cpu_count={}'.format(
+            args.n_core, multiprocessing.cpu_count())
+
+    def Start():
+        LOG.info('====>Started a worker in {} from parent {}'.format(
+            os.getpid(), os.getppid()))
+    exe_pool = Pool(args.n_core, initializer=Start)
+    if args.trim:
+        get_consensus = get_consensus_with_trim
+    else:
+        get_consensus = get_consensus_without_trim
+
+    K = 8
+    config = args.min_cov, K, \
+        args.max_n_read, args.min_idt, args.edge_tolerance, \
+        args.trim_size, args.min_cov_aln, args.max_cov_aln, \
+        args.allow_external_mapping
+    # TODO: pass config object, not tuple, so we can add fields
+    LOG.debug("====>args={}".format(args))
+    LOG.debug("====>get_consensus={}".format(get_consensus))
+    LOG.debug("====>config={}".format(config))
+    inputs = []
+    for datum in get_seq_data(config, args.min_n_read, args.min_len_aln):
+        inputs.append((get_consensus, datum))
+    LOG.debug("====>len(get_seq_data({}, {}, {}))={}"
+              .format(config, args.min_n_read, args.min_len_aln, len(inputs)))
+    try:
+        LOG.info('====>running {!r}'.format(get_consensus))
+        for res in exe_pool.imap(io.run_func, inputs):
+            process_get_consensus_result(res, args)
+        LOG.info('====>finished {!r}'.format(get_consensus))
+    except:
+        LOG.exception('====>failed gen_consensus')
+        exe_pool.terminate()
+        raise
+
+good_region = re.compile("[ACGT]+")
+
+def process_get_consensus_result(res, args, limit=500):
+        cns, seed_id = res
+        seed_id = int(seed_id)
+        if not cns:
+            LOG.warning("====>process_get_consensus_result() data error! res={}".format(res))
+            return
+        if len(cns) < limit:
+            LOG.debug("====>process_get_consensus_result() len(cns)={} < limit[{}]"
+                        .format(len(cns), limit))
+            return
+
+        if args.output_full:
+            print('>{:d}_f'.format(seed_id))
+            print(cns)
+        else:
+            cns = good_region.findall(cns)
+            if args.output_multi:
+                seq_i = 0
+                for cns_seq in cns:
+                    if len(cns_seq) < limit:
+                        continue
+                    if seq_i >= 10:
+                        break
+                    print(">prolog/%s%01d/%d_%d" % (seed_id, seq_i, 0, len(cns_seq)))
+                    print(format_seq(cns_seq, 80))
+                    seq_i += 1
+            else:
+                if len(cns) == 0:
+                    return
+                cns.sort(key=lambda x: len(x))
+                print('>{:d}'.format(seed_id))
+                print(cns[-1])
+
+def main(argv=sys.argv):
+    args = parse_args(argv)
+    run(args)
+
+if __name__ == "__main__":
+    main(sys.argv)

+ 90 - 0
FALCON/falcon_kit/mains/consensus_gather_fasta_fofn.py

@@ -0,0 +1,90 @@
+"""
+"""
+
+
+
+
+from future.utils import viewitems
+import argparse
+import logging
+import os
+import string
+import sys
+from ..util import io
+
+LOG = logging.getLogger()
+
+
+def post_hook(config_fn, db_fn, gathered_fn):
+    # gathered_fn is needed only for this hacky bypass, for pbsmrtpipe.
+    if os.path.samefile(gathered_fn, db_fn):
+        return
+    if os.path.samefile(gathered_fn, config_fn):
+        return
+    config = io.deserialize(config_fn)
+    hook = config.get('LA4Falcon_post')
+    if hook:
+        LOG.warning('Found LA4Falcon_post in General section of cfg. About to run {!r}...'.format(hook))
+        db = os.path.abspath(db_fn)
+        parent = os.path.abspath(os.path.dirname(os.getcwd()))
+        dbdir = os.path.join(config['LA4Falcon_dbdir'], 'fc-db') + parent
+        cmd = string.Template(hook).substitute(DB=db, DBDIR=dbdir)
+        io.syscall(cmd)
+
+def run(gathered_fn, db_fn, config_fn, preads_fofn_fn):
+    gathered = io.deserialize(gathered_fn)
+    d = os.path.abspath(os.path.realpath(os.path.dirname(gathered_fn)))
+    def abspath(fn):
+        if os.path.isabs(fn):
+            return fn # I expect this never to happen though.
+        return os.path.join(d, fn)
+    fasta_fns = list()
+    for desc in gathered:
+        fn = abspath(desc['fasta'])
+        if 0 == io.filesize(fn):
+            LOG.warning('Skipping empty fasta {!r}'.format(fn))
+            continue
+        fasta_fns.append(fn)
+    with open(preads_fofn_fn,  'w') as f:
+        for filename in sorted(fasta_fns, key=lambda fn: (os.path.basename(fn), fn)):
+            print(filename, file=f)
+    post_hook(config_fn, db_fn, gathered_fn)
+
+
+class HelpF(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
+    pass
+
+
+def parse_args(argv):
+    description = 'Turn gathered file into FOFN of fasta files.'
+    epilog = ''
+    parser = argparse.ArgumentParser(
+        description=description,
+        epilog=epilog,
+        formatter_class=HelpF,
+    )
+    parser.add_argument(
+        '--gathered-fn',
+        help='Input. JSON list of output dicts.')
+    parser.add_argument(
+        '--db-fn',
+        help='Input. Dazzler DB of raw_reads.')
+    parser.add_argument(
+        '--config-fn',
+        help='Input. JSON of relevant configuration (currently from General section of full-prog config).')
+    parser.add_argument(
+        '--preads-fofn-fn',
+        help='Output. FOFN of preads (fasta files).',
+    )
+    args = parser.parse_args(argv[1:])
+    return args
+
+
+def main(argv=sys.argv):
+    args = parse_args(argv)
+    logging.basicConfig(level=logging.INFO)
+    run(**vars(args))
+
+
+if __name__ == '__main__':  # pragma: no cover
+    main()

+ 148 - 0
FALCON/falcon_kit/mains/consensus_split.py

@@ -0,0 +1,148 @@
+
+
+
+from future.utils import viewitems
+import argparse
+import collections
+import logging
+import os
+import string
+import sys
+from .. import io
+from .. import bash
+from .. import pype_tasks
+
+LOG = logging.getLogger()
+
+def corrected_relpath(p, was_rel_to):
+    if os.path.isabs(p):
+        return p
+    #LOG.warning('{},{},{}'.format(p, was_rel_to, os.path.relpath(os.path.join(was_rel_to, p))))
+    return os.path.normpath(os.path.relpath(os.path.join(was_rel_to, p)))
+
+def read_gathered_las(path):
+    """Return dict of block->[las_paths].
+    For now, these are ws separated on each line of input.
+    """
+    result = collections.defaultdict(list)
+    dn = os.path.normpath(os.path.dirname(path))
+    p_id2las = io.deserialize(path)
+    for block, las_path in list(p_id2las.items()):
+            result[int(block)].append(corrected_relpath(las_path, dn))
+    #import pprint
+    #LOG.warning('path={!r}, result={}'.format(
+    #   path, pprint.pformat(result)))
+    return result
+
+def pre_hook(config_fn, db_fn):
+    config = io.deserialize(config_fn)
+    hook = config.get('LA4Falcon_pre')
+    if hook:
+        LOG.warning('Found LA4Falcon_pre in General section of cfg. About to run {!r}...'.format(hook))
+        if config.get('LA4Falcon_preload'):
+            LOG.error('Found both LA4Falcon_pre and LA4Falcon_preload. Why would you preload after you have copied the DB? I hope you know what you are doing.')
+        db = os.path.abspath(db_fn)
+        parent = os.path.abspath(os.path.dirname(os.getcwd()))
+        dbdir = os.path.join(config['LA4Falcon_dbdir'], 'fc-db') + parent
+        cmd = string.Template(hook).substitute(DB=db, DBDIR=dbdir)
+        io.syscall(cmd)
+
+def run(p_id2las_fn, db_fn, length_cutoff_fn, config_fn, wildcards,
+        bash_template_fn, split_fn):
+    with open(bash_template_fn, 'w') as stream:
+        stream.write(pype_tasks.TASK_CONSENSUS_TASK_SCRIPT)
+
+    db_fn = os.path.realpath(db_fn)
+    # Because DazzlerDB is not a "FileType" in pbcommand,
+    # it might be a symlink with a weird extension.
+    LOG.info('Scattering las from {!r} (based on {!r}) into {!r}.'.format(
+        p_id2las_fn, db_fn, split_fn))
+
+    wildcards = wildcards.split(',')
+    #basedir = os.path.dirname(os.path.abspath(split_fn))
+    #rootdir = os.path.dirname(os.path.dirname(basedir)) # for now
+    outdir = os.path.abspath(os.path.dirname(split_fn))
+    jobs = list()
+    p_ids_merge_las = read_gathered_las(p_id2las_fn)
+    tasks = []
+    for (p_id, las_fns) in viewitems(p_ids_merge_las):
+        assert len(las_fns) == 1, repr(las_fns)
+        # since we know each merge-task is for a single block
+        las_fn = las_fns[0]
+        cns_id = 'cns_%05d' % int(p_id)
+        cns_id2 = cns_id
+        ##out_done_fn = '%s_done' % cns_label
+        #out_file_fn = '%s.fasta' % cns_label
+        #symlinked_las_fn = '{rootdir}/0-rawreads/cns-split/{cns_id}/merged.{cns_id2}.las'.format(**locals())
+        symlinked_las_fn = '{outdir}/cns-symlinks/{cns_id}/merged.{cns_id2}.las'.format(**locals())
+        io.mkdirs(os.path.normpath(os.path.dirname(symlinked_las_fn)))
+        src = os.path.relpath(las_fn,
+            os.path.normpath(os.path.dirname(symlinked_las_fn)))
+        io.symlink(src, symlinked_las_fn)
+
+        # Record in a job-dict.
+        job = dict()
+        job['input'] = dict(
+                las = symlinked_las_fn,
+                db = db_fn,
+                length_cutoff = length_cutoff_fn,
+                config = config_fn,
+        )
+        job['output'] = dict(
+                fasta = 'consensus.{cns_id2}.fasta'.format(**locals()),
+                #'{rootdir}/0-rawreads/consensus/{cns_id}/consensus.{cns_id2}.fasta'.format(**locals()),
+        )
+        job['params'] = dict(
+        )
+        job['wildcards'] = {wildcards[0]: cns_id, wildcards[1]: cns_id}
+        jobs.append(job)
+
+    io.serialize(split_fn, jobs)
+    pre_hook(config_fn, db_fn)
+
+
+class HelpF(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
+    pass
+
+
+def parse_args(argv):
+    description = 'Prepare for parallel consensus jobs.'
+    epilog = ''
+    parser = argparse.ArgumentParser(
+        description=description,
+        epilog=epilog,
+        formatter_class=HelpF,
+    )
+    parser.add_argument(
+        '--p-id2las-fn',
+        help='Input. JSON dict of p-id to las.)')
+    parser.add_argument(
+        '--db-fn',
+        help='Input. Dazzler DB of raw_reads.')
+    parser.add_argument(
+        '--length-cutoff-fn',
+        help='Input. Contains a single integer, the length-cutoff.')
+    parser.add_argument(
+        '--config-fn',
+        help='Input. JSON of relevant configuration (currently from General section of full-prog config).')
+    parser.add_argument(
+        '--wildcards',
+        help='Input. Comma-separated wildcard names. Might be needed downstream.')
+    parser.add_argument(
+        '--split-fn',
+        help='Output. JSON list of jobs, where each is a dict of input/output/params/wildcards.')
+    parser.add_argument(
+        '--bash-template-fn',
+        help='Output. Copy of known daligner bash template, for use later.')
+    args = parser.parse_args(argv[1:])
+    return args
+
+
+def main(argv=sys.argv):
+    args = parse_args(argv)
+    logging.basicConfig(level=logging.INFO)
+    run(**vars(args))
+
+
+if __name__ == '__main__':  # pragma: no cover
+    main()

+ 197 - 0
FALCON/falcon_kit/mains/consensus_task.py

@@ -0,0 +1,197 @@
+
+
+import argparse
+import logging
+import multiprocessing
+import os
+import re
+import sys
+from .. import io
+from .. import bash
+from .dazzler import symlink_db
+
+LOG = logging.getLogger()
+
+def get_option_with_proper_nproc(regexp, opt, opt_name, nproc, cpu_count=multiprocessing.cpu_count()):
+    r"""Return opts sans the regexp match, and proper nproc.
+    >>> regexp = re.compile(r'-j[^\d]*(\d+)')
+    >>> get_option_with_proper_nproc(regexp, 'foo -j 5', 'baz', nproc=7, cpu_count=6)
+    ('foo ', 5)
+    >>> get_option_with_proper_nproc(regexp, 'foo -j 5', 'baz', nproc=3, cpu_count=4)
+    ('foo ', 3)
+    >>> get_option_with_proper_nproc(regexp, 'foo -j 5', 'baz', nproc=3, cpu_count=2)
+    ('foo ', 2)
+    """
+    job_nproc = int(nproc)
+    mo = regexp.search(opt)
+    if mo:
+        opt_nproc = int(mo.group(1))
+        if job_nproc < opt_nproc:
+            LOG.warning('NPROC={}, but falcon_sense_option="{}", so we will ignore that option and use {}'.format(
+                job_nproc, opt, job_nproc))
+        elif job_nproc > opt_nproc:
+            LOG.warning('NPROC={}, but falcon_sense_option="{}", so we will override NPROC and use {}'.format(
+                job_nproc, opt, opt_nproc))
+        nproc = min(job_nproc, opt_nproc)
+        opt = regexp.sub('', opt) # remove --n_core, for now
+    else:
+        nproc = job_nproc
+    if nproc > cpu_count:
+        LOG.warning('Requested nproc={} > cpu_count={}; using {}'.format(
+            nproc, cpu_count, cpu_count))
+        nproc = cpu_count
+    return opt, nproc
+
+def get_falcon_sense_option(opt, nproc):
+    """
+    >>> get_falcon_sense_option('', 11)
+    ' --n-core=11'
+    >>> get_falcon_sense_option('--n-core=24', 10)
+    ' --n-core=10'
+    """
+    re_n_core = re.compile(r'--n-core[^\d]+(\d+)')
+    opt, nproc = get_option_with_proper_nproc(re_n_core, opt, 'falcon_sense_option', nproc)
+    opt += ' --n-core={}'.format(nproc)
+    return opt
+
+def get_pa_dazcon_option(opt, nproc):
+    """
+    >>> get_pa_dazcon_option('', 12)
+    ' -j 12'
+    >>> get_pa_dazcon_option('-j  48', 13)
+    ' -j 13'
+    """
+    re_j = re.compile(r'-j[^\d]+(\d+)')
+    opt, nproc = get_option_with_proper_nproc(re_j, opt, 'pa_dazcon_option', nproc)
+    opt += ' -j {}'.format(nproc)
+    return opt
+
+def symlink(actual):
+    """Symlink into cwd, without relativizing.
+    """
+    symbolic = os.path.basename(actual)
+    if os.path.abspath(actual) == os.path.abspath(symbolic):
+        LOG.warning('Cannot symlink {!r} as {!r}, itself.'.format(actual, symbolic))
+        return
+    rel = actual # not really relative, but this code was copy/pasted
+    if True:
+        LOG.info('ln -sf {} {}'.format(rel, symbolic))
+        if os.path.lexists(symbolic):
+            if os.readlink(symbolic) == rel:
+                return
+            else:
+                os.unlink(symbolic)
+    os.symlink(rel, symbolic)
+
+# This function was copied from bash.py and modified.
+def script_run_consensus(config, db_fn, las_fn, out_file_fn, nproc):
+    """config: dazcon, falcon_sense_greedy, falcon_sense_skip_contained, LA4Falcon_preload
+    """
+    symlink_db(db_fn, symlink=symlink)
+    db_fn = os.path.basename(db_fn)
+    assert os.path.exists(db_fn), os.path.abspath(db_fn)
+    io.rm(out_file_fn) # in case of resume
+    out_file_bfn = out_file_fn + '.tmp'
+    params = dict(config)
+    length_cutoff = params['length_cutoff']
+    bash_cutoff = '{}'.format(length_cutoff)
+    params['falcon_sense_option'] = get_falcon_sense_option(params.get('falcon_sense_option', ''), nproc)
+    params['pa_dazcon_option'] = get_pa_dazcon_option(params.get('pa_dazcon_option', ''), nproc)
+    params.update(locals()) # not needed
+    LA4Falcon_flags = 'P' if params.get('LA4Falcon_preload') else ''
+    if config["falcon_sense_skip_contained"]:
+        LA4Falcon_flags += 'fso'
+    elif config["falcon_sense_greedy"]:
+        LA4Falcon_flags += 'fog'
+    else:
+        LA4Falcon_flags += 'fo'
+    if LA4Falcon_flags:
+        LA4Falcon_flags = '-' + ''.join(set(LA4Falcon_flags))
+    run_consensus = "LA4Falcon -H$CUTOFF %s {db_fn} {las_fn} | python3 -m falcon_kit.mains.consensus {falcon_sense_option} >| {out_file_bfn}" % LA4Falcon_flags
+
+    if config.get('dazcon', False):
+        run_consensus = """
+which dazcon
+dazcon {pa_dazcon_option} -s {db_fn} -a {las_fn} >| {out_file_bfn}
+"""
+
+    script = """
+set -o pipefail
+CUTOFF=%(bash_cutoff)s
+%(run_consensus)s
+mv -f {out_file_bfn} {out_file_fn}
+""" % (locals())
+    return script.format(**params)
+
+
+def run(config_fn, length_cutoff_fn, las_fn, db_fn, nproc,
+        fasta_fn):
+    job_done_fn = 'job.done'
+    length_cutoff = int(open(length_cutoff_fn).read())
+    config = io.deserialize(config_fn)
+    config['length_cutoff'] = length_cutoff
+    dbdir = config.get('LA4Falcon_dbdir')
+    if dbdir:
+        # Assume we are 2 levels deeper than consensus_split was.
+        parent3 = os.path.abspath(os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd()))))
+        dbdir = os.path.join(config['LA4Falcon_dbdir'], 'fc-db') + parent3
+        bn = os.path.basename(db_fn)
+        LOG.warning('Using symlinks to {} in LA4Falcon_dbdir={!r}'.format(bn, dbdir))
+        db_fn = os.path.join(dbdir, bn)
+    script = script_run_consensus(
+        config, db_fn, las_fn,
+        os.path.basename(fasta_fn), # not sure basename is really needed here
+        nproc=nproc,
+    )
+    script_fn = 'run_consensus.sh'
+    bash.write_script(script, script_fn, job_done_fn)
+    io.syscall('bash -vex {}'.format(script_fn))
+
+
+class HelpF(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
+    pass
+
+
+def parse_args(argv):
+    description = 'Run consensus on a merged .las file, to produce a fasta file of preads.'
+    epilog = ''
+    parser = argparse.ArgumentParser(
+        description=description,
+        epilog=epilog,
+        formatter_class=HelpF,
+    )
+    parser.add_argument(
+        '--nproc',
+        help='Number of processors to be used.')
+    parser.add_argument(
+        '--las-fn',
+        help='Input. Merged .las file.',
+    )
+    parser.add_argument(
+        '--db-fn',
+        help='Input. Dazzler DB of raw-reads.',
+    )
+    parser.add_argument(
+        '--length-cutoff-fn',
+        help='Input. Contains a single integer, the length-cutoff.',
+    )
+    parser.add_argument(
+        '--config-fn',
+        help='Input. JSON of relevant configuration (currently from General section of full-prog config).',
+    )
+    parser.add_argument(
+        '--fasta-fn',
+        help='Output. Consensus fasta file.',
+    )
+    args = parser.parse_args(argv[1:])
+    return args
+
+
+def main(argv=sys.argv):
+    args = parse_args(argv)
+    logging.basicConfig(level=logging.INFO)
+    run(**vars(args))
+
+
+if __name__ == '__main__':  # pragma: no cover
+    main()

+ 35 - 0
FALCON/falcon_kit/mains/contig_annotate.py

@@ -0,0 +1,35 @@
+
+
+
+from falcon_kit.fc_asm_graph import AsmGraph
+import sys
+
+
+def main(argv=sys.argv):
+    G_asm = AsmGraph("sg_edges_list", "utg_data", "ctg_paths")
+
+    p_ctg_coor_map = {}
+    for fn in ("p_ctg_tiling_path", "a_ctg_tiling_path"):
+        f = open(fn)
+        for row in f:
+            row = row.strip().split()
+            ctg_id, v, w, edge_rid, b, e = row[:6]
+            if ctg_id not in p_ctg_coor_map:
+                coor = 0   # the p_ctg_tiling_path should be sorted by contig the order of the edges in the tiling path
+                p_ctg_coor_map[ctg_id] = {}
+                p_ctg_coor_map[ctg_id][v] = 0
+                coor += abs(int(b) - int(e))
+                p_ctg_coor_map[ctg_id][w] = coor
+                G_asm.node_to_ctg[w]
+                print(ctg_id, v, 0, " ".join(list(G_asm.node_to_ctg[v])))
+                print(ctg_id, w, coor, " ".join(list(G_asm.node_to_ctg[w])))
+                continue
+            else:
+                coor += abs(int(b) - int(e))
+                p_ctg_coor_map[ctg_id][w] = coor
+                print(ctg_id, w, coor, " ".join(list(G_asm.node_to_ctg[w])))
+        f.close()
+
+
+if __name__ == "__main__":
+    main(sys.argv)

+ 59 - 0
FALCON/falcon_kit/mains/copy_fofn.py

@@ -0,0 +1,59 @@
+
+
+import argparse
+import logging
+import os
+import sys
+from .. import io
+
+LOG = logging.getLogger()
+
+
+def run(abs, in_fn, out_fn):
+    out_dir = os.path.normpath(os.path.dirname(out_fn))
+    io.mkdirs(out_dir)
+    def identity(fn): return fn
+    def relative(fn): return os.path.relpath(fn, out_dir)
+    adjusted_fn = identity if abs else relative
+    with open(out_fn, 'w') as stream:
+        for abs_fn in io.yield_abspath_from_fofn(in_fn):
+            fn = adjusted_fn(abs_fn)
+            stream.write('{}\n'.format(fn))
+
+
+class HelpF(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
+    pass
+
+
+def parse_args(argv):
+    description = 'Copy FOFN. If directory changes, then relative paths must change too.'
+    epilog = ''
+    parser = argparse.ArgumentParser(
+        description=description,
+        epilog=epilog,
+        formatter_class=HelpF,
+    )
+    parser.add_argument(
+        '--in-fn',
+        help='Input. FOFN of paths relative to its own directory.'
+    )
+    parser.add_argument(
+        '--abs', action='store_true',
+        help='Store absolute paths. (Otherwise, paths will be relative to directory of output FOFN.)'
+    )
+    parser.add_argument(
+        '--out-fn',
+        help='Output. FOFN of paths relative to its own directory.'
+    )
+    args = parser.parse_args(argv[1:])
+    return args
+
+
+def main(argv=sys.argv):
+    args = parse_args(argv)
+    logging.basicConfig(level=logging.INFO)
+    run(**vars(args))
+
+
+if __name__ == '__main__':  # pragma: no cover
+    main()

+ 97 - 0
FALCON/falcon_kit/mains/copy_mapped.py

@@ -0,0 +1,97 @@
+from future.utils import viewitems
+
+import argparse
+import json
+import logging
+import os
+import shutil
+import sys
+
+LOG = logging.getLogger(__name__)
+
+def deserialize(fn):
+    with open(fn) as ifs:
+        return json.loads(ifs.read())
+
+def assert_exists(fn):
+    if not os.path.isfile(fn):
+        raise Exception('Does not exist: {!r}'.format(fn))
+
+def mkdir(dirname):
+    if not os.path.isdir(dirname):
+        # Possible race-condition, so dirs must be created serially.
+        os.makedirs(dirname)
+
+#def symlink(name, src):
+#    msg = '{} -> {}'.format(name, src)
+#    assert not os.path.lexists(name), msg
+#    #print msg
+#    os.symlink(src, name)
+
+def copy(name, rel_src):
+    try:
+        if not os.path.isabs(rel_src):
+            dn = os.path.normpath(os.path.dirname(name))
+            src = os.path.join(dn, rel_src)
+        else:
+            src = rel_src
+        shutil.copy2(src, name)
+    except Exception:
+        msg = '{} -> {}'.format(name, rel_src)
+        LOG.error(msg)
+        raise
+
+def run(special_split_fn, fn_patterns):
+    """
+    Symlink targets will be relative to cwd.
+    For each pattern, each wildcard will be substituted everywhere, e.g.
+        fn_pattern == 'top/{key}/input_{key}.txt'
+    """
+    fnkeypattdict = dict(fnkeypatt.split('=') for fnkeypatt in fn_patterns)
+    jobs = deserialize(special_split_fn)
+    mapdir = os.path.normpath(os.path.dirname(os.path.normpath(special_split_fn)))
+    for job in jobs:
+        inputs = job['input']
+        wildcards = job['wildcards']
+        for (fnkey, fn_pattern) in viewitems(fnkeypattdict):
+            val = inputs[fnkey]
+            # val should be relative to the location of the special_split_fn.
+            #assert not os.path.isabs(val), 'mapped input (dynamic output) filename {!r} must be relative (to serialzed file location {!r})'.format(
+            #        val, special_split_fn)
+            if not os.path.isabs(val):
+                mapped_input_fn = os.path.join(mapdir, val)
+            else:
+                mapped_input_fn = val
+            assert_exists(mapped_input_fn)
+            try:
+                symlink_name = fn_pattern.format(**wildcards)
+            except Exception as err:
+                import pprint
+                msg = str(err) + ': for pattern {!r} and wildcards\n{!r}'.format(
+                        fn_pattern, pprint.pformat(wildcards))
+                raise Exception(msg)
+            outdir = os.path.normpath(os.path.dirname(symlink_name))
+            mkdir(outdir)
+            target_name = os.path.relpath(mapped_input_fn, outdir)
+            copy(symlink_name, target_name)
+
+def parse_args(argv):
+    description = 'Create copies called "fn_pattern", of files named by values in "mapped_fn".'
+    parser = argparse.ArgumentParser(
+            description=description,
+    )
+    parser.add_argument(
+            '--special-split-fn', required=True,
+            help='Serialized split-file (in our special format), where "mapped_inputs" has a map with key to filename, relative to the directory of this file.')
+    parser.add_argument(
+            'fn_patterns', nargs='+',
+            help='"fnkey=pattern" Can appear multiple times. Each is a pattern for output filename, to be substituted with keys in special_split_fn. Each fnkey=filename must appear in the input section of each job listed in special-split.')
+    return parser.parse_args(argv[1:])
+
+def main(argv=sys.argv):
+    args = parse_args(argv)
+    run(**vars(args))
+
+if __name__ == "__main__":
+    logging.basicConfig()
+    main()

+ 73 - 0
FALCON/falcon_kit/mains/cromwell_run_uows_tar.py

@@ -0,0 +1,73 @@
+
+
+import argparse
+import collections
+import glob
+import logging
+import os
+import sys
+import pypeflow.do_task
+from .. import io
+
+LOG = logging.getLogger()
+
+def dir_from_tar(tar_fn):
+    # standard convention for tar-files
+    return os.path.splitext(os.path.basename(tar_fn))[0]
+
+def run(tool, uows_tar_fn, nproc):
+    cmd = 'tar --strip-components=1 -xvf {}'.format(uows_tar_fn)
+    io.syscall(cmd)
+    #uows_dn = dir_from_tar(uows_tar_fn)
+    uows_dn = '.'
+    uows = list(sorted(glob.glob('{}/uow-*'.format(uows_dn))))
+    print(uows)
+    las_fns = list()
+    for uow in uows:
+        with io.cd(uow):
+            cmd = 'bash -vex uow.sh'
+            io.syscall(cmd)
+        #las_fns.extend(sorted(glob.glob('{}/*.las'.format(uow))))
+    #cmd = 'LAmerge {} {}'.format(
+    #    result_fn, ' '.join(las_fns))
+    #io.syscall(cmd)
+    #io.rm(*las_fns)
+
+
+class HelpF(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
+    pass
+
+
+def parse_args(argv):
+    description = 'Run a bash script once for each unit-of-work, in its own sub-dir. Handle results case-by-case, according to "tool".'
+    epilog = '''For now, runs will be in series, since we do not know how many processors we can use.
+
+For tool=daligner, we merge .las files into a single .las
+'''
+    parser = argparse.ArgumentParser(
+        description=description,
+        epilog=epilog,
+        formatter_class=HelpF,
+    )
+    parser.add_argument(
+        '--nproc',
+        help='Number of processors to be used.')
+    parser.add_argument(
+        '--uows-tar-fn',
+        help='Input. Tarfile of directories of unit-of-work.')
+    parser.add_argument(
+        '--tool', default='daligner', choices=['daligner', 'datander'],
+        help='The tool for each unit of work. (Currently ignored.)')
+
+    args = parser.parse_args(argv[1:])
+    return args
+
+
+def main(argv=sys.argv):
+    args = parse_args(argv)
+    logging.basicConfig(level=logging.INFO)
+    run(**vars(args))
+
+
+if __name__ == '__main__':  # pragma: no cover
+    main()

+ 76 - 0
FALCON/falcon_kit/mains/cromwell_symlink.py

@@ -0,0 +1,76 @@
+
+
+import argparse
+import logging
+import os
+import sys
+from .. import io
+
+LOG = logging.getLogger()
+
+
+def symlink(actual, symbolic=None, force=True):
+    """Symlink into cwd, relatively.
+    symbolic name is basename(actual) if not provided.
+    If not force, raise when already exists and does not match.
+    But ignore symlink to self.
+    """
+    # COPIED VERBATIM FROM ./dazzler.py
+    symbolic = os.path.basename(actual) if not symbolic else symbolic
+    if os.path.abspath(actual) == os.path.abspath(symbolic):
+        LOG.warning('Cannot symlink {!r} as {!r}, itself.'.format(actual, symbolic))
+        return
+    rel = os.path.relpath(actual)
+    if force:
+        LOG.info('ln -sf {} {}'.format(rel, symbolic))
+        if os.path.lexists(symbolic):
+            if os.readlink(symbolic) == rel:
+                return
+            else:
+                os.unlink(symbolic)
+    else:
+        LOG.info('ln -s {} {}'.format(rel, symbolic))
+        if os.path.lexists(symbolic):
+            if os.readlink(symbolic) != rel:
+                msg = '{!r} already exists as {!r}, not {!r}'.format(
+                        symbolic, os.readlink(symbolic), rel)
+                raise Exception(msg)
+            else:
+                LOG.info('{!r} already points to {!r}'.format(symbolic, rel))
+                return
+    os.symlink(rel, symbolic)
+
+
+def run(input_fns):
+    for fn in input_fns:
+        symlink(fn)
+
+
+class HelpF(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
+    pass
+
+
+def parse_args(argv):
+    description = 'Symlink into current directory. This helps keep command-lines short later.'
+    epilog = ''
+    parser = argparse.ArgumentParser(
+        description=description,
+        epilog=epilog,
+        formatter_class=HelpF,
+    )
+    parser.add_argument(
+        'input_fns', nargs='*',
+        help='These will be symlinked according to their basenames.')
+
+    args = parser.parse_args(argv[1:])
+    return args
+
+
+def main(argv=sys.argv):
+    args = parse_args(argv)
+    logging.basicConfig(level=logging.INFO)
+    run(**vars(args))
+
+
+if __name__ == '__main__':  # pragma: no cover
+    main()

+ 55 - 0
FALCON/falcon_kit/mains/cromwell_undot.py

@@ -0,0 +1,55 @@
+
+
+import argparse
+import glob
+import logging
+import os
+import sys
+from .. import io
+
+LOG = logging.getLogger()
+
+
+def rename(fn, prefix):
+    dn, bn = os.path.split(fn)
+    nfn = os.path.join(dn, prefix + bn)
+    cmd = 'mv -f {} {}'.format(fn, nfn)
+    LOG.info('cmd ={!r}'.format(cmd))
+    io.syscall(cmd)
+
+def run(pattern, prefix):
+    for fn in glob.iglob(pattern):
+        rename(fn, prefix)
+
+
+class HelpF(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
+    pass
+
+
+def parse_args(argv):
+    description = 'Find all files matching "pattern" under CWD, and prefix with "prefix".'
+    epilog = 'We do this because Cromwell does not properly glob dot-files.'
+    parser = argparse.ArgumentParser(
+        description=description,
+        epilog=epilog,
+        formatter_class=HelpF,
+    )
+    parser.add_argument(
+        '--pattern',
+        help='Find all files matching this, including in subdirs.')
+    parser.add_argument(
+        '--prefix', default='dot',
+        help='Rename the matching files to have this prefix. E.g. ".foo" becomes "PREFIX.foo".')
+
+    args = parser.parse_args(argv[1:])
+    return args
+
+
+def main(argv=sys.argv):
+    args = parse_args(argv)
+    logging.basicConfig(level=logging.INFO)
+    run(**vars(args))
+
+
+if __name__ == '__main__':  # pragma: no cover
+    main()

+ 58 - 0
FALCON/falcon_kit/mains/cromwell_write_json.py

@@ -0,0 +1,58 @@
+"""Given a FOFN, write JSON list.
+
+Cromwell write_json() does not work as we would expect.
+
+https://github.com/broadinstitute/cromwell/issues/4625
+
+So we use write_lines() instead.
+
+Then, this little program can convert those lines into JSON.
+"""
+
+
+import argparse
+import logging
+import os
+import sys
+from .. import io
+
+LOG = logging.getLogger()
+
+
+def run(lines_fn, json_fn):
+    with open(lines_fn) as sin:
+        fns = [line.strip() for line in sin]
+    io.serialize(json_fn, fns)
+
+
+class HelpF(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
+    pass
+
+
+def parse_args(argv):
+    description = 'Symlink into current directory. This helps keep command-lines short later.'
+    epilog = ''
+    parser = argparse.ArgumentParser(
+        description=description,
+        epilog=epilog,
+        formatter_class=HelpF,
+    )
+    parser.add_argument(
+        '--lines-fn',
+        help='Input. Result of WDL write_lines().')
+    parser.add_argument(
+        '--json-fn',
+        help='Output. Should have been result of WDL write_json().')
+
+    args = parser.parse_args(argv[1:])
+    return args
+
+
+def main(argv=sys.argv):
+    args = parse_args(argv)
+    logging.basicConfig(level=logging.INFO)
+    run(**vars(args))
+
+
+if __name__ == '__main__':  # pragma: no cover
+    main()

+ 85 - 0
FALCON/falcon_kit/mains/ctg_link_analysis.py

@@ -0,0 +1,85 @@
+
+
+
+from falcon_kit import fc_asm_graph
+import sys
+
+
+def main(argv=sys.argv):
+    AsmGraph = fc_asm_graph.AsmGraph
+
+    G_asm = AsmGraph("sg_edges_list", "utg_data", "ctg_paths")
+
+    sg_edges = G_asm.sg_edges
+    node_to_ctg = G_asm.node_to_ctg
+    node_to_utg = G_asm.node_to_utg
+
+    ctg_data = G_asm.ctg_data
+    utg_data = G_asm.utg_data
+
+    ctg_pair_links = {}
+    for (v, w) in list(sg_edges.keys()):
+        if v in node_to_ctg and w in node_to_ctg:
+            for ctg1 in list(node_to_ctg[v]):
+                for ctg2 in list(node_to_ctg[w]):
+                    if ctg1 == ctg2:
+                        continue
+                    ctg_pair_links.setdefault((ctg1, ctg2), set())
+                    ctg_pair_links[(ctg1, ctg2)].add((v, w))
+
+    utg_pair_links = {}
+    for (v, w) in list(sg_edges.keys()):
+        if v in node_to_utg and w in node_to_utg:
+            for u1 in list(node_to_utg[v]):
+                for u2 in list(node_to_utg[w]):
+                    if u1 == u2:
+                        continue
+                    utg_pair_links.setdefault((u1, u2), set())
+                    utg_pair_links[(u1, u2)].add((v, w))
+
+    for ctg1, ctg2 in ctg_pair_links:
+        links = ctg_pair_links[(ctg1, ctg2)]
+        count = len(links)
+        if count > 0:
+            path1 = ctg_data[ctg1][-1][-5:]
+            path2 = ctg_data[ctg2][-1][:5]
+            utg1 = []
+            utg2 = []
+            for s1, v1, t1 in path1:
+                u1 = (s1, t1, v1)
+                type_, length, score, path_or_edges = utg_data[u1]
+                if type_ == "compound":
+                    for u in path_or_edges.split("|"):
+                        ss, vv, tt = u.split("~")
+                        utg1.append((ss, tt, vv))
+                else:
+                    utg1.append(u1)
+            for s2, v2, t2 in path2:
+                u2 = (s2, t2, v2)
+                type_, length, score, path_or_edges = utg_data[u2]
+                if type_ == "compound":
+                    for u in path_or_edges.split("|"):
+                        ss, vv, tt = u.split("~")
+                        utg2.append((ss, tt, vv))
+                else:
+                    utg2.append(u2)
+            # print path1
+            # print path2
+            # print len(utg1), len(utg2)
+            for u1 in utg1:
+                for u2 in utg2:
+                    u1 = tuple(u1)
+                    u2 = tuple(u2)
+                    c = utg_pair_links.get((u1, u2), set())
+                    if len(c) == 0:
+                        continue
+                    s1, t1, v1 = u1
+                    s2, t2, v2 = u2
+                    len_1 = ctg_data[ctg1][3]
+                    len_2 = ctg_data[ctg2][3]
+                    print('{} {} {:7d}\t{:7d}\t{}\t{}\t{}\t{} {} {}'.format(
+                        ctg1, ctg2, len_1, len_2, len(utg1), len(utg2), len(links), "~".join((s1, v1, t1)),  "~".join((s2, v2, t2)), len(c)))
+
+
+if __name__ == "__main__":
+    main(sys.argv)

File diff suppressed because it is too large
+ 1533 - 0
FALCON/falcon_kit/mains/dazzler.py


+ 70 - 0
FALCON/falcon_kit/mains/db.py

@@ -0,0 +1,70 @@
+"""
+This is meant to be used for LA4Falcon_pre/post hooks.
+dbdir is probably /dev/shm.
+"""
+import os, shutil, sys
+
+# We will ignore track files (.anno/.data).
+suffixes = ('.idx', '.bps')
+
+def log(msg):
+    print(msg)
+def rm(bn, dn):
+    """Remove bn from directory.
+    Skip silently if not found.
+    Leave the directory tree.
+    """
+    fn = os.path.join(dn, bn)
+    if os.path.exists(fn):
+        log('rm -f "{}"'.format(fn))
+        os.remove(fn)
+def cp(bn, src_dn, dst_dn):
+    """Copy bn from src to dst.
+    Create dirs for dst_dn as needed.
+    Over-write if exists in dst.
+    Raise Exception if bn is not found in src_dn.
+    """
+    src_fn = os.path.join(src_dn, bn)
+    dst_fn = os.path.join(dst_dn, bn)
+    if not os.path.exists(src_fn):
+        msg = 'Nothing found at "{}"'.format(src_fn)
+        raise Exception(msg)
+    if not os.path.isdir(dst_dn):
+        log('mkdir -p "{}"'.format(dst_dn))
+        os.makedirs(dst_dn)
+    if os.path.exists(dst_fn):
+        log('WARNING: {!r} already exists. Deleting and re-copying.'.format(dst_fn))
+        rm(bn, dst_dn)
+    log('cp -f "{}" "{}"'.format(src_fn, dst_fn))
+    shutil.copy2(src_fn, dst_fn)
+def clean(db, dbdir):
+    """
+    Remove db and dot-db files from dbdir.
+    Assume the same basename was used.
+    """
+    bn = os.path.basename(db)
+    assert bn.endswith('.db'), '{} does not end in .db'.format(bn)
+    dbname = bn[:-3] # drop .db
+    rm(bn, dbdir)
+    for suffix in suffixes:
+        bn = '.'+dbname+suffix
+        rm(bn, dbdir)
+def copy(db, dbdir):
+    """
+    Copy db and dot-db files into dbdir.
+    (dbdir is probably /dev/shm.)
+    """
+    dn, bn = os.path.split(db)
+    assert bn.endswith('.db'), '{} does not end in .db'.format(bn)
+    dbname = bn[:-3] # drop .db
+    cp(bn, dn, dbdir)
+    for suffix in suffixes:
+        bn = '.'+dbname+suffix
+        cp(bn, dn, dbdir)
+def main(prog, subcmd, db, dbdir):
+    cmd2func = {'clean': clean, 'copy': copy}
+    func = cmd2func[subcmd]
+    func(db, dbdir)
+
+if __name__ == "__main__":
+    main(*sys.argv)  # pylint: disable=no-value-for-parameter

+ 138 - 0
FALCON/falcon_kit/mains/dedup_a_tigs.py

@@ -0,0 +1,138 @@
+
+
+
+from falcon_kit.FastaReader import open_fasta_reader
+import argparse
+import sys
+# import falcon_kit.align_dw as align
+import falcon_kit.align_edlib as align
+
+class TooLongError(Exception): pass
+
+def log(msg):
+    sys.stderr.write(msg)
+    sys.stderr.write('\n')
+
+def yield_single_compound(fp_in):
+    """
+    Loads all a_ctg with the same ctg_id and a_id.
+    The header of a_ctg is:
+        p_ctg_id-a_id-sub_id v w total_length total_score num_path_edges delta_len idt cov
+    The first sequence in the returned list is the base a_ctg (the exact sequence
+    which is part of the primary contig (the base of the bubble)).
+    """
+    ret = []
+    prev_id = None
+    for r in fp_in:
+        tig_id, v, w, len_, ovl, ne, delta_l, idt, cov = r.name.split()
+        p_ctg_id, a_id, sub_id = tig_id.split('-')
+        curr_id = (p_ctg_id, a_id)
+        if prev_id != None and prev_id != curr_id:
+            yield ret
+            ret = []
+        prev_id = curr_id
+        ret.append(r)
+    yield ret
+
+def filter_duplicate(compound_a_ctg, max_idt, max_aln_cov, min_len_diff, min_seq_len, ploidy):
+    """
+    Takes a list of a_ctg sequences in a compound unitig (bubble) which need
+    to be deduplicated according to the parameters.
+    The zeroth sequence in the list is the "base" sequence. This sequence is
+    already part of the primary path by definition, and will not be output.
+    """
+
+    ret = []
+
+    # Sanity check.
+    if len(compound_a_ctg) == 0: return ret # pragma: no cover
+
+    ref_seqs = [compound_a_ctg[0]]
+
+    # Zeroth sequence is the base seq.
+    for i in range(1, len(compound_a_ctg)):
+        header = compound_a_ctg[i].name.split()
+        a_ctg_id, v, w, len_, ovl, ne, delta_l, idt, cov = header
+        a_ctg_seq = compound_a_ctg[i].sequence
+
+        # Reset the values
+        delta_l, idt, cov = 0.0, 1.0, 1.0
+        is_duplicate = False
+
+        # Align against the base sequence and all non-filtered alternate branches.
+        loop_to = len(ref_seqs) if ploidy <= 0 else min(ploidy, len(ref_seqs))
+        for j in range(0, loop_to):
+            # Just fetch the components for readibility.
+            ref_ctg_id = ref_seqs[j].name.split()[0]
+            ref_seq = ref_seqs[j].sequence
+
+            log('[i = %d, j = %d] Comparing: query "%s" vs ref "%s".' % (i, j, a_ctg_id, ref_ctg_id))
+
+            # Align.
+            delta_l, idt, cov = align.get_aln_results(ref_seq, a_ctg_seq, min_seq_len)
+
+            # Round to the floor of 2 decimal places. Needed to reproduce
+            # old behaviour.
+            idt = float('%.2f' % (idt))
+            cov = float('%.2f' % (cov))
+
+            log('  Rounded: new_delta_l = %d, new_idt = %.2f, new_cov = %.2f' % (delta_l, idt, cov))
+
+            # Check if this is a duplicate.
+            # The same conditions apply as in the old version.
+            if 100 * idt > max_idt and \
+                    100 * cov > max_aln_cov and \
+                    abs(delta_l) < min_len_diff:
+                is_duplicate = True
+                log('    -> Duplicate!')
+                break
+
+        if is_duplicate == False:
+            # This branch is not a duplicate. Add it to references,
+            # so that the following branches can be compared to it afterwards.
+            ref_seqs.append(compound_a_ctg[i])
+
+            # Append the non-duplicates.
+            new_header = ' '.join([a_ctg_id, v, w, len_, ovl, ne, str(delta_l), '%.2f' % (idt), '%.2f' % (cov)])
+            ret.append((compound_a_ctg[i], new_header))
+
+        log('')
+
+    return ret
+
+def run(fp_out, fp_in, max_idt, max_aln_cov, min_len_diff, min_seq_len, ploidy):
+    for compound_a_ctg in yield_single_compound(fp_in):
+        filtered_a_ctg = filter_duplicate(compound_a_ctg, max_idt, max_aln_cov, min_len_diff, min_seq_len, ploidy)
+
+        for a_ctg, new_header in filtered_a_ctg:
+            fp_out.write('>%s\n' % (new_header))
+            fp_out.write(a_ctg.sequence)
+            fp_out.write('\n')
+
+def parse_args(argv):
+    parser = argparse.ArgumentParser(description='Removes duplicate a-tig, iff *all* conditions are violated. Assumes the working directory has the a_ctg_all.fa file, and produces a_ctg.fa',
+                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--max-idt', type=int,
+                        help="Keep a-tig if the identity (in %%) to the primary contig is <= max_idt", default=96)
+    parser.add_argument('--max-aln-cov', type=int,
+                        help="Keep a-tig if the alignment coverage (in %%) on the a-tig is <= max_aln_cov", default=97)
+    parser.add_argument('--min-len-diff', type=int,
+                        help="Keep a-tig if the length different > min_len_diff", default=500)
+    parser.add_argument('--min-seq-len', type=int,
+                        help="Branches with length less than this threshold will always be deduplicated.", default=2000)
+    parser.add_argument('--ploidy', type=int,
+                        help="For a diplid genome, 2 branches per SV are expected. This parameter limits the number of pairwise comparison. If <= 0, this threshold is not applied.", default=2)
+    parser.add_argument('--a-ctg-all', type=str,
+                        help="Input set of all associate contigs for deduplication.", default="a_ctg_all.fa")
+
+    args = parser.parse_args(argv[1:])
+    return args
+
+def main(argv=sys.argv):
+    args = parse_args(argv)
+
+    with open_fasta_reader(args.a_ctg_all) as fp_in:
+        run(sys.stdout, fp_in, args.max_idt, args.max_aln_cov, args.min_len_diff, args.min_seq_len, args.ploidy)
+
+if __name__ == "__main__":  # pragma: no cover
+    main(sys.argv)          # pragma: no cover

+ 48 - 0
FALCON/falcon_kit/mains/dedup_a_tp.py

@@ -0,0 +1,48 @@
+
+
+
+from falcon_kit.FastaReader import open_fasta_reader
+import argparse
+import sys
+
+def load_headers(fp_in):
+    """
+    Loads all a_ctg IDs from the a_ctg.fa, which is already deduplicated.
+    """
+    ret = set()
+    for r in fp_in:
+        a_ctg_id = r.name.split()[0]
+        ret.add(a_ctg_id)
+    return ret
+
+def run(fp_out, a_ctg, a_ctg_all_tiling_path):
+    with open_fasta_reader(a_ctg) as fp_in:
+        a_ctg_ids = load_headers(fp_in)
+
+    with open(a_ctg_all_tiling_path, 'r') as fp_in:
+        for line in fp_in:
+            line = line.strip()
+            if len(line) == 0:  # pragma: no cover
+                continue        # pragma: no cover
+            sl = line.split()
+            if sl[0] not in a_ctg_ids:
+                continue
+            fp_out.write('%s\n' % (line))
+
+def parse_args(argv):
+    parser = argparse.ArgumentParser(description='Extracts all tiling paths from a_ctg_all_tiling_paths for which there is a header in a_ctg.fa (which was already deduplicated).',
+                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--a-ctg', type=str,
+                        help="Path to the a_ctg.fa file.", default='a_ctg.fa')
+    parser.add_argument('--a-ctg-all-tiling-path', type=str,
+                        help="Path to the a_ctg_all_tiling_path file.", default='a_ctg_all_tiling_path')
+
+    args = parser.parse_args(argv[1:])
+    return args
+
+def main(argv=sys.argv):
+    args = parse_args(argv)
+    run(sys.stdout, **vars(args))
+
+if __name__ == "__main__":  # pragma: no cover
+    main(sys.argv)          # pragma: no cover

+ 249 - 0
FALCON/falcon_kit/mains/fasta2fasta.py

@@ -0,0 +1,249 @@
+"""A pre-processor for DAZZ_DB/fasta2DB.
+
+Since fasta2DB has several constraints
+(a single movie per fasta, limited line-width, actual filenames),
+we write intermediate fasta
+files to disk. To reduce disk I/O, we can also compress them.
+
+Currently, we ignore zmw numbers and instead use a global counter.
+
+Inputs may be compressed, and may be either fasta or fastq.
+(For now, we ignore QVs.)
+"""
+from future.utils import itervalues
+from builtins import object
+from ..util.system import abs_fns
+import argparse
+import glob
+import gzip
+import logging
+import os
+import re
+import sys
+
+log = logging.getLogger()
+
+DNA_BASES = ['A', 'C', 'G', 'T']
+COMPLEMENT = {
+    'A': 'T',
+    'C': 'G',
+    'G': 'C',
+    'T': 'A',
+}
+
+
+def complement(x): return (COMPLEMENT[base] for base in x)
+
+
+zmw_counter = None
+
+
+def WriteSplit(write, seq, split=8000):
+    i = 0
+    while i < len(seq):
+        slice = seq[i:i + split]
+        write(slice)
+        write('\n')
+        i += split
+
+
+def parse_header(header, zmw_counter=None):
+    """
+    >>> parse_header('>mine foo bar', 1)
+    ('mine', 1, 'foo bar', 2)
+    >>> parse_header('>mine/123/5_75 foo bar')
+    ('mine', 123, '5_75 foo bar', None)
+
+    For now, ignore the zmw and instead use a global counter.
+    """
+    if '/' in header:
+        parts = header[1:].split('/')
+    else:
+        parts = header[1:].split(None, 1)
+    movie = parts[0]
+    if zmw_counter is None:
+        zmw = int(parts[1])
+    else:
+        zmw = zmw_counter
+        zmw_counter += 1
+    if len(parts) > 1:
+        extra = parts[-1]
+    else:
+        extra = ''
+    return movie, zmw, extra, zmw_counter
+
+
+re_range = re.compile('^(\d+)_(\d+)\s*(.*)$')
+
+
+def process_fasta(ifs, movie2write):
+    header = ifs.readline().strip()
+    if header[0] != '>':
+        raise Exception('{!r} is not a fasta file.'.format(ifs.name))
+    while header:
+        global zmw_counter
+        movie, zmw, extra, zmw_counter = parse_header(header, zmw_counter)
+        write = movie2write[movie]
+        # log.info('header={!r}'.format(header))
+        seq = ''
+        line = ifs.readline().strip()
+        while line and not line.startswith('>'):
+            seq += line.strip()
+            line = ifs.readline().strip()
+        length = len(seq)
+        #log.info('seq:{!r}...({})'.format(seq[:5], length))
+        beg, end = 0, length
+        mo = re_range.search(extra)
+        if mo:
+            beg, end, extra = mo.groups()
+            beg = int(beg)
+            end = int(end)
+            if (end - beg) != length:
+                end = beg + length
+                # Probably never happens tho.
+        if extra:
+            extra = ' ' + extra
+        new_header = '>{movie}/{zmw}/{beg}_{end}{extra}\n'.format(**locals())
+        write(new_header)
+        WriteSplit(write, seq)
+        header = line
+
+
+def process_fastq(ifs, movie2write):
+    header = ifs.readline().strip()
+    if header[0] != '@':
+        raise Exception('{!r} is not a fastq file.'.format(ifs.name))
+    while header:
+        global zmw_counter
+        movie, zmw, extra, zmw_counter = parse_header(header, zmw_counter)
+        write = movie2write[movie]
+        # log.info('header={!r}'.format(header))
+        seq = ifs.readline().strip()
+        header2 = ifs.readline().strip()
+        quals = ifs.readline().strip()
+        length = len(seq)
+        #log.info('seq:{!r}...({})'.format(seq[:5], length))
+        new_header = '>{movie}/{zmw}/0_{length} {extra}\n'.format(**locals())
+        write(new_header)
+        WriteSplit(write, seq)
+        header = ifs.readline().strip()
+
+
+def process_try_both(ifs, movie2write):
+    try:
+        process_fasta(ifs, movie2write)
+    except Exception:
+        log.exception('bad fasta: {!r}; trying as fastq...'.format(ifs.name))
+        process_fastq(ifs, movie2write)
+
+
+def process(ifn, movie2write):
+    root, ext = os.path.splitext(ifn)
+    if ifn.endswith('.gz'):
+        Open = gzip.GzipFile
+        ext = os.path.splitext(root)[1]
+    elif ifn.endswith('.bz2'):
+        import bz2
+        Open = bz2.BZ2File
+        ext = os.path.splitext(root)[1]
+    else:
+        Open = open
+
+    log.info('ext={!r}'.format(ext))
+    if ext in ('.fasta', '.fa'):
+        func = process_fasta
+    elif ext in ('.fastq', '.fq'):
+        func = process_fastq
+    else:
+        func = process_try_both
+
+    with Open(ifn) as ifs:
+        func(ifs, movie2write)
+
+
+class WriterMap(object):
+    def basenames(self):
+        return list(self.__obn2movie.keys())
+
+    def close(self):
+        for ofs in itervalues(self.__movie2ofs):
+            ofs.close()
+
+    def __getitem__(self, movie):
+        """Get or create a 'write' function.
+        """
+        ofs = self.__movie2ofs.get(movie)
+        if ofs is None:
+            obn = self.__basename(movie)
+            self.__obn2movie[obn] = movie
+            if os.path.exists(obn):
+                log.info('Over-writing {!r}'.format(obn))
+            else:
+                log.info('Creating {!r}'.format(obn))
+            ofs = self.__open(obn, mode='w')
+            self.__movie2ofs[movie] = ofs
+        return ofs.write
+
+    def __init__(self, Basename, Open):
+        self.__obn2movie = dict()
+        self.__movie2ofs = dict()
+        self.__basename = Basename
+        self.__open = Open
+
+
+def get_writer(Gzip=False):
+    if Gzip:
+        def Basename(movie): return movie + '.fasta.gz'
+        import functools
+        Open = functools.partial(gzip.GzipFile, compresslevel=1)
+        # A little better, a little slower:
+        #import bz2
+        #Open = bz2.BZ2File
+        #Basename = lambda movie: movie + '.fasta.bz2'
+    else:
+        def Basename(movie): return movie + '.fasta'
+        Open = open
+    movie2write = WriterMap(Basename, Open)
+    return movie2write
+
+
+def fixall(ifns, Gzip=False):
+    """Given an iterator of input absolute filenames (fasta or fastq),
+    return a list of output basenames of resulting .fasta(.gz) files, relative to CWD.
+    """
+    if Gzip:
+        open = gzip.GzipFile
+    movie2write = get_writer(Gzip)
+    for ifn in ifns:
+        process(ifn, movie2write)
+    movie2write.close()
+    return movie2write.basenames()
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--gzip',
+                        action='store_true',
+                        help='Compress intermediate fasta with gzip. (Not currently implemented.)')
+    parser.add_argument('--zmw-start',
+                        type=int,
+                        help='Ignore the zmw number in the fasta header. Instead, use a global counter, starting at this numer.')
+    # parser.add_argument('--clean',
+    #    action='store_true',
+    #    help='Remove intermediate fasta when done.')
+    # parser.add_argument('--fofn',
+    #    help='Dump intermediate FOFN. This can be used directly by "fasta2DB foo -ffofn" if fasta are uncompressed.')
+    # parser.add_argument('--fasta2DB',
+    #    help='Pass these arguments along to fasta2DB. These should exclude fasta inputs.')
+    #global ARGS
+    ARGS = parser.parse_args()
+    global zmw_counter
+    zmw_counter = ARGS.zmw_start
+    for obn in fixall(abs_fns(sys.stdin, os.getcwd()), Gzip=ARGS.gzip):
+        sys.stdout.write('{}\n'.format(os.path.abspath(obn)))
+
+
+if __name__ == "__main__":
+    logging.basicConfig()
+    log.setLevel(logging.DEBUG)
+    main()

+ 384 - 0
FALCON/falcon_kit/mains/fasta_filter.py

@@ -0,0 +1,384 @@
+import falcon_kit.FastaReader as FastaReader
+
+import os
+import re
+import sys
+import argparse
+import collections
+import itertools
+import logging
+import contextlib
+import json
+
+LOG = logging.getLogger()
+
+ZMWTuple = collections.namedtuple('ZMWTuple', ['movie_name', 'zmw_id', 'subread_start', 'subread_end', 'seq_len', 'subread_record', 'subread_header', 'subread_id'])
+
+re_ccs = re.compile(r'^([^/]*/[^/]*)/ccs\b(.*)$')
+
+def str_name(name, seqlen):
+    """In lieu of FastaReader.__str__() to replace "/ccs".
+    >>> str_name('m/123/0_99 FOO=BAR', 7)
+    'm/123/0_99 FOO=BAR'
+    >>> str_name('m/123/ccs FOO=BAR', 7)
+    'm/123/0_7 FOO=BAR'
+    """
+    if 'ccs' not in name:
+        # Avoid regex when not needed.
+        return name
+    match = re_ccs.search(name)
+    if match:
+        b_e = '0_{}'.format(seqlen)
+        name = re_ccs.sub(r'\1/0_{}\2'.format(seqlen), name, count=1)
+    return name
+
+def write_record(fp, record):
+    fp.write('>{}\n'.format(str_name(record.name, record.length)))
+    fp.write(FastaReader.wrap(record.sequence, FastaReader.FastaRecord.COLUMNS))
+    fp.write('\n')
+
+def check_in_whitelist(whitelist_set, movie_name, zmw_id):
+    """
+    >>> check_in_whitelist([], 'foo', '1')
+    False
+    >>> check_in_whitelist(['bar/1'], 'foo', '1')
+    False
+    >>> check_in_whitelist(['foo/1'], 'foo', '1')
+    True
+    """
+    movie_zmw = '{}/{}'.format(movie_name, zmw_id)
+    return movie_zmw in whitelist_set
+
+def tokenize_header(seq_header):
+    """
+    >>> tokenize_header('foo/123/0_100')
+    ('foo', '123', 0, 100)
+    """
+    try:
+        rid = seq_header.split()[0]
+        movie_name, zmw_id, subread_pos = rid.split('/')
+        subread_start, subread_end = [int(val) for val in subread_pos.split('_')]
+        return movie_name, zmw_id, subread_start, subread_end
+    except Exception as exc:
+        #raise ValueError(msg) from exc from exc
+        LOG.exception("Trapped exception. Raising new one.")
+        msg = "Error tokenizing FASTA header:\n{!r}".format(seq_header)
+        #raise ValueError(msg) from exc # python3
+        raise ValueError(msg)
+
+def yield_record_and_tokenized_headers(whitelist_set, records):
+    """For each record, yield (record, tokens)
+    but only if whitelisted.
+
+    records: iterable
+    whitelist_set: has __contains__, but empty means use everything.
+    """
+    for record in records:
+        tokens = tokenize_header(record.name)
+        movie_name, zmw_id, subread_start, subread_end = tokens
+        if whitelist_set and not check_in_whitelist(whitelist_set, movie_name, zmw_id):
+            continue
+        yield record, tokens
+
+def yield_record(whitelist_set, records):
+    """Yield each record,
+    but only if whitelisted.
+
+    This is an optimized version of yield_record_and_tokenized_headers(),
+    to avoid tokenizing when we have no whitelist.
+
+    records: iterable
+    whitelist_set: has __contains__, but empty means use everything.
+    """
+    for record in records:
+        if not whitelist_set:
+            # no need to tokenize
+            yield record
+            continue
+        tokens = tokenize_header(record.name)
+        movie_name, zmw_id, subread_start, subread_end = tokens
+        if not check_in_whitelist(whitelist_set, movie_name, zmw_id):
+            continue
+        yield record
+
+def longest_zmw_subread(zmw_subreads):
+    """Return subread_record with longest seq_len.
+    zmw_subreads is a list of ZMWTuple.
+    """
+    assert len(zmw_subreads) != 0
+
+    return max(zmw_subreads, key = lambda x: x.seq_len)
+
+def median_zmw_subread(zmw_subreads):
+    """Return subread_record with median seq_len.
+    zmw_subreads is a list of ZMWTuple.
+    """
+    assert len(zmw_subreads) != 0
+
+    sorted_subreads = sorted(zmw_subreads, key = lambda x: x.seq_len)
+
+    # Not really a median since we round to the floor, and the definition of
+    # median value would be the average of the two middle elements.
+    # However, we need a concrete subread associated with the median value.
+    median_id = len(sorted_subreads) // 2
+
+    return sorted_subreads[median_id]
+
+def internal_median_zmw_subread(zmw_subreads):
+    """Returns a single subread based on the following selection criteria:
+    - If the ZMW has < 3 subreads, the maximum one is output.
+    - If the ZMW has >= 3 subreads, the median one is selected only from the internal
+      ones (ignoring the first and the last subread)
+    This is intended to prevent the impact of very short first and last subread on the
+    median selection, since the polymerase can start/stop in the middle of the insert.
+    zmw_subreads is a list of ZMWTuple.
+    """
+    assert len(zmw_subreads) != 0
+
+    selected_subread = None
+
+    if len(zmw_subreads) < 3:
+        sorted_subreads = sorted(zmw_subreads, key = lambda x: x.seq_len)
+        selected_subread = sorted_subreads[-1]
+    else:
+        sorted_by_pos = sorted(zmw_subreads, key = lambda x: x.subread_start)
+        sorted_subreads = sorted(sorted_by_pos[1:-1], key = lambda x: x.seq_len)
+        median_id = len(sorted_subreads) // 2
+        selected_subread = sorted_subreads[median_id]
+
+    return selected_subread
+
+##############################
+### Streamed-median filter ###
+##############################
+def yield_zmwtuple(records, whitelist_set, store_record):
+    subread_id = 0
+    for (record, tokens) in yield_record_and_tokenized_headers(whitelist_set, records):
+        movie_name, zmw_id, subread_start, subread_end = tokens
+        record_to_store = record if store_record else None
+        zrec = ZMWTuple(movie_name=movie_name, zmw_id=zmw_id,
+                        subread_start=subread_start, subread_end=subread_end,
+                        seq_len=len(record.sequence), subread_record=record_to_store,
+                        subread_header=record.name, subread_id=subread_id)
+        subread_id += 1
+        yield zrec
+
+def write_streamed(fp_out, yield_zmwtuple_func, zmw_filter_func):
+    for zmw_id, zmw_subreads in itertools.groupby(yield_zmwtuple_func(store_record=True), lambda x: x.zmw_id):
+        zrec = zmw_filter_func(list(zmw_subreads))
+        write_record(fp_out, zrec.subread_record)
+
+def run_streamed_median_filter(fp_in, fp_out, whitelist_set, zmw_filter_func=median_zmw_subread):
+    def yield_zmwtuple_func(store_record=True):
+        fasta_records = FastaReader.yield_fasta_record(fp_in, log=LOG.info)
+        return yield_zmwtuple(fasta_records, whitelist_set, store_record)
+    write_streamed(fp_out, yield_zmwtuple_func, zmw_filter_func)
+
+##############################
+### Longest filter.
+##############################
+def run_streamed_longest_filter(fp_in, fp_out, whitelist_set):
+    def yield_zmwtuple_func(store_record=True):
+        fasta_records = FastaReader.yield_fasta_record(fp_in, log=LOG.info)
+        return yield_zmwtuple(fasta_records, whitelist_set, store_record)
+    write_streamed(fp_out, yield_zmwtuple_func, zmw_filter_func=longest_zmw_subread)
+
+##############################
+### Pass filter.           ###
+##############################
+def run_pass_filter(fp_in, fp_out, whitelist_set):
+    for record in yield_record(whitelist_set, FastaReader.yield_fasta_record(fp_in, log=LOG.info)):
+        write_record(fp_out, record)
+
+##################################
+### Double-pass median filter. ###
+##################################
+def write_doublepass_median(fp_out, yield_zmwtuple_func, zmw_filter_func=median_zmw_subread):
+    # Stores all subreads for a ZMW.
+    zmw_dict = collections.defaultdict(list)
+
+    # First pass, collect all ZMW info.
+    for zrec in yield_zmwtuple_func(store_record=False):
+        # Store None instead of the actual record to free the memory after yield.
+        zmw_id = zrec.zmw_id
+        zmw_dict[zmw_id].append(zrec)
+
+    # For each ZMW, keep only one particular subread.
+    selected = collections.defaultdict(int)
+    for zmw_id, zmw_subreads in zmw_dict.items():
+        median_zrec = zmw_filter_func(list(zmw_subreads))
+        selected[zmw_id] = median_zrec.subread_id
+
+    # Second pass, yield selected sequences.
+    # This must be exactly the same order, so we end up with the same computed subread_id for each record.
+    for zrec in yield_zmwtuple_func(store_record=True):
+        zmw_id = zrec.zmw_id
+        subread_id = zrec.subread_id
+        if selected[zmw_id] == subread_id:
+            write_record(fp_out, zrec.subread_record)
+
+def run_median_filter(fp_in, fp_out, whitelist_set, zmw_filter_func=median_zmw_subread):
+    # Needed to jump back for the second pass.
+    # Expect an actual file, not a pipe.
+    try:
+        fp_in_start = fp_in.tell()
+        fp_in.seek(fp_in_start)
+    except Exception as e:
+        msg = 'fileobj.tell()/seek() failed. Cannot rewind, so cannot do multi-pass. {}\n{}'.format(
+            type(fp_in), dir(fp_in))
+        raise AssertionError(msg, e)
+
+    def yield_zmwtuple_func(store_record=True):
+        # Rewind.
+        fp_in.seek(fp_in_start, os.SEEK_SET)
+
+        fasta_records = FastaReader.yield_fasta_record(fp_in, log=LOG.info)
+        return yield_zmwtuple(fasta_records, whitelist_set, store_record)
+
+    write_doublepass_median(fp_out, yield_zmwtuple_func, zmw_filter_func)
+
+###############################
+### Internal median filter. ###
+###############################
+def run_internal_median_filter(fp_in, fp_out, whitelist_set):
+    run_median_filter(fp_in, fp_out, whitelist_set, zmw_filter_func=internal_median_zmw_subread)
+
+#######################################
+### Streamed internal median filter ###
+#######################################
+def run_streamed_internal_median_filter(fp_in, fp_out, whitelist_set):
+    run_streamed_median_filter(fp_in, fp_out, whitelist_set=whitelist_set, zmw_filter_func=internal_median_zmw_subread)
+
+##############################
+### Main and cmds.         ###
+##############################
+@contextlib.contextmanager
+def open_input_stream(input_path):
+    """stdin if '-'
+    """
+    if input_path == '-':
+        yield sys.stdin
+    else:
+        with open(input_path) as stream:
+            yield stream
+
+def load_zmw_whitelist(zmw_whitelist_fn):
+    """Read from json filename, or do nothing if null fn.
+    Return as a set, empty-set if empty.
+    Raise if missing non-null fn.
+    """
+    # ret = set()
+    if not zmw_whitelist_fn:
+        return set()
+    with open(zmw_whitelist_fn, 'r') as fp_in:
+        try:
+            return set(json.loads(fp_in.read()))
+        except ValueError:
+            LOG.error('Failed to parse "{}" as JSON. Assuming empty whitelist.'.format(zmw_whitelist_fn))
+            return set()
+
+def cmd_run_pass_filter(args):
+    whitelist_set = load_zmw_whitelist(args.zmw_whitelist_fn)
+    with open_input_stream(args.input_path) as fp_in:
+        run_pass_filter(fp_in, sys.stdout, whitelist_set)
+
+def cmd_run_streamed_longest_filter(args):
+    whitelist_set = load_zmw_whitelist(args.zmw_whitelist_fn)
+    with open_input_stream(args.input_path) as fp_in:
+        run_streamed_longest_filter(fp_in, sys.stdout, whitelist_set)
+
+def cmd_run_streamed_median_filter(args):
+    whitelist_set = load_zmw_whitelist(args.zmw_whitelist_fn)
+    with open_input_stream(args.input_path) as fp_in:
+        run_streamed_median_filter(fp_in, sys.stdout, whitelist_set)
+
+def cmd_run_median_filter(args):
+    whitelist_set = load_zmw_whitelist(args.zmw_whitelist_fn)
+    # Don't allow '-' for the double-pass median filter.
+    with open(args.input_path, 'r') as fp_in:
+        run_median_filter(fp_in, sys.stdout, whitelist_set)
+
+def cmd_run_internal_median_filter(args):
+    whitelist_set = load_zmw_whitelist(args.zmw_whitelist_fn)
+    with open(args.input_path, 'r') as fp_in:
+        run_internal_median_filter(fp_in, sys.stdout, whitelist_set)
+
+def cmd_run_streamed_internal_median_filter(args):
+    whitelist_set = load_zmw_whitelist(args.zmw_whitelist_fn)
+    with open_input_stream(args.input_path) as fp_in:
+        run_streamed_internal_median_filter(fp_in, sys.stdout, whitelist_set)
+
+class HelpF(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
+    pass
+
+def parse_args(argv):
+    description = 'Filters the input FASTA file according to one of the selected filters.'
+    parser = argparse.ArgumentParser(
+        description=description,
+        formatter_class=HelpF,
+    )
+
+    parser.add_argument(
+        '--zmw-whitelist-fn', default='',
+        help='A JSON file containing a list of "movie_name/zmw" IDs to retain. If the file is an empty list, all ZMWs will be used; otherwise, only the listed ones will be whitelisted. (Applies to all filters.)',
+        required = False,
+    )
+
+    subparsers = parser.add_subparsers(help='sub-command help')
+
+    help_pass = 'The no-op filter - passes every FASTA record to stdout. If input_path is "-", input is read from stdin.'
+    help_streamed_longest = 'Selects the longest read in each ZMW by running a single-pass over the data (i.e. "streamed"). The input subreads should be groupped by ZMW. If input_path is "-", input is read from stdin.'
+    help_median = 'Applies the median-length ZMW filter by running two passes over the data. Only one subread per ZMW is output, based on median-length selection. The input_path needs to be a file.'
+    help_streamed_median = 'Applies the median-length ZMW filter by running a single-pass over the data. The input subreads should be groupped by ZMW. If input_path is "-", input is read from stdin.'
+    help_internal_median = 'Applies the median-length ZMW filter only on internal subreads (ZMWs with >= 3 subreads) by running two passes over the data. For ZMWs with < 3 subreads, the maximum-length one is selected. The input_path needs to be a file.'
+    help_streamed_internal_median = 'Applies the median-length ZMW filter only on internal subreads (ZMWs with >= 3 subreads) by running a single pass over the data. The input subreads should be groupped by ZMW. For ZMWs with < 3 subreads, the maximum-length one is selected. If input_path is "-", input is read from stdin.'
+
+    parser_pass = subparsers.add_parser('pass',
+            formatter_class=HelpF,
+            description=help_pass,
+            help=help_pass)
+    parser_pass.set_defaults(func=cmd_run_pass_filter)
+
+    parser_streamed_longest = subparsers.add_parser('streamed-longest',
+            formatter_class=HelpF,
+            description=help_streamed_longest,
+            help=help_streamed_longest)
+    parser_streamed_longest.set_defaults(func=cmd_run_streamed_longest_filter)
+
+    parser_median = subparsers.add_parser('median',
+            formatter_class=HelpF,
+            description=help_median,
+            help=help_median)
+    parser_median.set_defaults(func=cmd_run_median_filter)
+
+    parser_streamed_median = subparsers.add_parser('streamed-median',
+            formatter_class=HelpF,
+            description=help_streamed_median,
+            help=help_streamed_median)
+    parser_streamed_median.set_defaults(func=cmd_run_streamed_median_filter)
+
+    parser_internal_median = subparsers.add_parser('internal-median',
+            formatter_class=HelpF,
+            description=help_internal_median,
+            help=help_internal_median)
+    parser_internal_median.set_defaults(func=cmd_run_internal_median_filter)
+
+    parser_streamed_internal_median = subparsers.add_parser('streamed-internal-median',
+            formatter_class=HelpF,
+            description=help_streamed_internal_median,
+            help=help_streamed_internal_median)
+    parser_streamed_internal_median.set_defaults(func=cmd_run_streamed_internal_median_filter)
+
+    parser.add_argument('input_path', help='Input PacBio FASTA file')
+
+    args = parser.parse_args(argv[1:])
+    return args
+
+def main(argv=sys.argv):
+    args = parse_args(argv)
+    logging.basicConfig(level=logging.INFO)
+    LOG.info("====>[{}],\n====>args={}".format(args.func, args))
+    args.func(args)
+
+if __name__ == "__main__":  # pragma: no cover
+    main(sys.argv)          # pragma: no cover

+ 220 - 0
FALCON/falcon_kit/mains/fasta_subsample.py

@@ -0,0 +1,220 @@
+"""
+Performs a single pass over an input FASTA/FOFN, and collects
+all ZMWs. For each ZMW it calculates the expected molecular size by picking
+the internal median subread length.
+The script outputs a JSON file with a whitelist of ZMWs selected by a given
+strategy (random, longest, etc.) and desired coverage of a genome.
+Author: Ivan Sovic
+"""
+from falcon_kit.mains.fasta_filter import ZMWTuple
+from falcon_kit.util.system import set_random_seed
+
+import falcon_kit.FastaReader as FastaReader
+import falcon_kit.mains.fasta_filter as fasta_filter
+import falcon_kit.io as io
+
+import os
+import sys
+import argparse
+import logging
+import contextlib
+import itertools
+import random
+import json
+import copy
+
+LOG = logging.getLogger()
+
+STRATEGY_RANDOM = 'random'
+STRATEGY_LONGEST = 'longest'
+
+def strategy_func_random(zmws):
+    """
+    >>> random.seed(12345); strategy_func_random([])
+    []
+    >>> random.seed(12345); strategy_func_random([('synthetic/1', 9)])
+    [('synthetic/1', 9)]
+    >>> random.seed(12345); strategy_func_random([('synthetic/1', 9), ('synthetic/2', 21), ('synthetic/3', 9), ('synthetic/4', 15), ('synthetic/5', 20)])
+    [('synthetic/5', 20), ('synthetic/3', 9), ('synthetic/2', 21), ('synthetic/1', 9), ('synthetic/4', 15)]
+    """
+    ret = copy.deepcopy(zmws)
+    random.shuffle(ret)
+    return ret
+
+def strategy_func_longest(zmws):
+    """
+    >>> strategy_func_longest([])
+    []
+    >>> strategy_func_longest([('synthetic/1', 9)])
+    [('synthetic/1', 9)]
+    >>> strategy_func_longest([('synthetic/1', 9), ('synthetic/2', 21), ('synthetic/3', 9), ('synthetic/4', 15), ('synthetic/5', 20)])
+    [('synthetic/2', 21), ('synthetic/5', 20), ('synthetic/4', 15), ('synthetic/1', 9), ('synthetic/3', 9)]
+    """
+    return sorted(zmws, key = lambda x: x[1], reverse = True)
+
+STRATEGY_TYPE_TO_FUNC = {   STRATEGY_RANDOM: strategy_func_random,
+                            STRATEGY_LONGEST: strategy_func_longest
+                        }
+
+def get_strategy_func(strategy_type):
+    """
+    >>> get_strategy_func(STRATEGY_RANDOM) == strategy_func_random
+    True
+    >>> get_strategy_func(STRATEGY_LONGEST) == strategy_func_longest
+    True
+    >>> try:
+    ...     get_strategy_func('nonexistent_strategy')
+    ...     print('False')
+    ... except:
+    ...     print('True')
+    True
+    """
+    assert strategy_type in STRATEGY_TYPE_TO_FUNC, 'Unknown strategy type: "{}"'.format(str(strategy_type))
+    return STRATEGY_TYPE_TO_FUNC[strategy_type]
+
+def select_zmws(zmws, min_requested_bases):
+    """
+    >>> select_zmws([], 0)
+    ([], 0)
+    >>> select_zmws([], 10)
+    ([], 0)
+    >>> select_zmws([('zmw/1', 1), ('zmw/2', 2), ('zmw/3', 5), ('zmw/4', 7), ('zmw/5', 10), ('zmw/6', 15)], 10)
+    (['zmw/1', 'zmw/2', 'zmw/3', 'zmw/4'], 15)
+    >>> select_zmws([('zmw/1', 1), ('zmw/2', 2), ('zmw/3', 5), ('zmw/4', 7), ('zmw/5', 10), ('zmw/6', 15)], 20)
+    (['zmw/1', 'zmw/2', 'zmw/3', 'zmw/4', 'zmw/5'], 25)
+    >>> select_zmws([('zmw/1', 1), ('zmw/1', 2), ('zmw/1', 5), ('zmw/1', 7), ('zmw/1', 10), ('zmw/1', 15)], 20)
+    (['zmw/1', 'zmw/1', 'zmw/1', 'zmw/1', 'zmw/1'], 25)
+    """
+    # Select the first N ZMWs which sum up to the desired coverage.
+    num_bases = 0
+    subsampled_zmws = []
+    for zmw_name, seq_len in zmws:
+        num_bases += seq_len
+        subsampled_zmws.append(zmw_name)
+        if num_bases >= min_requested_bases:
+            break
+    return subsampled_zmws, num_bases
+
+def calc_stats(total_unique_molecular_bases, total_bases, output_bases, genome_size, coverage):
+    """
+    >>> calc_stats(0, 0, 0, 0, 0) == \
+    {'genome_size': 0, 'coverage': 0, 'total_bases': 0, 'total_unique_molecular_bases': 0, \
+    'output_bases': 0, 'unique_molecular_avg_cov': 0.0, 'output_avg_cov': 0.0, 'total_avg_cov': 0.0}
+    True
+    >>> calc_stats(10000, 100000, 2000, 1000, 2) == \
+    {'genome_size': 1000, 'coverage': 2, 'total_bases': 100000, 'total_unique_molecular_bases': 10000, \
+    'output_bases': 2000, 'unique_molecular_avg_cov': 10.0, 'output_avg_cov': 2.0, 'total_avg_cov': 100.0}
+    True
+    """
+    unique_molecular_avg_cov = 0.0 if genome_size == 0 else float(total_unique_molecular_bases) / float(genome_size)
+    total_avg_cov = 0.0 if genome_size == 0 else float(total_bases) / float(genome_size)
+    output_avg_cov = 0.0 if genome_size == 0 else float(output_bases) / float(genome_size)
+    ret = {}
+    ret['genome_size'] = genome_size
+    ret['coverage'] = coverage
+    ret['total_bases'] = total_bases
+    ret['total_unique_molecular_bases'] = total_unique_molecular_bases
+    ret['output_bases'] = output_bases
+    ret['total_avg_cov'] = total_avg_cov
+    ret['unique_molecular_avg_cov'] = unique_molecular_avg_cov
+    ret['output_avg_cov'] = output_avg_cov
+    return ret
+
+def collect_zmws(yield_zmwtuple_func):
+    """
+    >>> collect_zmws([])
+    ([], 0, 0)
+    >>> collect_zmws([\
+        ZMWTuple(movie_name='test' , zmw_id='1', subread_start=0, subread_end=1000, seq_len=1000, subread_record=None, subread_header='test/1/0_1000', subread_id=0), \
+    ])
+    ([('test/1', 1000)], 1000, 1000)
+    >>> collect_zmws([\
+        ZMWTuple(movie_name='test' , zmw_id='1', subread_start=123, subread_end=456, seq_len=1000, subread_record=None, subread_header='test/1/0_1000', subread_id=0), \
+        ZMWTuple(movie_name='test' , zmw_id='1', subread_start=123, subread_end=456, seq_len=2000, subread_record=None, subread_header='test/1/1000_3000', subread_id=0), \
+        ZMWTuple(movie_name='test' , zmw_id='1', subread_start=123, subread_end=456, seq_len=3000, subread_record=None, subread_header='test/1/3000_6000', subread_id=0), \
+    ])
+    ([('test/1', 2000)], 2000, 6000)
+    >>> collect_zmws([\
+        ZMWTuple(movie_name='test' , zmw_id='1', subread_start=123, subread_end=456, seq_len=1000, subread_record=None, subread_header='test/1/0_1000', subread_id=0), \
+        ZMWTuple(movie_name='test' , zmw_id='1', subread_start=123, subread_end=456, seq_len=2000, subread_record=None, subread_header='test/1/1000_3000', subread_id=1), \
+        ZMWTuple(movie_name='test' , zmw_id='1', subread_start=123, subread_end=456, seq_len=3000, subread_record=None, subread_header='test/1/3000_6000', subread_id=2), \
+        ZMWTuple(movie_name='test' , zmw_id='2', subread_start=123, subread_end=456, seq_len=10000, subread_record=None, subread_header='header2', subread_id=3), \
+    ])
+    ([('test/1', 2000), ('test/2', 10000)], 12000, 16000)
+    """
+    zmws = []
+    unique_molecular_size = 0
+    total_size = 0
+    for zmw_id, zmw_subreads in itertools.groupby(yield_zmwtuple_func, lambda x: x.zmw_id):
+        zmw_subreads_list = list(zmw_subreads)
+        zrec = fasta_filter.internal_median_zmw_subread(zmw_subreads_list)
+        movie_zmw = zrec.movie_name + '/' + zrec.zmw_id
+        unique_molecular_size += zrec.seq_len
+        total_size += sum([zmw.seq_len for zmw in zmw_subreads_list])
+        zmws.append((movie_zmw, zrec.seq_len))
+    return zmws, unique_molecular_size, total_size
+
+def yield_record(input_files):
+    for input_fn in input_files:
+        with open(input_fn, 'r') as fp_in:
+            fasta_records = FastaReader.yield_fasta_record(fp_in, log=LOG.info)
+            for record in fasta_records:
+                yield record
+
+def run(yield_zmw_tuple_func, coverage, genome_size, strategy_func):
+    zmws, total_unique_molecular_bases, total_bases = collect_zmws(yield_zmw_tuple_func)
+    zmws = strategy_func(zmws)
+    subsampled_zmws, output_bases = select_zmws(zmws, coverage * genome_size)
+    stats_dict = calc_stats(total_unique_molecular_bases, total_bases, output_bases, genome_size, coverage)
+    return subsampled_zmws, zmws, stats_dict
+
+def parse_args(argv):
+    parser = argparse.ArgumentParser(description="Produces a list of ZMW where the median unique molecular "\
+                                        "coverage sums up to the desired coverage of the given genome size.s",
+                                        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--strategy', type=str, default='random',
+                        help='Subsampling strategy: random, longest')
+    parser.add_argument('--coverage', type=float, default=60,
+                        help='Desired coverage for subsampling.')
+    parser.add_argument('--genome-size', type=float, default=0,
+                        help='Genome size estimate of the input dataset.', required=True)
+    parser.add_argument('--random-seed', type=int, default=12345,
+                        help='Seed value used for the random generator.', required=False)
+    parser.add_argument('input_fn', type=str, default='input.fofn',
+                        help='Input FASTA files or a FOFN. (Streaming is not allowed).')
+    parser.add_argument('out_prefix', type=str, default='input.cov',
+                        help='Prefix of the output files to generate.')
+    args = parser.parse_args(argv[1:])
+    return args
+
+def main(argv=sys.argv):
+    args = parse_args(argv)
+    logging.basicConfig(level=logging.INFO)
+
+    strategy_func = get_strategy_func(args.strategy)
+    LOG.info('Using subsampling strategy: "{strategy}"'.format(strategy=args.strategy))
+
+    set_random_seed(args.random_seed)
+
+    input_files = list(io.yield_abspath_from_fofn(args.input_fn))
+
+    zmws_whitelist, zmws_all, stats_dict  = run(
+            fasta_filter.yield_zmwtuple(yield_record(input_files), None, False), args.coverage, args.genome_size, strategy_func)
+
+    out_zmw_whitelist = args.out_prefix + '.whitelist.json'
+    out_all_zmws = args.out_prefix + '.all.json'
+    out_zmw_stats = args.out_prefix + '.stats.json'
+
+    with open(out_zmw_whitelist, 'w') as fp_out_whitelist, \
+         open(out_all_zmws, 'w') as fp_out_all_zmws, \
+         open(out_zmw_stats, 'w') as fp_out_stats:
+
+        # Write out the whitelist.
+        fp_out_whitelist.write(json.dumps(zmws_whitelist))
+        # Write the entire list of ZMWs and lengths, might be very informative.
+        fp_out_all_zmws.write(json.dumps(zmws_all))
+        # Write out the stats.
+        fp_out_stats.write(json.dumps(stats_dict))
+
+if __name__ == "__main__":  # pragma: no cover
+    main(sys.argv)          # pragma: no cover

+ 158 - 0
FALCON/falcon_kit/mains/fetch_reads.py

@@ -0,0 +1,158 @@
+
+
+
+
+
+from falcon_kit.FastaReader import open_fasta_reader
+import argparse
+import contextlib
+import os
+import glob
+import sys
+import re
+
+
+def fetch_ref_and_reads(base_dir, fofn, ctg_id, out_dir, min_ctg_lenth):
+    read_fofn = fofn
+    if out_dir == None:
+        out_dir = os.path.join(base_dir, '3-unzip/reads')
+
+    ctg_fa = os.path.join(base_dir, '2-asm-falcon/p_ctg.fa')
+    read_map_dir = os.path.join(base_dir, '2-asm-falcon/read_maps')
+
+    rawread_id_file = os.path.join(
+        read_map_dir, 'dump_rawread_ids', 'rawread_ids')
+    pread_id_file = os.path.join(read_map_dir, 'dump_pread_ids', 'pread_ids')
+
+    rid_to_oid = open(rawread_id_file).read().split(
+        '\n')  # daligner raw read id to the original ids
+    pid_to_fid = open(pread_id_file).read().split(
+        '\n')  # daligner pread id to the fake ids
+    assert rid_to_oid, 'Empty rid_to_oid. Maybe empty {!r}?'.format(
+        rawread_id_file)
+    assert pid_to_fid, 'Empty pid_to_fid. Maybe empty {!r}?'.format(
+        pread_id_file)
+
+    def pid_to_oid(pid):
+        fid = pid_to_fid[int(pid)]
+        rid = int(fid.split('/')[1]) // 10
+        return rid_to_oid[int(rid)]
+
+    with open_fasta_reader(ctg_fa) as ref_fasta:
+        all_ctg_ids = set()
+        for s in ref_fasta:
+            s_id = s.name.split()[0]
+            if ctg_id != 'all' and s_id != ctg_id:
+                continue
+
+            if len(s.sequence) < min_ctg_lenth:
+                continue
+
+            if ctg_id != 'all':
+                ref_out = open(os.path.join(
+                    out_dir, '%s_ref.fa' % ctg_id), 'w')
+            else:
+                ref_out = open(os.path.join(out_dir, '%s_ref.fa' % s_id), 'w')
+
+            print('>%s' % s_id, file=ref_out)
+            print(s.sequence, file=ref_out)
+            all_ctg_ids.add(s_id)
+            ref_out.close()
+
+    read_set = {}
+    ctg_id_hits = {}
+
+    map_fn = os.path.join(read_map_dir, 'rawread_to_contigs')
+    with open(map_fn, 'r') as f:
+        for row in f:
+            row = row.strip().split()
+            hit_ctg = row[1]
+            hit_ctg = hit_ctg.split('-')[0]
+            if int(row[3]) == 0:
+                o_id = rid_to_oid[int(row[0])]
+                read_set[o_id] = hit_ctg
+                ctg_id_hits[hit_ctg] = ctg_id_hits.get(hit_ctg, 0) + 1
+    assert read_set, 'Empty read_set. Maybe empty {!r}?'.format(map_fn)
+    map_fn = os.path.join(read_map_dir, 'pread_to_contigs')
+    with open(map_fn, 'r') as f:
+        for row in f:
+            row = row.strip().split()
+            hit_ctg = row[1]
+            hit_ctg = hit_ctg.split('-')[0]
+            if hit_ctg not in read_set and int(row[3]) == 0:
+                o_id = pid_to_oid(row[0])
+                read_set[o_id] = hit_ctg
+                ctg_id_hits[hit_ctg] = ctg_id_hits.get(hit_ctg, 0) + 1
+
+    with open(os.path.join(out_dir, 'ctg_list'), 'w') as f:
+        for ctg_id in sorted(list(all_ctg_ids)):
+            if ctg_id_hits.get(ctg_id, 0) < 5:
+                continue
+            # ignore small circle contigs, they need different approach
+            if ctg_id[-1] not in ['F', 'R']:
+                continue
+            print(ctg_id, file=f)
+
+    read_out_files = {}
+
+    @contextlib.contextmanager
+    def reopened_fasta_out(ctg_id):
+                # A convenient closure, with a contextmanager.
+        if ctg_id not in read_out_files:
+            read_out = open(os.path.join(out_dir, '%s_reads.fa' % ctg_id), 'w')
+            read_out_files[ctg_id] = 1
+        else:
+            read_out = open(os.path.join(out_dir, '%s_reads.fa' % ctg_id), 'a')
+        yield read_out
+        read_out.close()
+
+    with open(read_fofn, 'r') as f:
+        for r_fn in f:
+            r_fn = r_fn.strip()
+            # will soon handle .dexta too
+            with open_fasta_reader(r_fn) as read_fa_file:
+                for r in read_fa_file:
+                    rid = r.name.split()[0]
+                    if rid not in read_set:
+                        ctg_id = 'unassigned'
+                    else:
+                        ctg_id = read_set[rid]
+
+                    if ctg_id == 'NA' or ctg_id not in all_ctg_ids:
+                        ctg_id = 'unassigned'
+
+                    with reopened_fasta_out(ctg_id) as read_out:
+                        print('>' + rid, file=read_out)
+                        print(r.sequence, file=read_out)
+
+
+def parse_args(argv):
+    parser = argparse.ArgumentParser(
+        description='using the read to contig mapping data to partition the reads grouped by contigs')
+    parser.add_argument('--base_dir', type=str, default='./',
+                        help='the base working dir of a falcon assembly')
+    parser.add_argument('--fofn', type=str, default='./input.fofn',
+                        help='path to the file of the list of raw read fasta files')
+    parser.add_argument('--ctg_id', type=str, default='all',
+                        help='contig identifier in the contig fasta file')
+    parser.add_argument('--out_dir', default=None, type=str,
+                        help='the output base_dir, default to `base_dir/3-unzip/reads` directory')
+    parser.add_argument('--min_ctg_lenth', default=20000, type=int,
+                        help='the minimum length of the contig for the outputs, default=20000')
+    #parser.add_argument('--ctg_fa', type=str, default='./2-asm-falcon/p_ctg.fa', help='path to the contig fasta file')
+    #parser.add_argument('--read_map_dir', type=str, default='./2-asm-falcon/read_maps', help='path to the read-contig map directory')
+    # we can run this in parallel mode in the furture
+    # parser.add_argument('--n_core', type=int, default=4,
+    #                    help='number of processes used for generating consensus')
+
+    args = parser.parse_args(argv[1:])
+    return args
+
+
+def main(argv=sys.argv):
+    args = parse_args(argv)
+    fetch_ref_and_reads(**vars(args))
+
+
+if __name__ == '__main__':
+    main()

+ 29 - 0
FALCON/falcon_kit/mains/gen_gfa_v1.py

@@ -0,0 +1,29 @@
+import argparse
+import os
+import sys
+
+from falcon_kit.fc_asm_graph import AsmGraph
+from falcon_kit.FastaReader import FastaReader
+from falcon_kit.gfa_graph import *
+
+def run(fp_out, collected_gfa):
+    with open(collected_gfa, 'r') as fp_in:
+        gfa_graph = deserialize_gfa(fp_in)
+
+    gfa_graph.write_gfa_v1(fp_out)
+
+def parse_args(argv):
+    parser = argparse.ArgumentParser(description="Generates GFA output (on stdout) from FALCON's assembly.",
+                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('collected_gfa', type=str, default='asm.gfa.json',
+                        help='Path to the file with collected and formatted data for generating the GFA')
+    args = parser.parse_args(argv[1:])
+    return args
+
+def main(argv=sys.argv):
+    args = parse_args(argv)
+
+    run(sys.stdout, **vars(args))
+
+if __name__ == '__main__':  # pragma: no cover
+    main()

+ 29 - 0
FALCON/falcon_kit/mains/gen_gfa_v2.py

@@ -0,0 +1,29 @@
+import argparse
+import os
+import sys
+
+from falcon_kit.fc_asm_graph import AsmGraph
+from falcon_kit.FastaReader import FastaReader
+from falcon_kit.gfa_graph import *
+
+def run(fp_out, collected_gfa):
+    with open(collected_gfa, 'r') as fp_in:
+        gfa_graph = deserialize_gfa(fp_in)
+
+    gfa_graph.write_gfa_v2(fp_out)
+
+def parse_args(argv):
+    parser = argparse.ArgumentParser(description="Generates GFA output (on stdout) from FALCON's assembly.",
+                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('collected_gfa', type=str, default='asm.gfa.json',
+                        help='Path to the file with collected and formatted data for generating the GFA')
+    args = parser.parse_args(argv[1:])
+    return args
+
+def main(argv=sys.argv):
+    args = parse_args(argv)
+
+    run(sys.stdout, **vars(args))
+
+if __name__ == '__main__':  # pragma: no cover
+    main()

+ 97 - 0
FALCON/falcon_kit/mains/generate_read_to_ctg_map.py

@@ -0,0 +1,97 @@
+
+
+
+
+
+import argparse
+import logging
+import sys
+from ..util import io
+from ..fc_asm_graph import AsmGraph
+
+def run(rawread_id_fn, pread_id_fn, sg_edges_list_fn, utg_data_fn, ctg_paths_fn, output_fn):
+    read_to_contig_map = output_fn
+    pread_did_to_rid = open(pread_id_fn).read().split('\n')
+    rid_to_oid = open(rawread_id_fn).read().split('\n')
+
+    asm_G = AsmGraph(sg_edges_list_fn,
+                     utg_data_fn,
+                     ctg_paths_fn)
+
+    pread_to_contigs = {}
+
+    with open(read_to_contig_map, 'w') as f:
+        for ctg in asm_G.ctg_data:
+            if ctg[-1] == 'R':
+                continue
+            ctg_g = asm_G.get_sg_for_ctg(ctg)
+            for n in ctg_g.nodes():
+                pid = int(n.split(':')[0])
+
+                rid = pread_did_to_rid[pid].split('/')[1]
+                rid = int(int(rid) // 10)
+                oid = rid_to_oid[rid]
+                k = (pid, rid, oid)
+                pread_to_contigs.setdefault(k, set())
+                pread_to_contigs[k].add(ctg)
+
+        for k in pread_to_contigs:
+            pid, rid, oid = k
+            for ctg in list(pread_to_contigs[k]):
+                print('%09d %09d %s %s' % (pid, rid, oid, ctg), file=f)
+
+
+class HelpF(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
+    pass
+
+
+def parse_args(argv):
+    description = 'Generate read_to_ctg_map from rawread_id file and pread_id file'
+    epilog = ''
+    parser = argparse.ArgumentParser(
+        description=description,
+        epilog=epilog,
+        formatter_class=HelpF,
+    )
+    parser.add_argument(
+        '--rawread-id-fn',
+        required=True,
+        help='From TASK_DUMP_RAWREAD_IDS_SCRIPT',
+    )
+    parser.add_argument(
+        '--pread-id-fn',
+        required=True,
+        help='From TASK_DUMP_PREAD_IDS_SCRIPT',
+    )
+    parser.add_argument(
+        '--sg-edges-list-fn',
+        required=True,
+        help='From Falcon stage 2-asm-falcon',
+    )
+    parser.add_argument(
+        '--utg-data-fn',
+        required=True,
+        help='From Falcon stage 2-asm-falcon',
+    )
+    parser.add_argument(
+        '--ctg-paths-fn',
+        required=True,
+        help='From Falcon stage 2-asm-falcon',
+    )
+    parser.add_argument(
+        '--output-fn',
+        required=True,
+        help='read-to-ctg-map',
+    )
+    args = parser.parse_args(argv[1:])
+    return args
+
+
+def main(argv=sys.argv):
+    args = parse_args(argv)
+    logging.basicConfig(level=logging.INFO)
+    run(**vars(args))
+
+
+if __name__ == '__main__':  # pragma: no cover
+    main()

+ 68 - 0
FALCON/falcon_kit/mains/generic_gather.py

@@ -0,0 +1,68 @@
+
+
+import argparse
+import logging
+import os
+import sys
+from .. import io
+
+LOG = logging.getLogger()
+
+
+def run(gathered_fn, scattered_fn):
+    thatdir = os.path.dirname(scattered_fn)
+    thisdir = os.path.dirname(gathered_fn)
+    scattered = io.deserialize(scattered_fn)
+    gathered = dict()
+    for job in scattered:
+        job_output = dict()
+        #job_output['wildcards'] = dict()
+        fn_dict = dict(job['output'])
+        for key in list(fn_dict.keys()):
+            # Fix path to be relative to gathered_fn.
+            fn = fn_dict[key]
+            if not os.path.isabs(fn):
+                thatfn = os.path.join(thatdir, fn)
+            else:
+                thatfn = fn
+            thisfn = os.path.relpath(thatfn, thisdir)
+            fn_dict[key] = thisfn
+        job_output['fns'] = fn_dict
+        wildcards = job['wildcards']
+        wildkey = ','.join('{}={}'.format(k,v) for k,v in sorted(wildcards.items()))
+        gathered[wildkey] = job_output
+    io.serialize(gathered_fn, gathered)
+
+
+class HelpF(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
+    pass
+
+
+def parse_args(argv):
+    description = 'Gather generic filenames into ... something. For now, just serialize.'
+    epilog = 'We expect the scattered file to have a specific format.'
+    parser = argparse.ArgumentParser(
+        description=description,
+        epilog=epilog,
+        formatter_class=HelpF,
+    )
+    parser.add_argument(
+        '--scattered-fn',
+        help='Input: result of scattering',
+    )
+    parser.add_argument(
+        '--gathered-fn',
+        help='Output: serialized something-or-other',
+    )
+    args = parser.parse_args(argv[1:])
+    return args
+
+
+def main(argv=sys.argv):
+    args = parse_args(argv)
+    logging.basicConfig(level=logging.INFO)
+    run(**vars(args))
+
+
+if __name__ == '__main__':  # pragma: no cover
+    main()

+ 117 - 0
FALCON/falcon_kit/mains/generic_run_units_of_work.py

@@ -0,0 +1,117 @@
+
+
+import argparse
+import collections
+import glob
+import logging
+import os
+import sys
+import pypeflow.do_task
+from .. import io
+
+LOG = logging.getLogger()
+
+
+# Here is some stuff basically copied from pypeflow.sample_tasks.py.
+def validate(bash_template, inputs, outputs, parameterss):
+    LOG.info('bash_script_from_template({}\n\tinputs={!r},\n\toutputs={!r})'.format(
+        bash_template, inputs, outputs))
+    def validate_dict(mydict):
+        "Python identifiers are illegal as keys."
+        try:
+            collections.namedtuple('validate', list(mydict.keys()))
+        except ValueError as exc:
+            LOG.exception('Bad key name in task definition dict {!r}'.format(mydict))
+            raise
+    validate_dict(inputs)
+    validate_dict(outputs)
+    validate_dict(parameterss)
+
+def update_values_rel_to(things, dn):
+    for key, val in list(things.items()):
+        try:
+            if not os.path.isabs(val):
+                things[key] = os.path.normpath(os.path.join(dn, val))
+        except Exception:
+            # Probably just not a string. But could be str, unicode, ...
+            pass
+
+def run(bash_template_fn, units_of_work_fn, nproc,
+        results_fn):
+    units_of_work_fn = os.path.realpath(units_of_work_fn)
+    uows = io.deserialize(units_of_work_fn)
+    uow_dirs = list()
+    results = list()
+    for i, uow in enumerate(uows):
+        uow_dir = 'uow-{:02d}'.format(i)
+        rel_units_of_work_dn = os.path.normpath(os.path.relpath(os.path.dirname(units_of_work_fn), uow_dir))
+        job = uow
+        inputs = job['input']
+        update_values_rel_to(inputs, rel_units_of_work_dn)
+        outputs = job['output'] # assumed to be relative to run-dir
+        params = dict(job['params'])
+        params['pypeflow_nproc'] = nproc
+        # We could also verify that any nproc from a splitter (which was a hint for splitting)
+        # matches pypeflow_nproc.
+
+        #params.update({k: v for (k, v) in viewitems(job['wildcards'])}) # include expanded wildcards
+        LOG.info('INPUT:{}'.format(inputs))
+        LOG.info('OUTPUT:{}'.format(outputs))
+        LOG.info('PARAMS:{}'.format(params))
+        uow_dirs.append(uow_dir)
+        io.rmdir(uow_dir)
+        io.mkdirs(uow_dir)
+        script = open(bash_template_fn).read()
+        with io.cd(uow_dir):
+            pypeflow.do_task.run_bash(script, inputs, outputs, params)
+            resolved_outputs = {k: os.path.abspath(v) for k,v in list(outputs.items())}
+        results.append({k: os.path.join('.', os.path.relpath(v)) for k,v in list(resolved_outputs.items())})
+        # Must be relative to this dir.
+        # (We assume outputs are under the current directory.)
+        # The reason for the './' prefix? So we can substitute in CWD later,
+        # in case we ran in /tmp. This also helps the pbsmrtpipe "gatherer".
+
+        #wildcards_str = '_'.join(w for w in itervalues(job['wildcards']))
+        #job_name = 'job{}'.format(wildcards_str)
+        #for (output_name, output_fn) in viewitems(outputs):
+        #    giname = '{}_{}'.format(job_name, output_name)
+        #    gather_inputs[giname] = output_fn
+    io.serialize(results_fn, results)
+
+
+class HelpF(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
+    pass
+
+
+def parse_args(argv):
+    description = 'Run a bash script once for each unit-of-work, in its own sub-dir.'
+    epilog = 'For now, runs will be in series, since we do not know how many processors we can use.'
+    parser = argparse.ArgumentParser(
+        description=description,
+        epilog=epilog,
+        formatter_class=HelpF,
+    )
+    parser.add_argument(
+        '--nproc',
+        help='Number of processors to be used.')
+    parser.add_argument(
+        '--bash-template-fn',
+        help='Input. Template of bash script to run on each unit-of-work, with snakemake-style substitutions.')
+    parser.add_argument(
+        '--units-of-work-fn',
+        help='Input. JSON list of records of unit-of-work. Each record is a dict of input, output, and params (snakemake-style).')
+    parser.add_argument(
+        '--results-fn',
+        help='Output. JSON list of result records.')
+    args = parser.parse_args(argv[1:])
+    return args
+
+
+def main(argv=sys.argv):
+    args = parse_args(argv)
+    logging.basicConfig(level=logging.INFO)
+    run(**vars(args))
+
+
+if __name__ == '__main__':  # pragma: no cover
+    main()

+ 89 - 0
FALCON/falcon_kit/mains/generic_scatter_one_uow.py

@@ -0,0 +1,89 @@
+"""
+This must not run in a tmpdir. The 'inputs' paths will
+end up relative to the rundir.
+"""
+
+
+import argparse
+import collections
+import glob
+import logging
+import os
+import sys
+from .. import io
+
+LOG = logging.getLogger()
+
+
+# Here is some stuff basically copied from pypeflow.sample_tasks.py.
+def validate(bash_template, inputs, outputs, parameterss):
+    LOG.info('bash_script_from_template({}\n\tinputs={!r},\n\toutputs={!r})'.format(
+        bash_template, inputs, outputs))
+    def validate_dict(mydict):
+        "Python identifiers are illegal as keys."
+        try:
+            collections.namedtuple('validate', list(mydict.keys()))
+        except ValueError as exc:
+            LOG.exception('Bad key name in task definition dict {!r}'.format(mydict))
+            raise
+    validate_dict(inputs)
+    validate_dict(outputs)
+    validate_dict(parameterss)
+
+
+def run(all_uow_list_fn, split_idx, one_uow_list_fn):
+    all_uows = io.deserialize(all_uow_list_fn)
+    all_dn = os.path.abspath(os.path.dirname(all_uow_list_fn))
+    one_dn = os.path.abspath(os.path.dirname(one_uow_list_fn))
+    rel_dn = os.path.relpath(all_dn, one_dn)
+    one_uow = all_uows[split_idx]
+
+    def fixpath(rel):
+        try:
+            if not os.path.isabs(rel):
+                return os.path.join('.', os.path.normpath(os.path.join(rel_dn, rel)))
+        except Exception:
+            # in case of non-string?
+            pass
+        return rel
+    if isinstance(one_uow, dict):
+        input_dict = one_uow['input']
+        for k, v in list(input_dict.items()):
+            input_dict[k] = fixpath(v)
+
+    io.serialize(one_uow_list_fn, [one_uow])
+
+
+class HelpF(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
+    pass
+
+
+def parse_args(argv):
+    description = 'Scatter a single unit-of-work from many units-of-work.'
+    epilog = ''
+    parser = argparse.ArgumentParser(
+        description=description,
+        epilog=epilog,
+        formatter_class=HelpF,
+    )
+    parser.add_argument(
+        '--all-uow-list-fn',
+        help='Input. JSON list of all units of work.')
+    parser.add_argument(
+        '--split-idx', type=int,
+        help='Input. Index into the all-uow-list for our single unit-of-work.')
+    parser.add_argument(
+        '--one-uow-list-fn',
+        help='Output. JSON list of a single unit-of-work.')
+    args = parser.parse_args(argv[1:])
+    return args
+
+
+def main(argv=sys.argv):
+    args = parse_args(argv)
+    logging.basicConfig(level=logging.INFO)
+    run(**vars(args))
+
+
+if __name__ == '__main__':  # pragma: no cover
+    main()

+ 82 - 0
FALCON/falcon_kit/mains/generic_scatter_uows.py

@@ -0,0 +1,82 @@
+
+
+import argparse
+import collections
+import glob
+import logging
+import os
+import sys
+from .. import io
+
+LOG = logging.getLogger()
+
+
+def yield_uows(n, all_uows):
+    uows_per_chunk = (len(all_uows) + n - 1) / n
+    for uow in all_uows:
+        yield [uow]
+
+
+def run(all_uow_list_fn, pattern, nchunks_max):
+    all_uows = io.deserialize(all_uow_list_fn)
+    n = min(nchunks_max, len(all_uows))
+    LOG.info('Num chunks = {} (<= {})'.format(n, nchunks_max))
+    all_dn = os.path.abspath(os.path.dirname(all_uow_list_fn))
+
+    for i, uows in enumerate(yield_uows(n, all_uows)):
+        key = '{:02d}'.format(i)
+        fn = pattern.replace('%', key)
+        LOG.info('Writing {} units-of-work to "{}" ({}).'.format(len(uows), fn, key))
+
+        one_dn = os.path.abspath(os.path.dirname(fn))
+        rel_dn = os.path.relpath(all_dn, one_dn)
+        def fixpath(rel):
+            try:
+                if not os.path.isabs(rel):
+                    return os.path.join('.', os.path.normpath(os.path.join(rel_dn, rel)))
+            except Exception:
+                # in case of non-string?
+                pass
+            return rel
+        for one_uow in uows:
+            if isinstance(one_uow, dict):
+                input_dict = one_uow['input']
+                for k, v in list(input_dict.items()):
+                    input_dict[k] = fixpath(v)
+
+        io.serialize(fn, uows)
+
+
+class HelpF(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
+    pass
+
+
+def parse_args(argv):
+    description = 'Scatter a single unit-of-work from many units-of-work.'
+    epilog = ''
+    parser = argparse.ArgumentParser(
+        description=description,
+        epilog=epilog,
+        formatter_class=HelpF,
+    )
+    parser.add_argument(
+        '--all-uow-list-fn',
+        help='Input. JSON list of all units of work.')
+    parser.add_argument(
+        '--nchunks-max', type=int,
+        help='Input. Maximum number of output files.')
+    parser.add_argument(
+        '--pattern',
+        help='Output. The "%" will be replaced by a zero-padded number. (Probably should be ".json")')
+    args = parser.parse_args(argv[1:])
+    return args
+
+
+def main(argv=sys.argv):
+    args = parse_args(argv)
+    logging.basicConfig(level=logging.INFO)
+    run(**vars(args))
+
+
+if __name__ == '__main__':  # pragma: no cover
+    main()

+ 101 - 0
FALCON/falcon_kit/mains/generic_scatter_uows_tar.py

@@ -0,0 +1,101 @@
+
+
+import argparse
+import collections
+import glob
+import logging
+import os
+import sys
+from .. import io
+
+LOG = logging.getLogger()
+
+
+def yield_uows(n, all_uows):
+    """
+    >>> list(yield_uows(2, [0,1,2,3,4]))
+    [[0, 1, 2], [3, 4]]
+    """
+    # yield exactly n sublists.
+    total = len(all_uows)
+    remaining = total
+    it = iter(all_uows)
+    while n and remaining:
+        taken = (remaining + n - 1) // n
+        to_yield = [next(it) for _ in range(taken)]
+        yield to_yield
+        remaining -= taken
+        n -= 1
+
+
+def move_into_tar(dn, fns):
+    # Create directory 'dn'.
+    # Move files (or dir-trees) into directory 'dn', and tar it.
+    # By convention, for tar-file "foo.tar", we first move everything into a directory named "foo".
+    io.mkdirs(dn)
+    for fn in fns:
+        cmd = 'mv {} {}'.format(fn, dn)
+        io.syscall(cmd)
+    tar_fn = '{}.tar'.format(dn)
+    #with tarfile.TarFile(tar_fn, 'w', dereference=False, ignore_zeros=True, errorlevel=2) as tf:
+    #    tf.add(dn)
+    cmd = 'tar cvf {} {}'.format(tar_fn, dn)
+    io.syscall(cmd)
+    io.rmdirs(dn)
+
+
+def dir_from_tar(tar_fn):
+    return os.path.splitext(os.path.basename(tar_fn))[0]
+
+
+def run(all_uows_tar_fn, pattern, nchunks_max):
+    cmd = 'tar -xvf {}'.format(all_uows_tar_fn)
+    io.syscall(cmd)
+    all_uows_dn = dir_from_tar(all_uows_tar_fn)
+    all_uows = list(sorted(glob.glob('{}/uow-*'.format(all_uows_dn))))
+    n = min(nchunks_max, len(all_uows))
+    LOG.info('Num chunks = {} (<= {})'.format(n, nchunks_max))
+
+    for i, uows in enumerate(yield_uows(n, all_uows)):
+        key = '{:02d}'.format(i)
+        fn = pattern.replace('%', key)
+        LOG.info('Writing {} units-of-work to "{}" ({}).'.format(len(uows), fn, key))
+        dn = dir_from_tar(fn)
+        move_into_tar(dn, uows)
+    cmd = 'rmdir {}'.format(all_uows_dn)
+    io.syscall(cmd)
+
+
+class HelpF(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
+    pass
+
+
+def parse_args(argv):
+    description = 'Scatter a single unit-of-work from many units-of-work.'
+    epilog = ''
+    parser = argparse.ArgumentParser(
+        description=description,
+        epilog=epilog,
+        formatter_class=HelpF,
+    )
+    parser.add_argument(
+        '--all-uows-tar-fn',
+        help='Input. Tarfile of all units of work directories.')
+    parser.add_argument(
+        '--nchunks-max', type=int,
+        help='Input. Maximum number of output files.')
+    parser.add_argument(
+        '--pattern',
+        help='Output. The "%" will be replaced by a zero-padded number. (Probably should be ".tar")')
+    args = parser.parse_args(argv[1:])
+    return args
+
+
+def main(argv=sys.argv):
+    args = parse_args(argv)
+    logging.basicConfig(level=logging.INFO)
+    run(**vars(args))
+
+
+if __name__ == '__main__':  # pragma: no cover
+    main()

+ 74 - 0
FALCON/falcon_kit/mains/generic_tar_uows.py

@@ -0,0 +1,74 @@
+
+
+import argparse
+import collections
+import glob
+import logging
+import os
+import sys
+import pypeflow.do_task
+from .. import io
+
+LOG = logging.getLogger()
+
+def tar_uows(fn, uows):
+    # Operate in a subdir. (Named, so not thread-safe.)
+    subdir = os.path.splitext(fn)[0]
+    io.mkdirs(subdir) # permissions?
+    with io.cd(subdir):
+        # We could include other files here, or at least symlinks, but not today.
+        # Soon, we will construct the uow-subdirs here, but we must consider clobbering.
+        io.serialize('some-units-of-work.json', uows)
+    cmd = 'tar -cf {} {}'.format(fn, subdir)
+    io.syscall(cmd)
+    io.rmdirs(subdir)
+
+def yield_uows(n, all_uows):
+    uows_per_chunk = (len(all_uows) + n - 1) / n
+    for uow in all_uows:
+        yield [uow]
+
+def run(all_uow_list_fn, pattern, nchunks_max):
+    all_uows = io.deserialize(all_uow_list_fn)
+    n = min(nchunks_max, len(all_uows))
+    LOG.info('Num chunks = {} (<= {})'.format(n, nchunks_max))
+    for i, uows in enumerate(yield_uows(n, all_uows)):
+        key = '{:02d}'.format(i)
+        fn = pattern.replace('%', key)
+        LOG.info('Writing {} units-of-work to "{}" ({}).'.format(len(uows), fn, key))
+        tar_uows(fn, uows)
+
+
+class HelpF(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
+    pass
+
+
+def parse_args(argv):
+    description = 'Split a JSON list of units-of-work into up to N files ("chunks"), still as lists of units-of-work.'
+    epilog = ''
+    parser = argparse.ArgumentParser(
+        description=description,
+        epilog=epilog,
+        formatter_class=HelpF,
+    )
+    parser.add_argument(
+        '--all-uow-list-fn',
+        help='Input. JSON list of units of work.')
+    parser.add_argument(
+        '--nchunks_max', type=int,
+        help='Input. Maximum number of output files.')
+    parser.add_argument(
+        '--pattern',
+        help='Output. The "%" will be replace by a zero-padded number. (These will be a tar-files, so it should probably end in ".tar".')
+    args = parser.parse_args(argv[1:])
+    return args
+
+
+def main(argv=sys.argv):
+    args = parse_args(argv)
+    logging.basicConfig(level=logging.INFO)
+    run(**vars(args))
+
+
+if __name__ == '__main__':  # pragma: no cover
+    main()

+ 69 - 0
FALCON/falcon_kit/mains/generic_unsplit.py

@@ -0,0 +1,69 @@
+
+
+import argparse
+import logging
+import os
+import sys
+from .. import io
+
+LOG = logging.getLogger()
+
+
+def run(result_fn_list_fn, gathered_fn):
+    thatdir = os.path.dirname(result_fn_list_fn)
+    thisdir = os.path.dirname(gathered_fn)
+    result_fn_list = io.deserialize(result_fn_list_fn)
+    io.serialize(gathered_fn, result_fn_list)
+    gathered_dn = os.path.dirname(gathered_fn)
+    gathered = list()
+    for result_fn in result_fn_list:
+        some_results = io.deserialize(result_fn)
+        d = os.path.abspath(os.path.dirname(result_fn))
+        def abspath(v):
+            if v.startswith('.'):
+                return os.path.normpath(os.path.relpath(os.path.join(d, v), gathered_dn))
+            else:
+                return v # apparently not a path
+        # By construction, this is a list of dicts of k:output,
+        # where outputs are relative to the location of result_fn.
+        some_abs_results = list()
+        for one in some_results:
+            for v in one.values():
+                assert not v.startswith('/'), '{!r} was expected to be relative'.format(v)
+            abs_one = {k: abspath(v) for k,v in list(one.items())}
+            some_abs_results.append(abs_one)
+        gathered.extend(some_abs_results)
+    io.serialize(gathered_fn, gathered)
+
+
+class HelpF(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
+    pass
+
+
+def parse_args(argv):
+    description = 'Gather the contents of contents of result-lists into a single gathered-list.'
+    epilog = 'results-list is known already, so that is a pseudo output. Its filenames point to the actual, unknown results.'
+    # Question: Do we need to know the wildcards for each result?
+    parser = argparse.ArgumentParser(
+        description=description,
+        epilog=epilog,
+        formatter_class=HelpF,
+    )
+    parser.add_argument(
+        '--result-fn-list-fn',
+        help='Input: Combined list of filenames of results (pseudo output, expected to exist already in our run-dir)')
+    parser.add_argument(
+        '--gathered-fn',
+        help='Output: serialized something-or-other')
+    args = parser.parse_args(argv[1:])
+    return args
+
+
+def main(argv=sys.argv):
+    args = parse_args(argv)
+    logging.basicConfig(level=logging.INFO)
+    run(**vars(args))
+
+
+if __name__ == '__main__':  # pragma: no cover
+    main()

+ 107 - 0
FALCON/falcon_kit/mains/get_read_ctg_map.py

@@ -0,0 +1,107 @@
+
+
+from .. import pype_tasks
+# pylint: disable=no-name-in-module, import-error, fixme, line-too-long
+from pypeflow.simple_pwatcher_bridge import (PypeProcWatcherWorkflow, MyFakePypeThreadTaskBase,
+                                             makePypeLocalFile, fn, PypeTask)
+PypeThreadTaskBase = MyFakePypeThreadTaskBase
+import argparse
+import glob
+import logging
+import sys
+import subprocess as sp
+import shlex
+import os
+
+LOG = logging.getLogger(__name__)
+
+
+def make_dirs(d):
+    if not os.path.isdir(d):
+        LOG.debug('mkdirs {}'.format(d))
+        os.makedirs(d)
+
+
+def get_read_ctg_map(rawread_dir, pread_dir, asm_dir):
+    read_map_dir = os.path.abspath(os.path.join(asm_dir, 'read_maps'))
+    make_dirs(read_map_dir)
+
+    wf = PypeProcWatcherWorkflow(
+        max_jobs=12,
+    )
+    """
+            job_type=config['job_type'],
+            job_queue=config['job_queue'],
+            sge_option=config.get('sge_option', ''),
+            watcher_type=config['pwatcher_type'],
+            watcher_directory=config['pwatcher_directory'])
+    """
+
+    rawread_db = makePypeLocalFile(os.path.join(rawread_dir, 'raw_reads.db'))
+    rawread_id_file = makePypeLocalFile(os.path.join(
+        read_map_dir, 'dump_rawread_ids', 'rawread_ids'))
+
+    task = PypeTask(
+        inputs={'rawread_db': rawread_db},
+        outputs={'rawread_id_file': rawread_id_file},
+    )
+    wf.addTask(task(pype_tasks.task_dump_rawread_ids))
+
+    pread_db = makePypeLocalFile(os.path.join(pread_dir, 'preads.db'))
+    pread_id_file = makePypeLocalFile(os.path.join(
+        read_map_dir, 'dump_pread_ids', 'pread_ids'))
+
+    task = PypeTask(
+        inputs={'pread_db': pread_db},
+        outputs={'pread_id_file': pread_id_file},
+    )
+    wf.addTask(task(pype_tasks.task_dump_pread_ids))
+
+    wf.refreshTargets()  # block
+
+    sg_edges_list = makePypeLocalFile(os.path.join(asm_dir, 'sg_edges_list'))
+    utg_data = makePypeLocalFile(os.path.join(asm_dir, 'utg_data'))
+    ctg_paths = makePypeLocalFile(os.path.join(asm_dir, 'ctg_paths'))
+
+    inputs = {'rawread_id_file': rawread_id_file,
+              'pread_id_file': pread_id_file,
+              'sg_edges_list': sg_edges_list,
+              'utg_data': utg_data,
+              'ctg_paths': ctg_paths}
+
+    read_to_contig_map = makePypeLocalFile(os.path.join(
+        read_map_dir, 'get_ctg_read_map', 'read_to_contig_map'))
+
+    task = PypeTask(
+        inputs=inputs,
+        outputs={'read_to_contig_map': read_to_contig_map},
+    )
+    wf.addTask(task(pype_tasks.task_generate_read_to_ctg_map))
+
+    wf.refreshTargets()  # block
+
+
+def parse_args(argv):
+    parser = argparse.ArgumentParser(description='generate `2-asm-falcon/read_maps/read_to_contig_map` that contains the \
+information from the chain of mapping: (contig id) -> (internal p-read id) -> (internal raw-read id) -> (original read id)',
+                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--basedir', type=str, default='./',
+                        help='the base working dir of a FALCON assembly')
+    args = parser.parse_args(argv[1:])
+    return args
+
+
+def main(argv=sys.argv):
+    logging.basicConfig()
+    args = parse_args(argv)
+    basedir = args.basedir
+    rawread_dir = os.path.abspath(os.path.join(basedir, '0-rawreads'))
+    pread_dir = os.path.abspath(os.path.join(basedir, '1-preads_ovl'))
+    asm_dir = os.path.abspath(os.path.join(basedir, '2-asm-falcon'))
+
+    get_read_ctg_map(rawread_dir=rawread_dir,
+                     pread_dir=pread_dir, asm_dir=asm_dir)
+
+
+if __name__ == '__main__':
+    main()

+ 338 - 0
FALCON/falcon_kit/mains/graph_to_contig.py

@@ -0,0 +1,338 @@
+"""
+TODO: (from convo w/ Ivan)
+the issue with this script (but would still like to re-read it to refresh my memory). The script loads all edge sequences and tries to do two things at once: create p_ctg and a_ctg sequences, and align the bubbles using those sequences
+
+
+If we generate:
+1. All paths first (as tiling paths) for all p_ctg and all a_ctg without loading sequences - this should not consume much space (take a look at *_tiling_paths files).
+2. Load the first read of each tiling path fully, and only edge sequences for every transition, we can generate the output sequences with the same memory/disk consumption.
+3. Align bubbles after that.
+
+Our resource consumption should be same
+
+Bubbles?
+It aligns them to produce the identity score
+
+After that the dedup_a_tigs.py script is used to deduplicate fake a_ctg.
+But that script is simple, and only depends on the alignment info that the previous script stored in the a_ctg header.
+"""
+
+
+
+
+from builtins import zip
+from builtins import range
+import argparse
+import logging
+import sys
+import networkx as nx
+from ..FastaReader import open_fasta_reader
+from ..io import open_progress
+
+RCMAP = dict(list(zip("ACGTacgtNn-", "TGCAtgcaNn-")))
+
+def log(msg):
+    sys.stderr.write(msg)
+    sys.stderr.write('\n')
+
+
+def rc(seq):
+    return "".join([RCMAP[c] for c in seq[::-1]])
+
+def reverse_end(node_id):
+    node_id, end = node_id.split(":")
+    new_end = "B" if end == "E" else "E"
+    return node_id + ":" + new_end
+
+
+def yield_first_seq(one_path_edges, seqs):
+    if one_path_edges and one_path_edges[0][0] != one_path_edges[-1][1]:
+        # If non-empty, and non-circular,
+        # prepend the entire first read.
+        (vv, ww) = one_path_edges[0]
+        (vv_rid, vv_letter) = vv.split(":")
+        if vv_letter == 'E':
+            first_seq = seqs[vv_rid]
+        else:
+            assert vv_letter == 'B'
+            first_seq = "".join([RCMAP[c] for c in seqs[vv_rid][::-1]])
+        yield first_seq
+
+def compose_ctg(seqs, edge_data, ctg_id, path_edges, proper_ctg):
+    total_score = 0
+    total_length = 0
+    edge_lines = []
+    sub_seqs = []
+
+    # If required, add the first read to the path sequence.
+    if proper_ctg:
+        sub_seqs = list(yield_first_seq(path_edges, seqs))
+        total_length = 0 if len(sub_seqs) == 0 else len(sub_seqs[0])
+
+    # Splice-in the rest of the path sequence.
+    for vv, ww in path_edges:
+        rid, s, t, aln_score, idt, e_seq = edge_data[(vv, ww)]
+        sub_seqs.append(e_seq)
+        edge_lines.append('%s %s %s %s %d %d %d %0.2f' % (
+            ctg_id, vv, ww, rid, s, t, aln_score, idt))
+        total_length += abs(s - t)
+        total_score += aln_score
+
+    return edge_lines, sub_seqs, total_score, total_length
+
+def run(improper_p_ctg, proper_a_ctg, preads_fasta_fn, sg_edges_list_fn, utg_data_fn, ctg_paths_fn):
+    """improper==True => Neglect the initial read.
+    We used to need that for unzip.
+    """
+    reads_in_layout = set()
+    with open_progress(sg_edges_list_fn) as f:
+        for l in f:
+            l = l.strip().split()
+            """001039799:E 000333411:E 000333411 17524 20167 17524 99.62 G"""
+            v, w, rid, s, t, aln_score, idt, type_ = l
+            if type_ != "G":
+                continue
+            r1 = v.split(":")[0]
+            reads_in_layout.add(r1)
+            r2 = w.split(":")[0]
+            reads_in_layout.add(r2)
+
+    seqs = {}
+    # load all p-read name into memory
+    with open_fasta_reader(preads_fasta_fn) as f:
+        for r in f:
+            if r.name not in reads_in_layout:
+                continue
+            seqs[r.name] = r.sequence.upper() # name == rid-string
+
+    edge_data = {}
+    with open_progress(sg_edges_list_fn) as f:
+        for l in f:
+            l = l.strip().split()
+            """001039799:E 000333411:E 000333411 17524 20167 17524 99.62 G"""
+            v, w, rid, s, t, aln_score, idt, type_ = l
+
+            if type_ != "G":
+                continue
+            r1, dir1 = v.split(":")
+            reads_in_layout.add(r1) # redundant, but harmless
+            r2, dir2 = w.split(":")
+            reads_in_layout.add(r2) # redundant, but harmless
+
+            s = int(s)
+            t = int(t)
+            aln_score = int(aln_score)
+            idt = float(idt)
+
+            if s < t:
+                e_seq = seqs[rid][s:t]
+                assert 'E' == dir2
+            else:
+                # t and s were swapped for 'c' alignments in ovlp_to_graph.generate_string_graph():702
+                # They were translated from reverse-dir to forward-dir coordinate system in LA4Falcon.
+                e_seq = "".join([RCMAP[c] for c in seqs[rid][t:s][::-1]])
+                assert 'B' == dir2
+            edge_data[(v, w)] = (rid, s, t, aln_score, idt, e_seq)
+
+    utg_data = {}
+    with open_progress(utg_data_fn) as f:
+        for l in f:
+            l = l.strip().split()
+            s, v, t, type_, length, score, path_or_edges = l
+            if type_ not in ["compound", "simple", "contained"]:
+                continue
+            length = int(length)
+            score = int(score)
+            if type_ in ("simple", "contained"):
+                path_or_edges = path_or_edges.split("~")
+            else:
+                path_or_edges = [tuple(e.split("~"))
+                                 for e in path_or_edges.split("|")]
+            utg_data[(s, v, t)] = type_, length, score, path_or_edges
+
+    p_ctg_out = open("p_ctg.fa", "w")
+    a_ctg_out = open("a_ctg_all.fa", "w")
+    p_ctg_t_out = open("p_ctg_tiling_path", "w")
+    a_ctg_t_out = open("a_ctg_all_tiling_path", "w")
+    layout_ctg = set()
+
+    with open_progress(ctg_paths_fn) as f:
+        for l in f:
+            l = l.strip().split()
+            ctg_id, c_type_, i_utig, t0, length, score, utgs = l
+            ctg_id = ctg_id
+            s0 = i_utig.split("~")[0]
+
+            if (reverse_end(t0), reverse_end(s0)) in layout_ctg:
+                continue
+            else:
+                layout_ctg.add((s0, t0))
+
+            ctg_label = i_utig + "~" + t0
+            length = int(length)
+            utgs = utgs.split("|")
+            one_path = []
+            total_score = 0
+            total_length = 0
+
+            #a_ctg_data = []
+            a_ctg_group = {}
+
+            for utg in utgs:
+                s, v, t = utg.split("~")
+                type_, length, score, path_or_edges = utg_data[(s, v, t)]
+                total_score += score
+                total_length += length
+                if type_ == "simple":
+                    if len(one_path) != 0:
+                        one_path.extend(path_or_edges[1:])
+                    else:
+                        one_path.extend(path_or_edges)
+                if type_ == "compound":
+
+                    c_graph = nx.DiGraph()
+
+                    all_alt_path = []
+                    for ss, vv, tt in path_or_edges:
+                        type_, length, score, sub_path = utg_data[(ss, vv, tt)]
+
+                        v1 = sub_path[0]
+                        for v2 in sub_path[1:]:
+                            c_graph.add_edge(
+                                v1, v2, e_score=edge_data[(v1, v2)][3])
+                            v1 = v2
+
+                    shortest_path = nx.shortest_path(c_graph, s, t, "e_score")
+                    score = nx.shortest_path_length(c_graph, s, t, "e_score")
+                    all_alt_path.append((score, shortest_path))
+
+                    # a_ctg_data.append( (s, t, shortest_path) ) #first path is the same as the one used in the primary contig
+                    while 1:
+                        n0 = shortest_path[0]
+                        for n1 in shortest_path[1:]:
+                            c_graph.remove_edge(n0, n1)
+                            n0 = n1
+                        try:
+                            shortest_path = nx.shortest_path(
+                                c_graph, s, t, "e_score")
+                            score = nx.shortest_path_length(
+                                c_graph, s, t, "e_score")
+                            #a_ctg_data.append( (s, t, shortest_path) )
+                            all_alt_path.append((score, shortest_path))
+
+                        except nx.exception.NetworkXNoPath:
+                            break
+                        # if len(shortest_path) < 2:
+                        #    break
+                    # Is sorting required, if we are appending the shortest paths in order?
+                    all_alt_path.sort()
+                    all_alt_path.reverse()
+                    shortest_path = all_alt_path[0][1]
+                    # The longest branch in the compound unitig is added to the primary path.
+                    if len(one_path) != 0:
+                        one_path.extend(shortest_path[1:])
+                    else:
+                        one_path.extend(shortest_path)
+
+                    a_ctg_group[(s, t)] = all_alt_path
+
+            if len(one_path) == 0:
+                continue
+
+            one_path_edges = list(zip(one_path[:-1], one_path[1:]))
+
+            # Compose the primary contig.
+            p_edge_lines, p_ctg_seq_chunks, p_total_score, p_total_length = compose_ctg(seqs, edge_data, ctg_id, one_path_edges, (not improper_p_ctg))
+
+            # Write out the tiling path.
+            p_ctg_t_out.write('\n'.join(p_edge_lines))
+            p_ctg_t_out.write('\n')
+
+            # Write the sequence.
+            # Using the `total_score` instead of `p_total_score` intentionally. Sum of
+            # edge scores is not identical to sum of unitig scores.
+            p_ctg_out.write('>%s %s %s %d %d\n' % (ctg_id, ctg_label, c_type_, p_total_length, total_score))
+            p_ctg_out.write(''.join(p_ctg_seq_chunks))
+            p_ctg_out.write('\n')
+
+            a_id = 0
+            for v, w in a_ctg_group:
+                atig_output = []
+
+                # Compose the base sequence.
+                for sub_id in range(len(a_ctg_group[(v, w)])):
+                    score, atig_path = a_ctg_group[(v, w)][sub_id]
+                    atig_path_edges = list(zip(atig_path[:-1], atig_path[1:]))
+
+                    a_ctg_id = '%s-%03d-%02d' % (ctg_id, a_id + 1, sub_id)
+                    a_edge_lines, sub_seqs, a_total_score, a_total_length = compose_ctg(
+                        seqs, edge_data, a_ctg_id, atig_path_edges, proper_a_ctg)
+
+                    seq = ''.join(sub_seqs)
+
+                    # Keep the placeholder for these values for legacy purposes, but mark
+                    # them as for deletion.
+                    # The base a_ctg will also be output to the same file, for simplicity.
+                    delta_len = 0
+                    idt = 1.0
+                    cov = 1.0
+                    atig_output.append((v, w, atig_path, a_total_length, a_total_score, seq, atig_path_edges, a_ctg_id, a_edge_lines, delta_len, idt, cov))
+
+                if len(atig_output) == 1:
+                    continue
+
+                for sub_id, data in enumerate(atig_output):
+                    v, w, tig_path, a_total_length, a_total_score, seq, atig_path_edges, a_ctg_id, a_edge_lines, delta_len, a_idt, cov = data
+
+                    # Write out the tiling path.
+                    a_ctg_t_out.write('\n'.join(a_edge_lines))
+                    a_ctg_t_out.write('\n')
+
+                    # Write the sequence.
+                    a_ctg_out.write('>%s %s %s %d %d %d %d %0.2f %0.2f\n' % (a_ctg_id, v, w, a_total_length, a_total_score, len(atig_path_edges), delta_len, idt, cov))
+                    a_ctg_out.write(''.join(seq))
+                    a_ctg_out.write('\n')
+
+                a_id += 1
+
+    a_ctg_out.close()
+    p_ctg_out.close()
+    a_ctg_t_out.close()
+    p_ctg_t_out.close()
+
+def main(argv=sys.argv):
+    description = 'Generate the primary and alternate contig fasta files and tiling paths, given the string graph.'
+    epilog = """
+We write these:
+
+    p_ctg_out = open("p_ctg.fa", "w")
+    a_ctg_out = open("a_ctg_all.fa", "w")
+    p_ctg_t_out = open("p_ctg_tiling_path", "w")
+    a_ctg_t_out = open("a_ctg_all_tiling_path", "w")
+"""
+    parser = argparse.ArgumentParser(
+            description=description,
+            formatter_class=argparse.RawDescriptionHelpFormatter,
+            epilog=epilog)
+    parser.add_argument('--improper-p-ctg', action='store_true',
+            help='Skip the initial read in each p_ctg path.')
+    parser.add_argument('--proper-a-ctg', action='store_true',
+            help='Skip the initial read in each a_ctg path.')
+    parser.add_argument('--preads-fasta-fn', type=str,
+            default='./preads4falcon.fasta',
+            help='Input. Preads file, required to construct the contigs.')
+    parser.add_argument('--sg-edges-list-fn', type=str,
+            default='./sg_edges_list',
+            help='Input. File containing string graph edges, produced by ovlp_to_graph.py.')
+    parser.add_argument('--utg-data-fn', type=str,
+            default='./utg_data',
+            help='Input. File containing unitig data, produced by ovlp_to_graph.py.')
+    parser.add_argument('--ctg-paths-fn', type=str,
+            default='./ctg_paths',
+            help='Input. File containing contig paths, produced by ovlp_to_graph.py.')
+    args = parser.parse_args(argv[1:])
+    run(**vars(args))
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    main(sys.argv)

+ 181 - 0
FALCON/falcon_kit/mains/graph_to_utgs.py

@@ -0,0 +1,181 @@
+
+
+
+
+from builtins import zip
+from builtins import range
+from falcon_kit import kup, falcon, DWA
+from falcon_kit.fc_asm_graph import AsmGraph
+import networkx as nx
+import sys
+
+RCMAP = dict(list(zip("ACGTacgtNn-", "TGCAtgcaNn-")))
+
+
+def rc(seq):
+    return "".join([RCMAP[c] for c in seq[::-1]])
+
+
+def get_aln_data(t_seq, q_seq):
+    aln_data = []
+    K = 8
+    seq0 = t_seq
+    lk_ptr = kup.allocate_kmer_lookup(1 << (K * 2))
+    sa_ptr = kup.allocate_seq(len(seq0))
+    sda_ptr = kup.allocate_seq_addr(len(seq0))
+    kup.add_sequence(0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr)
+    q_id = "dummy"
+
+    kmer_match_ptr = kup.find_kmer_pos_for_seq(
+        q_seq, len(q_seq), K, sda_ptr, lk_ptr)
+    kmer_match = kmer_match_ptr[0]
+    aln_range_ptr = kup.find_best_aln_range(kmer_match_ptr, K, K * 5, 12)
+    aln_range = aln_range_ptr[0]
+    x, y = list(zip(* [(kmer_match.query_pos[i], kmer_match.target_pos[i])
+                  for i in range(kmer_match.count)]))
+    kup.free_kmer_match(kmer_match_ptr)
+    s1, e1, s2, e2 = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2
+
+    if e1 - s1 > 100:
+
+        alignment = DWA.align(q_seq[s1:e1], e1 - s1,
+                              seq0[s2:e2], e2 - s2,
+                              1500, 1)
+
+        if alignment[0].aln_str_size > 100:
+            aln_data.append((q_id, 0, s1, e1, len(q_seq), s2, e2, len(
+                seq0), alignment[0].aln_str_size, alignment[0].dist))
+            aln_str1 = alignment[0].q_aln_str
+            aln_str0 = alignment[0].t_aln_str
+
+        DWA.free_alignment(alignment)
+
+    kup.free_kmer_lookup(lk_ptr)
+    kup.free_seq_array(sa_ptr)
+    kup.free_seq_addr_array(sda_ptr)
+    return aln_data, x, y
+
+
+def main(argv=sys.argv):
+    G_asm = AsmGraph("sg_edges_list", "utg_data", "ctg_paths")
+    G_asm.load_sg_seq("preads4falcon.fasta")
+
+    utg_out = open("utgs.fa", "w")
+
+    for utg in G_asm.utg_data:
+        s, t, v = utg
+        type_, length, score, path_or_edges = G_asm.utg_data[(s, t, v)]
+        if type_ == "simple":
+            path_or_edges = path_or_edges.split("~")
+            seq = G_asm.get_seq_from_path(path_or_edges)
+            print(">%s~%s~%s-%d %d %d" % (
+                s, v, t, 0, length, score), file=utg_out)
+            print(seq, file=utg_out)
+
+        if type_ == "compound":
+
+            c_graph = nx.DiGraph()
+
+            all_alt_path = []
+            path_or_edges = [c.split("~") for c in path_or_edges.split("|")]
+            for ss, vv, tt in path_or_edges:
+                type_, length, score, sub_path = G_asm.utg_data[(ss, tt, vv)]
+
+                sub_path = sub_path.split("~")
+                v1 = sub_path[0]
+                for v2 in sub_path[1:]:
+                    c_graph.add_edge(
+                        v1, v2, e_score=G_asm.sg_edges[(v1, v2)][1])
+                    v1 = v2
+
+            shortest_path = nx.shortest_path(c_graph, s, t, "e_score")
+            score = nx.shortest_path_length(c_graph, s, t, "e_score")
+            all_alt_path.append((score, shortest_path))
+
+            # a_ctg_data.append( (s, t, shortest_path) ) #first path is the same as the one used in the primary contig
+            while 1:
+                if s == t:
+                    break
+                n0 = shortest_path[0]
+                for n1 in shortest_path[1:]:
+                    c_graph.remove_edge(n0, n1)
+                    n0 = n1
+                try:
+                    shortest_path = nx.shortest_path(c_graph, s, t, "e_score")
+                    score = nx.shortest_path_length(c_graph, s, t, "e_score")
+                    #a_ctg_data.append( (s, t, shortest_path) )
+                    all_alt_path.append((score, shortest_path))
+
+                except nx.exception.NetworkXNoPath:
+                    break
+                # if len(shortest_path) < 2:
+                #    break
+
+            all_alt_path.sort()
+            all_alt_path.reverse()
+            shortest_path = all_alt_path[0][1]
+
+            score, atig_path = all_alt_path[0]
+
+            atig_output = []
+
+            atig_path_edges = list(zip(atig_path[:-1], atig_path[1:]))
+            sub_seqs = []
+            total_length = 0
+            total_score = 0
+            for vv, ww in atig_path_edges:
+                r, aln_score, idt, typs_ = G_asm.sg_edges[(vv, ww)]
+                e_seq = G_asm.sg_edge_seqs[(vv, ww)]
+                rid, ss, tt = r
+                sub_seqs.append(e_seq)
+                total_length += abs(ss - tt)
+                total_score += aln_score
+
+            base_seq = "".join(sub_seqs)
+            atig_output.append(
+                (s, t, atig_path, total_length, total_score, base_seq, atig_path_edges, 1, 1))
+
+            duplicated = True
+            for score, atig_path in all_alt_path[1:]:
+                atig_path_edges = list(zip(atig_path[:-1], atig_path[1:]))
+                sub_seqs = []
+                total_length = 0
+                total_score = 0
+                for vv, ww in atig_path_edges:
+                    r, aln_score, idt, type_ = G_asm.sg_edges[(vv, ww)]
+                    e_seq = G_asm.sg_edge_seqs[(vv, ww)]
+                    rid, ss, tt = r
+                    sub_seqs.append(e_seq)
+                    total_length += abs(ss - tt)
+                    total_score += aln_score
+
+                seq = "".join(sub_seqs)
+
+                aln_data, x, y = get_aln_data(base_seq, seq)
+                if len(aln_data) != 0:
+                    idt = 1.0 - 1.0 * aln_data[-1][-1] / aln_data[-1][-2]
+                    cov = 1.0 * (aln_data[-1][3] -
+                                 aln_data[-1][2]) / aln_data[-1][4]
+                    if idt < 0.96 or cov < 0.98:
+                        duplicated = False
+                        atig_output.append(
+                            (s, t, atig_path, total_length, total_score, seq, atig_path_edges, idt, cov))
+                else:
+                    duplicated = False
+                    atig_output.append(
+                        (s, t, atig_path, total_length, total_score, seq, atig_path_edges, 0, 0))
+
+            # if len(atig_output) == 1:
+            #    continue
+
+            sub_id = 0
+            for data in atig_output:
+                v0, w0, tig_path, total_length, total_score, seq, atig_path_edges, a_idt, cov = data
+                print(">%s~%s~%s-%d %d %d" % (
+                    v0, "NA", w0, sub_id,  total_length, total_score), file=utg_out)
+                print(seq, file=utg_out)
+                sub_id += 1
+
+
+if __name__ == "__main__":
+    main(sys.argv)

+ 413 - 0
FALCON/falcon_kit/mains/hgap4_adapt.py

@@ -0,0 +1,413 @@
+"""Given a full HGAP4 run,
+
+generate directories and symlinks to make it look like
+a pypeflow run.
+
+Then, fc_run/fc_unzip/fc_quiver can operate on it. In fact, fc_run
+should be already satisfied.
+
+One caveat: At the moment, parts of falcon-unzip actually write into the
+falcon dirs. We should fix that. But for now, we create writable run-dirs.
+We do *not* write into the HGAP4 run-dir.
+"""
+
+
+
+from future.utils import viewitems
+from ..util.system import (cd, touch, make_dirs)
+import argparse
+import contextlib
+import glob
+import json
+import logging
+import os
+import sys
+
+LOG = logging.getLogger(__name__)
+
+"""
+Note that, even though this program is designed to let unzip run,
+it has nothing to do with unzip. It merely mimics falcon, so that
+the falcon jobs appear as fully satisfied to pypeflow. That is why
+this program is in this repo rather than in FALCON_unzip.
+
+However, if HGAP4/pbsmrtpipe-tasks change, then this would need to
+be updated.
+"""
+
+"""Post-FALCON steps:
+
+pbcoretools.tasks.fasta2referenceset-0
+pbalign.tasks.pbalign-0                 *******
+pbreports.tasks.summarize_coverage-0
+genomic_consensus.tasks.variantcaller-0 *******
+pbreports.tasks.coverage_report_hgap-0
+genomic_consensus.tasks.gff2bed-0
+pbcoretools.tasks.contigset2fasta-0
+pbreports.tasks.polished_assembly-0
+pbreports.tasks.mapping_stats_hgap-0
+falcon_ns.tasks.task_report_preassembly_yield-0
+
+Or with chunking:
+
+pbcoretools.tasks.fasta2referenceset-0
+pbcoretools.tasks.subreadset_align_scatter-1
+pbalign.tasks.pbalign-2
+pbalign.tasks.pbalign-1
+.pbcoretools.tasks.subreadset_align_scatter-b473df0f-c3d5-46ab-8c45-be5054ea0dbd-gathered-pipeline.chunks.json
+pbcoretools.tasks.gather_alignmentset-1
+pbreports.tasks.summarize_coverage-0
+pbcoretools.tasks.alignment_contig_scatter-1
+pbreports.tasks.coverage_report_hgap-0
+genomic_consensus.tasks.variantcaller-2
+genomic_consensus.tasks.variantcaller-1
+.pbcoretools.tasks.alignment_contig_scatter-7eda161b-3ed9-4891-97f3-300dd975407a-gathered-pipeline.chunks.json
+pbcoretools.tasks.gather_gff-1
+pbreports.tasks.mapping_stats_hgap-0
+pbcoretools.tasks.gather_fastq-1
+pbcoretools.tasks.gather_contigset-1
+pbcoretools.tasks.gather_vcf-1
+genomic_consensus.tasks.gff2bed-0
+pbreports.tasks.polished_assembly-0
+pbcoretools.tasks.contigset2fasta-0
+"""
+
+@contextlib.contextmanager
+def mkcd(newdir):
+    make_dirs(newdir)
+    with cd(newdir):
+        yield
+
+
+def symlink(jo):
+    """Caller should first cd into link-dir.
+    """
+    def assert_exists(path):
+        assert os.path.exists(path), 'File does not exist: {!r}'.format(path)
+
+    def assert_dir(path):
+        assert os.path.isdir(path), 'Not a directory: {!r}'.format(path)
+    assert_dir(jo)
+    taskdir = os.path.join(jo, 'tasks')
+    assert_dir(taskdir)
+
+    def touch_done():
+        """Standard pypeflow convention.
+        """
+        touch('run.sh.done')
+
+    def abstdir(basetdir):
+        return os.path.abspath(os.path.join(jo, 'tasks', basetdir))
+
+    def link(targetdir, basename, linkname=None):
+        if not linkname:
+            linkname = basename
+        reldir = os.path.relpath(targetdir)
+        target = os.path.join(reldir, basename)
+        assert_exists(os.path.abspath(target))
+        if os.path.lexists(linkname):
+            if os.readlink(linkname) == target:
+                return
+            os.unlink(linkname)
+        LOG.info('link {!r} to {}/{}'.format(linkname, reldir, basename))
+        os.symlink(target, linkname)
+
+    # Define task symlinkers
+
+    def task_make_fofn_abs_raw():
+        """deprecated
+        "input_fofn" from cfg
+        """
+        rdir = abstdir('falcon_ns2.tasks.task_falcon_make_fofn_abs-0')
+        with mkcd('0-rawreads/raw-fofn-abs/'):
+            link(rdir, 'file.fofn', 'input.fofn')
+            # touch('input.fofn')
+            touch_done()
+
+    def task_build_rdb():
+        rdir = abstdir('falcon_ns2.tasks.task_falcon0_dazzler_build_raw-0')
+        with mkcd('0-rawreads/build/'):
+            #touch('length_cutoff', 'rdb_build_done', 'run_jobs.sh', 'raw_reads.db')
+            link(rdir, 'raw_reads.db')
+            link(rdir, '.raw_reads.bps')
+            link(rdir, '.raw_reads.idx')
+            link(rdir, '.raw_reads.dust.data')
+            link(rdir, '.raw_reads.dust.anno')
+            touch_done()
+
+    def task_tan_split():
+        rdir = abstdir('falcon_ns2.tasks.task_falcon0_dazzler_tan_split-0')
+        with mkcd('0-rawreads/tan-split/'):
+            #link(rdir, 'split.json', 'tan-uows.json')
+            with open('tan-uows.json', 'w') as stream:
+                data = dict()
+                stream.write(json.dumps(data))
+            link(rdir, 'bash_template.txt', 'bash_template.sh')
+            touch_done()
+
+    def task_tan_gathered():
+        with mkcd('0-rawreads/tan-gathered/'):
+            touch_done()
+
+    def task_tan_combine():
+        rdir = abstdir('falcon_ns2.tasks.task_falcon0_dazzler_tan_combine-0')
+        with mkcd('0-rawreads/tan-combine/'):
+            link(rdir, 'raw_reads.db')
+            link(rdir, '.raw_reads.bps')
+            link(rdir, '.raw_reads.idx')
+            link(rdir, '.raw_reads.dust.data')
+            link(rdir, '.raw_reads.dust.anno')
+            link(rdir, '.raw_reads.tan.data')
+            link(rdir, '.raw_reads.tan.anno')
+            touch_done()
+
+    def task_daligner_split():
+        rdir = abstdir('falcon_ns2.tasks.task_falcon0_dazzler_daligner_split-0')
+        with mkcd('0-rawreads/daligner-split/'):
+            #link(rdir, 'split.json', 'all-units-of-work.json')
+            with open('all-units-of-work.json', 'w') as stream:
+                data = dict()
+                stream.write(json.dumps(data))
+            link(rdir, 'bash_template.txt', 'bash_template.sh')
+            touch_done()
+
+    def task_daligner_gathered():
+        with mkcd('0-rawreads/daligner-gathered/'):
+            touch_done()
+
+    def task_daligner_combine():
+        rdir = abstdir('falcon_ns2.tasks.task_falcon0_dazzler_daligner_combine-0')
+        with mkcd('0-rawreads/daligner-combine/'):
+            link(rdir, 'las_paths.json', 'gathered-las.json')
+            touch_done()
+
+    def task_lamerge_split():
+        rdir = abstdir('falcon_ns2.tasks.task_falcon0_dazzler_lamerge_split-0')
+        with mkcd('0-rawreads/las-merge-split/'):
+            #link(rdir, 'split.json', 'all-units-of-work.json')
+            with open('all-units-of-work.json', 'w') as stream:
+                data = dict()
+                stream.write(json.dumps(data))
+            link(rdir, 'bash_template.txt', 'las-merge-bash-template.sh')
+            touch_done()
+
+    def task_lamerge_gathered():
+        with mkcd('0-rawreads/las-merge-gathered/'):
+            touch_done()
+
+    def task_lamerge_combine():
+        # falcon_unzip/rr_hctg_track.py looks at las-merge-combine/las_paths.json, with abspaths
+        rdir = abstdir(
+            'falcon_ns2.tasks.task_falcon0_dazzler_lamerge_combine-0')
+        with mkcd('0-rawreads/las-merge-combine/'):
+            link(rdir, 'las_paths.json', 'las_fofn.json') # unzip/quiver, for now
+            link(rdir, 'las_paths.json')
+            link(rdir, 'block2las.json')
+            touch_done()
+
+    def task_cns_split():
+        rdir = abstdir('falcon_ns2.tasks.task_falcon0_dazzler_cns_split-0')
+        with mkcd('0-rawreads/cns-split/'):
+            #link(rdir, 'split.json', 'all-units-of-work.json')
+            with open('split.json', 'w') as stream:
+                data = dict()
+                stream.write(json.dumps(data))
+            #link(rdir, 'bash_template.txt', 'bash_template.sh')
+            touch_done()
+
+    def task_cns_gather():
+        #rdir = abstdir('falcon_ns2.tasks.task_falcon0_dazzler_cns_split-0')
+        #rdir = abstdir('falcon_ns2.tasks.task_falcon0_run_cns_post_gather-0')
+        with mkcd('0-rawreads/cns-gather/'):
+            touch_done()
+
+    #def task_cns_combine():
+    #    rdir = abstdir('falcon_ns2.tasks.task_falcon0_dazzler_cns_combine-0')
+    #    with mkcd('0-rawreads/cns-combine/'):
+    #        touch_done()
+
+    def task_preads():
+        rdir = abstdir('falcon_ns2.tasks.task_falcon0_run_cns_post_gather-0')
+        with mkcd('0-rawreads/preads/'):
+            link(rdir, 'input-preads.fofn', 'input_preads.fofn')
+            touch_done()
+
+    def task_report_pre_assembly():
+        """
+        """
+        with mkcd('0-rawreads/report/'):
+            # touch('pre_assembly_stats.json')
+            touch_done()
+
+    def task_build_pdb():
+        """
+        """
+        rdir = abstdir('falcon_ns2.tasks.task_falcon1_dazzler_build_p-0')
+        with mkcd('1-preads_ovl/build/'):
+            #touch('pdb_build_done', 'run_jobs.sh', 'preads.db')
+            link(rdir, 'preads.db')
+            link(rdir, '.preads.bps')
+            link(rdir, '.preads.idx')
+            link(rdir, '.preads.dust.data')
+            link(rdir, '.preads.dust.anno')
+            touch_done()
+
+    def task_daligner_split1():
+        #rdir = abstdir('falcon_ns2.tasks.task_falcon1_dazzler_daligner_split-0')
+        with mkcd('1-preads_ovl/daligner-split/'):
+            #link(rdir, 'split.json', 'all-units-of-work.json')
+            with open('all-units-of-work.json', 'w') as stream:
+                data = dict()
+                stream.write(json.dumps(data))
+            #link(rdir, 'bash_template.txt', 'bash_template.sh')
+            touch_done()
+
+    def task_daligner_gathered1():
+        with mkcd('1-preads_ovl/daligner-gathered/'):
+            touch_done()
+
+    def task_daligner_combine1():
+        rdir = abstdir('falcon_ns2.tasks.task_falcon1_dazzler_daligner_combine-0')
+        with mkcd('1-preads_ovl/daligner-combine/'):
+            link(rdir, 'las_paths.json', 'gathered-las.json')
+            touch_done()
+
+    def task_lamerge_split1():
+        #rdir = abstdir('falcon_ns2.tasks.task_falcon1_dazzler_lamerge_split-0')
+        with mkcd('1-preads_ovl/las-merge-split/'):
+            #link(rdir, 'split.json', 'all-units-of-work.json')
+            with open('all-units-of-work.json', 'w') as stream:
+                data = dict()
+                stream.write(json.dumps(data))
+            #link(rdir, 'bash_template.txt', 'las-merge-bash-template.sh')
+            touch_done()
+
+    def task_lamerge_gathered1():
+        with mkcd('1-preads_ovl/las-merge-gathered/'):
+            touch_done()
+
+    def task_lamerge_combine1():
+        rdir = abstdir(
+            'falcon_ns2.tasks.task_falcon1_dazzler_lamerge_combine-0')
+        with mkcd('1-preads_ovl/las-merge-combine/'):
+            link(rdir, 'las_paths.json', 'las_fofn.json') # unzip/quiver, for now
+            link(rdir, 'las_paths.json', 'las_paths.json')
+            link(rdir, 'block2las.json', 'block2las.json')
+            touch_done()
+
+    def task_run_db2falcon():
+        rdir = abstdir('falcon_ns2.tasks.task_falcon1_run_db2falcon-0')
+        with mkcd('1-preads_ovl/db2falcon/'):
+            link(rdir, 'preads4falcon.fasta')
+            touch_done()
+
+    def task_run_falcon_asm():
+        rdir = abstdir('falcon_ns2.tasks.task_falcon2_run_falcon_asm-0')
+        with mkcd('2-asm-falcon/'):
+            # workflow depends on:
+            touch('falcon_asm_done')
+            # get_read_ctg_map needs:
+            link(rdir, 'sg_edges_list')
+            link(rdir, 'utg_data')
+            link(rdir, 'ctg_paths')
+            # fetch_reads needs:
+            link(rdir, 'p_ctg.fa')
+            link(rdir, 'a_ctg.fa')
+            link(rdir, 'p_ctg_tiling_path')
+            link(rdir, 'a_ctg_tiling_path')
+
+            touch_done()
+
+    #task_make_fofn_abs_raw()
+    task_build_rdb()
+    task_tan_split()
+    task_tan_gathered()
+    task_tan_combine()
+    task_daligner_split()
+    task_daligner_gathered()
+    task_daligner_combine()
+    task_lamerge_split()
+    task_lamerge_gathered()
+    task_lamerge_combine()
+    task_cns_split()
+    task_cns_gather()
+    #task_cns_combine()
+    task_preads()
+    task_report_pre_assembly()
+    task_build_pdb()
+    task_daligner_split1()
+    task_daligner_gathered1()
+    task_daligner_combine1()
+    task_lamerge_split1()
+    task_lamerge_gathered1()
+    task_lamerge_combine1()
+    task_run_db2falcon()
+    task_run_falcon_asm()
+
+    def dump_fc_run(fn):
+        input_fofn = os.path.join(abstdir('falcon_ns2.tasks.task_falcon_make_fofn_abs-0'), 'file.fofn')
+        length_cutoff = int(open(os.path.join(abstdir('falcon_ns2.tasks.task_falcon0_dazzler_build_raw-0'), 'length_cutoff.txt')).read())
+        with open(fn, 'w') as stream:
+            p = lambda x: stream.write(x + '\n')
+            p('[General]')
+            p('input_fofn = {}'.format(input_fofn))
+            p('length_cutoff = {}'.format(length_cutoff))
+            p('[Unzip]')
+            p('input_fofn = {}'.format(input_fofn))
+            p('# You need to find this!')
+            p('input_bam_fofn = {}'.format('input_bam.fofn'))
+            p('[job.defaults]')
+            p('pwatcher_type = blocking')
+            #p('submit = /bin/bash -c "${JOB_SCRIPT}"')
+            p('submit = /bin/bash -c "${JOB_SCRIPT}" > "${JOB_STDOUT}" 2> "${JOB_STDERR}"')
+
+    dump_fc_run('fc_run.generated.cfg')
+
+
+def get_parser():
+    class HelpF(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
+        pass
+    description = 'Given a full HGAP4 run, generate directories and symlinks to make it look like a pypeflow run.'
+    epilog = """
+Typically:
+    mkdir mydir/
+    cd mydir/
+    python3 -m falcon_kit.mains.hgap4_adapt --job-output-dir=../job_output/
+
+    fc_run fc_run.cfg          -- (A)
+    fc_unzip.py fc_unzip.cfg   -- (B)
+    fc_quiver.py fc_unzip.cfg  -- (C)
+
+You need to create/modify the .cfg files.
+
+(A) This should be a no-op, and you do not need to run this at all. Just a sanity check.
+It will tell you that everything is already satisfied. But it
+cannot work unless you provide `input.fofn` (which can be empty) and set it to `input_fofn`
+in your .cfg.
+
+(B)/(C) These will need both `input_fofn` and `input_bam_fofn`. The latter
+should name actual BAM files to use for Quiver (also for partitioning for pbalign).
+
+For more help on .cfg files, see
+* https://github.com/PacificBiosciences/FALCON/wiki
+* https://github.com/PacificBiosciences/FALCON_unzip/wiki
+* https://github.com/PacificBiosciences/FALCON-integrate/wiki
+"""
+
+    parser = argparse.ArgumentParser(
+        description=description,
+        epilog=epilog,
+        formatter_class=HelpF)
+    parser.add_argument('--job-output-dir', default='job_output',
+                        help='Directory of HGAP4 job_output. (A symlink or relative path is fine.) Task-dirs are under here in "tasks/"')
+    return parser
+
+
+def main(argv=sys.argv):
+    args = get_parser().parse_args(argv[1:])
+    symlink(args.job_output_dir)
+
+
+if __name__ == '__main__':
+    logging.basicConfig(level=logging.INFO)
+    main()

+ 46 - 0
FALCON/falcon_kit/mains/las_write_empty.py

@@ -0,0 +1,46 @@
+import argparse, logging, struct, sys
+
+LOG = logging.getLogger()
+
+
+def write_empty_las(stream):
+    """Empty las means novl=0, tspace=0.
+    stream should be opened for binary writing.
+    """
+    data = struct.pack('qi', 0, 1000)
+    stream.write(data)
+
+
+def run(las_fn):
+    LOG.info('Writing empty las file {!r}.'.format(las_fn))
+    with open(las_fn, 'wb') as stream:
+        write_empty_las(stream)
+
+
+class HelpF(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
+    pass
+
+
+def parse_args(argv):
+    description = 'Write an empty .las file.'
+    epilog = 'The point is to pretend that we ran daligner and found no overlaps.'
+    parser = argparse.ArgumentParser(
+        description=description,
+        epilog=epilog,
+        formatter_class=HelpF,
+    )
+    parser.add_argument(
+        'las_fn',
+        help='Output. A single las file, empty. (12 bytes, actually.)')
+    args = parser.parse_args(argv[1:])
+    return args
+
+
+def main(argv=sys.argv):
+    args = parse_args(argv)
+    logging.basicConfig(level=logging.INFO)
+    run(**vars(args))
+
+
+if __name__ == '__main__':  # pragma: no cover
+    main()

+ 325 - 0
FALCON/falcon_kit/mains/ovlp_filter.py

@@ -0,0 +1,325 @@
+
+
+
+from builtins import range
+from falcon_kit.multiproc import Pool
+import falcon_kit.util.io as io
+import argparse
+import os
+import sys
+
+Reader = io.CapturedProcessReaderContext
+
+
+def run_filter_stage1(db_fn, fn, max_diff, max_ovlp, min_ovlp, min_len):
+    cmd = "LA4Falcon -mo %s %s" % (db_fn, fn)
+    reader = Reader(cmd)
+    with reader:
+        return fn, filter_stage1(reader.readlines, max_diff, max_ovlp, min_ovlp, min_len)
+
+
+def filter_stage1(readlines, max_diff, max_ovlp, min_ovlp, min_len):
+    def ignore(overlap_data):
+        left_count = overlap_data["5p"]
+        right_count = overlap_data["3p"]
+        if (abs(left_count - right_count) > max_diff) or \
+           (left_count > max_ovlp) or (right_count > max_ovlp) or \
+           (left_count < min_ovlp) or (right_count < min_ovlp):
+            return True
+
+    ignore_rtn = []
+    current_q_id = None
+    ave_idt = 0.0
+    all_over_len = 0.0
+    overlap_data = {"5p": 0, "3p": 0}
+    q_id = None
+    for l in readlines():
+        l = l.strip().split()
+        q_id, t_id = l[:2]
+
+        if q_id != current_q_id:
+            if current_q_id is not None:
+                if ignore(overlap_data):
+                    ignore_rtn.append(current_q_id)
+            overlap_data = {"5p": 0, "3p": 0}
+            current_q_id = q_id
+            ave_idt = 0.0
+            all_over_len = 0.0
+
+        overlap_len = -int(l[2])
+        idt = float(l[3])
+        q_s, q_e, q_l = int(l[5]), int(l[6]), int(l[7])
+        t_s, t_e, t_l = int(l[9]), int(l[10]), int(l[11])
+
+        if idt < 90.0:
+            continue
+        if q_l < min_len or t_l < min_len:
+            continue
+        if l[-1] in ("contains", "overlap"):
+            ave_idt += idt * overlap_len
+            all_over_len += overlap_len
+        if q_s == 0:
+            overlap_data["5p"] += 1
+        if q_e == q_l:
+            overlap_data["3p"] += 1
+    if q_id is not None:
+        if ignore(overlap_data):
+            ignore_rtn.append(current_q_id)
+    return ignore_rtn
+
+
+def run_filter_stage2(db_fn, fn, max_diff, max_ovlp, min_ovlp, min_len, ignore_set):
+    cmd = "LA4Falcon -mo %s %s" % (db_fn, fn)
+    reader = Reader(cmd)
+    with reader:
+        return fn, filter_stage2(reader.readlines, max_diff, max_ovlp, min_ovlp, min_len, ignore_set)
+
+
+def filter_stage2(readlines, max_diff, max_ovlp, min_ovlp, min_len, ignore_set):
+    contained_id = set()
+    for l in readlines():
+        l = l.strip().split()
+        q_id, t_id = l[:2]
+
+        q_s, q_e, q_l = int(l[5]), int(l[6]), int(l[7])
+        t_s, t_e, t_l = int(l[9]), int(l[10]), int(l[11])
+
+        idt = float(l[3])
+        if idt < 90:
+            continue
+
+        if q_l < min_len or t_l < min_len:
+            continue
+
+        if q_id in ignore_set:
+            continue
+        if t_id in ignore_set:
+            continue
+        if l[-1] == "contained":
+            contained_id.add(q_id)
+        if l[-1] == "contains":
+            contained_id.add(t_id)
+    return contained_id
+
+
+def run_filter_stage3(db_fn, fn, max_diff, max_ovlp, min_ovlp, min_len, ignore_set, contained_set, bestn):
+    cmd = "LA4Falcon -mo %s %s" % (db_fn, fn)
+    reader = Reader(cmd)
+    with reader:
+        return fn, filter_stage3(reader.readlines, max_diff, max_ovlp, min_ovlp, min_len, ignore_set, contained_set, bestn)
+
+
+def filter_stage3(readlines, max_diff, max_ovlp, min_ovlp, min_len, ignore_set, contained_set, bestn):
+    ovlp_output = []
+    overlap_data = {"5p": [], "3p": []}
+    current_q_id = None
+    for l in readlines():
+        l = l.strip().split()
+        q_id, t_id = l[:2]
+
+        if current_q_id == None:
+            current_q_id = q_id
+            overlap_data = {"5p": [], "3p": []}
+
+        elif q_id != current_q_id:
+
+            left = overlap_data["5p"]
+            right = overlap_data["3p"]
+            left.sort()
+            right.sort()
+
+            for i in range(len(left)):
+                score, m_range, ovlp = left[i]
+                ovlp_output.append(ovlp)
+                # print " ".join(ovlp), read_end_data[current_q_id]
+                if i >= bestn and m_range > 1000:
+                    break
+
+            for i in range(len(right)):
+                score, m_range, ovlp = right[i]
+                ovlp_output.append(ovlp)
+                # print " ".join(ovlp), read_end_data[current_q_id]
+                if i >= bestn and m_range > 1000:
+                    break
+
+            overlap_data = {"5p": [], "3p": []}
+            current_q_id = q_id
+
+        if q_id in contained_set:
+            continue
+        if t_id in contained_set:
+            continue
+        if q_id in ignore_set:
+            continue
+        if t_id in ignore_set:
+            continue
+
+        overlap_len = -int(l[2])
+        idt = float(l[3])
+        q_s, q_e, q_l = int(l[5]), int(l[6]), int(l[7])
+        t_s, t_e, t_l = int(l[9]), int(l[10]), int(l[11])
+
+        if idt < 90:
+            continue
+        if q_l < min_len or t_l < min_len:
+            continue
+
+        if q_s == 0:
+            overlap_data["5p"].append((-overlap_len,  t_l - (t_e - t_s),  l))
+        elif q_e == q_l:
+            overlap_data["3p"].append((-overlap_len, t_l - (t_e - t_s), l))
+
+    left = overlap_data["5p"]
+    right = overlap_data["3p"]
+    left.sort()
+    right.sort()
+
+    for i in range(len(left)):
+        score, m_range, ovlp = left[i]
+        ovlp_output.append(ovlp)
+        # print " ".join(ovlp), read_end_data[current_q_id]
+        if i >= bestn and m_range > 1000:
+            break
+
+    for i in range(len(right)):
+        score, m_range, ovlp = right[i]
+        ovlp_output.append(ovlp)
+        # print " ".join(ovlp), read_end_data[current_q_id]
+        if i >= bestn and m_range > 1000:
+            break
+
+    return ovlp_output
+
+
+def run_ovlp_filter(outs, exe_pool, file_list, max_diff, max_cov, min_cov, min_len, bestn, db_fn):
+    io.LOG('preparing filter_stage1')
+    io.logstats()
+    inputs = []
+    for fn in file_list:
+        if len(fn) != 0:
+            inputs.append((run_filter_stage1, db_fn, fn,
+                           max_diff, max_cov, min_cov, min_len))
+
+    ignore_all = []
+    for res in exe_pool.imap(io.run_func, inputs):
+        ignore_all.extend(res[1])
+
+    io.LOG('preparing filter_stage2')
+    io.logstats()
+    inputs = []
+    ignore_all = set(ignore_all)
+    for fn in file_list:
+        if len(fn) != 0:
+            inputs.append((run_filter_stage2, db_fn, fn, max_diff,
+                           max_cov, min_cov, min_len, ignore_all))
+    contained = set()
+    for res in exe_pool.imap(io.run_func, inputs):
+        contained.update(res[1])
+        # print res[0], len(res[1]), len(contained)
+
+    # print "all", len(contained)
+    io.LOG('preparing filter_stage3')
+    io.logstats()
+    inputs = []
+    ignore_all = set(ignore_all)
+    for fn in file_list:
+        if len(fn) != 0:
+            inputs.append((run_filter_stage3, db_fn, fn, max_diff,
+                           max_cov, min_cov, min_len, ignore_all, contained, bestn))
+    for res in exe_pool.imap(io.run_func, inputs):
+        for l in res[1]:
+            outs.write(" ".join(l) + "\n")
+    io.logstats()
+
+
+def try_run_ovlp_filter(out_fn, n_core, fofn, max_diff, max_cov, min_cov, min_len, bestn, db_fn):
+    io.LOG('starting ovlp_filter')
+    file_list = io.validated_fns(fofn)
+    io.LOG('fofn %r: %r' % (fofn, file_list))
+    n_core = min(n_core, len(file_list))
+    exe_pool = Pool(n_core)
+    tmp_out_fn = out_fn + '.tmp'
+    try:
+        with open(tmp_out_fn, 'w') as outs:
+            run_ovlp_filter(outs, exe_pool, file_list, max_diff, max_cov,
+                            min_cov, min_len, bestn, db_fn)
+            outs.write('---\n')
+        os.rename(tmp_out_fn, out_fn)
+        io.LOG('finished ovlp_filter')
+    except:
+        io.LOG('terminating ovlp_filter workers...')
+        exe_pool.terminate()
+        raise
+
+
+def ovlp_filter(out_fn, n_core, las_fofn, max_diff, max_cov, min_cov, min_len, bestn, db_fn, debug, silent, stream):
+    if debug:
+        n_core = 0
+        silent = False
+    if silent:
+        io.LOG = io.write_nothing
+    if stream:
+        global Reader
+        Reader = io.StreamedProcessReaderContext
+    try_run_ovlp_filter(out_fn, n_core, las_fofn, max_diff, max_cov,
+                        min_cov, min_len, bestn, db_fn)
+
+
+def parse_args(argv):
+    epilog = """Output consists of selected lines from LA4Falcon -mo, e.g.
+000000047 000000550 -206 100.00 0 0 206 603 1 0 206 741 overlap
+"""
+
+    class HelpF(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
+        pass
+    parser = argparse.ArgumentParser(
+        description='a simple multi-processes LAS ovelap data filter',
+        epilog=epilog,
+        formatter_class=HelpF)
+    parser.add_argument(
+        '--out-fn', default='preads.ovl',
+        help='Output filename')
+    parser.add_argument(
+        '--n-core', type=int, default=4,
+        help='number of processes used for generating consensus; 0 for main process only')
+    parser.add_argument(
+        '--las-fofn', type=str,
+        help='file contains the paths of all LAS files to be processed in parallel')
+    parser.add_argument(
+        '--db', type=str, dest='db_fn',
+        help='read db file path')
+    parser.add_argument(
+        '--max-diff', type=int,
+        help="max difference of 5' and 3' coverage")
+    parser.add_argument(
+        '--max-cov', type=int,
+        help="max coverage of 5' or 3' coverage")
+    parser.add_argument(
+        '--min-cov', type=int,
+        help="min coverage of 5' or 3' coverage")
+    parser.add_argument(
+        '--min-len', type=int, default=2500,
+        help="min length of the reads")
+    parser.add_argument(
+        '--bestn', type=int, default=10,
+        help="output at least best n overlaps on 5' or 3' ends if possible")
+    parser.add_argument(
+        '--stream', action='store_true',
+        help='stream from LA4Falcon, instead of slurping all at once; can save memory for large data')
+    parser.add_argument(
+        '--debug', '-g', action='store_true',
+        help="single-threaded, plus other aids to debugging")
+    parser.add_argument(
+        '--silent', action='store_true',
+        help="suppress cmd reporting on stderr")
+    args = parser.parse_args(argv[1:])
+    return args
+
+
+def main(argv=sys.argv):
+    args = parse_args(argv)
+    ovlp_filter(**vars(args))
+
+
+if __name__ == "__main__":
+    main(sys.argv)

+ 142 - 0
FALCON/falcon_kit/mains/ovlp_stats.py

@@ -0,0 +1,142 @@
+
+
+
+
+from falcon_kit.multiproc import Pool
+import falcon_kit.util.io as io
+import argparse
+import shlex
+import subprocess as sp
+import sys
+import traceback
+
+Reader = io.CapturedProcessReaderContext
+
+
+def filter_stats(readlines, min_len):
+    current_q_id = None
+    ave_idt = 0.0
+    all_over_len = 0.0
+    overlap_data = {"5p": 0, "3p": 0}
+    q_id = None
+    rtn_data = []
+    q_l = 0
+    for l in readlines():
+        l = l.strip().split()
+        q_id, t_id = l[:2]
+
+        if q_id != current_q_id:
+            left_count = overlap_data["5p"]
+            right_count = overlap_data["3p"]
+            if (current_q_id != None and
+                    (left_count > 0 or right_count > 0)):
+                rtn_data.append((current_q_id, q_l, left_count, right_count))
+            overlap_data = {"5p": 0, "3p": 0}
+            current_q_id = q_id
+            ave_idt = 0.0
+            all_over_len = 0.0
+
+        overlap_len = -int(l[2])
+        idt = float(l[3])
+        q_s, q_e, q_l = int(l[5]), int(l[6]), int(l[7])
+        t_s, t_e, t_l = int(l[9]), int(l[10]), int(l[11])
+
+        if q_l < min_len or t_l < min_len:
+            continue
+
+        if idt < 90:
+            continue
+
+        if l[-1] in ("contains", "overlap"):
+            ave_idt += idt * overlap_len
+            all_over_len += overlap_len
+        if q_s == 0:
+            overlap_data["5p"] += 1
+        if q_e == q_l:
+            overlap_data["3p"] += 1
+
+    if q_id != None:
+        left_count = overlap_data["5p"]
+        right_count = overlap_data["3p"]
+        if (left_count > 0 or right_count > 0):
+            rtn_data.append((q_id, q_l, left_count, right_count))
+
+    return rtn_data
+
+
+def run_filter_stats(db_fn, fn, min_len):
+    try:
+        cmd = "LA4Falcon -mo {} {}".format(db_fn, fn)
+        reader = Reader(cmd)
+        with reader:
+            return fn, filter_stats(reader.readlines, min_len)
+    except Exception:
+        stack = traceback.format_exc()
+        io.LOG(stack)
+        raise
+
+def run_ovlp_stats(exe_pool, db_fn, file_list, min_len):
+    inputs = []
+    for fn in file_list:
+        if len(fn) != 0:
+            inputs.append((run_filter_stats, db_fn, fn, min_len))
+    for res in exe_pool.imap(io.run_func, inputs):
+        for l in res[1]:
+            print(" ".join([str(c) for c in l]))
+
+
+def try_run_ovlp_stats(n_core, db_fn, fofn, min_len):
+    io.LOG('starting ovlp_stats')
+    file_list = io.validated_fns(fofn)
+    io.LOG('fofn {!r}: {}'.format(fofn, file_list))
+    io.LOG('db {!r}; n_core={}'.format(db_fn, n_core))
+    n_core = min(n_core, len(file_list))
+    exe_pool = Pool(n_core)
+    try:
+        run_ovlp_stats(exe_pool, db_fn, file_list, min_len)
+        io.LOG('finished ovlp_stats')
+    except KeyboardInterrupt:
+        io.LOG('terminating ovlp_stats workers...')
+        exe_pool.terminate()
+
+
+def ovlp_stats(db_fn, fofn, min_len, n_core, stream, debug, silent):
+    if debug:
+        n_core = 0
+        silent = False
+    if silent:
+        io.LOG = io.write_nothing
+    if stream:
+        global Reader
+        Reader = io.StreamedProcessReaderContext
+    try_run_ovlp_stats(n_core, db_fn, fofn, min_len)
+
+
+def parse_args(argv):
+    parser = argparse.ArgumentParser(description='a simple multi-processes LAS ovelap data filter',
+                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--n-core', type=int, default=4,
+                        help='number of processes used for generating consensus; '
+                        '0 for main process only')
+    parser.add_argument('--fofn', type=str, required=True,
+                        help='file contains the path of all LAS file to be processed in parallel')
+    parser.add_argument('--min-len', type=int, default=2500,
+                        help="min length of the reads")
+    parser.add_argument('--db-fn', default='./1-preads_ovl/preads.db',
+                        help="DAZZLER DB of preads")
+    parser.add_argument('--stream', action='store_true',
+                        help='stream from LA4Falcon, instead of slurping all at once; can save memory for large data')
+    parser.add_argument('--debug', '-g', action='store_true',
+                        help="single-threaded, plus other aids to debugging")
+    parser.add_argument('--silent', action='store_true',
+                        help="suppress cmd reporting on stderr")
+    return parser.parse_args(argv[1:])
+
+
+def main(argv=sys.argv):
+    args = parse_args(argv)
+    ovlp_stats(**vars(args))
+
+
+if __name__ == "__main__":
+    main(sys.argv)

File diff suppressed because it is too large
+ 1600 - 0
FALCON/falcon_kit/mains/ovlp_to_graph.py


+ 178 - 0
FALCON/falcon_kit/mains/pr_ctg_track.py

@@ -0,0 +1,178 @@
+
+
+
+
+from falcon_kit.multiproc import Pool
+import falcon_kit.util.io as io
+import argparse
+import sys
+import glob
+import os
+from heapq import heappush, heappop, heappushpop
+
+Reader = io.CapturedProcessReaderContext
+
+
+def get_pid_to_ctg(fn):
+    pid_to_ctg = {}
+    with open(fn) as f:
+        for row in f:
+            row = row.strip().split()
+            pid, rid, oid, ctg = row
+            pid_to_ctg.setdefault(pid, set())
+            pid_to_ctg[pid].add(ctg)
+    return pid_to_ctg
+
+
+def run_tr_stage1(db_fn, fn, min_len, bestn, pid_to_ctg):
+    cmd = "LA4Falcon -mo %s %s" % (db_fn, fn)
+    reader = Reader(cmd)
+    with reader:
+        return fn, tr_stage1(reader.readlines, min_len, bestn, pid_to_ctg)
+
+
+def tr_stage1(readlines, min_len, bestn, pid_to_ctg):
+    """
+    for each read in the b-read column inside the LAS files, we
+    keep top `bestn` hits with a priority queue through all overlaps
+    """
+    rtn = {}
+    for l in readlines():
+        l = l.strip().split()
+        q_id, t_id = l[:2]
+        overlap_len = -int(l[2])
+        idt = float(l[3])
+        q_s, q_e, q_l = int(l[5]), int(l[6]), int(l[7])
+        t_s, t_e, t_l = int(l[9]), int(l[10]), int(l[11])
+        if t_l < min_len:
+            continue
+        if q_id not in pid_to_ctg:
+            continue
+        rtn.setdefault(t_id, [])
+        if len(rtn[t_id]) < bestn:
+            heappush(rtn[t_id], (overlap_len, q_id))
+        else:
+            heappushpop(rtn[t_id], (overlap_len, q_id))
+
+    return rtn
+
+
+def run_track_reads(exe_pool, base_dir, file_list, min_len, bestn, db_fn):
+    io.LOG('preparing tr_stage1')
+    io.logstats()
+    asm_dir = os.path.abspath(os.path.join(base_dir, '2-asm-falcon'))
+    pid_to_ctg = get_pid_to_ctg(os.path.join(
+        asm_dir, 'read_maps', 'get_ctg_read_map', 'read_to_contig_map'))
+    io.LOG('len(pid_to_ctg) == {}'.format(len(pid_to_ctg)))
+    assert pid_to_ctg, 'Empty pid_to_ctg. Maybe empty {!r}?'.format(file_list)
+    inputs = []
+    for fn in file_list:
+        inputs.append((run_tr_stage1, db_fn, fn, min_len, bestn, pid_to_ctg))
+
+    """
+    Aggregate hits from each individual LAS and keep the best n hit.
+    Note that this does not guarantee that the final results is globally the best n hits espcially
+    when the number of `bestn` is too small.  In those case, if there is more hits from single LAS
+    file, then we will miss some good  hits.
+    """
+
+    bread_to_areads = {}
+    for fn, res in exe_pool.imap(io.run_func, inputs):
+        for k in res:
+            bread_to_areads.setdefault(k, [])
+            for item in res[k]:
+                if len(bread_to_areads[k]) < bestn:
+                    heappush(bread_to_areads[k], item)
+                else:
+                    heappushpop(bread_to_areads[k], item)
+    assert bread_to_areads, 'No bread_to_areads found. Is there any point in continuing?'
+
+    with open(os.path.join(asm_dir, "read_maps/pread_to_contigs"), "w") as out_f:
+        for bread in bread_to_areads:
+
+            ctg_score = {}
+            for s, pid in bread_to_areads[bread]:
+                if pid not in pid_to_ctg:
+                    continue
+
+                ctgs = pid_to_ctg[pid]
+                for ctg in ctgs:
+                    ctg_score.setdefault(ctg, [0, 0])
+                    ctg_score[ctg][0] += -s
+                    ctg_score[ctg][1] += 1
+
+            ctg_score = list(ctg_score.items())
+            ctg_score.sort(key=lambda k: k[1][0])
+            rank = 0
+
+            for ctg, score_count in ctg_score:
+                if bread in pid_to_ctg and ctg in pid_to_ctg[bread]:
+                    in_ctg = 1
+                else:
+                    in_ctg = 0
+                score, count = score_count
+                print(bread, ctg, count, rank, score, in_ctg, file=out_f)
+                rank += 1
+
+
+def try_run_track_reads(n_core, base_dir, min_len, bestn):
+    io.LOG('starting track_reads')
+    pread_dir = os.path.abspath(os.path.join(base_dir, "1-preads_ovl"))
+    file_list = glob.glob(os.path.join(pread_dir, "m*/preads.*.las"))
+    io.LOG('file list: %r' % file_list)
+    db_fn = os.path.join(pread_dir, "preads.db")
+    n_core = min(n_core, len(file_list))
+    exe_pool = Pool(n_core)
+    try:
+        run_track_reads(exe_pool, base_dir, file_list, min_len, bestn, db_fn)
+        io.LOG('finished track_reads')
+    except:
+        io.LOG('terminating track_reads workers...')
+        exe_pool.terminate()
+        raise
+
+
+def track_reads(n_core, base_dir, min_len, bestn, debug, silent, stream):
+    if debug:
+        n_core = 0
+        silent = False
+    if silent:
+        io.LOG = io.write_nothing
+    if stream:
+        global Reader
+        Reader = io.StreamedProcessReaderContext
+    try_run_track_reads(n_core, base_dir, min_len, bestn)
+
+
+def parse_args(argv):
+    parser = argparse.ArgumentParser(description='scan the pread overlap information to identify the best hit from the preads \
+to the contigs with read_to_contig_map generated by `fc_get_read_ctg_map` in `2-asm-falcon/read_maps/get_ctg_read_map/read_to_contig_map`',
+                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--n_core', type=int, default=4,
+                        help='number of processes used for for tracking reads; '
+                        '0 for main process only')
+    #parser.add_argument('--fofn', type=str, help='file contains the path of all LAS file to be processed in parallel')
+    #parser.add_argument('--db', type=str, dest='db_fn', help='read db file path')
+    parser.add_argument('--base_dir', type=str, default="./",
+                        help='the base working dir of a FALCON assembly')
+    parser.add_argument('--min_len', type=int, default=2500,
+                        help="min length of the reads")
+    parser.add_argument('--stream', action='store_true',
+                        help='stream from LA4Falcon, instead of slurping all at once; can save memory for large data')
+    parser.add_argument('--debug', '-g', action='store_true',
+                        help="single-threaded, plus other aids to debugging")
+    parser.add_argument('--silent', action='store_true',
+                        help="suppress cmd reporting on stderr")
+    parser.add_argument('--bestn', type=int, default=40,
+                        help="keep best n hits")
+    args = parser.parse_args(argv[1:])
+    return args
+
+
+def main(argv=sys.argv):
+    args = parse_args(argv)
+    track_reads(**vars(args))
+
+
+if __name__ == "__main__":
+    main()

+ 63 - 0
FALCON/falcon_kit/mains/reduce_preads.py

@@ -0,0 +1,63 @@
+"""
+Creates a reduced version of preads4falcon.fasta file by writing only the preads
+which are incident with 'G' edges in the final assembly graph.
+"""
+
+
+
+import argparse
+import logging
+import sys
+from ..FastaReader import open_fasta_reader
+from ..io import open_progress
+
+default_sg_edges_list_fns = ['./sg_edges_list']
+
+def run(fp_out, preads_fasta_fn, sg_edges_list_fns):
+    # Workaround the Argparse issue. It does not override
+    # the default argument value when the parameter is
+    # used in the append mode, but instead adds to the default
+    # list. https://bugs.python.org/issue16399
+    # Instead, we will not specify the default value, and
+    # check if the list is emptu here here, so that the user
+    # can specify exactly the paths to the file(s).
+    if not sg_edges_list_fns:
+        sg_edges_list_fns = default_sg_edges_list_fns
+
+    reads_in_layout = set()
+
+    for fn in sg_edges_list_fns:
+        with open_progress(fn) as fp_in:
+            for l in fp_in:
+                l = l.strip().split()
+                """001039799:E 000333411:E 000333411 17524 20167 17524 99.62 G"""
+                v, w, rid, s, t, aln_score, idt, type_ = l
+                if type_ != "G":
+                    continue
+                r1 = v.split(":")[0]
+                reads_in_layout.add(r1)
+                r2 = w.split(":")[0]
+                reads_in_layout.add(r2)
+
+    with open_fasta_reader(preads_fasta_fn) as f:
+        for r in f:
+            if r.name not in reads_in_layout:
+                continue
+            fp_out.write('>{}\n{}\n'.format(r.name, r.sequence.upper()))
+
+def main(argv=sys.argv):
+    description = 'Create a reduced set of preads, with only those used in the final layout. Write to stdout.'
+    parser = argparse.ArgumentParser(
+            description=description,
+            formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument('--preads-fasta-fn', type=str,
+            default='preads4falcon.fasta',
+            help='Preads file, required to construct the contigs.')
+    parser.add_argument('--sg-edges-list-fns', action='append',
+            help='One or more files containing string graph edges, produced by ovlp_to_graph.py.')
+    args = parser.parse_args(argv[1:])
+    run(sys.stdout, **vars(args))
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    main(sys.argv)

+ 48 - 0
FALCON/falcon_kit/mains/report_pre_assembly.py

@@ -0,0 +1,48 @@
+from .. import stats_preassembly
+import argparse
+import json
+import logging
+
+log = logging.getLogger()
+
+
+def do_report(db, preads_fofn, genome_length, length_cutoff, out):
+    kwds = dict(
+        i_preads_fofn_fn=preads_fofn,
+        i_raw_reads_db_fn=db,
+        genome_length=genome_length,
+        length_cutoff=length_cutoff,
+    )
+    report_dict = stats_preassembly.calc_dict(**kwds)
+    content = json.dumps(report_dict, sort_keys=True,
+                         indent=4, separators=(',', ': '))
+    open(out, 'w').write(content)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--genome-length',
+                        type=int,
+                        required=True,
+                        help='Estimated number of bases in the full genome haplotype.')
+    parser.add_argument('--length-cutoff',
+                        type=int,
+                        required=True,
+                        help='Minimum length of any seed read.')
+    parser.add_argument('--db',
+                        required=True,
+                        help='Path to raw_reads.db (dazzler DB)')
+    parser.add_argument('--preads-fofn',
+                        required=True,
+                        help='Path to FOFN of preads fasta files.')
+    parser.add_argument('--out',
+                        required=True,
+                        help='Path to JSON output file.')
+    ARGS = parser.parse_args()
+    do_report(**vars(ARGS))
+
+
+if __name__ == "__main__":
+    logging.basicConfig()
+    log.setLevel(logging.DEBUG)
+    main()

+ 185 - 0
FALCON/falcon_kit/mains/rr_ctg_track.py

@@ -0,0 +1,185 @@
+
+
+
+from falcon_kit.multiproc import Pool
+import falcon_kit.util.io as io
+import argparse
+import sys
+import glob
+import os
+from heapq import heappush, heappop, heappushpop
+
+Reader = io.CapturedProcessReaderContext
+
+
+def get_rid_to_ctg(fn):
+    rid_to_ctg = {}
+    with open(fn) as f:
+        for row in f:
+            row = row.strip().split()
+            pid, rid, oid, ctg = row
+            rid_to_ctg.setdefault(rid, set())
+            rid_to_ctg[rid].add(ctg)
+    return rid_to_ctg
+
+
+def run_tr_stage1(db_fn, fn, min_len, bestn, rid_to_ctg):
+    cmd = "LA4Falcon -m %s %s" % (db_fn, fn)
+    reader = Reader(cmd)
+    with reader:
+        return fn, tr_stage1(reader.readlines, min_len, bestn, rid_to_ctg)
+
+
+def tr_stage1(readlines, min_len, bestn, rid_to_ctg):
+    """
+    for each read in the b-read column inside the LAS files, we
+    keep top `bestn` hits with a priority queue through all overlaps
+    """
+    rtn = {}
+    for l in readlines():
+        l = l.strip().split()
+        q_id, t_id = l[:2]
+        overlap_len = -int(l[2])
+        idt = float(l[3])
+        q_s, q_e, q_l = int(l[5]), int(l[6]), int(l[7])
+        t_s, t_e, t_l = int(l[9]), int(l[10]), int(l[11])
+        if t_l < min_len:
+            continue
+        if q_id not in rid_to_ctg:
+            continue
+        rtn.setdefault(t_id, [])
+        if len(rtn[t_id]) < bestn:
+            heappush(rtn[t_id], (overlap_len, q_id))
+        else:
+            heappushpop(rtn[t_id], (overlap_len, q_id))
+
+    return rtn
+
+
+def run_track_reads(exe_pool, base_dir, file_list, min_len, bestn, db_fn):
+    io.LOG('preparing tr_stage1')
+    io.logstats()
+    asm_dir = os.path.abspath(os.path.join(base_dir, '2-asm-falcon'))
+    rid_to_ctg = get_rid_to_ctg(os.path.join(
+        asm_dir, 'read_maps', 'get_ctg_read_map', 'read_to_contig_map'))
+    inputs = []
+    for fn in file_list:
+        inputs.append((run_tr_stage1, db_fn, fn, min_len, bestn, rid_to_ctg))
+    """
+    Aggregate hits from each individual LAS and keep the best n hit.
+    Note that this does not guarantee that the final results is globally the best n hits espcially
+    when the number of `bestn` is too small.  In those case, if there is more hits from single LAS
+    file, then we will miss some good  hits.
+    """
+    bread_to_areads = {}
+    for fn, res in exe_pool.imap(io.run_func, inputs):
+        for k in res:
+            bread_to_areads.setdefault(k, [])
+            for item in res[k]:
+                if len(bread_to_areads[k]) < bestn:
+                    heappush(bread_to_areads[k], item)
+                else:
+                    heappushpop(bread_to_areads[k], item)
+
+    #rid_to_oid = open(os.path.join(rawread_dir, 'dump_rawread_ids', 'raw_read_ids')).read().split('\n')
+
+    """
+    For each b-read, we find the best contig map throgh the b->a->contig map.
+    """
+    with open(os.path.join(asm_dir, 'read_maps/rawread_to_contigs'), 'w') as out_f:
+        for bread in bread_to_areads:
+
+            ctg_score = {}
+            for s, rid in bread_to_areads[bread]:
+                if rid not in rid_to_ctg:
+                    continue
+
+                ctgs = rid_to_ctg[rid]
+                for ctg in ctgs:
+                    ctg_score.setdefault(ctg, [0, 0])
+                    ctg_score[ctg][0] += -s
+                    ctg_score[ctg][1] += 1
+
+            #oid = rid_to_oid[int(bread)]
+            ctg_score = list(ctg_score.items())
+            ctg_score.sort(key=lambda k: k[1][0])
+            rank = 0
+
+            for ctg, score_count in ctg_score:
+                if bread in rid_to_ctg and ctg in rid_to_ctg[bread]:
+                    in_ctg = 1
+                else:
+                    in_ctg = 0
+                score, count = score_count
+                #print(bread, oid, ctg, count, rank, score, in_ctg, file=out_f)
+                print(bread, ctg, count, rank, score, in_ctg, file=out_f)
+                rank += 1
+
+
+def try_run_track_reads(n_core, base_dir, min_len, bestn):
+    io.LOG('starting track_reads')
+
+    rawread_dir = os.path.abspath(os.path.join(base_dir, "0-rawreads"))
+
+    # better logic for finding the las files path or move the logic to extern (taking the --fofn option?)
+    file_list = glob.glob(os.path.join(rawread_dir, "m*/raw_reads.*.las"))
+    io.LOG('file list: %r' % file_list)
+
+    # same, shoud we decide this as a parameter
+    db_fn = os.path.join(rawread_dir, "raw_reads.db")
+    n_core = min(n_core, len(file_list))
+    exe_pool = Pool(n_core)
+    try:
+        run_track_reads(exe_pool, base_dir, file_list, min_len, bestn, db_fn)
+        io.LOG('finished track_reads')
+    except:
+        io.LOG('terminating track_reads workers...')
+        exe_pool.terminate()
+        raise
+
+
+def track_reads(n_core, base_dir, min_len, bestn, debug, silent, stream):
+    if debug:
+        n_core = 0
+        silent = False
+    if silent:
+        io.LOG = io.write_nothing
+    if stream:
+        global Reader
+        Reader = io.StreamedProcessReaderContext
+
+    try_run_track_reads(n_core, base_dir, min_len, bestn)
+
+
+def parse_args(argv):
+    parser = argparse.ArgumentParser(description='scan the raw read overlap information to identify the best hit from the reads \
+to the contigs with read_to_contig_map generated by `fc_get_read_ctg_map` in `2-asm-falcon/read_maps/get_ctg_read_map/read_to_contig_map`',
+                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--n_core', type=int, default=4,
+                        help='number of processes used for for tracking reads; '
+                        '0 for main process only')
+    #parser.add_argument('--fofn', type=str, help='file contains the path of all LAS file to be processed in parallel')
+    #parser.add_argument('--db', type=str, dest='db_fn', help='read db file path')
+    parser.add_argument('--base_dir', type=str, default="./",
+                        help='the base working dir of a FALCON assembly')
+    parser.add_argument('--min_len', type=int, default=2500,
+                        help="min length of the reads")
+    parser.add_argument('--stream', action='store_true',
+                        help='stream from LA4Falcon, instead of slurping all at once; can save memory for large data')
+    parser.add_argument('--debug', '-g', action='store_true',
+                        help="single-threaded, plus other aids to debugging")
+    parser.add_argument('--silent', action='store_true',
+                        help="suppress cmd reporting on stderr")
+    parser.add_argument('--bestn', type=int, default=40,
+                        help="keep best n hits")
+    args = parser.parse_args(argv[1:])
+    return args
+
+
+def main(argv=sys.argv):
+    args = parse_args(argv)
+    track_reads(**vars(args))
+
+
+if __name__ == "__main__":
+    main()

+ 710 - 0
FALCON/falcon_kit/mains/run1.py

@@ -0,0 +1,710 @@
+
+
+from ..pype import (wrap_gen_task as gen_task, gen_parallel_tasks, Dist)
+from .. import run_support
+from .. import bash, pype_tasks, snakemake
+from ..util.system import (only_these_symlinks, lfs_setstripe_maybe)
+from .. import io
+from .. import functional
+# pylint: disable=no-name-in-module, import-error, fixme, line-too-long
+from pypeflow.simple_pwatcher_bridge import (PypeProcWatcherWorkflow, MyFakePypeThreadTaskBase,
+                                             makePypeLocalFile, PypeTask)
+import argparse
+import glob
+import json
+import logging
+import os
+import re
+import sys
+import time
+
+
+LOG = logging.getLogger(__name__)  # default, for remote tasks
+
+def check_general_config(general_config, input_config_fn):
+    if ('pa_daligner_option' not in general_config or
+        'ovlp_daligner_option' not in general_config):
+        msg = '''Missing options.
+We now require both "pa_daligner_option" (stage 0) and "ovlp_daligner_option" (stage 1),
+which are automatically passed along to
+  HPC.daligner
+  HPC.TANmask
+  HPC.REPmask
+
+These can provide additional flags:
+  pa_HPCdaligner_option
+  pa_HPCTANmask_option
+  ovlp_HPCdaligner_option
+  pa_REPmask_code (-g/-c pairs for 3 iterations, e.g. '1,20;5,15;20,10')
+'''
+        raise Exception(msg)
+    required = ('input_fofn', 'genome_size')
+    for name in required:
+        assert name in general_config, 'Missing "{}" in {}.'.format(name, input_config_fn)
+
+def main1(prog_name, input_config_fn, logger_config_fn=None):
+    global LOG
+    LOG = run_support.setup_logger(logger_config_fn)
+    lfs_setstripe_maybe(path='.', stripe=12)
+
+    LOG.info('fc_run started with configuration %s', input_config_fn)
+    try:
+        config = run_support.parse_cfg_file(input_config_fn)
+        import json
+        dumped = json.dumps(config, indent=2, separators=(',', ': '), sort_keys=True)
+        LOG.info('cfg=\n{}'.format(dumped))
+    except Exception:
+        LOG.exception('Failed to parse config "{}".'.format(input_config_fn))
+        raise
+    general_config = config['General']
+    check_general_config(general_config, input_config_fn)
+    input_fofn_fn = general_config['input_fofn']
+    genome_size = int(general_config['genome_size'])
+    squash = True if 0 < genome_size < 1000000 else False
+    wf = PypeProcWatcherWorkflow(job_defaults=config['job.defaults'],
+                                 squash=squash,
+    )
+    general_config['ver'] = '100'
+    # Store config as JSON, available to many tasks.
+    config_fn = './config.json' # must not be in a task-dir
+    io.serialize(config_fn, config)
+    run(wf, config,
+        os.path.abspath(config_fn),
+        input_fofn_fn=input_fofn_fn,
+        )
+
+
+def add_bam2dexta_tasks(
+            wf,
+            config,
+            input_fofn_fn, rawread_dir):
+        # run bam2dexta
+        bam2dexta_uows_fn = os.path.join(
+            rawread_dir, 'bam2dexta-split', 'bam2dexta-uows.json')
+        bam2dexta_bash_template_fn = os.path.join(
+            rawread_dir, 'bam2dexta-split', 'bash_template.sh')
+        wf.addTask(gen_task(
+            script=pype_tasks.TASK_BAM2DEXTA_SPLIT_SCRIPT,
+            inputs={
+                'bam': input_fofn_fn,
+            },
+            outputs={
+                'split': bam2dexta_uows_fn,
+                'bash_template': bam2dexta_bash_template_fn,
+            },
+            parameters={
+                'wildcards': 'bam2dexta_id',
+            },
+            dist=Dist(local=True),
+        ))
+
+        gathered_fn = os.path.join(rawread_dir, 'bam2dexta-gathered', 'gathered-dexta-files.json')
+        gen_parallel_tasks(
+            wf,
+            bam2dexta_uows_fn, gathered_fn,
+            run_dict=dict(
+                bash_template_fn=bam2dexta_bash_template_fn,
+                script='fubar-TODO', #pype_tasks.TASK_DB_TAN_APPLY_SCRIPT, # for snakemake stuff
+                inputs={
+                    'units_of_work': '0-rawreads/bam2dexta-chunks/{bam2dexta_id}/some-units-of-work.json',
+                },
+                outputs={
+                    'results': '0-rawreads/bam2dexta-runs/{bam2dexta_id}/some-done-files.json',
+                },
+                parameters={},
+
+            ),
+            dist=Dist(NPROC=1, MB=4000, job_dict=config['job.step.da']),
+        )
+
+        input_fofn_fn = os.path.join(rawread_dir, 'bam2dexta-combine', 'input.fofn')
+        wf.addTask(gen_task(
+            script=pype_tasks.TASK_BAM2DEXTA_COMBINE_SCRIPT,
+            inputs={
+                'gathered': gathered_fn,
+            },
+            outputs={
+                'fofn': input_fofn_fn,
+            },
+            parameters={},
+            dist=Dist(local=True),
+        ))
+
+        return input_fofn_fn
+
+
+def run(wf, config,
+        config_fn,
+        input_fofn_fn,
+        ):
+    """
+    Preconditions (for now):
+    * LOG
+    * run_run_support.logger
+    """
+    parsed_config = io.deserialize(config_fn)
+    if parsed_config != config:
+        msg = 'Config from {!r} != passed config'.format(config_fn)
+        raise Exception(msg)
+    general_config = config['General']
+    general_config_fn = os.path.join(os.path.dirname(config_fn), 'General_config.json')
+    io.serialize(general_config_fn, general_config) # Some tasks use this.
+    rawread_dir = '0-rawreads'
+    pread_dir = '1-preads_ovl'
+    falcon_asm_dir = '2-asm-falcon'
+
+    for d in (rawread_dir, pread_dir, falcon_asm_dir):
+        run_support.make_dirs(d)
+
+    # only matter for parallel jobs
+    job_defaults = config['job.defaults']
+    #exitOnFailure = bool(job_defaults.get('stop_all_jobs_on_failure', False))
+    global default_njobs
+    default_njobs = int(job_defaults.get('njobs', 7))
+    wf.max_jobs = default_njobs
+
+    assert general_config['input_type'] in (
+        'raw', 'preads'), 'Invalid input_type=={!r}'.format(general_config['input_type'])
+
+    parameters = {}
+
+    if general_config['input_type'] == 'raw':
+        # Most common workflow: Start with rawreads.
+
+        if input_fofn_fn.endswith('.xml'):
+            input_fofn_fn = add_bam2dexta_tasks(wf, config, input_fofn_fn, rawread_dir)
+
+        # import sequences into daligner DB
+        # calculate length_cutoff (if specified as -1)
+        # split DB
+        # run DBdust
+        r_db_dust_fn = os.path.join(rawread_dir, 'build', 'raw_reads.db')
+        length_cutoff_fn = os.path.join(rawread_dir, 'build', 'length_cutoff')
+        wf.addTask(gen_task(
+            script=pype_tasks.TASK_DB_BUILD_SCRIPT,
+            inputs={
+                'config': general_config_fn,
+                'input_fofn': input_fofn_fn,
+            },
+            outputs={
+                'length_cutoff': length_cutoff_fn,
+                'db': r_db_dust_fn,
+                # Also .raw_reads.*, of course. And dust track.
+            },
+            parameters=dict(
+            ),
+            dist=Dist(NPROC=1, job_dict=config['job.step.dust']),
+        ))
+
+        # run TANmask
+        tan_uows_fn = os.path.join(
+            rawread_dir, 'tan-split', 'tan-uows.json')
+        tan_bash_template_fn = os.path.join(
+            rawread_dir, 'tan-split', 'bash_template.sh')
+        wf.addTask(gen_task(
+            script=pype_tasks.TASK_DB_TAN_SPLIT_SCRIPT,
+            inputs={
+                'config': general_config_fn,
+                'db': r_db_dust_fn,
+            },
+            outputs={
+                'split': tan_uows_fn,
+                'bash_template': tan_bash_template_fn,
+            },
+            parameters={},
+            dist=Dist(NPROC=1),
+        ))
+
+        gathered_fn = os.path.join(rawread_dir, 'tan-gathered', 'gathered-done-files.json')
+        gen_parallel_tasks(
+            wf,
+            tan_uows_fn, gathered_fn,
+            run_dict=dict(
+                bash_template_fn=tan_bash_template_fn,
+                script='fubar-TODO', #pype_tasks.TASK_DB_TAN_APPLY_SCRIPT, # for snakemake stuff
+                inputs={
+                    'units_of_work': '0-rawreads/tan-chunks/{tan0_id}/some-units-of-work.json',
+                },
+                outputs={
+                    #'job_done': '0-rawreads/{dal0_id}/daligner.done',
+                    'results': '0-rawreads/tan-runs/{tan0_id}/some-done-files.json',
+                },
+                parameters={},
+
+            ),
+            dist=Dist(NPROC=4, MB=4000, job_dict=config['job.step.da']),
+        )
+
+        r_db_tan_fn = os.path.join(rawread_dir, 'tan-combine', 'raw_reads.db')
+        wf.addTask(gen_task(
+            script=pype_tasks.TASK_DB_TAN_COMBINE_SCRIPT,
+            inputs={
+                'config': general_config_fn,
+                'db': r_db_dust_fn,
+                'gathered': gathered_fn,
+            },
+            outputs={
+                'new_db': r_db_tan_fn,
+            },
+            parameters={},
+            dist=Dist(local=True),
+        ))
+
+        #### HPC.REPmask/daligner/LAmerge
+        codes = functional.parse_REPmask_code(general_config['pa_REPmask_code'])
+        LOG.info('Parsed pa_REPmask_code (repa,repb,repc): {!r}'.format(codes))
+
+        ### REPmask tasks (a, b, c)
+        letter = 'a'
+        group_size, coverage_limit = codes[0]
+        i_db_fn = r_db_tan_fn
+        o_db_fn = add_rep_tasks(wf, rawread_dir, config, general_config,
+                general_config_fn, i_db_fn, length_cutoff_fn,
+                letter, group_size, coverage_limit)
+        letter = 'b'
+        group_size, coverage_limit = codes[1]
+        i_db_fn = o_db_fn
+        o_db_fn = add_rep_tasks(wf, rawread_dir, config, general_config,
+                general_config_fn, i_db_fn, length_cutoff_fn,
+                letter, group_size, coverage_limit)
+        letter = 'c'
+        group_size, coverage_limit = codes[2]
+        i_db_fn = o_db_fn
+        o_db_fn = add_rep_tasks(wf, rawread_dir, config, general_config,
+                general_config_fn, i_db_fn, length_cutoff_fn,
+                letter, group_size, coverage_limit)
+        r_db_rep_fn = o_db_fn
+
+        #### basic daligner/LAmerge
+        p_id2las_fn = os.path.join(rawread_dir, 'las-merge-combine', 'p_id2las.json')
+        las_fofn_fn = os.path.join(rawread_dir, 'las-merge-combine', 'las_fofn.json')
+
+        add_daligner_and_merge_tasks(
+            wf,
+            general_config, config['job.step.da'], config['job.step.la'],
+            rawread_dir,
+            general_config_fn, r_db_rep_fn,
+            length_cutoff_fn,
+            p_id2las_fn, las_fofn_fn,
+            daligner_wildcard='dal0_id',
+            lamerge_wildcard='mer0_id',
+            daligner_params={},
+            db_prefix='raw_reads', # TODO: Infer
+            daligner_split_script=pype_tasks.TASK_DB_DALIGNER_SPLIT_SCRIPT,
+        )
+        ####
+
+        if general_config['target'] == 'overlapping':
+            sys.exit(0)
+
+        # Produce new FOFN of preads fasta, based on consensus of overlaps.
+
+        split_fn = os.path.join(
+            rawread_dir, 'cns-split', 'split.json')
+        bash_template_fn = os.path.join(
+            rawread_dir, 'cns-split', 'consensus-bash-template.sh')
+        params = dict(parameters)
+        params['wildcards'] = 'cns0_id,cns0_id2'
+        wf.addTask(gen_task(
+            script=pype_tasks.TASK_CONSENSUS_SPLIT_SCRIPT,
+            inputs={
+                'p_id2las': p_id2las_fn,
+                'raw_reads_db': r_db_rep_fn,
+                'length_cutoff': length_cutoff_fn,
+                'config': general_config_fn,
+            },
+            outputs={
+                'split': split_fn,
+                'bash_template': bash_template_fn,
+            },
+            parameters=params,
+            dist=Dist(local=True),
+        ))
+
+        gathered_fn = os.path.join(rawread_dir, 'cns-gather', 'gathered.json')
+        gen_parallel_tasks(
+            wf,
+            split_fn, gathered_fn,
+            run_dict=dict(
+                bash_template_fn=bash_template_fn,
+                script=pype_tasks.TASK_CONSENSUS_TASK_SCRIPT, # for snakemake only
+                inputs = {
+                    #'las': '0-rawreads/cns-split/{cns0_id}/merged.{cns0_id2}.las',
+                    #'db': r_db_rep_fn,
+                    #'length_cutoff': length_cutoff_fn,
+                    #'config': general_config_fn,
+                    'units_of_work': '0-rawreads/cns-chunks/{cns0_id}/some-units-of-work.json',
+                },
+                outputs = {
+                    #'fasta': '0-rawreads/consensus/{cns0_id}/consensus.{cns0_id2}.fasta',
+                    'results': '0-rawreads/cns-runs/{cns0_id}/some-done-files.json',
+                },
+                parameters={},
+            ),
+            dist=Dist(NPROC=6, job_dict=config['job.step.cns']),
+        )
+
+        preads_fofn_fn = os.path.join(rawread_dir, 'preads', 'input_preads.fofn')
+        wf.addTask(gen_task(
+            script=pype_tasks.TASK_CONSENSUS_GATHER_SCRIPT,
+            inputs={
+                'gathered': gathered_fn,
+                'config': general_config_fn,
+                'raw_reads_db': r_db_rep_fn,
+            },
+            outputs={
+                'preads_fofn': preads_fofn_fn,
+            },
+            parameters=parameters, #{},
+            dist=Dist(local=True),
+        ))
+
+        rdir = os.path.join(rawread_dir, 'report')
+        pre_assembly_report_fn = os.path.join(rdir, 'pre_assembly_stats.json')
+        params = dict(parameters)
+        params['length_cutoff_user'] = general_config['length_cutoff']
+        params['genome_length'] = general_config['genome_size'] # note different name; historical
+        wf.addTask(gen_task(
+            script=pype_tasks.TASK_REPORT_PRE_ASSEMBLY_SCRIPT,
+            inputs={'length_cutoff': length_cutoff_fn,
+                    'raw_reads_db': r_db_rep_fn,
+                    'preads_fofn': preads_fofn_fn,
+                    'config': general_config_fn,
+            },
+            outputs={'pre_assembly_report': pre_assembly_report_fn,
+            },
+            parameters=params,
+            dist=Dist(local=True),
+        ))
+
+        if general_config['target'] == 'pre-assembly':
+            wf.refreshTargets()
+            LOG.info('Quitting after stage-0 for General.target=pre-assembly')
+            return
+
+    # build pread database
+    if general_config['input_type'] == 'preads':
+        LOG.info('General.input_type=preads, so we skip stage 0-rawreads.')
+        preads_fofn_fn = general_config['input_fofn']
+        assert os.path.exists(preads_fofn_fn), '{!r} does not exist.'.format(preads_fofn_fn)
+
+    pdb_build_done = os.path.join(pread_dir, 'pdb_build_done')
+    run_jobs_fn = os.path.join(pread_dir, 'run_jobs.sh')
+    preads_db_fn = os.path.join(pread_dir, 'build', 'preads.db')
+    length_cutoff_pr_fn = os.path.join(pread_dir, 'build', 'length_cutoff')
+
+    wf.addTask(gen_task(
+        script=pype_tasks.TASK_DB_BUILD_SCRIPT,
+        inputs={
+            'config': general_config_fn,
+            'input_fofn': preads_fofn_fn,
+        },
+        outputs={
+            'length_cutoff': length_cutoff_pr_fn,
+            'db': preads_db_fn,
+            # Also .preads.*, of course.
+        },
+        parameters=dict(
+        ),
+        dist=Dist(NPROC=1, job_dict=config['job.step.dust']),
+    ))
+
+    ####
+    p_id2las_fn = os.path.join(pread_dir, 'las-merge-combine', 'block2las.json')
+    las_fofn_fn = os.path.join(pread_dir, 'las-merge-combine', 'las_fofn.json')
+
+    add_daligner_and_merge_tasks(
+        wf,
+        general_config, config['job.step.pda'], config['job.step.pla'],
+        pread_dir,
+        general_config_fn, preads_db_fn, # no tan-mask for preads
+        length_cutoff_pr_fn,
+        p_id2las_fn, las_fofn_fn,
+        daligner_wildcard='dal1_id',
+        lamerge_wildcard='mer1_id',
+        daligner_params={},
+        db_prefix='preads', # TODO: Infer
+        daligner_split_script=pype_tasks.TASK_DB_DALIGNER_SPLIT_SCRIPT,
+    )
+    ####
+
+    db2falcon_dir = os.path.join(pread_dir, 'db2falcon')
+    db2falcon_done_fn = os.path.join(db2falcon_dir, 'db2falcon_done')
+    preads4falcon_fn = os.path.join(db2falcon_dir, 'preads4falcon.fasta')
+    wf.addTask(gen_task(
+        script=pype_tasks.TASK_RUN_DB_TO_FALCON_SCRIPT,
+        inputs={'p_id2las': p_id2las_fn,
+                'preads_db': preads_db_fn,
+                },
+        outputs={'job_done': db2falcon_done_fn,
+                 'preads4falcon': preads4falcon_fn,
+                 },
+        parameters={},
+        dist=Dist(NPROC=4, job_dict=config['job.step.asm']),
+    ))
+
+    falcon_asm_done_fn = os.path.join(falcon_asm_dir, 'falcon_asm_done')
+    for key in ('overlap_filtering_setting', 'length_cutoff_pr', 'fc_ovlp_to_graph_option'):
+        parameters[key] = general_config[key]
+    wf.addTask(gen_task(
+        script=pype_tasks.TASK_RUN_FALCON_ASM_SCRIPT,
+        inputs={'db2falcon_done': db2falcon_done_fn, 'db_file': preads_db_fn,
+                'preads4falcon_fasta': preads4falcon_fn,
+                'las_fofn': las_fofn_fn,
+                'config': general_config_fn,
+                },
+        outputs={'falcon_asm_done': falcon_asm_done_fn},
+        parameters=parameters,
+        dist=Dist(NPROC=4, job_dict=config['job.step.asm']),
+    ))
+    wf.refreshTargets()
+
+    with io.cd('0-rawreads'):
+        # for backwards-compatibility
+        io.symlink('las-merge-combine', 'las-gather')
+
+    #return falcon_asm_done
+def add_daligner_and_merge_tasks(
+        wf,
+        general_config, daligner_job_config, merge_job_config,
+        super_dir,
+        general_config_fn, db_fn,
+        length_cutoff_fn, # not always needed (refactor later)
+        p_id2las_fn, las_fofn_fn,
+        daligner_wildcard, #='dal0_id',
+        lamerge_wildcard, #='mer0_id',
+        daligner_params=dict(),
+        db_prefix='raw_reads',
+        daligner_split_script=pype_tasks.TASK_DB_DALIGNER_SPLIT_SCRIPT,
+    ):
+    """
+    Results:
+      p_id2las_fn, las_fofn_fn
+    """
+    parameters = dict()
+
+    # run daligner
+    daligner_all_units_fn = os.path.join(
+        super_dir, 'daligner-split', 'all-units-of-work.json')
+    daligner_bash_template_fn = os.path.join(
+        super_dir, 'daligner-split', 'daligner_bash_template.sh')
+    params = dict(daligner_params)
+    params['skip_checks'] = int(general_config.get('skip_checks', 0))
+    params['wildcards'] = daligner_wildcard
+    wf.addTask(gen_task(
+        script=daligner_split_script,
+        inputs={
+            'config': general_config_fn,
+            'db': db_fn,
+            'length_cutoff': length_cutoff_fn,
+        },
+        outputs={
+            'split': daligner_all_units_fn,
+            'bash_template': daligner_bash_template_fn
+        },
+        parameters=params,
+        dist=Dist(local=True, NPROC=4), # really, NPROC=1, but we need to know the max
+    ))
+
+    gathered_fn = os.path.join(super_dir, 'daligner-gathered', 'gathered-done-files.json')
+    gen_parallel_tasks(
+        wf,
+        daligner_all_units_fn, gathered_fn,
+        run_dict=dict(
+            bash_template_fn=daligner_bash_template_fn,
+            script=pype_tasks.TASK_DB_DALIGNER_APPLY_SCRIPT, # for snakemake stuff
+            inputs={
+                'units_of_work': os.path.join(super_dir, 'daligner-chunks/{%s}/some-units-of-work.json'%daligner_wildcard),
+            },
+            outputs={
+                'results': os.path.join(super_dir, 'daligner-runs/{%s}/some-done-files.json'%daligner_wildcard),
+            },
+            parameters={},
+        ),
+        dist=Dist(NPROC=4, MB=4000, job_dict=daligner_job_config),
+    )
+
+    gathered_las_fn = os.path.join(super_dir, 'daligner-combine', 'gathered-las.json')
+    wf.addTask(gen_task(
+        script=pype_tasks.TASK_DB_DALIGNER_COMBINE_SCRIPT,
+        inputs={
+            'config': general_config_fn,
+            'db': db_fn,
+            'gathered': gathered_fn,
+        },
+        outputs={
+            'las_paths': gathered_las_fn,
+        },
+        parameters={},
+        #dist=Dist(NPROC=1, MB=4000, job_dict=daligner_job_config)
+        dist=Dist(local=True),
+    ))
+
+    # Merge .las files.
+    las_merge_all_units_fn = os.path.join(super_dir, 'las-merge-split', 'all-units-of-work.json')
+    bash_template_fn = os.path.join(super_dir, 'las-merge-split', 'las-merge-bash-template.sh')
+    params = dict(parameters)
+    params['db_prefix'] = db_prefix
+    params['wildcards'] = lamerge_wildcard
+    wf.addTask(gen_task(
+        script=pype_tasks.TASK_DB_LAMERGE_SPLIT_SCRIPT,
+        inputs={
+            'config': general_config_fn,
+            'las_paths': gathered_las_fn,
+        },
+        outputs={
+            'split': las_merge_all_units_fn,
+            'bash_template': bash_template_fn,
+        },
+        parameters=params,
+        dist=Dist(local=True),
+    ))
+
+    gathered_fn = os.path.join(super_dir, 'las-merge-gathered', 'gathered.json')
+    gen_parallel_tasks(
+        wf,
+        las_merge_all_units_fn, gathered_fn,
+        run_dict=dict(
+            bash_template_fn=bash_template_fn,
+            script=pype_tasks.TASK_DB_LAMERGE_APPLY_SCRIPT, # for snakemake
+            inputs={
+                'units_of_work': os.path.join(super_dir, 'las-merge-chunks/{%s}/some-units-of-work.json'%lamerge_wildcard),
+            },
+            outputs={
+                'results': os.path.join(super_dir, 'las-merge-runs/{%s}/some-las-paths.json'%lamerge_wildcard),
+            },
+            parameters={},
+        ),
+        dist=Dist(NPROC=1, job_dict=merge_job_config),
+    )
+
+    wf.addTask(gen_task(
+        script=pype_tasks.TASK_DB_LAMERGE_COMBINE_SCRIPT,
+        inputs={
+            'config': general_config_fn,
+            'gathered': gathered_fn,
+        },
+        outputs={
+            'block2las': p_id2las_fn,
+            'las_paths': las_fofn_fn,
+        },
+        parameters={},
+        dist=Dist(local=True),
+    ))
+
+
+def add_rep_tasks(
+        wf,
+        rawread_dir, config, general_config,
+        general_config_fn, i_db_fn, length_cutoff_fn,
+        letter, group_size, coverage_limit,
+        ):
+        """
+        Add daligner/lamerge/REPmask parallel tasks for one iteration of repeat-masking.
+        TODO: Make the tasks no-ops if the codes are zero (or something like that).
+        """
+        name = 'rep{}'.format(letter)
+        rep_dir = os.path.join(rawread_dir, name)
+        o_db_rep_fn = os.path.join(rep_dir, 'rep-combine', 'raw_reads.db')
+
+        p_id2las_fn = os.path.join(rep_dir, 'las-merge-combine', 'p_id2las.json')
+        las_fofn_fn = os.path.join(rep_dir, 'las-merge-combine', 'las_fofn.json')
+
+        rep_daligner_params = dict(
+            group_size=group_size, coverage_limit=coverage_limit,
+        )
+        add_daligner_and_merge_tasks(
+            wf,
+            general_config, config['job.step.da'], config['job.step.la'],
+            rep_dir,
+            general_config_fn, i_db_fn,
+            length_cutoff_fn,
+            p_id2las_fn, las_fofn_fn,
+            daligner_wildcard='dal0{}_id'.format(letter),
+            lamerge_wildcard='mer0{}_id'.format(letter),
+            daligner_params=rep_daligner_params,
+            db_prefix='raw_reads', # TODO: Infer
+            daligner_split_script=pype_tasks.TASK_DB_REP_DALIGNER_SPLIT_SCRIPT,
+        )
+
+        ### REPmask
+        # rep-split
+        # We assume that daligner/LAmerge have already run.
+        # Instead of using the REP.mask calls from rep-jobs.05.MASK,
+        # we construct our own.
+        rep_uows_fn = os.path.join(
+            rep_dir, 'rep-split', 'rep-uows.json')
+        rep_bash_template_fn = os.path.join(
+            rep_dir, 'rep-split', 'bash_template.sh')
+        wf.addTask(gen_task(
+            script=pype_tasks.TASK_DB_REP_SPLIT_SCRIPT,
+            inputs={
+                'config': general_config_fn,
+                'db': i_db_fn,
+                'las_paths': las_fofn_fn,
+            },
+            outputs={
+                'split': rep_uows_fn,
+                'bash_template': rep_bash_template_fn,
+            },
+            parameters={
+                'group_size': group_size,
+                'coverage_limit': coverage_limit,
+                'wildcards': '{}_id'.format(name),
+            },
+            dist=Dist(NPROC=1),
+        ))
+
+        # rep-apply
+        gathered_fn = os.path.join(rep_dir, 'rep-gathered', 'gathered-done-files.json')
+        gen_parallel_tasks(
+            wf,
+            rep_uows_fn, gathered_fn,
+            run_dict=dict(
+                bash_template_fn=rep_bash_template_fn,
+                script='fubar-TODO', #pype_tasks.TASK_DB_REP_APPLY_SCRIPT, # for snakemake stuff
+                inputs={
+                    'units_of_work': '0-rawreads/%(name)s/rep-chunks/{%(name)s_id}/some-units-of-work.json'%locals(),
+                },
+                outputs={
+                    'results': '0-rawreads/%(name)s/rep-runs/{%(name)s_id}/some-done-files.json'%locals(),
+                },
+                parameters={},
+
+            ),
+            dist=Dist(NPROC=4, MB=4000, job_dict=config['job.step.da']),
+        )
+
+        # rep-combine
+        wf.addTask(gen_task(
+            script=pype_tasks.TASK_DB_REP_COMBINE_SCRIPT,
+            inputs={
+                'config': general_config_fn,
+                'db': i_db_fn,
+                'gathered': gathered_fn,
+            },
+            outputs={
+                'new_db': o_db_rep_fn,
+            },
+            parameters={
+                'group_size': group_size,
+            },
+            dist=Dist(local=True),
+        ))
+
+        return o_db_rep_fn
+
+
+def main(argv=sys.argv):
+    parser = argparse.ArgumentParser()
+    parser.add_argument('config',
+                        help='.cfg/.ini/.json')
+    parser.add_argument('logger',
+                        nargs='?',
+                        help='(Optional)JSON config for standard Python logging module')
+    args = parser.parse_args(argv[1:])
+    main1(argv[0], args.config, args.logger)
+
+
+if __name__ == '__main__':
+    main()

+ 79 - 0
FALCON/falcon_kit/mains/symlink_mapped.py

@@ -0,0 +1,79 @@
+from future.utils import viewitems
+
+import argparse
+import json
+import os
+import sys
+
+def deserialize(fn):
+    with open(fn) as ifs:
+        return json.loads(ifs.read())
+
+def assert_exists(fn):
+    if not os.path.isfile(fn):
+        raise Exception('Does not exist: {!r}'.format(fn))
+
+def mkdir(dirname):
+    if not os.path.isdir(dirname):
+        # Possible race-condition, so dirs must be created serially.
+        os.makedirs(dirname)
+
+def symlink(name, target):
+    msg = '{} -> {}'.format(name, target)
+    assert not os.path.lexists(name), msg
+    #print msg
+    os.symlink(target, name)
+
+def run(special_split_fn, fn_patterns):
+    """
+    Symlink targets will be relative to cwd.
+    For each pattern, each wildcard will be substituted everywhere, e.g.
+        fn_pattern == 'top/{key}/input_{key}.txt'
+    """
+    fnkeypattdict = dict(fnkeypatt.split('=') for fnkeypatt in fn_patterns)
+    jobs = deserialize(special_split_fn)
+    mapdir = os.path.normpath(os.path.dirname(os.path.normpath(special_split_fn)))
+    for job in jobs:
+        inputs = job['input']
+        wildcards = job['wildcards']
+        for (fnkey, fn_pattern) in viewitems(fnkeypattdict):
+            val = inputs[fnkey]
+            # val should be relative to the location of the special_split_fn.
+            #assert not os.path.isabs(val), 'mapped input (dynamic output) filename {!r} must be relative (to serialzed file location {!r})'.format(
+            #        val, special_split_fn)
+            if not os.path.isabs(val):
+                mapped_input_fn = os.path.join(mapdir, val)
+            else:
+                mapped_input_fn = val
+            assert_exists(mapped_input_fn)
+            try:
+                symlink_name = fn_pattern.format(**wildcards)
+            except Exception as err:
+                import pprint
+                msg = str(err) + ': for pattern {!r} and wildcards\n{!r}'.format(
+                        fn_pattern, pprint.pformat(wildcards))
+                raise Exception(msg)
+            outdir = os.path.normpath(os.path.dirname(symlink_name))
+            mkdir(outdir)
+            target_name = os.path.relpath(mapped_input_fn, outdir)
+            symlink(symlink_name, target_name)
+
+def parse_args(argv):
+    description = 'Create symlinks named after "fn_pattern", targeting values in "mapped_fn".'
+    parser = argparse.ArgumentParser(
+            description=description,
+    )
+    parser.add_argument(
+            '--special-split-fn', required=True,
+            help='Serialized split-file (in our special format), where "mapped_inputs" has a map with key to filename, relative to the directory of this file.')
+    parser.add_argument(
+            'fn_patterns', nargs='+',
+            help='"fnkey=pattern" Can appear multiple times. Each is a pattern for symlinks, to be substituted with keys in special_split_fn. Each fnkey=filename must appear in the input section of each job listed in special-split.')
+    return parser.parse_args(argv[1:])
+
+def main(argv=sys.argv):
+    args = parse_args(argv)
+    run(**vars(args))
+
+if __name__ == "__main__":
+    main()

+ 85 - 0
FALCON/falcon_kit/mains/task_report_pre_assembly.py

@@ -0,0 +1,85 @@
+
+
+import argparse
+import logging
+import os
+import sys
+from .. import io
+from .. import bash
+from .. import run_support
+
+LOG = logging.getLogger()
+
+
+def script_run_report_pre_assembly(i_raw_reads_db_fn, i_preads_fofn_fn, genome_length, length_cutoff, o_json_fn):
+    params = dict()
+    params.update(locals())
+    script = """\
+python3 -m falcon_kit.mains.report_pre_assembly --genome-length {genome_length} --length-cutoff {length_cutoff} --db {i_raw_reads_db_fn} --preads-fofn {i_preads_fofn_fn} --out {o_json_fn}
+"""
+    return script.format(**params)
+
+
+def run(config_fn, length_cutoff_fn, raw_reads_db_fn, preads_fofn_fn, pre_assembly_report_fn):
+    config = io.deserialize(config_fn)
+    genome_length = int(config['genome_size'])
+    length_cutoff_user = int(config['length_cutoff'])
+    # Update length_cutoff if auto-calc (when length_cutoff is negative).
+    # length_cutoff_fn was created long ago, so no filesystem issues.
+    length_cutoff = run_support.get_length_cutoff(
+        length_cutoff_user, length_cutoff_fn)
+    # Hmmm. Actually, I think we now write the user length_cutoff into the length_cutoff file,
+    # if not -1. TODO(CD): Check on that, and simplify here if so.
+
+    script = script_run_report_pre_assembly(
+        raw_reads_db_fn, preads_fofn_fn, genome_length, length_cutoff, pre_assembly_report_fn)
+    script_fn = 'run-report-pre-assembly.sh'
+    job_done_fn = 'job.done'
+    bash.write_script(script, script_fn, job_done_fn)
+    io.syscall('bash -vex {}'.format(script_fn))
+
+
+class HelpF(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
+    pass
+
+
+def parse_args(argv):
+    description = 'Prepare to run the pre-assembly report generator, and run it.'
+    epilog = 'length_cutoff might be cleaned up someday. For now, yeah, it is confusing.'
+    parser = argparse.ArgumentParser(
+        description=description,
+        epilog=epilog,
+        formatter_class=HelpF,
+    )
+    parser.add_argument(
+        '--config-fn',
+        help='Input. JSON configuration. We use "length_cutoff" (if positive) and "genome_size".',
+    )
+    parser.add_argument(
+        '--length-cutoff-fn',
+        help='Input. File of a single number: the length-cutoff for raw reads.',
+    )
+    parser.add_argument(
+        '--raw-reads-db-fn',
+        help='Input. Dazzler DB of raw reads.',
+    )
+    parser.add_argument(
+        '--preads-fofn-fn',
+        help='Input. FOFN of preads las files.',
+    )
+    parser.add_argument(
+        '--pre-assembly-report-fn',
+        help='Output. In JSON format.',
+    )
+    args = parser.parse_args(argv[1:])
+    return args
+
+
+def main(argv=sys.argv):
+    args = parse_args(argv)
+    logging.basicConfig(level=logging.INFO)
+    run(**vars(args))
+
+
+if __name__ == '__main__':  # pragma: no cover
+    main()

+ 40 - 0
FALCON/falcon_kit/mains/tasks.py

@@ -0,0 +1,40 @@
+"""Executable tasks.
+
+To be called by pbsmrtpipe.
+
+pypeFLOW uses its own adaptors instead.
+"""
+
+
+
+from .. import run_support as support
+import sys
+
+
+def help():
+    print("""
+Usage:
+    falcon-task [task] <[task-args]>
+
+tasks:
+    make-fofn-abs
+""")
+    sys.exit(2)
+
+
+def main_make_fofn_abs(i_fofn_fn, o_fofn_fn):
+    support.make_fofn_abs(i_fofn_fn, o_fofn_fn)
+
+
+def main(argv=sys.argv):
+    if len(argv) < 2 or argv[1].startswith('-'):
+        help()
+    task = argv[1]
+    tasks = {
+        'make-fofn-abs': main_make_fofn_abs,
+    }
+    return tasks[task](*argv[2:])
+
+
+if __name__ == "__main__":
+    main(sys.argv)

+ 53 - 0
FALCON/falcon_kit/mains/zmw_collect.py

@@ -0,0 +1,53 @@
+"""
+Performs a single pass over an input FASTA (file or streamed), and collects
+all ZMWs. For each ZMW it calculates the expected molecular size by picking
+the internal median subread length.
+The script writes one line per ZMW indicating:
+movie_zmw median_insert_length total_insert_sum num_passes
+Author: Ivan Sovic
+"""
+from falcon_kit.mains.fasta_filter import ZMWTuple
+
+import falcon_kit.FastaReader as FastaReader
+import falcon_kit.mains.fasta_filter as fasta_filter
+import falcon_kit.io as io
+
+import os
+import sys
+import argparse
+import logging
+import contextlib
+import itertools
+
+LOG = logging.getLogger()
+
+def yield_record(fp_in):
+    fasta_records = FastaReader.yield_fasta_record(fp_in, log=LOG.info)
+    for record in fasta_records:
+        yield record
+
+def run(fp_out, yield_zmw_tuple_func):
+    for zmw_id, zmw_subreads in itertools.groupby(yield_zmw_tuple_func, lambda x: x.zmw_id):
+        zmw_subreads_list = list(zmw_subreads)
+        zrec = fasta_filter.internal_median_zmw_subread(zmw_subreads_list)
+        movie_zmw = zrec.movie_name + '/' + zrec.zmw_id
+        zmw_unique_molecular_size = zrec.seq_len
+        zmw_total_size = sum([zmw.seq_len for zmw in zmw_subreads_list])
+        zmw_num_passes = len(zmw_subreads_list)
+        fp_out.write('{}\t{}\t{}\t{}\n'.format(movie_zmw, zmw_unique_molecular_size, zmw_total_size, zmw_num_passes))
+
+def parse_args(argv):
+    parser = argparse.ArgumentParser(description="For a given streamed FASTA file, it collects all subreads per ZMW, "\
+                                        "calculates the median insert size, and writes out a TSV file with base counts.",
+                                        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    args = parser.parse_args(argv[1:])
+    return args
+
+def main(argv=sys.argv):
+    args = parse_args(argv)
+    logging.basicConfig(level=logging.INFO)
+
+    run(sys.stdout, fasta_filter.yield_zmwtuple(yield_record(sys.stdin), whitelist_set=None, store_record=False))
+
+if __name__ == "__main__":  # pragma: no cover
+    main(sys.argv)          # pragma: no cover

+ 160 - 0
FALCON/falcon_kit/mains/zmw_subsample.py

@@ -0,0 +1,160 @@
+"""
+Takes a CSV file with a list of ZMWs with their corresponding lengths.
+The script outputs a JSON file with a whitelist of ZMWs selected by a given
+strategy (random, longest, etc.) and desired coverage of a genome.
+Author: Ivan Sovic
+"""
+import falcon_kit.util.system as system
+
+import falcon_kit.io as io
+
+import os
+import sys
+import argparse
+import logging
+import contextlib
+import itertools
+import random
+import json
+
+LOG = logging.getLogger()
+
+STRATEGY_RANDOM = 'random'
+STRATEGY_LONGEST = 'longest'
+
+def strategy_func_random(zmws):
+    """
+    >>> random.seed(12345); strategy_func_random([])
+    []
+    >>> random.seed(12345); strategy_func_random([('synthetic/1', 9)])
+    [('synthetic/1', 9)]
+    >>> random.seed(12345); strategy_func_random([('synthetic/1', 9), ('synthetic/2', 21), ('synthetic/3', 9), ('synthetic/4', 15), ('synthetic/5', 20)])
+    [('synthetic/5', 20), ('synthetic/3', 9), ('synthetic/2', 21), ('synthetic/1', 9), ('synthetic/4', 15)]
+    """
+    ret = list(zmws)
+    random.shuffle(ret)
+    return ret
+
+def strategy_func_longest(zmws):
+    """
+    >>> strategy_func_longest([])
+    []
+    >>> strategy_func_longest([('synthetic/1', 9)])
+    [('synthetic/1', 9)]
+    >>> strategy_func_longest([('synthetic/1', 9), ('synthetic/2', 21), ('synthetic/3', 9), ('synthetic/4', 15), ('synthetic/5', 20)])
+    [('synthetic/2', 21), ('synthetic/5', 20), ('synthetic/4', 15), ('synthetic/1', 9), ('synthetic/3', 9)]
+    """
+    return sorted(zmws, key = lambda x: x[1], reverse = True)
+
+STRATEGY_TYPE_TO_FUNC = {   STRATEGY_RANDOM: strategy_func_random,
+                            STRATEGY_LONGEST: strategy_func_longest,
+                        }
+
+def select_zmws(zmws, min_requested_bases):
+    """
+    >>> select_zmws([], 0)
+    ([], 0)
+    >>> select_zmws([], 10)
+    ([], 0)
+    >>> select_zmws([('zmw/1', 1), ('zmw/2', 2), ('zmw/3', 5), ('zmw/4', 7), ('zmw/5', 10), ('zmw/6', 15)], 10)
+    (['zmw/1', 'zmw/2', 'zmw/3', 'zmw/4'], 15)
+    >>> select_zmws([('zmw/1', 1), ('zmw/2', 2), ('zmw/3', 5), ('zmw/4', 7), ('zmw/5', 10), ('zmw/6', 15)], 20)
+    (['zmw/1', 'zmw/2', 'zmw/3', 'zmw/4', 'zmw/5'], 25)
+    >>> select_zmws([('zmw/1', 1), ('zmw/1', 2), ('zmw/1', 5), ('zmw/1', 7), ('zmw/1', 10), ('zmw/1', 15)], 20)
+    (['zmw/1', 'zmw/1', 'zmw/1', 'zmw/1', 'zmw/1'], 25)
+    """
+    # Select the first N ZMWs which sum up to the desired coverage.
+    num_bases = 0
+    subsampled_zmws = []
+    for zmw_name, seq_len in zmws:
+        num_bases += seq_len
+        subsampled_zmws.append(zmw_name)
+        if num_bases >= min_requested_bases:
+            break
+    return subsampled_zmws, num_bases
+
+def calc_stats(total_unique_molecular_bases, total_bases, output_bases, genome_size, coverage):
+    """
+    >>> calc_stats(0, 0, 0, 0, 0) == \
+    {'genome_size': 0, 'coverage': 0, 'total_bases': 0, 'total_unique_molecular_bases': 0, \
+    'output_bases': 0, 'unique_molecular_avg_cov': 0.0, 'output_avg_cov': 0.0, 'total_avg_cov': 0.0}
+    True
+    >>> calc_stats(10000, 100000, 2000, 1000, 2) == \
+    {'genome_size': 1000, 'coverage': 2, 'total_bases': 100000, 'total_unique_molecular_bases': 10000, \
+    'output_bases': 2000, 'unique_molecular_avg_cov': 10.0, 'output_avg_cov': 2.0, 'total_avg_cov': 100.0}
+    True
+    """
+    unique_molecular_avg_cov = 0.0 if genome_size == 0 else float(total_unique_molecular_bases) / float(genome_size)
+    total_avg_cov = 0.0 if genome_size == 0 else float(total_bases) / float(genome_size)
+    output_avg_cov = 0.0 if genome_size == 0 else float(output_bases) / float(genome_size)
+
+    ret = {
+        'genome_size': genome_size,
+        'coverage': coverage,
+        'total_bases': total_bases,
+        'total_unique_molecular_bases': total_unique_molecular_bases,
+        'output_bases': output_bases,
+        'total_avg_cov': total_avg_cov,
+        'unique_molecular_avg_cov': unique_molecular_avg_cov,
+        'output_avg_cov': output_avg_cov,
+    }
+
+    return ret
+
+def collect_zmws(fp_in):
+    zmws = []
+    seen_zmws = set()
+    unique_molecular_size = 0
+    total_size = 0
+    for line in fp_in:
+        sl = line.strip().split()
+        movie_zmw, zmw_median_len, zmw_total_len, zmw_num_passes = sl[0], int(sl[1]), int(sl[2]), int(sl[3])
+        assert movie_zmw not in seen_zmws, 'Duplicate ZMWs detected in the input. Offender: "{}".'.format(movie_zmw)
+        unique_molecular_size += zmw_median_len
+        total_size += zmw_total_len
+        zmws.append((movie_zmw, zmw_median_len))
+        seen_zmws.add(movie_zmw)
+    return zmws, unique_molecular_size, total_size
+
+def run(fp_in, coverage, genome_size, strategy_func):
+    zmws, total_unique_molecular_bases, total_bases = collect_zmws(fp_in)
+    zmws = strategy_func(zmws)
+    subsampled_zmws, output_bases = select_zmws(zmws, coverage * genome_size)
+    stats_dict = calc_stats(total_unique_molecular_bases, total_bases, output_bases, genome_size, coverage)
+    return subsampled_zmws, zmws, stats_dict
+
+def parse_args(argv):
+    parser = argparse.ArgumentParser(description="Produces a list of ZMW where the median unique molecular "\
+                                        "coverage sums up to the desired coverage of the given genome size, "\
+                                        "given a specified subsampling strategy. Input is a TSV passed via stdin. "\
+                                        "Output is to stdout.",
+                                        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--strategy', type=str, default='random',
+                        help='Subsampling strategy: random, longest')
+    parser.add_argument('--coverage', type=float, default=60,
+                        help='Desired coverage for subsampling.')
+    parser.add_argument('--genome-size', type=float, default=0,
+                        help='Genome size estimate of the input dataset.', required=True)
+    parser.add_argument('--random-seed', type=int, default=12345,
+                        help='Seed value used for the random generator.', required=False)
+    parser.add_argument('out_fn', type=str, default='zmw.whitelist.json',
+                        help='Output JSON file with subsampled ZMWs.')
+    args = parser.parse_args(argv[1:])
+    return args
+
+def main(argv=sys.argv):
+    args = parse_args(argv)
+    logging.basicConfig(level=logging.INFO)
+
+    strategy_func = STRATEGY_TYPE_TO_FUNC[args.strategy]
+    LOG.info('Using subsampling strategy: "{}"'.format(args.strategy))
+
+    system.set_random_seed(args.random_seed)
+
+    zmws_whitelist, zmws_all, stats_dict = run(
+            sys.stdin, args.coverage, args.genome_size, strategy_func)
+
+    io.serialize(args.out_fn, zmws_whitelist)
+
+if __name__ == "__main__":  # pragma: no cover
+    main(sys.argv)          # pragma: no cover

+ 36 - 0
FALCON/falcon_kit/multiproc.py

@@ -0,0 +1,36 @@
+"""Job pools for multiprocessing.
+"""
+
+
+from builtins import map
+from builtins import object
+import multiprocessing
+
+
+class FakePool(object):
+    """Fake version of multiprocessing.Pool
+    """
+
+    def map(self, func, iterable, chunksize=None):
+        return list(map(func, iterable))
+
+    def imap(self, func, iterable, chunksize=None):
+        return list(map(func, iterable))
+
+    def terminate(self):
+        pass
+
+    def __init__(self, initializer=None, initargs=[], *args, **kwds):
+        if initializer:
+            initializer(*initargs)
+
+
+def Pool(processes, *args, **kwds):
+    """Pool factory.
+    If 'not processes', return our FakePool;
+    otherwise, a multiprocessing.Pool.
+    """
+    if processes:
+        return multiprocessing.Pool(processes, *args, **kwds)
+    else:
+        return FakePool(*args, **kwds)

+ 203 - 0
FALCON/falcon_kit/pype.py

@@ -0,0 +1,203 @@
+"""This was copied from falcon_unzip, but we
+needed to modify the TASK SCRIPT to use our copy of
+generic_gather.py (not used now).
+"""
+
+
+import logging
+import os
+from pypeflow.simple_pwatcher_bridge import (PypeTask, Dist)
+from pypeflow.tasks import gen_task as pype_gen_task
+from pypeflow.do_task import wait_for
+from . import io
+
+LOG = logging.getLogger(__name__)
+
+TASK_GENERIC_RUN_UNITS_SCRIPT = """\
+python3 -m falcon_kit.mains.generic_run_units_of_work --nproc={params.pypeflow_nproc} --units-of-work-fn={input.units_of_work} --bash-template-fn={input.bash_template} --results-fn={output.results}
+"""
+TASK_GENERIC_SCATTER_ONE_UOW_SCRIPT = """\
+python3 -m falcon_kit.mains.generic_scatter_one_uow --all-uow-list-fn={input.all} --one-uow-list-fn={output.one} --split-idx={params.split_idx}
+"""
+TASK_GENERIC_UNSPLIT_SCRIPT = """
+python3 -m falcon_kit.mains.generic_unsplit --result-fn-list-fn={output.result_fn_list} --gathered-fn={output.gathered}
+"""
+#TASK_GENERIC_CHUNKING_SCRIPT = """\
+# This is done via pbtag now, I think.
+#python3 -m falcon_kit.mains.generic_chunking split-fn={input.split} --bash-template-temp-fn={input.bash_template_temp} --units-of-work-fn={output.units_of_work} --uow-template-fn={output.uow_template} --split-idx={params.split_idx}
+#"""
+
+
+def wrap_gen_task(script, inputs, outputs, rule_writer=None, parameters=None, dist=None):
+    if parameters is None:
+        parameters = dict()
+    if dist is None:
+        dist = Dist()
+    from future.utils import viewitems
+    rel_inputs = dict()
+    rel_outputs = dict()
+    # Make relative to CWD. (But better if caller does this.)
+    def get_rel(maybe_abs):
+        rel = dict()
+        for (k, v) in viewitems(maybe_abs):
+            try:
+                if os.path.isabs(v):
+                    v = os.path.relpath(v)
+                rel[k] = v
+            except Exception:
+                LOG.exception('Error for {!r}->{!r}'.format(k, v))
+                raise
+        return rel
+    inputs = get_rel(inputs)
+    outputs = get_rel(outputs)
+
+    first_output_dir = os.path.normpath(os.path.dirname(list(outputs.values())[0]))
+    rel_topdir = os.path.relpath('.', first_output_dir) # redundant for rel-inputs, but fine
+    params = dict(parameters)
+    params['topdir'] = rel_topdir
+
+    pt = pype_gen_task(script, inputs, outputs, params, dist)
+
+    # Run pype_gen_task first because it can valid some stuff.
+    if rule_writer:
+        rule_writer(inputs, outputs, params, script)
+    return pt
+
+
+def gen_parallel_tasks(
+        wf,
+        split_fn,
+        gathered_fn,
+        run_dict,
+        rule_writer=None,
+        dist=None,
+        run_script=TASK_GENERIC_RUN_UNITS_SCRIPT,
+):
+    """
+    By convention, the first (wildcard) output in run_dict['outputs'] must be the gatherable list,
+    in the same format as the gathered_fn to be generated from them.
+
+    For now, we require a single such output, since we do not yet test for wildcards.
+    """
+    assert 'dist' not in run_dict, 'dist should be a parameter of gen_parallel_tasks(), not of its run_dict'
+    if dist is None:
+        dist = Dist()
+    from future.utils import itervalues
+    #from future.utils import viewitems
+    # run_dict['inputs'] should be patterns to match the inputs in split_fn, by convention.
+
+    #task_parameters = resolved_dict(run_dict.get('parameters', {}))
+    task_parameters = run_dict.get('parameters', {})
+    assert not task_parameters, 'We do not currently support the "parameters" field of a run_dict. {!r}'.format(task_parameters)
+
+    # Write 3 wildcard rules for snakemake, 2 with dynamic.
+    if rule_writer:
+        rule_writer.write_dynamic_rules(
+            rule_name="foo",
+            input_json=split_fn,
+            inputs=dict_rel_paths(run_dict['inputs']),
+            shell_template=run_dict['script'],
+            parameters=task_parameters,
+            wildcard_outputs=dict_rel_paths(run_dict['outputs']),
+            output_json=gathered_fn,
+    )
+
+    #outputs = {k:patt.format(**jobkv) for k,patt in output_patterns}
+    #inputs =  {k:patt.format(**jobkv) for k,patt in input_patterns}
+    #inputs['SPLIT'] = split_fn # presumably ignored by script; might not be needed at all
+    #split_fn = scatter_dict['outputs']['split'] # by convention
+    wf.refreshTargets()
+    max_jobs = wf.max_jobs
+
+    wait_for(split_fn)
+    split = io.deserialize(split_fn)
+    bash_template_fn = run_dict['bash_template_fn']
+
+    def find_wildcard_input(inputs):
+        for k,v in list(inputs.items()):
+            if '{' in v:
+                return v
+        else:
+            raise Exception('No wildcard inputs among {!r}'.format(inputs))
+
+    LOG.debug('PARALLEL OUTPUTS:{}'.format(run_dict['outputs']))
+    task_results = dict()
+    for split_idx, job in enumerate(split):
+        #inputs = job['input']
+        #outputs = job['output']
+        #params = job['params']
+        #wildcards = job['wildcards']
+        #params.update({k: v for (k, v) in viewitems(job['wildcards'])}) # include expanded wildcards
+        #LOG.warning('OUT:{}'.format(outputs))
+
+        wildcards = job['wildcards']
+        def resolved(v):
+            return v.format(**wildcards)
+        def resolved_dict(d):
+            result = dict(d)
+            LOG.debug(' wildcards={!r}'.format(wildcards))
+            for k,v in list(d.items()):
+                LOG.debug('  k={}, v={!r}'.format(k, v))
+                result[k] = v.format(**wildcards)
+            return result
+        #task_inputs = resolved_dict(run_dict['inputs'])
+        task_outputs = resolved_dict(run_dict['outputs'])
+
+        wild_input = find_wildcard_input(run_dict['inputs'])
+        one_uow_fn = os.path.abspath(wild_input.format(**wildcards))
+
+        wf.addTask(pype_gen_task(
+                script=TASK_GENERIC_SCATTER_ONE_UOW_SCRIPT,
+                inputs={
+                    'all': split_fn,
+                },
+                outputs={
+                    'one': one_uow_fn,
+                },
+                parameters={
+                    'split_idx': split_idx,
+                },
+                dist=Dist(local=True, use_tmpdir=False),
+        ))
+
+        wf.addTask(pype_gen_task(
+                script=run_script, # usually TASK_GENERIC_RUN_UNITS_SCRIPT, unless individual load-time is slow
+                inputs={
+                    'units_of_work': one_uow_fn,
+                    'bash_template': bash_template_fn,
+                },
+                outputs=task_outputs, # TASK_GENERIC_RUN_UNITS_SCRIPT expects only 1, called 'results'
+                parameters={}, # some are substituted from 'dist'
+                dist=dist,
+        ))
+        wildcards_str = '_'.join(w for w in itervalues(job['wildcards']))
+        job_name = 'job{}'.format(wildcards_str)
+        task_results[job_name] = list(task_outputs.values())[0]
+
+    gather_inputs = dict(task_results)
+    ## An implicit "gatherer" simply takes the output filenames and combines their contents.
+    gathered_dn = os.path.dirname(gathered_fn)
+    result_fn_list_fn = os.path.join(gathered_dn, 'result-fn-list.json')
+    # Dump (with rel-paths) into next task-dir before next task starts.
+    io.serialize(result_fn_list_fn, [os.path.relpath(v, gathered_dn) for v in list(task_results.values())])
+    #assert 'result_fn_list' not in gather_inputs
+    #gather_inputs['result_fn_list'] = result_fn_list_fn # No! pseudo output, since it must exist in a known directory
+    LOG.debug('gather_inputs:{!r}'.format(gather_inputs))
+    wf.addTask(pype_gen_task(
+        script=TASK_GENERIC_UNSPLIT_SCRIPT,
+        inputs=gather_inputs,
+        outputs={
+            'gathered': gathered_fn,
+            'result_fn_list': result_fn_list_fn,
+        },
+        parameters={},
+        dist=Dist(local=True, use_tmpdir=False),
+    ))
+    wf.max_jobs = dist.job_dict.get('njobs', max_jobs)
+    wf.refreshTargets()
+    wf.max_jobs = max_jobs
+
+
+def dict_rel_paths(dict_paths):
+    from future.utils import viewitems
+    return {k: os.path.relpath(v) for (k, v) in viewitems(dict_paths)}

+ 195 - 0
FALCON/falcon_kit/pype_tasks.py

@@ -0,0 +1,195 @@
+
+
+
+
+from future.utils import viewitems
+from future.utils import itervalues
+# PypeTask functions now need to be module-level.
+from . import run_support as support
+from . import bash  # for scattering
+# from pypeflow.simple_pwatcher_bridge import fn # not really needed
+import collections
+import json
+import logging
+import os.path
+LOG = logging.getLogger(__name__)
+
+
+TASK_BAM2DEXTA_SPLIT_SCRIPT = """\
+python3 -m falcon_kit.mains.bam2dexta split  --wildcards={params.wildcards} --bam={input.bam} --split-fn={output.split} --bash-template-fn={output.bash_template}
+"""
+TASK_BAM2DEXTA_APPLY_SCRIPT = """\
+python3 -m falcon_kit.mains.bam2dexta apply  --bam-fn={input.bam} --dexta-fn={output.dexta}
+"""
+TASK_BAM2DEXTA_COMBINE_SCRIPT = """\
+python3 -m falcon_kit.mains.bam2dexta combine  --gathered-fn={input.gathered} --dexta-fofn-fn={output.fofn}
+"""
+TASK_CONSENSUS_SPLIT_SCRIPT = """\
+python3 -m falcon_kit.mains.consensus_split --wildcards={params.wildcards} --p-id2las-fn={input.p_id2las} --db-fn={input.raw_reads_db} --length-cutoff-fn={input.length_cutoff} --config-fn={input.config} --split-fn={output.split} --bash-template-fn={output.bash_template}
+"""
+TASK_CONSENSUS_TASK_SCRIPT = """\
+python3 -m falcon_kit.mains.consensus_task --nproc={params.pypeflow_nproc} --las-fn={input.las} --db-fn={input.db} --length-cutoff-fn={input.length_cutoff} --config-fn={input.config} --fasta-fn={output.fasta}
+"""
+TASK_CONSENSUS_GATHER_SCRIPT = """\
+python3 -m falcon_kit.mains.consensus_gather_fasta_fofn --gathered-fn={input.gathered} --db-fn={input.raw_reads_db} --config-fn={input.config} --preads-fofn-fn={output.preads_fofn}
+"""
+TASK_REPORT_PRE_ASSEMBLY_SCRIPT = """\
+python3 -m falcon_kit.mains.task_report_pre_assembly --config-fn={input.config} --length-cutoff-fn={input.length_cutoff} --raw-reads-db-fn={input.raw_reads_db} --preads-fofn-fn={input.preads_fofn} --pre-assembly-report-fn={output.pre_assembly_report}
+"""
+
+TASK_DB_BUILD_SCRIPT = """\
+python3 -m falcon_kit.mains.dazzler --config-fn={input.config} --db-fn={output.db}  build --input-fofn-fn={input.input_fofn} --length-cutoff-fn={output.length_cutoff}
+# TODO: Verify that db exists.
+#ln -sf {output.length_cutoff} length_cutoff
+"""
+TASK_DB_TAN_SPLIT_SCRIPT = """\
+python3 -m falcon_kit.mains.dazzler --config={input.config} --db={input.db}  tan-split --split={output.split} --bash-template={output.bash_template}
+"""
+TASK_DB_TAN_APPLY_SCRIPT = """\
+python3 -m falcon_kit.mains.dazzler --config={input.config} --db={input.db}  tan-apply --script={input.script} --job-done={output.job_done}
+"""
+TASK_DB_TAN_COMBINE_SCRIPT = """\
+python3 -m falcon_kit.mains.dazzler --config={input.config} --db={input.db}  tan-combine --gathered={input.gathered} --new-db={output.new_db}
+"""
+TASK_DB_REP_SPLIT_SCRIPT = """\
+python3 -m falcon_kit.mains.dazzler --config={input.config} --db={input.db}  rep-split --las-paths-fn={input.las_paths} --wildcards={params.wildcards} -g{params.group_size} -c{params.coverage_limit} --split={output.split} --bash-template={output.bash_template}
+"""
+TASK_DB_REP_APPLY_SCRIPT = """\
+python3 -m falcon_kit.mains.dazzler --config={input.config} --db={input.db}  rep-apply --script={input.script} --job-done={output.job_done}
+"""
+TASK_DB_REP_COMBINE_SCRIPT = """\
+python3 -m falcon_kit.mains.dazzler --config={input.config} --db={input.db}  rep-combine -g{params.group_size} --gathered={input.gathered} --new-db={output.new_db}
+"""
+TASK_DB_REP_DALIGNER_SPLIT_SCRIPT = """\
+python3 -m falcon_kit.mains.dazzler --config={input.config} --db={input.db} --nproc={params.pypeflow_nproc}  rep-daligner-split --wildcards={params.wildcards} --group-size={params.group_size} --coverage-limit={params.coverage_limit} --split-fn={output.split} --bash-template-fn={output.bash_template}
+"""
+TASK_DB_DALIGNER_SPLIT_SCRIPT = """\
+python3 -m falcon_kit.mains.dazzler --config={input.config} --db={input.db} --nproc={params.pypeflow_nproc}  daligner-split --wildcards={params.wildcards} --length-cutoff-fn={input.length_cutoff} --split-fn={output.split} --bash-template-fn={output.bash_template}
+"""
+TASK_DB_DALIGNER_APPLY_SCRIPT = """\
+python3 -m falcon_kit.mains.dazzler --config={input.config} --db={input.db}  daligner-apply --script={input.script} --job-done={output.job_done}
+"""
+TASK_DB_DALIGNER_COMBINE_SCRIPT = """\
+python3 -m falcon_kit.mains.dazzler --config={input.config} --db={input.db}  daligner-combine --gathered={input.gathered} --las-paths-fn={output.las_paths}
+"""
+TASK_DB_LAMERGE_SPLIT_SCRIPT = """\
+python3 -m falcon_kit.mains.dazzler --config={input.config}                  merge-split --db-prefix={params.db_prefix} --las-paths={input.las_paths} --wildcards={params.wildcards} --split-fn={output.split} --bash-template-fn={output.bash_template}
+"""
+TASK_DB_LAMERGE_APPLY_SCRIPT = """\
+python3 -m falcon_kit.mains.dazzler --config={input.config}                  merge-apply --las-paths={input.las_paths} --las-fn={output.las_fn}
+"""
+TASK_DB_LAMERGE_COMBINE_SCRIPT = """\
+python3 -m falcon_kit.mains.dazzler --config={input.config}                  merge-combine --gathered={input.gathered} --las-paths-fn={output.las_paths} --block2las-fn={output.block2las}
+"""
+
+TASK_DUMP_RAWREAD_IDS_SCRIPT = """\
+DBshow -n {input.rawread_db} | tr -d '>' | LD_LIBRARY_PATH= awk '{{print $1}}' > {output.rawread_id_file}
+"""
+TASK_DUMP_PREAD_IDS_SCRIPT = """\
+DBshow -n {input.pread_db} | tr -d '>' | LD_LIBRARY_PATH= awk '{{print $1}}' > {output.pread_id_file}
+"""
+TASK_GENERATE_READ_TO_CTG_MAP_SCRIPT = """\
+python3 -m falcon_kit.mains.generate_read_to_ctg_map --rawread-id={input.rawread_id_file} --pread-id={input.pread_id_file} --sg-edges-list={input.sg_edges_list} --utg-data={input.utg_data} --ctg-paths={input.ctg_paths} --output={output.read_to_contig_map}
+"""
+TASK_RUN_DB_TO_FALCON_SCRIPT = """\
+# Given preads.db,
+# write preads4falcon.fasta (implicitly) in CWD.
+time DB2Falcon -U {input.preads_db}
+[ -f {output.preads4falcon} ] || exit 1
+touch {output.job_done}
+"""
+TASK_RUN_FALCON_ASM_SCRIPT = """\
+# Given, las_fofn.json,
+# write preads.ovl:
+
+# mobs uses binwrappers, so it does not see our "entry-points".
+# So, after dropping "src/py_scripts/*.py", we can call these via python3 -m:
+
+time python3 -m falcon_kit.mains.ovlp_filter --db {input.db_file} --las-fofn {input.las_fofn} {params.overlap_filtering_setting} --min-len {params.length_cutoff_pr} --out-fn preads.ovl
+
+ln -sf {input.preads4falcon_fasta} ./preads4falcon.fasta
+
+# Given preads.ovl,
+# write sg_edges_list, c_path, utg_data, ctg_paths.
+time python3 -m falcon_kit.mains.ovlp_to_graph {params.fc_ovlp_to_graph_option} --overlap-file preads.ovl >| fc_ovlp_to_graph.log
+
+# Given sg_edges_list, utg_data, ctg_paths, preads4falcon.fasta,
+# write p_ctg.fa and a_ctg_all.fa,
+# plus a_ctg_base.fa, p_ctg_tiling_path, a_ctg_tiling_path, a_ctg_base_tiling_path:
+time python3 -m falcon_kit.mains.graph_to_contig
+
+# Given a_ctg_all.fa, write a_ctg.fa:
+time python3 -m falcon_kit.mains.dedup_a_tigs >| a_ctg.fa
+
+# Given a_ctg.fa and a_ctg_all_tiling_path, write a_ctg_tiling_path:
+time python3 -m falcon_kit.mains.dedup_a_tp >| a_ctg_tiling_path
+
+# Collect all info needed to format the GFA-1 and GFA-2 representations of
+# the assembly graphs.
+time python3 -m falcon_kit.mains.collect_pread_gfa >| asm.gfa.json
+time python3 -m falcon_kit.mains.collect_pread_gfa --add-string-graph >| sg.gfa.json
+time python3 -m falcon_kit.mains.collect_contig_gfa >| contig.gfa.json
+
+# Output the assembly pread graph.
+time python3 -m falcon_kit.mains.gen_gfa_v1 asm.gfa.json >| asm.gfa
+time python3 -m falcon_kit.mains.gen_gfa_v2 asm.gfa.json >| asm.gfa2
+
+# Output the string graph.
+time python3 -m falcon_kit.mains.gen_gfa_v1 sg.gfa.json >| sg.gfa
+time python3 -m falcon_kit.mains.gen_gfa_v2 sg.gfa.json >| sg.gfa2
+
+# Output the contig graph with associate contigs attached to each primary contig.
+time python3 -m falcon_kit.mains.gen_gfa_v2 contig.gfa.json >| contig.gfa2
+
+#rm -f ./preads4falcon.fasta
+
+touch {output.falcon_asm_done}
+"""
+
+
+def fn(p): return p
+
+
+def system(call, check=False):
+    LOG.debug('$(%s)' % repr(call))
+    rc = os.system(call)
+    msg = 'Call %r returned %d.' % (call, rc)
+    if rc:
+        LOG.warning(msg)
+        if check:
+            raise Exception(msg)
+    else:
+        LOG.debug(msg)
+    return rc
+
+
+def task_dump_rawread_ids(self):
+    rawread_db = fn(self.rawread_db)
+    rawread_id_file = fn(self.rawread_id_file)
+    input = object()
+    input.rawread_db = rawread_db
+    output = object()
+    output.rawread_id_file = rawread_id_file
+    system(TASK_DUMP_RAWREAD_IDS_SCRIPT.format(**locals()))
+
+
+def task_dump_pread_ids(self):
+    pread_db = fn(self.pread_db)
+    pread_id_file = fn(self.pread_id_file)
+    input = object()
+    input.pread_db = pread_db
+    output = object()
+    output.pread_id_file = pread_id_file
+    system(TASK_DUMP_PREAD_IDS_SCRIPT.format(**locals()))
+
+
+def task_generate_read_to_ctg_map(self):
+    input = object()
+    input.rawread_id_file = fn(self.rawread_id_file)
+    input.pread_id_file = fn(self.pread_id_file)
+    input.sg_edges_list = fn(self.sg_edges_list)
+    input.utg_data = fn(self.utg_data)
+    input.ctg_paths = fn(self.ctg_paths)
+    output = object()
+    output.read_to_contig_map = fn(self.read_to_contig_map)
+    system(TASK_GENERATE_READ_TO_CTG_MAP_SCRIPT.format(**locals()))

+ 586 - 0
FALCON/falcon_kit/run_support.py

@@ -0,0 +1,586 @@
+from future.utils import viewitems
+
+from . import bash, functional
+from .functional import cfg_tobool
+from .io import NativeIO
+from .util.system import (make_fofn_abs, make_dirs, cd)
+import json
+import logging
+import logging.config
+import os
+import re
+import io
+import sys
+import tempfile
+import time
+import uuid
+
+logger = logging.getLogger(__name__)
+
+from configparser import ConfigParser
+
+
+def _prepend_env_paths(content, names):
+    """
+    E.g.
+      names = ['PATH', 'PYTYHONPATH']
+      content =
+        echo hi
+      =>
+        export PATH=current:path:${PATH}
+        export PYTHON=current:path:${PYTHONPATH}
+        echo hi
+    """
+    export_env_vars = ['export %(k)s=%(v)s:${%(k)s}' % dict(
+        k=name, v=os.environ.get(name, '')) for name in names]
+    return '\n'.join(export_env_vars + [content])
+
+
+def update_env_in_script(fn, names):
+    """Modify fn using on prepend_env_paths().
+    """
+    with open(fn) as ifs:
+        content = ifs.read()
+    content = _prepend_env_paths(content, names)
+    with open(fn, 'w') as ofs:
+        ofs.write(content)
+
+
+def use_tmpdir_for_files(basenames, src_dir, link_dir):
+    """NOT USED. Kept only for reference. This will be done in pypeFLOW.
+
+    Generate script to copy db files to tmpdir (for speed).
+    - Choose tmp_dir, based on src_dir name.
+    - rsync basenames into tmp_dir  # after 'flock', per file
+    - symlink from link_dir into tmp_dir.
+    Return list of script lines, sans linefeed.
+    """
+    script = list()
+    unique = os.path.abspath(src_dir).replace('/', '_')
+    root = tempfile.gettempdir()
+    tmp_dir = os.path.join(root, 'falcon', unique)
+    script.append('mkdir -p %s' % tmp_dir)
+    for basename in basenames:
+        src = os.path.join(src_dir, basename)
+        dst = os.path.join(tmp_dir, basename)
+        rm_cmd = 'rm -f %s' % basename
+        # Wait on lock for up to 10 minutes, in case of very large files.
+        rsync_cmd = "flock -w 600 %s.lock -c 'rsync -av %s %s'" % (
+            dst, src, dst)
+        ln_cmd = 'ln -sf %s %s' % (dst, basename)
+        script.extend([rm_cmd, rsync_cmd, ln_cmd])
+    return script
+
+
+def make_job_data(url, script_fn):
+    """Choose defaults.
+    Run in same directory as script_fn.
+    Base job_name on script_fn.
+    """
+    wd = os.path.dirname(script_fn)
+    job_name = '{0}-{1}-{2}'.format(
+        os.path.basename(script_fn),
+        url.split("/")[-1],
+        str(uuid.uuid4())[:8],
+    )
+    job_data = {"job_name": job_name,
+                "cwd": wd,
+                "script_fn": script_fn}
+    return job_data
+
+
+def check_HPCdaligner_option(option):
+    msg = ''
+    if '-dal' in option:
+        msg += 'HPC.daligner option "-dal" has changed to "-B".\n'
+    if '-deg' in option:
+        msg += 'HPC.daligner option "-deg" has changed to "-D".\n'
+    if msg:
+        raise Exception(msg)
+
+
+def clean_falcon_options(fc):
+    """Update some values in fc.
+    Replace _ with - in a couple places.
+    """
+    keys = ('falcon_sense_option', 'overlap_filtering_setting', 'fc_ovlp_to_graph_option',
+    )
+    for key in keys:
+        update_dash_flags(fc, key)
+    for dk in ('pa_HPCdaligner_option', 'ovlp_HPCdaligner_option'):
+        if dk in fc:
+            check_HPCdaligner_option(fc[dk])
+
+
+def get_config(config):
+    """
+    This is only for the call from pbsmrtpipe:
+       upport.get_config(support.parse_config(fn))
+    We have changed parse_config() to return a dict.
+    So this is a no-op.
+    """
+    cfg = dict(config) # already a dict now
+    return cfg
+
+
+def dict2config(jdict, section):
+    config = ConfigParser()
+    if not config.has_section(section):
+        config.add_section(section)
+    for (k, v) in viewitems(jdict):
+        config.set(section, k, str(v))
+    return config
+
+
+def parse_config(config_fn):
+    """Deprecated.
+    Called from pbsmrtpipe, for now.
+    """
+    return parse_cfg_file(config_fn)
+
+def parse_cfg_file(config_fn):
+    """Return as dict.
+    """
+    with open(config_fn) as stream:
+        ext = os.path.splitext(config_fn)[1]
+        if ext in ('.json', '.js'):
+            config = json.loads(stream.read())
+        else:
+            # Parse sections (and case-sensitively), into sub-dicts.
+            config = parse_cfg_with_sections(stream)
+    update_defaults(config['General'])
+    # Copy General section to top, for now.
+    #for key, val in config['General'].items():
+    #    config[key] = val
+    ##cfg.update(config.get('General', {}))
+    check_config_sections(config) # Ensure that the right sections exist.
+    update_job_sections(config)
+    return config
+
+def process_job_defaults(job_defaults):
+    key = 'use_tmpdir'
+    use_tmpdir = job_defaults.get(key, '')
+    if '/' in use_tmpdir:
+        tempfile.tempdir = use_tmpdir
+        os.environ['TMPDIR'] = use_tmpdir
+    else:
+        if use_tmpdir.lower().startswith('t'):
+            use_tmpdir = tempfile.gettempdir()
+        else:
+            use_tmpdir = False
+        job_defaults[key] = use_tmpdir
+
+def update_job_defaults_section(config):
+    """For backwards compatibility with stuff from 'General' section.
+    """
+    General = config['General']
+    job_defaults = config['job.defaults']
+
+    if 'njobs' in General:
+        logger.warning('"njobs" belongs in the [job.defaults] section.')
+    if 'pwatcher_type' in General:
+        logger.warning('Please specify "pwatcher_type" only in the [job.defaults] section, not in [General].')
+    if 'job_type' in General:
+        logger.warning('Please specify "job_type" only in the [job.defaults] section, not in [General].')
+    if 'stop_all_jobs_on_failure' in General:
+        logger.warning('Please specify "stop_all_jobs_on_failure" only in the [job.defaults] section, not in [General].')
+    if 'use_tmpdir' in General:
+        logger.warning('Please specify "use_tmpdir" only in the [job.defaults] section, not in [General].')
+    if 'job_name_style' in General:
+        logger.warning('Please specify "job_name_style" only in the [job.defaults] section, not in [General].')
+    if 'job_queue' in General:
+        logger.warning('Please specify "JOB_QUEUE" only in the [job.defaults] section, not as "job_queue" in [General].')
+    if 'sge_option' in General:
+        logger.warning('Please specify "JOB_OPTS" in the [job.defaults] section, not as "sge_option" in [General].')
+
+    pwatcher_type = General.get('pwatcher_type', 'fs_based') #, config.get('pwatcher_type')))
+    job_type = job_defaults.get('job_type', General.get('job_type', '')).lower()
+    job_queue = General.get('job_queue', '')
+    sge_option = General.get('sge_option', '')
+
+    if 'pwatcher_type' not in job_defaults:
+        job_defaults['pwatcher_type'] = pwatcher_type
+    else:
+        pwatcher_type = job_defaults['pwatcher_type']
+    if 'submit' not in config['job.defaults']:
+        if 'blocking' == pwatcher_type:
+            if not job_queue or ' ' not in job_queue:
+                raise Exception('pwatcher_type=blocking, but "submit" is not in [job.defaults] section.')
+            config['job.defaults']['submit'] = job_queue
+            logger.warning('Please set "submit" in [job.defaults] section. (For now, we will use "job_queue" from [General], which was a hack.)')
+        elif 'fs_based' == pwatcher_type or 'network_based' == pwatcher_type:
+            if not job_type:
+                logger.error('job.defaults.submit is not set; pwatcher_type={}; but job_type is not set. Maybe try "job_type=local" first.'.format(pwatcher_type))
+                job_type = 'local'
+                job_defaults['job_type'] = job_type
+            allowed_job_types = ['sge', 'pbs', 'torque', 'slurm', 'lsf', 'local']
+            assert job_type in allowed_job_types, 'job_type={} not in {}'.format(
+                    job_type, allowed_job_types)
+            if job_queue and 'JOB_QUEUE' not in config['job.defaults']:
+                job_defaults['JOB_QUEUE'] = job_queue
+        else:
+            raise Exception('Unknown pwatcher_type={}'.format(pwatcher_type))
+    #assert 'submit' in config['job.defaults'], repr(config)
+    if sge_option and 'JOB_OPTS' not in config['job.defaults']:
+        job_defaults['JOB_OPTS'] = sge_option
+    if 'njobs' not in job_defaults:
+        config['job.defaults']['njobs'] = int(General.get('default_concurrent_jobs', 8)) # GLOBAL DEFAULT CONCURRENCY
+        msg = 'Please supply a default for "njobs" (aka concurrency) in section [job.defaults]. For now, we will use {}'.format(
+                config['job.defaults']['njobs'])
+        logger.warning(msg)
+    def update_if_if(key):
+        if key not in job_defaults:
+            if key in General:
+                job_defaults[key] = General[key]
+                logger.warning('Found "{}" from [General] section; should be in [job.defaults] instead.'.format(key))
+    update_if_if('job_name_style')
+    update_if_if('stop_all_jobs_on_failure')
+    update_if_if('use_tmpdir')
+
+    legacy_names = [
+            'pwatcher_type', 'pwatcher_directory',
+            'job_type', 'job_queue', 'job_name_style',
+            'use_tmpdir',
+    ]
+    def update_if_missing(name, sub_dict):
+        if General.get(name) and name not in sub_dict:
+            sub_dict[name] = General[name]
+    for name in legacy_names:
+        update_if_missing(name, config['job.defaults'])
+    process_job_defaults(job_defaults)
+
+def update_job_sections(config):
+    """More for backwards compatibility with stuff from 'General' section.
+    """
+    update_job_defaults_section(config)
+    General = config['General']
+
+    # Update a few where the names change and the section is non-default.
+    def update_step_job_opts(name):
+        if General.get('sge_option_'+name) and 'JOB_OPTS' not in config['job.step.'+name]:
+            config['job.step.'+name]['JOB_OPTS'] = General['sge_option_'+name]
+    def update_step_njobs(name):
+        if General.get(name+'_concurrent_jobs') and 'njobs' not in config['job.step.'+name]:
+            config['job.step.'+name]['njobs'] = int(General[name+'_concurrent_jobs'])
+    for name in ['bd', 'da', 'la', 'pda', 'pla', 'cns', 'fc', 'asm']:
+        update_step_job_opts(name)
+        update_step_njobs(name)
+    # Prefer 'asm' to 'fc'.
+    asm = dict(config['job.step.asm'])
+    config['job.step.asm'] = config['job.step.fc']
+    del config['job.step.fc']
+    config['job.step.asm'].update(asm)
+
+def parse_cfg_with_sections(stream):
+    """Return as dict of dict of ...
+    """
+    #Experimental:
+    """
+    ConfigParser sections become sub-sub sections when separated by dots.
+
+        [foo.bar]
+        baz = 42
+
+    is equivalent to JSON
+
+        {"foo": {"bar": {"baz": 42}}}
+    """
+    content = stream.read()
+    result = dict()
+    try:
+        jdict = json.loads(NativeIO(content).read())
+        return jdict
+    except ValueError:
+        pass #logger.exception('Could not parse stream as JSON.')
+    try:
+        config = ConfigParser(strict=False)
+        config.optionxform = str
+        config.read_file(NativeIO(content))
+        sections = config.sections()
+        for sec in sections:
+            result[sec] = dict(config.items(sec))
+        return result
+    except:
+        raise
+
+
+def check_config_sections(cfg):
+    """And ensure these all exist.
+    """
+    allowed_sections = set(['General',
+            'job.step.dust',
+            'job.step.da', 'job.step.pda',
+            'job.step.la', 'job.step.pla',
+            'job.step.cns', 'job.step.fc',
+            'job.step.asm',
+            'job.defaults',
+    ])
+    all_sections = set(k for k,v in list(cfg.items()) if isinstance(v, dict))
+    unexpected = all_sections - allowed_sections
+    if unexpected:
+        msg = 'You have {} unexpected cfg sections: {}'.format(
+            len(unexpected), unexpected)
+        raise Exception(msg)
+    # Guarantee they all exist.
+    for sec in allowed_sections:
+        if sec not in cfg:
+            cfg[sec] = dict()
+
+def update_dash_flags(cfg, key):
+    if key not in cfg:
+        return
+    val = cfg[key]
+    cfg[key] = new_val = functional.dash_flags(cfg[key])
+    if val != new_val:
+        msg = '''\
+Option contains flags with "_":
+ "{key}={val}". Those should be "-", as in
+ "{key}={new_val}". Auto-replaced.'''.format(**locals())
+        logger.warning(msg)
+
+
+TEXT_FILE_BUSY = 'avoid_text_file_busy'
+
+def update_defaults(cfg):
+    """cfg is probably the General sub-dict.
+    """
+    def set_default(key, val):
+        if key not in cfg:
+            cfg[key] = val
+    set_default('input_type', 'raw')
+    set_default('overlap_filtering_setting', '--max-diff 1000 --max-cov 1000 --min-cov 2')
+    #set_default('pa_daligner_option', '-e.70 -s100 -t16') # TODO: -t is a dumb default
+    #set_default('ovlp_daligner_option', '-e.96 -s1000 -h60 -t32') # TODO: -t is a dumb default
+    set_default('pa_HPCdaligner_option', '-v -D24')
+    set_default('ovlp_HPCdaligner_option', '-v -D24 -l500')
+    set_default('pa_HPCTANmask_option', '-l500') # daligner defaults to -l1000
+    #set_default('ovlp_HPCTANmask_option', '-l500')
+    set_default('pa_REPmask_code', '0,300/0,300/0,300')
+    set_default('pa_DBsplit_option', '-x500 -s200 -a')
+    set_default('skip_checks', False)
+    set_default('pa_DBdust_option', '') # Gene recommends the defaults. I have tried -w128 -t2.5 -m20
+    set_default('pa_fasta_filter_option', 'streamed-internal-median')
+    set_default('pa_subsample_coverage', 0)
+    set_default('pa_subsample_strategy', 'random')
+    set_default('pa_subsample_random_seed', 12345)
+    set_default('dazcon', False)
+    set_default('pa_dazcon_option', '-j 4 -x -l 500')
+    set_default('ovlp_DBdust_option', '')
+    set_default('ovlp_DBsplit_option', '-x500 -s200 -a')
+    set_default('falcon_sense_option', '--output-multi --min-idt 0.70 --min-cov 2 --max-n-read 1800')
+    set_default('falcon_sense_skip_contained', False)
+    set_default('falcon_sense_greedy', False)
+    set_default('LA4Falcon_preload', '')
+    set_default('fc_ovlp_to_graph_option', '')
+    set_default('genome_size', 0)
+    set_default('seed_coverage', 20)
+    set_default('length_cutoff', -1)
+    set_default('length_cutoff_pr', 0)
+    set_default('bestn', 12)
+    set_default('target', 'assembly')
+    set_default(TEXT_FILE_BUSY, bash.BUG_avoid_Text_file_busy)
+
+    for bool_key in ('skip_checks', 'dazcon', 'falcon_sense_skip_contained', 'falcon_sense_greedy', 'LA4Falcon_preload', TEXT_FILE_BUSY):
+        cfg[bool_key] = functional.cfg_tobool(cfg.get(bool_key, False))
+
+    if 'dust' in cfg:
+        logger.warning(
+            "The 'dust' option is deprecated and ignored. We always run DBdust now. Use ovlp_/pa_DBdust_option to override DBdust default arguments.")
+
+    bash.BUG_avoid_Text_file_busy = cfg[TEXT_FILE_BUSY]
+
+    clean_falcon_options(cfg)
+
+    falcon_sense_option = cfg['falcon_sense_option']
+    if 'local_match_count' in falcon_sense_option or 'output_dformat' in falcon_sense_option:
+        raise Exception('Please remove obsolete "--local_match_count_*" or "--output_dformat"' +
+                        ' from "falcon_sense_option" in your cfg: %s' % repr(falcon_sense_option))
+    genome_size = int(cfg['genome_size'])
+    length_cutoff = int(cfg['length_cutoff'])
+    if length_cutoff < 0 and genome_size < 1:
+        raise Exception(
+            'Must specify either length_cutoff>0 or genome_size>0')
+    pa_subsample_strategy = cfg['pa_subsample_strategy']
+    pa_subsample_random_seed = int(cfg['pa_subsample_random_seed'])
+    pa_subsample_coverage = int(cfg['pa_subsample_coverage'])
+    if pa_subsample_coverage > 0:
+        if genome_size < 1:
+            raise Exception(
+                'Must specify genome_size > 0 for subsampling.')
+
+    # This one depends on length_cutoff_pr for its default.
+    fc_ovlp_to_graph_option = cfg['fc_ovlp_to_graph_option']
+    if '--min_len' not in fc_ovlp_to_graph_option and '--min-len' not in fc_ovlp_to_graph_option:
+        length_cutoff_pr = cfg['length_cutoff_pr']
+        fc_ovlp_to_graph_option += ' --min-len {}'.format(length_cutoff_pr)
+        cfg['fc_ovlp_to_graph_option'] = fc_ovlp_to_graph_option
+
+    target = cfg['target']
+    if target not in ["overlapping", "pre-assembly", "assembly"]:
+        msg = """ Target has to be "overlapping", "pre-assembly" or "assembly" in this verison. You have an unknown target {!r} in the configuration file.  """.format(target)
+        raise Exception(msg)
+
+    possible_extra_keys = [
+            'sge_option', 'default_concurrent_jobs',
+            'pwatcher_type', 'pwatcher_directory',
+            'job_type', 'job_queue', 'job_name_style',
+            'use_tmpdir',
+    ]
+    for step in ['dust', 'da', 'la', 'pda', 'pla', 'fc', 'cns', 'asm']:
+        sge_option_key = 'sge_option_' + step
+        possible_extra_keys.append(sge_option_key)
+        concurrent_jobs_key = step + '_concurrent_jobs'
+        possible_extra_keys.append(concurrent_jobs_key)
+    extra = list()
+    for key in possible_extra_keys:
+        if key in cfg:
+            extra.append(key)
+    if extra:
+        extra.sort()
+        msg = 'You have several old-style options. These should be provided in the `[job.defaults]` or `[job.step.*]` sections, and possibly renamed. See https://github.com/PacificBiosciences/FALCON/wiki/Configuration\n {}'.format(extra)
+        logger.warning(msg)
+
+    check_unexpected_keys(cfg)
+
+def check_unexpected_keys(cfg):
+    # Warn on unused variables.
+    expected = (TEXT_FILE_BUSY,
+        'input_fofn',
+        'input_type',
+        'genome_size',
+        'seed_coverage',
+        'length_cutoff',
+        'length_cutoff_pr',
+        'dazcon',
+        'pa_dazcon_option',
+        'pa_DBdust_option',
+        'pa_fasta_filter_option',
+        'pa_subsample_coverage',
+        'pa_subsample_strategy',
+        'pa_subsample_random_seed',
+        'pa_DBsplit_option',
+        'pa_HPCTANmask_option',
+        'pa_HPCREPmask_option',
+        'pa_REPmask_code',
+        'pa_daligner_option',
+        'pa_HPCdaligner_option',
+        'ovlp_DBdust_option',
+        'ovlp_DBsplit_option',
+        #'ovlp_HPCTANmask_option',
+        'ovlp_daligner_option',
+        'ovlp_HPCdaligner_option',
+        'skip_checks',
+        'falcon_sense_option',
+        'falcon_sense_skip_contained',
+        'falcon_sense_greedy',
+        'LA4Falcon_preload',
+        'LA4Falcon_pre', # hidden
+        'LA4Falcon_post', # hidden
+        'LA4Falcon_dbdir', # hidden
+        'overlap_filtering_setting',
+        'fc_ovlp_to_graph_option',
+        'bestn',
+        'target',
+    )
+    unused = set(cfg.keys()) - set(expected)
+    if unused:
+        logger.warning("Unexpected keys in input config: {}".format(unused))
+
+
+default_logging_config = """
+[loggers]
+keys=root
+
+[handlers]
+keys=stream,file_all
+
+[formatters]
+keys=form01,form02
+
+[logger_root]
+level=NOTSET
+handlers=stream,file_all
+
+[handler_stream]
+class=StreamHandler
+level=DEBUG
+formatter=form02
+args=(sys.stderr,)
+
+[handler_file_all]
+class=FileHandler
+level=DEBUG
+formatter=form01
+args=('all.log', 'w')
+
+[formatter_form01]
+format=%(asctime)s - %(name)s:%(lineno)d - %(levelname)s - %(message)s
+
+[formatter_form02]
+format=[%(levelname)s]%(message)s
+"""
+
+
+def _setup_logging(logging_config_fn):
+    """See https://docs.python.org/2/library/logging.config.html
+    """
+    logging.Formatter.converter = time.gmtime  # cannot be done in .ini
+
+    if logging_config_fn:
+        if logging_config_fn.endswith('.json'):
+            logging.config.dictConfig(
+                json.loads(open(logging_config_fn).read()))
+            # print repr(logging.Logger.manager.loggerDict) # to debug
+            return
+        logger_fileobj = open(logging_config_fn)
+    else:
+        logger_fileobj = NativeIO(default_logging_config)
+    defaults = {
+    }
+    logging.config.fileConfig(
+        logger_fileobj, defaults=defaults, disable_existing_loggers=False)
+
+
+def setup_logger(logging_config_fn):
+    global logger
+    try:
+        _setup_logging(logging_config_fn)
+        logger = logging.getLogger("fc_run")
+        logger.info('Setup logging from file "{}".'.format(logging_config_fn))
+    except Exception:
+        logging.basicConfig()
+        logger = logging.getLogger()
+        logger.exception(
+            'Failed to setup logging from file "{}". Using basicConfig().'.format(logging_config_fn))
+    try:
+        import logging_tree
+        logger.info(logging_tree.format.build_description())
+    except ImportError:
+        pass
+
+    return logger
+
+
+def get_length_cutoff(length_cutoff, fn):
+    if length_cutoff < 0:
+        length_cutoff = int(open(fn).read().strip())
+        logger.info('length_cutoff=%d from %r' % (length_cutoff, fn))
+    return length_cutoff  # possibly updated
+
+
+def logger_record(func):
+    def wrapper(*args, **kwargs):
+        logger.info("====>Begin executing function: {}".format(func.__name__))
+        if args:
+            logger.info("====>args={}".format(args))
+        if kwargs:
+            logger.info("====>kwargs={}".format(kwargs))
+
+        # function execution
+        t_start = time.time()
+        func(*args, **kwargs)
+        t_end = time.time()
+
+        logger.info("====>End executing function: {}, time cost: {} second."
+                    .format(func.__name__, (t_end-t_start)))
+    return wrapper

+ 250 - 0
FALCON/falcon_kit/snakemake.py

@@ -0,0 +1,250 @@
+"""Exact copy of falcon_unzip/tasks/snakemake.py
+TODO: Consolidate.
+"""
+
+
+
+from future.utils import viewitems
+from future.utils import itervalues
+
+from builtins import object
+import json
+import os
+import re
+
+
+def find_wildcards(pattern):
+    """
+    >>> find_wildcards('{foo}/{bar}')
+    ['bar', 'foo']
+    """
+    re_wildcard = re.compile(r'\{(\w+)\}')
+    found = [mo.group(1) for mo in re_wildcard.finditer(pattern)]
+    return list(sorted(found))
+
+class SnakemakeRuleWriter(object):
+    def legalize(self, rule_name):
+        return self.re_bad_char.sub('_', rule_name, count=0)
+    def unique_rule_name(self, basename):
+        rule_name = basename
+        if rule_name in self.rule_names:
+            i = 1
+            while rule_name in self.rule_names:
+                rule_name = basename + str(i)
+                i += 1
+        self.rule_names.add(rule_name)
+        return rule_name
+    def write_dynamic_rules(self, rule_name, input_json, inputs, shell_template,
+            parameters, wildcard_outputs, output_json):
+        """Lots of conventions.
+        input_json: should have a key 'mapped_inputs', which is a map of key->filename
+          Those filenames will be symlinked here, according to the patterns in wildcard_inputs.
+        shell_template: for the parallel task
+        output_json: This will contain only key->filename, based on wildcard_outputs.
+        inputs: These include non-wildcards too.
+        (For now, we assume inputs/outputs is just one per parallel task.)
+        """
+        # snakemake does not like paths starting with './'; they can lead to mismatches.
+        # So we run normpath everywhere.
+        input_json = os.path.normpath(input_json)
+        output_json = os.path.normpath(output_json)
+
+        # snakemake cannot use already-generated files as dynamic outputs (the wildcard_inputs for the parallel task),
+        # so we rename them and plan to symlink.
+        wildcard_inputs = dict(inputs)
+        nonwildcard_inputs = dict()
+        for (key, fn) in list(viewitems(wildcard_inputs)):
+            if '{' not in fn:
+                del wildcard_inputs[key]
+                nonwildcard_inputs[key] = fn
+                continue
+            dn, bn = os.path.split(wildcard_inputs[key])
+            wildcard_inputs[key] = os.path.join(dn + '.symlink', bn)
+        rule_name = self.unique_rule_name(rule_name)
+        dynamic_output_kvs = ', '.join("%s=dynamic('%s')"%(k, os.path.normpath(v)) for (k, v) in viewitems(wildcard_inputs))
+        dynamic_input_kvs =  ', '.join("%s=ancient(dynamic('%s'))"%(k, os.path.normpath(v)) for (k, v) in viewitems(wildcard_outputs))
+        rule_parameters = {k: v for (k, v) in viewitems(parameters) if not k.startswith('_')}
+        params = ','.join('\n        %s="%s"'%(k,v) for (k, v) in viewitems(rule_parameters))
+        pattern_kv_list = list()
+        for (name, wi) in viewitems(wildcard_inputs):
+            fn_pattern = wi
+            fn_pattern = fn_pattern.replace('{', '{{')
+            fn_pattern = fn_pattern.replace('}', '}}')
+            pattern_kv_list.append('%s="%s"' %(name, fn_pattern))
+        wi_pattern_kvs = ' '.join(pattern_kv_list)
+
+        rule = """
+rule dynamic_%(rule_name)s_split:
+    input:  %(input_json)r
+    output: %(dynamic_output_kvs)s
+    shell: 'python3 -m falcon_kit.mains.copy_mapped --special-split={input} %(wi_pattern_kvs)s'
+"""%(locals())
+        self.write(rule)
+
+        input_wildcards = set() # Not sure yet whether input must match output wildcards.
+        for wi_fn in itervalues(wildcard_inputs):
+            found = find_wildcards(wi_fn)
+            input_wildcards.update(found)
+        wildcards = list(sorted(input_wildcards))
+        params_plus_wildcards = {k: '{%s}'%k for k in wildcards}
+        params_plus_wildcards.update(parameters)
+        # The parallel script uses all inputs, not just wildcards.
+        all_inputs = dict(wildcard_inputs)
+        all_inputs.update(nonwildcard_inputs)
+        self.write_script_rule(all_inputs, wildcard_outputs, params_plus_wildcards, shell_template, rule_name=None)
+
+        wo_str_lists_list = ['%s=[str(i) for i in input.%s]' %(name, name) for name in list(wildcard_outputs.keys())]
+        wo_pattern_kv_list = ['%s="%s"' %(name, os.path.normpath(patt)) for (name, patt) in viewitems(wildcard_outputs)]
+        wo_str_lists_kvs = ',\n              '.join(wo_str_lists_list)
+        wo_pattern_kvs =   ',\n              '.join(wo_pattern_kv_list)
+
+        wildcards = list()
+        for wi_fn in itervalues(wildcard_outputs):
+            found = find_wildcards(wi_fn)
+            if wildcards:
+                assert wildcards == found, 'snakemake requires all outputs (and inputs?) to have the same wildcards'
+            else:
+                wildcards = found
+        wildcards_comma_sep = ', '.join('"%s"' %k for k in wildcards)
+
+        rule = '''
+rule dynamic_%(rule_name)s_merge:
+    input:  %(dynamic_input_kvs)s
+    output: %(output_json)r
+    run:
+        snake_merge_multi_dynamic(output[0],
+            dict(
+              %(wo_str_lists_kvs)s
+            ),
+            dict(
+              %(wo_pattern_kvs)s
+            ),
+            [%(wildcards_comma_sep)s] # all wildcards
+        )
+'''%(locals())
+        self.write(rule)
+    def write_script_rule(self, inputs, outputs, parameters, shell_template, rule_name):
+        assert '_bash_' not in parameters
+        first_output_name, first_output_fn = list(outputs.items())[0] # for rundir, since we cannot sub wildcards in shell
+        if not rule_name:
+            rule_name = os.path.dirname(first_output_fn)
+        rule_name = self.unique_rule_name(self.legalize(rule_name))
+        wildcard_rundir = os.path.normpath(os.path.dirname(first_output_fn)) # unsubstituted
+        # We use snake_string_path b/c normpath drops leading ./, but we do NOT want abspath.
+        input_kvs = ', '.join('%s=%s'%(k, snake_string_path(v)) for k,v in
+                sorted(viewitems(inputs)))
+        output_kvs = ', '.join('%s=%s'%(k, snake_string_path(v)) for k,v in
+                sorted(viewitems(outputs)))
+        rule_parameters = {k: v for (k, v) in viewitems(parameters) if not k.startswith('_')}
+        #rule_parameters['reltopdir'] = os.path.relpath('.', wildcard_rundir) # in case we need this later
+        params = ','.join('\n        %s="%s"'%(k,v) for (k, v) in viewitems(rule_parameters))
+        shell = snake_shell(shell_template, wildcard_rundir)
+        # cd $(dirname '{output.%(first_output_name)s}')
+        rule = """
+rule static_%(rule_name)s:
+    input:  %(input_kvs)s
+    output: %(output_kvs)s
+    params:%(params)s
+    shell:
+        '''
+outdir=$(dirname {output[0]})
+#mkdir -p ${{outdir}}
+cd ${{outdir}}
+date
+
+%(shell)s
+date
+'''
+"""%(locals())
+        self.write(rule)
+    def __call__(self, inputs, outputs, parameters, shell_template, rule_name=None):
+        self.write_script_rule(inputs, outputs, parameters, shell_template, rule_name)
+    def __init__(self, writer):
+        self.write = writer.write
+        self.rule_names = set() # to ensure uniqueness
+        self.re_bad_char = re.compile(r'\W')
+        self.write("""
+# THIS IS CURRENTLY BROKEN.
+import json
+import os
+#import snakemake.utils
+
+def snake_merge_dynamic_dict(reldir, input_fns, pattern, wildcards):
+        '''Assume each wildcard appears at most once in the pattern.
+        '''
+        for k in wildcards:
+            pattern = pattern.replace('{%s}' %k, '(?P<%s>\w+)' %k)
+        re_dynamic = re.compile(pattern)
+        mapped = list()
+        for fn in input_fns:
+            mo = re_dynamic.search(fn)
+            assert mo, '{!r} did not match {!r}'.format(fn, re_dynamic.pattern)
+            file_description = dict()
+            file_description['wildcards'] = dict(mo.groupdict())
+            file_description['fn'] = os.path.relpath(fn, reldir)
+            mapped.append(file_description)
+        return mapped
+
+def snake_merge_multi_dynamic(output_fn, dict_of_input_fns, dict_of_patterns, wildcards):
+        outdir = os.path.normpath(os.path.dirname(output_fn))
+        if not os.path.isdir(outdir):
+            os.makedirs(outdir)
+        assert list(sorted(dict_of_input_fns.keys())) == list(sorted(dict_of_patterns.keys()))
+        all_mapped = dict()
+        for i in dict_of_patterns.keys():
+            input_fns = dict_of_input_fns[i]
+            pattern = dict_of_patterns[i]
+            mapped = snake_merge_dynamic_dict(outdir, input_fns, pattern, wildcards)
+            all_mapped[i] = mapped
+        all_grouped = dict()
+        for i, mapped in all_mapped.items():
+            #print(i, mapped)
+            for file_description in mapped:
+                #print(file_description)
+                #print(file_description['wildcards'])
+                #print(list(sorted(file_description['wildcards'].items())))
+                wildkey = ','.join('{}={}'.format(k,v) for k,v in sorted(file_description['wildcards'].items()))
+                if wildkey not in all_grouped:
+                    new_group = dict(
+                        wildcards=dict(file_description['wildcards']),
+                        fns=dict(),
+                    )
+                    all_grouped[wildkey] = new_group
+                group = all_grouped[wildkey]
+                wildcards = file_description['wildcards']
+                assert wildcards == group['wildcards'], '{!r} should match {!r} by snakemake convention'.format(
+                    wildcards, group['wildcards'])
+                fn = file_description['fn']
+                group['fns'][i] = fn
+        ser = json.dumps(all_grouped, indent=2, separators=(',', ': ')) + '\\n'
+        with open(output_fn, 'w') as out:
+            out.write(ser)
+""")
+        prefix = """
+shell.prefix('''
+# Add -e vs. in falcon_unzip.
+set -vex
+hostname
+pwd
+''')
+"""
+        self.write(prefix)
+class SnakemakeDynamic(object):
+    """Not currently used."""
+    def __init__(self, path):
+        self.path = path
+def snake_string_path(p):
+    """normpath drops leading ./
+    """
+    if isinstance(p, SnakemakeDynamic):
+        return "dynamic('{}')".format(
+                os.path.normpath(p.path))
+    else:
+        return "'{}'".format(
+                os.path.normpath(p))
+def snake_shell(template, rundir):
+    reltopdir = os.path.relpath('.', rundir)
+    def makerel(mo):
+        return os.path.join(reltopdir, mo.group(0))
+    re_inout = re.compile(r'{(?:input|output)')
+    return re_inout.sub(makerel, template, count =0)

+ 272 - 0
FALCON/falcon_kit/stats_preassembly.py

@@ -0,0 +1,272 @@
+""" PreAssembly Report.
+
+See FALCON-pbsmrtpipe/pbfalcon/report_preassembly.py for XML version.
+"""
+# Copied from
+#   http://swarm/files/depot/branches/springfield/S2.3/software/smrtanalysis/bioinformatics/tools/pbreports/pbreports/report/preassembly.py
+
+
+
+
+from future.utils import viewitems
+from builtins import object
+from .FastaReader import open_fasta_reader
+from .util.io import syscall
+from . import functional
+import collections
+import glob
+import itertools
+import logging
+import os
+import pprint
+import re
+
+log = logging.getLogger(__name__)
+__version__ = '0.1'
+
+Stats = collections.namedtuple(
+    'FastaStats', ['nreads', 'total', 'n50', 'p95', 'esize'])
+
+# Copied from pbreports/util.py
+# We want to avoid a dependency on pbreports b/c it needs matplotlib.
+
+
+def get_fasta_readlengths(fasta_file):
+    """
+    Get a sorted list of contig lengths
+    :return: (tuple)
+    """
+    lens = []
+    with open_fasta_reader(fasta_file) as f:
+        for record in f:
+            lens.append(len(record.sequence))
+    lens.sort()
+    return lens
+
+
+def get_db_readlengths(fn):
+    """Use DBdump on a DAZZ_DB.
+    If DBsplit was run, then we see the filtered reads only, since we do not provide '-u' to DBdump.
+    """
+    call = 'DBdump -h {}'.format(fn)
+    return list(functional.parsed_readlengths_from_dbdump_output(syscall(call)))
+
+
+class FastaContainer(object):
+
+    def __init__(self, nreads, total, file_name):
+        self.nreads = nreads
+        self.total = total
+        self.file_name = file_name
+
+    @staticmethod
+    def from_file(file_name):
+        #        nreads, total = _compute_values(file_name)
+        read_lens = get_fasta_readlengths(file_name)
+        nreads = len(read_lens)
+        total = sum(read_lens)
+        return FastaContainer(nreads, total, file_name)
+
+    def __str__(self):
+        return "N {n} Total {t} File: {f}".format(n=self.nreads, t=self.total, f=self.file_name)
+
+
+def _validate_file(file_name):
+    if os.path.isfile(file_name):
+        return os.path.abspath(file_name)
+    else:
+        msg = "Unable to find {f}".format(f=file_name)
+        log.error(msg)
+        raise IOError(msg)
+
+
+def cutoff_reads(read_lens, min_read_len):
+    return [rl for rl in read_lens if rl >= min_read_len]
+
+
+def read_len_above(read_lens, threshold):
+    subtotal = 0
+    # Reverse-order calculation is faster.
+    for irev, rl in enumerate(reversed(read_lens)):
+        subtotal += rl
+        if subtotal >= threshold:
+            return rl
+
+
+def percentile(read_lens, p):
+    # TODO: Fix this when p=1.0
+    return read_lens[int(len(read_lens) * p)]
+
+
+def stats_from_sorted_readlengths(read_lens):
+    nreads = len(read_lens)
+    total = sum(read_lens)
+    sum_squares = sum(r * r for r in read_lens)
+    n50 = read_len_above(read_lens, int(total * 0.50))
+    p95 = percentile(read_lens, 0.95)
+    esize = sum_squares / total
+    #alt_n50 = pbreports.util.compute_n50(read_lens)
+    # log.info('our n50=%s, pbreports=%s' %(n50, alt_n50)) # Ours is more correct when median is between 2 reads.
+    return Stats(nreads=nreads, total=total, n50=n50, p95=p95, esize=esize)
+
+
+def read_lens_from_fofn(fofn_fn):
+    """Return sorted list.
+    """
+    fns = [fn.strip() for fn in open(fofn_fn) if fn.strip()]
+    # get_fasta_readlengths() returns sorted, so sorting the chain is roughly linear.
+    return list(sorted(itertools.chain.from_iterable(get_fasta_readlengths(fn) for fn in fns)))
+
+
+def read_lens_from_db(db_fn):
+    """Return sorted read-lengths from a DAZZ_DB.
+    """
+    return list(sorted(get_db_readlengths(db_fn)))
+
+
+def abs_filenames(fofn_fn):
+    fofn_dir = os.path.dirname(fofn_fn)
+
+    def abs_fn(fn):
+        if os.path.isabs(fn):
+            return fn
+        return os.path.join(fofn_dir, fn)
+    fns = [abs_fn(fn.strip()) for fn in open(fofn_fn) if fn.strip()]
+    return fns
+
+
+def metric_fragmentation(preads_fofn):
+    # https://jira.pacificbiosciences.com/browse/SAT-105
+    # sed -nr 's;>prolog/([0-9]*)[0-9]/.*;\1;p' %s/*.fasta | sort | uniq -c | awk '{print $1}' | sort | uniq -c
+    fastas = abs_filenames(preads_fofn)
+    assert fastas, 'No fasta found in {!r}'.format(preads_fofn)
+    call = r"""perl -e 'while (<>) { if ( m{>[^/]+/(\d+)\d/} ) { $id{$1}++; } }; while (my ($k, $v) = each %%id) { $counts{$v}++; }; while (my ($k, $v) = each %%counts) { print "$v $k\n"; };' %s""" % (' '.join(fastas))
+    counts = syscall(call)
+    return functional.calc_metric_fragmentation(counts)
+
+
+def metric_truncation(db, preads_fofn):
+    # https://jira.pacificbiosciences.com/browse/SAT-105
+    fastas = abs_filenames(preads_fofn)
+    assert fastas, 'No fasta found in {!r}'.format(preads_fofn)
+    call = r"""perl -e 'while (<>) { if ( m{>[^/]+/0*(\d+)\d/(\d+)_(\d+)} ) { $lengths{(1 + $1)} += ($3 - $2); } }; while (my ($k, $v) = each %%lengths) { print "$k $v\n"; };' %s""" % (' '.join(fastas))
+    # The +1 is because of the DBdump readids start at 1, but these start at 0.
+    length_pairs_output = syscall(call)
+    call = 'DBdump -rh {}'.format(db)
+    dbdump_output = syscall(call)
+    return functional.calc_metric_truncation(dbdump_output, length_pairs_output)
+
+
+def stats_dict(stats_raw_reads, stats_seed_reads, stats_corrected_reads, genome_length, length_cutoff,
+               fragmentation, truncation):
+    """All inputs are paths to fasta files.
+    genome_length and length_cutoff can be None.
+    """
+    log.info('stats for raw reads:       %s' % repr(stats_raw_reads))
+    log.info('stats for seed reads:      %s' % repr(stats_seed_reads))
+    log.info('stats for corrected reads: %s' % repr(stats_corrected_reads))
+
+    kwds = {}
+    genome_length = -1 if not genome_length else genome_length
+    kwds['genome_length'] = genome_length
+    kwds['length_cutoff'] = 0 if length_cutoff is None else length_cutoff
+    kwds['raw_reads'] = stats_raw_reads.nreads
+    kwds['raw_bases'] = stats_raw_reads.total
+    kwds['raw_mean'] = stats_raw_reads.total / stats_raw_reads.nreads
+    kwds['raw_n50'] = stats_raw_reads.n50
+    kwds['raw_p95'] = stats_raw_reads.p95
+    kwds['raw_coverage'] = stats_raw_reads.total / genome_length
+    kwds['raw_esize'] = stats_raw_reads.esize
+    kwds['seed_reads'] = stats_seed_reads.nreads
+    kwds['seed_bases'] = stats_seed_reads.total
+    kwds['seed_mean'] = stats_seed_reads.total / stats_seed_reads.nreads
+    kwds['seed_n50'] = stats_seed_reads.n50
+    kwds['seed_p95'] = stats_seed_reads.p95
+    kwds['seed_coverage'] = stats_seed_reads.total / genome_length
+    kwds['seed_esize'] = stats_seed_reads.esize
+    kwds['preassembled_reads'] = stats_corrected_reads.nreads
+    kwds['preassembled_bases'] = stats_corrected_reads.total
+    kwds['preassembled_mean'] = stats_corrected_reads.total / \
+        stats_corrected_reads.nreads
+    kwds['preassembled_n50'] = stats_corrected_reads.n50
+    kwds['preassembled_p95'] = stats_corrected_reads.p95
+    kwds['preassembled_coverage'] = stats_corrected_reads.total / genome_length
+    kwds['preassembled_esize'] = stats_corrected_reads.esize
+    kwds['preassembled_yield'] = stats_corrected_reads.total / \
+        stats_seed_reads.total
+    kwds['preassembled_seed_fragmentation'] = fragmentation
+    kwds['preassembled_seed_truncation'] = truncation
+
+    def round_if_float(v):
+        return v if type(v) is not float else round(v, 3)
+    result = {k: round_if_float(v) for (k, v) in viewitems(kwds)}
+    return result
+
+# DEPRECATED
+
+
+def make_dict(
+    i_preads_fofn_fn,
+    i_raw_reads_fofn_fn,
+    genome_length,
+    length_cutoff,
+    fragmentation=-1,
+    truncation=-1,
+):
+    raw_reads = read_lens_from_fofn(i_raw_reads_fofn_fn)
+    stats_raw_reads = stats_from_sorted_readlengths(raw_reads)
+
+    seed_reads = cutoff_reads(raw_reads, length_cutoff)
+    stats_seed_reads = stats_from_sorted_readlengths(seed_reads)
+
+    preads = read_lens_from_fofn(i_preads_fofn_fn)
+    stats_preads = stats_from_sorted_readlengths(preads)
+    report_dict = stats_dict(
+        stats_raw_reads=stats_raw_reads,
+        stats_seed_reads=stats_seed_reads,
+        stats_corrected_reads=stats_preads,
+        genome_length=genome_length,
+        length_cutoff=length_cutoff,
+        fragmentation=fragmentation,
+        truncation=truncation,
+    )
+    return report_dict
+
+
+def calc_dict(
+    i_preads_fofn_fn,
+    i_raw_reads_db_fn,
+    genome_length,
+    length_cutoff,
+):
+    try:
+        frag = metric_fragmentation(i_preads_fofn_fn)
+    except:
+        frag = -1.0
+        log.exception('Using arbitrary fragmentation metric: {}'.format(frag))
+    try:
+        trunc = metric_truncation(i_raw_reads_db_fn, i_preads_fofn_fn)
+    except:
+        trunc = -1.0
+        log.exception('Using arbitrary truncation metric: {}'.format(trunc))
+
+    raw_reads = read_lens_from_db(i_raw_reads_db_fn)
+    stats_raw_reads = stats_from_sorted_readlengths(raw_reads)
+
+    seed_reads = cutoff_reads(raw_reads, length_cutoff)
+    stats_seed_reads = stats_from_sorted_readlengths(seed_reads)
+
+    preads = read_lens_from_fofn(i_preads_fofn_fn)
+    stats_preads = stats_from_sorted_readlengths(preads)
+    report_dict = stats_dict(
+        stats_raw_reads=stats_raw_reads,
+        stats_seed_reads=stats_seed_reads,
+        stats_corrected_reads=stats_preads,
+        genome_length=genome_length,
+        length_cutoff=length_cutoff,
+        fragmentation=frag,
+        truncation=trunc,
+    )
+    log.info('Calculated pre-assembly stats:\n{}'.format(
+        pprint.pformat(report_dict)))
+    return report_dict

+ 0 - 0
FALCON/falcon_kit/testkit/__init__.py


+ 61 - 0
FALCON/falcon_kit/testkit/test_assembly.py

@@ -0,0 +1,61 @@
+"""
+This is meant to replace pysiv2.custom.test_assembly.
+In your testkit_cfg.json, instead of
+    "pysiv2.custom": [
+you will now use
+    "falcon_kit.testkit": [
+The values *might* need to be updated slightly, since we rely on the DB
+rather than on datasets now.
+"""
+import os, re, unittest
+from falcon_kit.util.io import system
+from falcon_kit.io import capture
+from falcon_kit import functional
+
+# Someday, we might simplify this. For now, we
+# use the current 'test_values.json'.
+from pysiv2.custom.base import TestStatisticsBase, TestReportStatistics # pylint: disable=no-name-in-module, import-error
+
+
+class TestPreAssembly(TestReportStatistics):
+    REPORT_ID = "preassembly"
+    TEST_ID = "preassembly"
+    METRIC_IDS = [
+        "raw_reads",
+        "raw_mean",
+        "raw_n50",
+        "raw_bases",
+        "preassembled_reads",
+        "preassembled_mean",
+        "preassembled_n50",
+        "preassembled_bases",
+        "preassembled_yield"
+    ]
+
+
+class TestPolishedAssembly(TestReportStatistics):
+    """
+    Test metrics in the output of pbreports.report.polished_assembly
+    """
+    REPORT_ID = "polished_assembly"
+    TEST_ID = "polished_assembly"
+    METRIC_IDS = [
+        "polished_contigs",
+        "max_contig_length",
+        "n_50_contig_length",
+        "sum_contig_lengths"
+    ]
+
+
+class TestFalconAssembly(TestStatisticsBase):
+    JSON_SCOPE = 'falcon_kit'
+    TEST_ID = 'filter_subreads'
+    METRIC_IDS = ['number_of_filtered_subreads']
+    DEFAULT_VALUES = {}
+
+    @classmethod
+    def getMetrics(cls):
+        db_fn = os.path.join(cls.job_dir, 'tasks', 'falcon_ns2.tasks.task_falcon0_dazzler_build_raw-0', 'raw_reads.db')
+        #system('which DBdump', check=True)
+        dump = capture('DBdump {}'.format(db_fn))
+        cls.metric_dict['number_of_filtered_subreads'] = functional.dazzler_num_reads(dump)

+ 36 - 0
FALCON/falcon_kit/testkit/test_foo.py

@@ -0,0 +1,36 @@
+import os, unittest
+from falcon_kit.util.io import system
+
+# Also in pbcommand/testkit/base_utils.py
+def pb_requirements(*reqs):
+    """
+    Method decorator for specifying linked JIRA issues.
+    """
+    def decorator(test_item):
+        test_item.__pb_requirements__ = list(reqs)
+        return test_item
+    return decorator
+
+class TestBase(unittest.TestCase):
+    _is_test = True
+    job_dir = None
+    service_access_layer = None
+    job_id = None
+
+@unittest.skip('FOO')
+class TestMe(TestBase):
+    @unittest.skip('would fail')
+    def test_excfunc(self):
+        raise Exception('FAIL HERE')
+    @unittest.skip('would fail')
+    def test_assertfunc(self):
+        assert 0 > 1, 'no way'
+    @unittest.skip('would fail')
+    def test_failfunc(self):
+        msg = 'In {} dir'.format(self.job_dir)
+        self.fail(msg)
+    def test_goodfunc(self):
+        pass
+    @pb_requirements('TAGT-000')
+    def test_myprd_func(self):
+        pass

+ 199 - 0
FALCON/falcon_kit/tiling_path.py

@@ -0,0 +1,199 @@
+class TilingPathEdge(object):
+
+    def __init__(self, split_line = None):
+        self.ctg_id, self.v, self.w, self.wrid, self.b, self.e, self.score, self.identity = \
+                        None, None, None, None, None, None, None, None
+        self.parsed = False
+        if split_line:                          # pragma: no cover
+            self.set_from(split_line)
+
+    def set_from(self, split_line):
+        assert(len(split_line) >= 8)
+        self.parsed = False
+        self.ctg_id = split_line[0]
+        self.v = split_line[1]
+        self.w = split_line[2]
+        self.wrid = split_line[3]
+        self.b = int(split_line[4])
+        self.e = int(split_line[5])
+        self.score = int(split_line[6])
+        self.identity = float(split_line[7])
+        self.parsed = True
+
+    def get_split_line(self):
+        return [str(val) for val in [self.ctg_id, self.v, self.w, self.wrid, self.b, self.e, self.score, self.identity]]
+
+    # def __str__(self):
+    #     return ' '.join(self.get_split_line())
+
+class TilingPath(object):
+    def __init__(self, tiling_edge_list, contig_sequence_len = None):
+        self.edges = tiling_edge_list  # These are TilingPathEdge objects.
+        self.v_to_edge = {}
+        self.w_to_edge = {}
+        self.coords = {}
+        self.contig_len = 0
+        self.first_node_offset = 0
+
+        for i in range(1, len(tiling_edge_list)):
+            assert(tiling_edge_list[i-1].w == tiling_edge_list[i].v)
+
+        # If the total contig sequence len is known, use that to
+        # calculate the length of the first read (in case proper
+        # contigs are specified). This is needed to offset the coordinates
+        # which can be calculated from the tiling path.
+        if contig_sequence_len != None:
+            _, tiling_len = calc_node_coords(tiling_edge_list)
+            msg = 'contig_sequence_len < tiling_len ({} < {})'.format(contig_sequence_len, tiling_len)
+            assert contig_sequence_len >= tiling_len, msg
+            self.first_node_offset = contig_sequence_len - tiling_len   # This is the length of the first node.
+
+        # The self.coords is a dict: self.coords[v] = coordinate_on_contig
+        self.coords, self.contig_len = calc_node_coords(tiling_edge_list, self.first_node_offset)
+
+        # Sanity check.
+        assert(contig_sequence_len == None or self.contig_len == contig_sequence_len)
+
+        # Create a lookup of node to edge.
+        self.v_to_edge = {}
+        self.w_to_edge = {}
+        for i in range(len(self.edges)):
+            e = self.edges[i]
+            self.v_to_edge[e.v] = i
+            self.w_to_edge[e.w] = i
+
+    def dump_as_split_lines(self):
+        return [e.get_split_line() for e in self.edges]
+
+    def get_subpath(self, start_coord, end_coord):
+        """
+        Given a TilingPath object, the method called `TilingPath.get_subpath() will
+        attempt to extract a part of the tiling path which covers the specified
+        coordinates.
+        For example, user could specify alignment start and end positions, and
+        provide the coordinates to this method, and the result would be a list
+        of tiling path edges which correspond to the tiling in between the two
+        coordinates (most likely slightly longer on both ends).
+
+        Both start and end coordinates can be < 0 if the input contig was improper.
+
+        Returns a list of split_line tiling path edges (not TilingEdge objects).
+        """
+        assert(self.edges)
+        assert(start_coord <= end_coord)
+
+        # end_coord -= 1  # Make the end inclusive.
+        # start_node = None
+        # end_node = None
+        start_edge = None
+        end_edge = None
+        if start_coord < self.coords[self.edges[0].v]:
+            start_edge = 0
+        if end_coord <= self.coords[self.edges[0].v]:
+            end_edge = 1
+        for i in range(len(self.edges)):
+            e = self.edges[i]
+            if start_coord >= self.coords[e.v] and start_coord < self.coords[e.w]:
+                start_edge = i
+            if end_coord > self.coords[e.v] and end_coord <= self.coords[e.w]:
+                end_edge = i + 1
+        if end_coord >= self.coords[self.edges[-1].w]:
+            end_edge = len(self.edges)
+        assert(start_edge != None and end_edge != None)
+
+        # Since the start_coord and end_coord can end within an edge,
+        # we return the position in the final contigas.
+        new_start_coord = start_coord - self.coords[self.edges[start_edge].v]
+        new_end_coord = end_coord - self.coords[self.edges[start_edge].v]
+        new_path = self.edges[start_edge:end_edge]
+        new_path = [val.get_split_line() for val in new_path]
+        return new_path, new_start_coord, new_end_coord
+
+def calc_node_coords(tiling_edge_list, first_node_offset=0):
+    """
+    For a single tiling path (tiling_edge_list is a list
+    of edges for a particular contig) calculates the
+    genomic coordinate of every node in the path.
+    In case there are cycles in the tiling path,
+    the existing node's coordinate will be overwritten.
+    `first_node_offset` refers to the length of the first node. If
+    not specified, the contig length should not
+    consider the length of the first node.
+    """
+    if not tiling_edge_list:
+        return {}, 0
+    coord_map = {}
+    contig_len = 0
+    edge0 = tiling_edge_list[0]
+    coord_map[edge0.v] = first_node_offset
+    for edge in tiling_edge_list:
+        if edge.v not in coord_map:
+            raise Exception(
+                'Tiling path is not in sorted order. Node "{v!r}" does not yet have an assigned coordinate.'.format(v=edge.v))
+        coord = coord_map[edge.v]
+        coord += abs(int(edge.b) - int(edge.e))
+        coord_map[edge.w] = coord
+        contig_len = max(contig_len, coord)
+    return coord_map, contig_len
+
+def yield_split_line(fp_in):
+    for line in fp_in:     # Example row: "0 000000007:B 000000005:B 000000005 9 0 1980 99.95"
+        line = line.strip()
+        if len(line) == 0: continue
+        sl = line.split()
+        yield sl
+
+def load_tiling_paths(tp_file, contig_lens=None, whitelist_seqs=None):
+    with open(tp_file) as fp_in:
+        ret = load_tiling_paths_from_stream(fp_in, contig_lens=contig_lens, whitelist_seqs=whitelist_seqs)
+    return ret
+
+def load_tiling_paths_from_stream(fp_in, contig_lens=None, whitelist_seqs=None):
+    split_lines = list(yield_split_line(fp_in))
+    return load_tiling_paths_from_split_lines(split_lines, contig_lens=contig_lens, whitelist_seqs=whitelist_seqs)
+
+def load_tiling_paths_from_split_lines(split_lines, contig_lens=None, whitelist_seqs=None):
+    """
+    Parameters:
+        contig_lens -   if a dict with contig sequence lengths is specified, the difference between the
+                        contig len and the length obtained from the tiling path will be used to offset
+                        the tiling path coordinates.
+        whitelist_seqs - a dict or a set object containing contig IDs to load. If None, no filter will be applied.
+    """
+    tiling_path_edges = {}
+    for sl in split_lines:     # Example row: "0 000000007:B 000000005:B 000000005 9 0 1980 99.95"
+        new_edge = TilingPathEdge(sl)
+        ctg_id = new_edge.ctg_id
+        if whitelist_seqs != None and (ctg_id in whitelist_seqs) == False:
+            continue
+        tiling_path_edges.setdefault(ctg_id, [])
+        tiling_path_edges[ctg_id].append(new_edge)
+
+    # Convert the flat lists to TilingPath objects.
+    # These keep track of
+    tiling_paths = {}
+    for ctg_id, edges in tiling_path_edges.items():
+        ctg_len = None
+        if contig_lens != None and ctg_id in contig_lens:
+            ctg_len = contig_lens[ctg_id]
+        tiling_paths[ctg_id] = TilingPath(edges, ctg_len)
+
+    return tiling_paths
+
+def find_a_ctg_placement(p_paths, a_paths):
+    """
+    Determines placement coordinates for each a_ctg in a given a_paths dict of
+    TilingPaths, and returns a dict of:
+        placement[p_ctg_id][a_ctg_id] = (start, end, p_ctg_id, a_ctg_id, first_node, last_node)
+    """
+    placement = {}
+    for a_ctg_id, a_tp in a_paths.items():
+        if len(a_tp.edges) == 0: continue       # pragma: no cover
+        first_node = a_tp.edges[0].v
+        last_node = a_tp.edges[-1].w
+        p_ctg_id = a_ctg_id.split('-')[0].split('_')[0]
+        p_tp = p_paths[p_ctg_id]
+        start, end = p_tp.coords[first_node], p_tp.coords[last_node]
+        placement.setdefault(p_ctg_id, {})
+        placement[p_ctg_id][a_ctg_id] = (start, end, p_ctg_id, a_ctg_id, first_node, last_node)
+    return placement

+ 0 - 0
FALCON/falcon_kit/util/__init__.py


+ 72 - 0
FALCON/falcon_kit/util/dataset_split.py

@@ -0,0 +1,72 @@
+"""A simple script to scatter a (filtered) subreadset into units of input files.
+"""
+from pbcore.io import (SubreadSet, ExternalResource) # pylint: disable=import-error
+import argparse
+import logging
+import os
+import sys
+import copy
+
+log = logging.getLogger(__name__)
+
+def split_dataset(subreadset, out_prefix):
+    """
+    Takes an input dataset, and for each entry generates one separate dataset
+    file, while maintaining all the filters.
+    Returns a FOFN of the generated datasets.
+
+    To create an example filtered dataset for testing:
+    dataset create --type SubreadSet test.subreadset.xml subreads1.bam subreads2.bam
+    dataset filter test.subreadset.xml test.filtered.subreadset.xml 'length>1000'
+    """
+    out_prefix_abs = os.path.abspath(out_prefix)
+
+    dset = SubreadSet(subreadset, strict=True, skipCounts=True)
+    fns = dset.toFofn()
+
+    log.info('resources in {!r}:\n{}'.format(subreadset, '\n'.join(fns)))
+
+    fofn = []
+    for i, bam_fn in enumerate(fns):
+        out_fn = '{}.{:05}.subreadset.xml'.format(out_prefix_abs, i)
+        new_dataset = SubreadSet(bam_fn, skipCounts=True)
+        new_dataset.newUuid()
+        new_dataset._filters = copy.deepcopy(dset._filters)
+        new_dataset.write(out_fn)
+        fofn.append(out_fn)
+
+    return fofn
+
+def run_split_dataset(subreadset, out_prefix):
+    out_prefix_abs = os.path.abspath(out_prefix)
+
+    fofn = split_dataset(subreadset, out_prefix_abs)
+
+    out_fofn_fn = '{}.fofn'.format(out_prefix_abs)
+    with open(out_fofn_fn, 'w') as ofs:
+        for fn in fofn:
+            ofs.write('{}\n'.format(fn))
+
+    log.info('Wrote {} chunks into "{}"'.format(len(fofn), out_fofn_fn))
+
+def main(argv=sys.argv):
+    description = """Scatter subreadsets in units of one input file.
+"""
+    epilog = """
+"""
+    parser = argparse.ArgumentParser(
+        description=description,
+        epilog=epilog,
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('subreadset',
+        help='Input subreadset XML filename. Can be filtered.')
+    parser.add_argument('out_prefix',
+        help='Prefix of the output sub-datasets.')
+    args = parser.parse_args(argv[1:])
+
+    run_split_dataset(args.subreadset, args.out_prefix)
+
+if __name__ == "__main__":
+    logging.basicConfig()
+    logging.getLogger().setLevel(logging.DEBUG)
+    main(sys.argv)

+ 273 - 0
FALCON/falcon_kit/util/io.py

@@ -0,0 +1,273 @@
+"""I/O utilities
+Not specific to FALCON.
+"""
+
+
+#from builtins import str
+from builtins import object
+import contextlib
+import os
+import resource
+import shlex
+import shutil
+import subprocess as sp
+import sys
+import tempfile
+import traceback
+from ..io import deserialize
+
+
+def write_nothing(*args):
+    """
+    To use,
+      LOG = noop
+    """
+
+
+def write_with_pid(*args):
+    msg = '[%d]%s\n' % (os.getpid(), ' '.join(args))
+    sys.stderr.write(msg)
+
+
+LOG = write_with_pid
+
+
+def logstats():
+    """This is useful 'atexit'.
+    """
+    LOG('maxrss:%9d' % (resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
+
+
+def reprarg(arg):
+    """Reduce the size of repr()
+    """
+    if isinstance(arg, str):
+        if len(arg) > 100:
+            return '{}...({})'.format(arg[:100], len(arg))
+    elif (isinstance(arg, set) or isinstance(arg, list)
+            or isinstance(arg, tuple) or isinstance(arg, dict)):
+        if len(arg) > 9:
+            return '%s(%d elem)' % (type(arg).__name__, len(arg))
+        else:
+            return '<' + ', '.join(reprarg(a) for a in arg) + '>'
+    return repr(arg)
+
+
+def run_func(args):
+    """Wrap multiprocessing.Pool calls.
+    Usage:
+        pool.imap(run_func, [func, arg0, arg1, ...])
+    """
+    func = args[0]
+    try:
+        func_name = func.__name__
+    except:
+        # but since it must be pickle-able, this should never happen.
+        func_name = repr(func)
+    args = args[1:]
+    try:
+        LOG('starting %s(%s)' % (func_name, ', '.join(reprarg(a) for a in args)))
+        logstats()
+        ret = func(*args)
+        logstats()
+        LOG('finished %s(%s)' % (func_name, ', '.join(reprarg(a) for a in args)))
+        return ret
+    except Exception:
+        raise Exception(traceback.format_exc())
+    except:  # KeyboardInterrupt, SystemExit
+        LOG('interrupted %s(%s)' %
+            (func_name, ', '.join(reprarg(a) for a in args)))
+        return
+
+
+def system(call, check=False):
+    LOG('$(%s)' % repr(call))
+    rc = os.system(call)
+    msg = "Call %r returned %d." % (call, rc)
+    if rc:
+        LOG("WARNING: " + msg)
+        if check:
+            raise Exception(msg)
+    else:
+        LOG(msg)
+    return rc
+
+
+def syscall(cmd):
+    """Return stdout, fully captured.
+    Wait for subproc to finish.
+    Raise if empty.
+    Raise on non-zero exit-code.
+    """
+    LOG('$ {!r} >'.format(cmd))
+    output = sp.check_output(shlex.split(cmd), encoding='ascii')
+    if not output:
+        msg = '%r failed to produce any output.' % cmd
+        LOG('WARNING: %s' % msg)
+    return output
+
+
+def slurplines(cmd):
+    return syscall(cmd).splitlines()
+
+
+def streamlines(cmd):
+    """Stream stdout from cmd.
+    Let stderr fall through.
+    The returned reader will stop yielding when the subproc exits.
+    Note: We do not detect a failure in the underlying process.
+    """
+    LOG('$ %s |' % cmd)
+    proc = sp.Popen(shlex.split(cmd), stdout=sp.PIPE, encoding='ascii')
+    return proc.stdout
+
+
+class DataReaderContext(object):
+    def readlines(self):
+        output = self.data.strip()
+        for line in output.splitlines():
+            yield line
+
+    def __enter__(self):
+        pass
+
+    def __exit__(self, *args):
+        self.returncode = 0
+
+    def __init__(self, data):
+        self.data = data
+
+
+class ProcessReaderContext(object):
+    """Prefer this to slurplines() or streamlines().
+    """
+    def readlines(self):
+        """Generate lines of native str.
+        """
+        # In py2, not unicode.
+        raise NotImplementedError()
+
+    def __enter__(self):
+        LOG('{!r}'.format(self.cmd))
+        self.proc = sp.Popen(shlex.split(self.cmd), stdout=sp.PIPE, universal_newlines=True, encoding='ascii')
+
+    def __exit__(self, etype, evalue, etb):
+        if etype is None:
+            self.proc.wait()
+        else:
+            # Exception was raised in "with-block".
+            # We cannot wait on proc b/c it might never finish!
+            pass
+        self.returncode = self.proc.returncode
+        if self.returncode:
+            msg = "%r <- %r" % (self.returncode, self.cmd)
+            raise Exception(msg)
+        del self.proc
+
+    def __init__(self, cmd):
+        self.cmd = cmd
+
+
+def splitlines_iter(text):
+    """This is the same as splitlines, but with a generator.
+    """
+    # https://stackoverflow.com/questions/3054604/iterate-over-the-lines-of-a-string
+    assert isinstance(text, str)
+    prevnl = -1
+    while True:
+        nextnl = text.find('\n', prevnl + 1) # u'\n' would force unicode
+        if nextnl < 0:
+            break
+        yield text[prevnl + 1:nextnl]
+        prevnl = nextnl
+    if (prevnl + 1) != len(text):
+        yield text[prevnl + 1:]
+
+
+class CapturedProcessReaderContext(ProcessReaderContext):
+    def readlines(self):
+        """Usage:
+
+            cmd = 'ls -l'
+            reader = CapturedProcessReaderContext(cmd)
+            with reader:
+                for line in reader.readlines():
+                    print line
+
+        Any exception within the 'with-block' is propagated.
+        Otherwise, after all lines are read, if 'cmd' failed, Exception is raised.
+        """
+        output, _ = self.proc.communicate()
+        # Process has terminated by now, so we can iterate without keeping it alive.
+        #for line in splitlines_iter(str(output, 'utf-8')):
+        for line in splitlines_iter(output):
+            yield line
+
+
+class StreamedProcessReaderContext(ProcessReaderContext):
+    def readlines(self):
+        """Usage:
+
+            cmd = 'ls -l'
+            reader = StreamedProcessReaderContext(cmd)
+            with reader:
+                for line in reader.readlines():
+                    print line
+
+        Any exception within the 'with-block' is propagated.
+        Otherwise, after all lines are read, if 'cmd' failed, Exception is raised.
+        """
+        for line in self.proc.stdout:
+            # We expect unicode from py3 but raw-str from py2, given
+            # universal_newlines=True.
+            # Based on source-code in 'future/types/newstr.py',
+            # it seems that str(str(x)) has no extra penalty,
+            # and it should not crash either. Anyway,
+            # our tests would catch it.
+            #yield str(line, 'utf-8').rstrip()
+            yield line.rstrip()
+
+
+def filesize(fn):
+    """In bytes.
+    Raise if fn does not exist.
+    """
+    statinfo = os.stat(fn)
+    return statinfo.st_size
+
+
+def validated_fns(fofn):
+    return list(yield_validated_fns(fofn))
+
+
+def yield_validated_fns(fofn):
+    """Return list of filenames from fofn, either abs or relative to CWD instead of dir of fofn.
+    Assert none are empty/non-existent.
+    """
+    dirname = os.path.normpath(os.path.dirname(os.path.realpath(fofn))) # normpath makes '' become '.'
+    try:
+        fns = deserialize(fofn)
+    except:
+        #LOG('las fofn {!r} does not seem to be JSON or msgpack; try to switch, so we can detect truncated files.'.format(fofn))
+        fns = open(fofn).read().strip().split()
+    try:
+        for fn in fns:
+            assert fn
+            if not os.path.isabs(fn):
+                fn = os.path.normpath(os.path.relpath(os.path.join(dirname, fn)))
+            assert os.path.isfile(fn), 'File {!r} is not a file.'.format(fn)
+            assert filesize(fn), '{!r} has size {}'.format(fn, filesize(fn))
+            yield fn
+    except Exception:
+        sys.stderr.write('Failed to validate FOFN {!r}\n'.format(fofn))
+        raise
+
+
+@contextlib.contextmanager
+def TemporaryDirectory():
+    name = tempfile.mkdtemp()
+    LOG('TemporaryDirectory={!r}'.format(name))
+    try:
+        yield name
+    finally:
+        shutil.rmtree(name)

+ 72 - 0
FALCON/falcon_kit/util/ordered_set.py

@@ -0,0 +1,72 @@
+# http://code.activestate.com/recipes/576694/
+import collections.abc
+
+class OrderedSet(collections.abc.MutableSet):
+
+    def __init__(self, iterable=None):
+        self.end = end = [] 
+        end += [None, end, end]         # sentinel node for doubly linked list
+        self.map = {}                   # key --> [key, prev, next]
+        if iterable is not None:
+            self |= iterable
+
+    def __len__(self):
+        return len(self.map)
+
+    def __contains__(self, key):
+        return key in self.map
+
+    def update(self, other):
+        for i in other:
+            self.add(i)
+
+    def add(self, key):
+        if key not in self.map:
+            end = self.end
+            curr = end[1]
+            curr[2] = end[1] = self.map[key] = [key, curr, end]
+
+    def discard(self, key):
+        if key in self.map:        
+            key, prev, next = self.map.pop(key)
+            prev[2] = next
+            next[1] = prev
+
+    def __iter__(self):
+        end = self.end
+        curr = end[2]
+        while curr is not end:
+            yield curr[0]
+            curr = curr[2]
+
+    def __reversed__(self):
+        end = self.end
+        curr = end[1]
+        while curr is not end:
+            yield curr[0]
+            curr = curr[1]
+
+    def pop(self, last=True):
+        if not self:
+            raise KeyError('set is empty')
+        key = self.end[1][0] if last else self.end[2][0]
+        self.discard(key)
+        return key
+
+    def __repr__(self):
+        if not self:
+            return '%s()' % (self.__class__.__name__,)
+        return '%s(%r)' % (self.__class__.__name__, list(self))
+
+    def __eq__(self, other):
+        if isinstance(other, OrderedSet):
+            return len(self) == len(other) and list(self) == list(other)
+        return set(self) == set(other)
+
+            
+if __name__ == '__main__':
+    s = OrderedSet('abracadaba')
+    t = OrderedSet('simsalabim')
+    print((s | t))
+    print((s & t))
+    print((s - t))

+ 126 - 0
FALCON/falcon_kit/util/system.py

@@ -0,0 +1,126 @@
+
+
+
+from future.utils import viewitems
+from pypeflow.io import cd, capture
+import fnmatch
+import logging
+import os
+import pprint
+import random
+import time
+
+log = logging.getLogger(__name__)
+
+
+def only_these_symlinks(dir2paths):
+    """Create symlinks, and delete all other symlinks for each directory.
+      dir2paths := {dir: [paths]}
+    ('paths' is usually a list of 1.)
+    Use relative symlink targets.
+    Possibly, log everything we do, as one log statement to avoid user complaints.
+    """
+    log.info('Symlink .las files for further merging:\n{}'.format(
+        pprint.pformat(dict(dir2paths))))
+    for (d, paths) in viewitems(dir2paths):
+        bases = [os.path.basename(path) for path in paths]
+        base2rel = {os.path.basename(path): os.path.relpath(
+            path, d) for path in paths}
+        assert len(base2rel) == len(
+            bases), 'Non-unique basename in {}'.format(repr(paths))
+        for existing_base in os.listdir(d):
+            existing_path = os.path.join(d, existing_base)
+            if os.path.islink(existing_path):
+                if existing_base in base2rel:
+                    if os.readlink(existing_path) != base2rel[existing_base]:
+                        # Wrong target (or non-relative) so remove it.
+                        os.unlink(existing_path)
+                    else:
+                        del base2rel[existing_base]  # Just keep it.
+                else:
+                    os.unlink(existing_path)  # Old? Remove it for safety.
+        for (base, rel) in viewitems(base2rel):
+            path = os.path.join(d, base)
+            os.symlink(rel, path)
+
+
+def lfs_setstripe_maybe(path='.', stripe=12):
+    path = os.path.abspath(path)
+    cmd = 'lfs setstripe -c {:d} {!s}'.format(stripe, path)
+    try:
+        capture(cmd)
+        log.info('Lustre filesystem detected. This lfs stripe ({}) should propagate to subdirs of {!r}.'.format(
+            stripe, path))
+    except Exception as exc:
+        log.info('Apparently {!r} is not in lustre filesystem, which is fine.'.format(
+            path))
+
+
+def find_files(root_path, pattern):
+    """
+    Finds all files with filenames formatted as the
+    given pattern, descending down from root_path.
+    raise Exception if root_path is not a directory.
+    """
+    if not os.path.isdir(root_path):
+        raise Exception('Not a directory: {!r}'.format(root_path))
+    for root, dirs, files in os.walk(root_path):
+        dirs.sort()
+        for filename in sorted(fnmatch.filter(files, pattern)):
+            yield os.path.join(root, filename)
+
+
+def abs_fns(ifofns, idir=None):
+    """Yield absolute filenames from a streamed file-of-filenames.
+    """
+    log.info('Absolutizing FOFN in dir={!r}'.format(os.path.abspath(idir)))
+    for line in ifofns.read().split():
+        ifn = line.strip()
+        if not ifn:
+            continue
+        if not os.path.isabs(ifn):
+            ifn = os.path.abspath(os.path.join(idir, ifn))
+        yield ifn
+
+
+def make_fofn_abs(i_fofn_fn, o_fofn_fn):
+    """Copy i_fofn to o_fofn, but with relative filenames expanded for the dir of i_fofn.
+    """
+    assert os.path.abspath(o_fofn_fn) != os.path.abspath(
+        i_fofn_fn), '{!r} != {!r}'.format(o_fofn_fn, i_fofn_fn)
+    with open(i_fofn_fn) as ifs, open(o_fofn_fn, 'w') as ofs:
+        for fn in abs_fns(ifs, os.path.dirname(os.path.realpath(i_fofn_fn))):
+            ofs.write(fn + '\n')
+    # return o_fofn_fn
+
+
+def make_dirs(d):
+    if not os.path.isdir(d):
+        log.debug('mkdir -p {!r}'.format(d))
+        os.makedirs(d)
+
+
+def touch(*paths):
+    """touch a file.
+    """
+    msg = 'touch {!r}'.format(paths)
+    log.debug(msg)
+    for path in paths:
+        if os.path.exists(path):
+            os.utime(path, None)
+        else:
+            open(path, 'a').close()
+
+
+def make_executable(path):
+    """http://stackoverflow.com/questions/12791997/how-do-you-do-a-simple-chmod-x-from-within-python
+    """
+    mode = os.stat(path).st_mode
+    mode |= (mode & 0o444) >> 2    # copy R bits to X
+    os.chmod(path, mode)
+
+
+def set_random_seed(seed):
+    seed = seed if seed else int(time.time())
+    random.seed(seed)
+    log.info('Random seed: {}'.format(seed))

+ 97 - 0
FALCON/setup.py

@@ -0,0 +1,97 @@
+#!/usr/bin/env python3.7
+
+from setuptools import setup, Extension
+
+
+# requires.txt
+# networkx>=1.9.1
+# python-edlib
+# python-msgpack
+# future>=0.16.0
+# pypeFLOW>=2.0.0
+install_requires = [
+    "networkx >=1.9.1",
+    # "python-edlib",
+    # "python-msgpack",
+    # "pb-dazzler",
+    "future >= 0.16.0",
+    #"pypeFLOW >= 2.0.0",
+
+]
+
+# https://docs.python.org/3/extending/building.html
+# module1 = Extension('demo',
+#                     define_macros = [('MAJOR_VERSION', '1'),
+#                                      ('MINOR_VERSION', '0')],
+#                     include_dirs = ['/usr/local/include'],
+#                     libraries = ['tcl83'],
+#                     library_dirs = ['/usr/local/lib'],
+#                     sources = ['demo.c'])
+ext_falcon_module = Extension(
+    'ext_falcon',
+    sources=['src/c/ext_falcon.c', 'src/c/DW_banded.c', 'src/c/kmer_lookup.c', 'src/c/falcon.c'],
+    extra_link_args=[],
+    extra_compile_args=['-fPIC', '-O3',
+                        '-fno-omit-frame-pointer'],# '-fno-omit-frame-pointer' can help with gperftools.
+    # libraries=['profiler'],
+    # library_dirs=['/home/cdunn/local/lib'],
+    # language="c++", # c for now
+    # export_symbols=['generate_consensus'], # for windows?
+)
+
+scripts = []
+
+# PKG-INFO
+# Metadata-Version: 2.1
+# Name: falcon-kit
+# Version: 1.4.2
+# Summary: a small toolkit for DNA seqeucne alignment, overlapping, and assembly
+# Home-page: UNKNOWN
+# Author: Jason Chin
+# Author-email: jchin@pacificbiosciences.com
+# License: UNKNOWN
+# Description: UNKNOWN
+# Platform: UNKNOWN
+# Provides-Extra: falcon-task
+setup(name='falcon_kit',
+      version='1.4.2',
+      description='a small toolkit for DNA seqeucne alignment, overlapping, and assembly',
+      author='Jason Chin',
+      author_email='jchin@pacificbiosciences.com',
+      packages=['falcon_kit',
+                'falcon_kit.mains',
+                'falcon_kit.util',
+                ],
+      package_dir={'falcon_kit': 'falcon_kit/'},
+      ext_modules=[ext_falcon_module],
+      entry_points={'console_scripts': [
+            'falcon-task = falcon_kit.mains.tasks:main',
+            'fc_actg_coordinate = falcon_kit.mains.actg_coordinate:main',
+            'fc_calc_cutoff = falcon_kit.mains.calc_cutoff:main',
+            'fc_consensus = falcon_kit.mains.consensus:main',
+            'fc_contig_annotate = falcon_kit.mains.contig_annotate:main',
+            'fc_ctg_link_analysis = falcon_kit.mains.ctg_link_analysis:main',
+            'fc_dedup_a_tigs = falcon_kit.mains.dedup_a_tigs:main',
+            'fc_fasta2fasta = falcon_kit.mains.fasta2fasta:main',
+            'fc_fetch_reads = falcon_kit.mains.fetch_reads:main',
+            'fc_gen_gfa_v1 = falcon_kit.mains.gen_gfa_v1:main',
+            'fc_get_read_ctg_map = falcon_kit.mains.get_read_ctg_map:main',
+            'fc_graph_to_contig = falcon_kit.mains.graph_to_contig:main',
+            'fc_graph_to_utgs = falcon_kit.mains.graph_to_utgs:main',
+            'fc_ovlp_filter = falcon_kit.mains.ovlp_filter:main',
+            'fc_ovlp_stats = falcon_kit.mains.ovlp_stats:main',
+            'fc_ovlp_to_graph = falcon_kit.mains.ovlp_to_graph:main',
+            'fc_pr_ctg_track = falcon_kit.mains.pr_ctg_track:main',
+            'fc_rr_ctg_track = falcon_kit.mains.rr_ctg_track:main',
+            'fc_run = falcon_kit.mains.run1:main',
+            'fc_run.py = falcon_kit.mains.run1:main',
+            'fc_run1 = falcon_kit.mains.run1:main',
+        ],
+      },
+      extras_require={
+          'falcon-task':  ['falcon_kit'],
+      },
+      scripts=scripts,
+      zip_safe=False,
+      install_requires=install_requires,
+)

+ 337 - 0
FALCON/src/c/DW_banded.c

@@ -0,0 +1,337 @@
+
+/*
+ * =====================================================================================
+ *
+ *       Filename:  DW_banded.c
+ *
+ *    Description:  A banded version for the O(ND) greedy sequence alignment algorithm
+ *
+ *        Version:  0.1
+ *        Created:  07/20/2013 17:00:00
+ *       Revision:  none
+ *       Compiler:  gcc
+ *
+ *         Author:  Jason Chin,
+ *        Company:
+ *
+ * =====================================================================================
+
+ #################################################################################$$
+ # Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
+ #
+ # All rights reserved.
+ #
+ # Redistribution and use in source and binary forms, with or without
+ # modification, are permitted (subject to the limitations in the
+ # disclaimer below) provided that the following conditions are met:
+ #
+ #  * Redistributions of source code must retain the above copyright
+ #  notice, this list of conditions and the following disclaimer.
+ #
+ #  * Redistributions in binary form must reproduce the above
+ #  copyright notice, this list of conditions and the following
+ #  disclaimer in the documentation and/or other materials provided
+ #  with the distribution.
+ #
+ #  * Neither the name of Pacific Biosciences nor the names of its
+ #  contributors may be used to endorse or promote products derived
+ #  from this software without specific prior written permission.
+ #
+ # NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+ # GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+ # BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+ # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ # DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ # USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ # OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ # SUCH DAMAGE.
+ #################################################################################$$
+
+
+*/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <limits.h>
+#include <stdbool.h>
+#include "common.h"
+
+int compare_d_path(const void * a, const void * b)
+{
+    const d_path_data2 * arg1 = a;
+    const d_path_data2 * arg2 = b;
+    if (arg1->d - arg2->d == 0) {
+        return  arg1->k - arg2->k;
+    } else {
+        return arg1->d - arg2->d;
+    }
+}
+
+
+void d_path_sort( d_path_data2 * base, unsigned long max_idx) {
+    qsort(base, max_idx, sizeof(d_path_data2), compare_d_path);
+}
+
+d_path_data2 * get_dpath_idx( seq_coor_t d, seq_coor_t k, unsigned long max_idx, d_path_data2 * base) {
+    d_path_data2 d_tmp;
+    d_path_data2 *rtn;
+    d_tmp.d = d;
+    d_tmp.k = k;
+    rtn = (d_path_data2 *)  bsearch( &d_tmp, base, max_idx, sizeof(d_path_data2), compare_d_path);
+    //printf("dp %ld %ld %ld %ld %ld %ld %ld\n", (rtn)->d, (rtn)->k, (rtn)->x1, (rtn)->y1, (rtn)->x2, (rtn)->y2, (rtn)->pre_k);
+
+    return rtn;
+
+}
+
+void print_d_path(  d_path_data2 * base, unsigned long max_idx) {
+    unsigned long idx;
+    for (idx = 0; idx < max_idx; idx++){
+        printf("dp %ld %d %d %d %d %d %d %d\n",idx, (base+idx)->d, (base+idx)->k, (base+idx)->x1, (base+idx)->y1, (base+idx)->x2, (base+idx)->y2, (base+idx)->pre_k);
+    }
+}
+
+void* my_calloc(int nitems, size_t size, char const* msg, int lineno) {
+    if (nitems < 0) {
+        fprintf(stderr, "CRITICAL ERROR: %s=calloc(%d, %zu) cannot take a negative value at line %d.\n",
+                msg, nitems, size, lineno);
+        abort();
+    }
+    void* result = calloc((size_t)nitems, size);
+    if (NULL == result) {
+        fprintf(stderr, "CRITICAL ERROR: %s=calloc(%d, %zu) returned 0.\n",
+                msg, nitems, size, lineno);
+        abort();
+    }
+    return result;
+}
+
+alignment * align(char * query_seq, seq_coor_t q_len,
+                  char * target_seq, seq_coor_t t_len,
+                  seq_coor_t band_tolerance,
+                  int get_aln_str) {
+    seq_coor_t * V;
+    seq_coor_t * U;  // array of matched bases for each "k"
+    seq_coor_t k_offset;
+    seq_coor_t d;
+    seq_coor_t k, k2;
+    seq_coor_t best_m;  // the best "matches" for each d
+    seq_coor_t min_k, new_min_k;
+    seq_coor_t max_k, new_max_k;
+    seq_coor_t pre_k;
+    seq_coor_t x, y;
+    seq_coor_t cd;
+    seq_coor_t ck;
+    seq_coor_t cx, cy, nx, ny;
+    seq_coor_t max_d;
+    seq_coor_t band_size;
+    unsigned long d_path_idx = 0;
+    unsigned long max_idx = 0;
+
+    d_path_data2 * d_path;
+    d_path_data2 * d_path_aux;
+    path_point * aln_path;
+    seq_coor_t aln_path_idx;
+    alignment * align_rtn;
+    seq_coor_t aln_pos;
+    seq_coor_t i;
+    bool aligned = false;
+
+    //printf("debug: %ld %ld\n", q_len, t_len);
+    //printf("%s\n", query_seq);
+
+    max_d = (int) (0.3*(q_len + t_len));
+
+    band_size = band_tolerance * 2;
+
+    V = my_calloc(max_d * 2 + 1, sizeof(seq_coor_t), "V", __LINE__);
+    U = my_calloc(max_d * 2 + 1, sizeof(seq_coor_t), "U", __LINE__);
+
+    k_offset = max_d;
+
+    if ((size_t)INT_MAX < ((size_t)max_d * (size_t)(band_size + 1) * 2ULL)) {
+        fprintf(stderr, "CRITICAL ERROR: q_len=%d and t_len=%d => max_d=%d, and band_size=%d. Those lens are too big.\n", q_len, t_len, max_d, band_size);
+        abort();
+    }
+    // We should probably use hashmap to store the backtracing information to save memory allocation time
+    // This O(MN) block allocation scheme is convient for now but it is slower for very long sequences
+    d_path = my_calloc(max_d * (band_size + 1 ) * 2 + 1, sizeof(d_path_data2), "d_path", __LINE__);
+
+    aln_path = my_calloc(q_len + t_len + 1, sizeof(path_point), "aln_path", __LINE__);
+
+    align_rtn = my_calloc(1, sizeof(alignment), "align_rtn", __LINE__);
+    align_rtn->t_aln_str = my_calloc(q_len + t_len + 1, sizeof(char), "align_rtn->t_aln_str", __LINE__);
+    align_rtn->q_aln_str = my_calloc(q_len + t_len + 1, sizeof(char), "align_rtn->q_aln_str", __LINE__);
+    align_rtn->aln_str_size = 0;
+    align_rtn->aln_q_s = 0;
+    align_rtn->aln_q_e = 0;
+    align_rtn->aln_t_s = 0;
+    align_rtn->aln_t_e = 0;
+
+    //printf("max_d: %lu, band_size: %lu\n", max_d, band_size);
+    best_m = -1;
+    min_k = 0;
+    max_k = 0;
+    d_path_idx = 0;
+    max_idx = 0;
+    for (d = 0; d < max_d; d ++ ) {
+        if (max_k - min_k > band_size) {
+            break;
+        }
+
+        for (k = min_k; k <= max_k;  k += 2) {
+
+            if ( (k == min_k) || ((k != max_k) && (V[ k - 1 + k_offset ] < V[ k + 1 + k_offset])) ) {
+                pre_k = k + 1;
+                x = V[ k + 1 + k_offset];
+            } else {
+                pre_k = k - 1;
+                x = V[ k - 1 + k_offset] + 1;
+            }
+            y = x - k;
+            d_path[d_path_idx].d = d;
+            d_path[d_path_idx].k = k;
+            d_path[d_path_idx].x1 = x;
+            d_path[d_path_idx].y1 = y;
+
+            while ( x < q_len && y < t_len && query_seq[x] == target_seq[y] ){
+                x++;
+                y++;
+            }
+
+            d_path[d_path_idx].x2 = x;
+            d_path[d_path_idx].y2 = y;
+            d_path[d_path_idx].pre_k = pre_k;
+            d_path_idx ++;
+
+            V[ k + k_offset ] = x;
+            U[ k + k_offset ] = x + y;
+
+            if ( x + y > best_m) {
+                best_m = x + y;
+            }
+
+            if ( x >= q_len || y >= t_len) {
+                aligned = true;
+                max_idx = d_path_idx;
+                break;
+            }
+        }
+
+        // For banding
+        new_min_k = max_k;
+        new_max_k = min_k;
+
+        for (k2 = min_k; k2 <= max_k;  k2 += 2) {
+            if (U[ k2 + k_offset] >= best_m - band_tolerance ) {
+                if ( k2 < new_min_k ) {
+                    new_min_k = k2;
+                }
+                if ( k2 > new_max_k ) {
+                    new_max_k = k2;
+                }
+            }
+        }
+
+        max_k = new_max_k + 1;
+        min_k = new_min_k - 1;
+
+        // For no banding
+        // max_k ++;
+        // min_k --;
+
+        // For debuging
+        // printf("min_max_k,d, %ld %ld %ld\n", min_k, max_k, d);
+
+        if (aligned == true) {
+            align_rtn->aln_q_e = x;
+            align_rtn->aln_t_e = y;
+            align_rtn->dist = d;
+            align_rtn->aln_str_size = (x + y + d) / 2;
+            align_rtn->aln_q_s = 0;
+            align_rtn->aln_t_s = 0;
+
+            d_path_sort(d_path, max_idx);
+            //print_d_path(d_path, max_idx);
+
+            if (get_aln_str > 0) {
+                cd = d;
+                ck = k;
+                aln_path_idx = 0;
+                while (cd >= 0 && aln_path_idx < q_len + t_len + 1) {
+                    d_path_aux = (d_path_data2 *) get_dpath_idx( cd, ck, max_idx, d_path);
+                    aln_path[aln_path_idx].x = d_path_aux -> x2;
+                    aln_path[aln_path_idx].y = d_path_aux -> y2;
+                    aln_path_idx ++;
+                    aln_path[aln_path_idx].x = d_path_aux -> x1;
+                    aln_path[aln_path_idx].y = d_path_aux -> y1;
+                    aln_path_idx ++;
+                    ck = d_path_aux -> pre_k;
+                    cd -= 1;
+                }
+                aln_path_idx --;
+                cx = aln_path[aln_path_idx].x;
+                cy = aln_path[aln_path_idx].y;
+                align_rtn->aln_q_s = cx;
+                align_rtn->aln_t_s = cy;
+                aln_pos = 0;
+                while ( aln_path_idx > 0 ) {
+                    aln_path_idx --;
+                    nx = aln_path[aln_path_idx].x;
+                    ny = aln_path[aln_path_idx].y;
+                    if (cx == nx && cy == ny){
+                        continue;
+                    }
+                    if (nx == cx && ny != cy){ //advance in y
+                        for (i = 0; i <  ny - cy; i++) {
+                            align_rtn->q_aln_str[aln_pos + i] = '-';
+                        }
+                        for (i = 0; i <  ny - cy; i++) {
+                            align_rtn->t_aln_str[aln_pos + i] = target_seq[cy + i];
+                        }
+                        aln_pos += ny - cy;
+                    } else if (nx != cx && ny == cy){ //advance in x
+                        for (i = 0; i <  nx - cx; i++) {
+                            align_rtn->q_aln_str[aln_pos + i] = query_seq[cx + i];
+                        }
+                        for (i = 0; i <  nx - cx; i++) {
+                            align_rtn->t_aln_str[aln_pos + i] = '-';
+                        }
+                        aln_pos += nx - cx;
+                    } else {
+                        for (i = 0; i <  nx - cx; i++) {
+                            align_rtn->q_aln_str[aln_pos + i] = query_seq[cx + i];
+                        }
+                        for (i = 0; i <  ny - cy; i++) {
+                            align_rtn->t_aln_str[aln_pos + i] = target_seq[cy + i];
+                        }
+                        aln_pos += ny - cy;
+                    }
+                    cx = nx;
+                    cy = ny;
+                }
+                align_rtn->aln_str_size = aln_pos;
+            }
+            break;
+        }
+    }
+
+    free(V);
+    free(U);
+    free(d_path);
+    free(aln_path);
+    return align_rtn;
+}
+
+
+void free_alignment(alignment * aln) {
+    free(aln->q_aln_str);
+    free(aln->t_aln_str);
+    free(aln);
+}

+ 20 - 0
FALCON/src/c/Makefile

@@ -0,0 +1,20 @@
+DW_align.so: DW_banded.c common.h
+	gcc DW_banded.c -O3 -shared -fPIC -o DW_align.so
+
+kmer_lookup.so: kmer_lookup.c common.h
+	gcc kmer_lookup.c -O3 -shared -fPIC -o kmer_lookup.so
+
+#falcon: DW_banded.c common.h kmer_lookup.c falcon.c 
+#	gcc DW_banded.c kmer_lookup.c falcon.c -O4 -o falcon -fPIC 
+
+falcon.so: falcon.c common.h DW_banded.c kmer_lookup.c
+	gcc DW_banded.c kmer_lookup.c falcon.c -O3 -shared -fPIC -o falcon.so 
+
+#falcon2.so: falcon.c common.h DW_banded_2.c kmer_lookup.c
+#	gcc DW_banded_2.c kmer_lookup.c falcon.c -O3 -shared -fPIC -o falcon2.so 
+
+clean:
+	rm  *.so
+
+all: DW_align.so kmer_lookup.so falcon.so
+

+ 16 - 0
FALCON/src/c/Makefile.osx

@@ -0,0 +1,16 @@
+DW_align.so: DW_banded.c common.h
+	gcc DW_banded.c -O3 -shared -fPIC -o DW_align.so -I/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.8.sdk/usr/include/ -L/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.8.sdk/usr/lib
+
+kmer_lookup.so: kmer_lookup.c common.h
+	gcc kmer_lookup.c -O3 -shared -fPIC -o kmer_lookup.so -I/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.8.sdk/usr/include/ -L/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.8.sdk/usr/lib
+
+falcon: DW_banded.c common.h kmer_lookup.c falcon.c 
+	gcc DW_banded.c kmer_lookup.c falcon.c -O4 -o falcon -fPIC -I/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.8.sdk/usr/include/ -L/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.8.sdk/usr/lib
+
+falcon.so: falcon.c common.h DW_banded.c kmer_lookup.c
+	gcc DW_banded.c kmer_lookup.c falcon.c -O3 -shared -fPIC -o falcon.so -I/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.8.sdk/usr/include/ -L/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.8.sdk/usr/lib
+
+
+
+all: DW_align.so kmer_lookup.so falcon.so falcon
+

+ 178 - 0
FALCON/src/c/common.h

@@ -0,0 +1,178 @@
+
+/*
+ * =====================================================================================
+ *
+ *       Filename:  common.h
+ *
+ *    Description:  Common delclaration for the code base 
+ *
+ *        Version:  0.1
+ *        Created:  07/16/2013 07:46:23 AM
+ *       Revision:  none
+ *       Compiler:  gcc
+ *
+ *         Author:  Jason Chin, 
+ *        Company:  
+ *
+ * =====================================================================================
+
+ #################################################################################$$
+ # Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
+ #
+ # All rights reserved.
+ #
+ # Redistribution and use in source and binary forms, with or without
+ # modification, are permitted (subject to the limitations in the
+ # disclaimer below) provided that the following conditions are met:
+ #
+ #  * Redistributions of source code must retain the above copyright
+ #  notice, this list of conditions and the following disclaimer.
+ #
+ #  * Redistributions in binary form must reproduce the above
+ #  copyright notice, this list of conditions and the following
+ #  disclaimer in the documentation and/or other materials provided
+ #  with the distribution.
+ #
+ #  * Neither the name of Pacific Biosciences nor the names of its
+ #  contributors may be used to endorse or promote products derived
+ #  from this software without specific prior written permission.
+ #
+ # NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+ # GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+ # BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+ # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ # DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ # USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ # OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ # SUCH DAMAGE.
+ #################################################################################$$
+ */
+
+typedef int seq_coor_t; 
+
+typedef struct {    
+    seq_coor_t aln_str_size ;
+    seq_coor_t dist ;
+    seq_coor_t aln_q_s;
+    seq_coor_t aln_q_e;
+    seq_coor_t aln_t_s;
+    seq_coor_t aln_t_e;
+    char * q_aln_str;
+    char * t_aln_str;
+
+} alignment;
+
+
+typedef struct {
+    seq_coor_t pre_k;
+    seq_coor_t x1;
+    seq_coor_t y1;
+    seq_coor_t x2;
+    seq_coor_t y2;
+} d_path_data;
+
+typedef struct {
+    seq_coor_t d;
+    seq_coor_t k;
+    seq_coor_t pre_k;
+    seq_coor_t x1;
+    seq_coor_t y1;
+    seq_coor_t x2;
+    seq_coor_t y2;
+} d_path_data2;
+
+typedef struct {
+    seq_coor_t x;
+    seq_coor_t y;
+} path_point;
+
+typedef struct {    
+    seq_coor_t start;
+    seq_coor_t last;
+    seq_coor_t count;
+} kmer_lookup;
+
+typedef unsigned char base;
+typedef base * seq_array;
+typedef seq_coor_t seq_addr;
+typedef seq_addr * seq_addr_array;
+
+
+typedef struct {
+    seq_coor_t count;
+    seq_coor_t * query_pos;
+    seq_coor_t * target_pos;
+} kmer_match;
+
+
+typedef struct {
+    seq_coor_t s1;
+    seq_coor_t e1;
+    seq_coor_t s2;
+    seq_coor_t e2;
+    long int score;
+} aln_range;
+
+
+typedef struct {
+    char * sequence;
+    int * eqv;
+} consensus_data;
+
+kmer_lookup * allocate_kmer_lookup (seq_coor_t);
+void init_kmer_lookup ( kmer_lookup *,  seq_coor_t );
+void free_kmer_lookup(kmer_lookup *);
+
+seq_array allocate_seq(seq_coor_t);
+void init_seq_array( seq_array, seq_coor_t);
+void free_seq_array(seq_array);
+
+seq_addr_array allocate_seq_addr(seq_coor_t size); 
+
+void free_seq_addr_array(seq_addr_array);
+
+
+aln_range *  find_best_aln_range(kmer_match *, 
+                              seq_coor_t, 
+                              seq_coor_t, 
+                              seq_coor_t); 
+
+void free_aln_range( aln_range *);
+
+kmer_match * find_kmer_pos_for_seq( char *, 
+                                    seq_coor_t, 
+                                    unsigned int K, 
+                                    seq_addr_array, 
+                                    kmer_lookup * );
+
+void free_kmer_match( kmer_match * ptr);
+void free_kmer_lookup(kmer_lookup * );
+
+
+
+void add_sequence ( seq_coor_t, 
+                    unsigned int, 
+                    char *, 
+                    seq_coor_t,
+                    seq_addr_array, 
+                    seq_array, 
+                    kmer_lookup *); 
+
+void mask_k_mer(seq_coor_t, kmer_lookup *, seq_coor_t);
+
+alignment * align(char *, seq_coor_t,
+                  char *, seq_coor_t,
+                  seq_coor_t,
+                  int); 
+
+void free_alignment(alignment *);
+
+
+void free_consensus_data(consensus_data *);
+

+ 84 - 0
FALCON/src/c/ext_falcon.c

@@ -0,0 +1,84 @@
+//https://docs.python.org/zh-cn/3/howto/cporting.html
+
+#include "Python.h"
+
+struct module_state {
+    PyObject *error;
+};
+
+#if PY_MAJOR_VERSION >= 3
+#define GETSTATE(m) ((struct module_state*)PyModule_GetState(m))
+#else
+#define GETSTATE(m) (&_state)
+static struct module_state _state;
+#endif
+
+static PyObject *
+error_out(PyObject *m) {
+    struct module_state *st = GETSTATE(m);
+    PyErr_SetString(st->error, "something bad happened");
+    return NULL;
+}
+
+static PyMethodDef ext_falcon_methods[] = {
+    {"error_out", (PyCFunction)error_out, METH_NOARGS, NULL},
+    {NULL, NULL}
+};
+
+#if PY_MAJOR_VERSION >= 3
+
+static int ext_falcon_traverse(PyObject *m, visitproc visit, void *arg) {
+    Py_VISIT(GETSTATE(m)->error);
+    return 0;
+}
+
+static int ext_falcon_clear(PyObject *m) {
+    Py_CLEAR(GETSTATE(m)->error);
+    return 0;
+}
+
+
+static struct PyModuleDef moduledef = {
+        PyModuleDef_HEAD_INIT,
+        "ext_falcon",
+        NULL,
+        sizeof(struct module_state),
+        ext_falcon_methods,
+        NULL,
+        ext_falcon_traverse,
+        ext_falcon_clear,
+        NULL
+};
+
+#define INITERROR return NULL
+
+PyMODINIT_FUNC
+PyInit_ext_falcon(void)
+
+#else
+#define INITERROR return NULL
+
+void
+initext_falcon(void)
+#endif
+{
+#if PY_MAJOR_VERSION >= 3
+    PyObject *module = PyModule_Create(&moduledef);
+#else
+    PyObject *module = Py_InitModule("ext_falcon", ext_falcon_methods);
+#endif
+
+    if (module == NULL)
+        INITERROR;
+    struct module_state *st = GETSTATE(module);
+
+    st->error = PyErr_NewException("ext_falcon.Error", NULL, NULL);
+    if (st->error == NULL) {
+        Py_DECREF(module);
+        INITERROR;
+    }
+
+#if PY_MAJOR_VERSION >= 3
+    return module;
+#endif
+}

+ 841 - 0
FALCON/src/c/falcon.c

@@ -0,0 +1,841 @@
+/*
+ * =====================================================================================
+ *
+ *       Filename:  fastcon.c
+ *
+ *    Description:
+ *
+ *        Version:  0.1
+ *        Created:  07/20/2013 17:00:00
+ *       Revision:  none
+ *       Compiler:  gcc
+ *
+ *         Author:  Jason Chin,
+ *        Company:
+ *
+ * =====================================================================================
+
+ #################################################################################$$
+ # Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
+ #
+ # All rights reserved.
+ #
+ # Redistribution and use in source and binary forms, with or without
+ # modification, are permitted (subject to the limitations in the
+ # disclaimer below) provided that the following conditions are met:
+ #
+ #  * Redistributions of source code must retain the above copyright
+ #  notice, this list of conditions and the following disclaimer.
+ #
+ #  * Redistributions in binary form must reproduce the above
+ #  copyright notice, this list of conditions and the following
+ #  disclaimer in the documentation and/or other materials provided
+ #  with the distribution.
+ #
+ #  * Neither the name of Pacific Biosciences nor the names of its
+ #  contributors may be used to endorse or promote products derived
+ #  from this software without specific prior written permission.
+ #
+ # NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+ # GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+ # BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+ # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ # DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ # USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ # OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ # SUCH DAMAGE.
+ #################################################################################$$
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <limits.h>
+#include <string.h>
+#include <assert.h>
+#include <stdint.h>
+#include "common.h"
+
+typedef struct {
+    seq_coor_t t_pos;
+    uint8_t delta;
+    char q_base;
+    seq_coor_t p_t_pos;   // the tag position of the previous base
+    uint8_t p_delta; // the tag delta of the previous base
+    char p_q_base;        // the previous base
+    unsigned q_id;
+} align_tag_t;
+
+typedef struct {
+    seq_coor_t len;
+    align_tag_t * align_tags;
+} align_tags_t;
+
+
+typedef struct {
+    uint16_t size;
+    uint16_t n_link;
+    seq_coor_t * p_t_pos;   // the tag position of the previous base
+    uint8_t * p_delta; // the tag delta of the previous base
+    char * p_q_base;        // the previous base
+    uint16_t * link_count;
+    uint16_t count;
+    seq_coor_t best_p_t_pos;
+    uint8_t best_p_delta;
+    uint8_t best_p_q_base; // encoded base
+    double score;
+} align_tag_col_t;
+
+typedef struct {
+    align_tag_col_t * base;
+} msa_base_group_t;
+
+typedef struct {
+    uint8_t size;
+    uint8_t max_delta;
+    msa_base_group_t * delta;
+} msa_delta_group_t;
+
+typedef msa_delta_group_t * msa_pos_t;
+
+align_tags_t * get_align_tags( char * aln_q_seq,
+                               char * aln_t_seq,
+                               seq_coor_t aln_seq_len,
+                               aln_range * range,
+                               unsigned q_id,
+                               seq_coor_t t_offset) {
+    char p_q_base;
+    align_tags_t * tags;
+    seq_coor_t i, j, jj, k, p_j, p_jj;
+
+    tags = calloc( 1, sizeof(align_tags_t) );
+    tags->len = aln_seq_len;
+    tags->align_tags = calloc( aln_seq_len + 1, sizeof(align_tag_t) );
+    i = range->s1 - 1;
+    j = range->s2 - 1;
+    jj = 0;
+    p_j = -1;
+    p_jj = 0;
+    p_q_base = '.';
+
+    for (k = 0; k < aln_seq_len; k++) {
+        if (aln_q_seq[k] != '-') {
+            i ++;
+            jj ++;
+        }
+        if (aln_t_seq[k] != '-') {
+            j ++;
+            jj = 0;
+        }
+        //printf("t %d %d %d %c %c\n", q_id, j, jj, aln_t_seq[k], aln_q_seq[k]);
+
+
+        if ( j + t_offset >= 0 && jj < UINT8_MAX && p_jj < UINT8_MAX) {
+            (tags->align_tags[k]).t_pos = j + t_offset;
+            (tags->align_tags[k]).delta = jj;
+            (tags->align_tags[k]).p_t_pos = p_j + t_offset;
+            (tags->align_tags[k]).p_delta = p_jj;
+            (tags->align_tags[k]).p_q_base = p_q_base;
+            (tags->align_tags[k]).q_base = aln_q_seq[k];
+            (tags->align_tags[k]).q_id = q_id;
+
+            p_j = j;
+            p_jj = jj;
+            p_q_base = aln_q_seq[k];
+        } else {
+            break; // when there is a big alignment gap > UINT8_MAX, stop to extned the tagging string
+        }
+    }
+    // sentinal at the end
+    //k = aln_seq_len;
+    tags->len = k;
+    (tags->align_tags[k]).t_pos = UINT_MAX;
+    (tags->align_tags[k]).delta = UINT8_MAX;
+    (tags->align_tags[k]).q_base = '.';
+    (tags->align_tags[k]).q_id = UINT_MAX;
+    return tags;
+}
+
+void free_align_tags( align_tags_t * tags) {
+    free( tags->align_tags );
+    free( tags );
+}
+
+
+void allocate_aln_col( align_tag_col_t * col) {
+    col->p_t_pos = ( seq_coor_t * ) calloc(col->size, sizeof( seq_coor_t ));
+    col->p_delta = ( uint8_t * ) calloc(col->size, sizeof( uint8_t ));
+    col->p_q_base = ( char * )calloc(col->size, sizeof( char ));
+    col->link_count = ( uint16_t * ) calloc(col->size, sizeof( uint16_t ));
+}
+
+void realloc_aln_col( align_tag_col_t * col ) {
+    col->p_t_pos = (seq_coor_t *) realloc( col->p_t_pos, (col->size) * sizeof( seq_coor_t ));
+    col->p_delta = ( uint8_t *)  realloc( col->p_delta, (col->size) * sizeof( uint8_t ));
+    col->p_q_base = (char *) realloc( col->p_q_base, (col->size) * sizeof( char ));
+    col->link_count = ( uint16_t *) realloc( col->link_count, (col->size) * sizeof( uint16_t ));
+}
+
+void free_aln_col( align_tag_col_t * col) {
+    free(col->p_t_pos);
+    free(col->p_delta);
+    free(col->p_q_base);
+    free(col->link_count);
+}
+
+
+void allocate_delta_group( msa_delta_group_t * g) {
+    int i,j;
+    g->max_delta = 0;
+    g->delta = (msa_base_group_t *) calloc( g->size, sizeof(msa_base_group_t));
+    for (i = 0; i< g->size; i++) {
+        g->delta[i].base = ( align_tag_col_t * ) calloc( 5, sizeof(align_tag_col_t ) );
+        for (j = 0; j < 5; j++ ) {
+             g->delta[i].base[j].size = 8;
+             allocate_aln_col(&(g->delta[i].base[j]));
+        }
+    }
+}
+
+void realloc_delta_group( msa_delta_group_t * g, uint16_t new_size ) {
+    int i, j, bs, es;
+    bs = g->size;
+    es = new_size;
+    g->delta = (msa_base_group_t *) realloc(g->delta, new_size * sizeof(msa_base_group_t));
+    for (i=bs; i < es; i++) {
+        g->delta[i].base = ( align_tag_col_t *) calloc( 5, sizeof(align_tag_col_t ) );
+        for (j = 0; j < 5; j++ ) {
+             g->delta[i].base[j].size = 8;
+             allocate_aln_col(&(g->delta[i].base[j]));
+        }
+    }
+    g->size = new_size;
+}
+
+void free_delta_group( msa_delta_group_t * g) {
+    //manything to do here
+    int i, j;
+    for (i = 0; i < g->size; i++) {
+        for (j = 0; j < 5; j++) {
+            free_aln_col( &(g->delta[i].base[j]) );
+        }
+        free(g->delta[i].base);
+    }
+    free(g->delta);
+}
+
+void update_col( align_tag_col_t * col, seq_coor_t p_t_pos, uint8_t p_delta, char p_q_base) {
+    int updated = 0;
+    int kk;
+    col->count += 1;
+    for (kk = 0; kk < col->n_link; kk++) {
+        if ( p_t_pos == col->p_t_pos[kk] &&
+             p_delta == col->p_delta[kk] &&
+             p_q_base == col->p_q_base[kk] ) {
+            col->link_count[kk] ++;
+            updated = 1;
+            break;
+        }
+    }
+    if (updated == 0) {
+        if (col->n_link + 1 > col->size) {
+            if (col->size < (UINT16_MAX >> 1)-1) {
+                col->size *= 2;
+            } else {
+                col->size += 256;
+            }
+            assert( col->size < UINT16_MAX-1 );
+            realloc_aln_col(col);
+        }
+        kk = col->n_link;
+
+        col->p_t_pos[kk] = p_t_pos;
+        col->p_delta[kk] = p_delta;
+        col->p_q_base[kk] = p_q_base;
+        col->link_count[kk] = 1;
+        col->n_link++;
+    }
+}
+
+
+msa_pos_t * get_msa_working_sapce(unsigned int max_t_len) {
+    msa_pos_t * msa_array;
+    unsigned int i;
+    msa_array = calloc(max_t_len, sizeof(msa_pos_t *));
+    for (i = 0; i < max_t_len; i++) {
+        msa_array[i] = calloc(1, sizeof(msa_delta_group_t));
+        msa_array[i]->size = 8;
+        allocate_delta_group(msa_array[i]);
+    }
+    return msa_array;
+}
+
+void clean_msa_working_space( msa_pos_t * msa_array, unsigned int max_t_len) {
+    unsigned int i,j,k;
+    align_tag_col_t * col;
+    for (i = 0; i < max_t_len; i++) {
+        for (j =0; j < msa_array[i]->max_delta + 1; j++) {
+            for (k = 0; k < 5; k++ ) {
+                col = msa_array[i]->delta[j].base + k;
+                /*
+                for (c =0; c < col->size; c++) {
+                    col->p_t_pos[c] = 0;
+                    col->p_delta[c] = 0;
+                    col->p_q_base[c] = 0;
+                    col->link_count[c] =0;
+                }
+                */
+                col->n_link = 0;
+                col->count = 0;
+                col->best_p_t_pos = 0;
+                col->best_p_delta = 0;
+                col->best_p_q_base = 0;
+                col->score = 0;
+            }
+        }
+        msa_array[i]->max_delta = 0;
+    }
+}
+
+#define STATIC_ALLOCATE
+//#undef STATIC_ALLOCATE
+
+consensus_data * get_cns_from_align_tags( align_tags_t ** tag_seqs,
+                                          unsigned n_tag_seqs,
+                                          unsigned t_len,
+                                          unsigned min_cov ) {
+
+    seq_coor_t i, j;
+    seq_coor_t t_pos = 0;
+    unsigned int * coverage;
+    unsigned int * local_nbase;
+
+    consensus_data * consensus;
+    //char * consensus;
+    align_tag_t * c_tag;
+
+    coverage = calloc( t_len, sizeof(unsigned int) );
+    local_nbase = calloc( t_len, sizeof(unsigned int) );
+
+#ifndef STATIC_ALLOCATE
+
+    msa_pos_t * msa_array = NULL; // For more efficiency, this should be injected.
+    msa_array = calloc(t_len, sizeof(msa_pos_t *));
+
+    for (i = 0; i < t_len; i++) {
+        msa_array[i] = calloc(1, sizeof(msa_delta_group_t));
+        msa_array[i]->size = 8;
+        allocate_delta_group(msa_array[i]);
+    }
+
+#else
+
+    static msa_pos_t * msa_array = NULL;
+    if ( msa_array == NULL) {
+        msa_array = get_msa_working_sapce( 100000 );
+    }
+
+    assert(t_len < 100000);
+
+#endif
+
+
+    // loop through every alignment
+    //printf("XX %d\n", n_tag_seqs);
+    for (i = 0; i < n_tag_seqs; i++) {
+
+        // for each alignment position, insert the alignment tag to msa_array
+        for (j = 0; j < tag_seqs[i]->len; j++) {
+            c_tag = tag_seqs[i]->align_tags + j;
+            unsigned int delta;
+            delta = c_tag->delta;
+            if (delta == 0) {
+                t_pos = c_tag->t_pos;
+                coverage[ t_pos ] ++;
+            }
+            // Assume t_pos was set on earlier iteration.
+            // (Otherwise, use its initial value, which might be an error. ~cd)
+            if (delta > msa_array[t_pos]->max_delta) {
+                msa_array[t_pos]->max_delta = delta;
+                if (msa_array[t_pos]->max_delta + 4 > msa_array[t_pos]->size ) {
+                    realloc_delta_group(msa_array[t_pos], msa_array[t_pos]->max_delta + 8);
+                }
+            }
+
+            unsigned int base = -1;
+            switch (c_tag->q_base) {
+                case 'A': base = 0; break;
+                case 'C': base = 1; break;
+                case 'G': base = 2; break;
+                case 'T': base = 3; break;
+                case '-': base = 4; break;
+            }
+            // Note: On bad input, base may be -1.
+            update_col( &(msa_array[t_pos]->delta[delta].base[base]), c_tag->p_t_pos, c_tag->p_delta, c_tag->p_q_base);
+            local_nbase[ t_pos ] ++;
+        }
+    }
+
+    // propogate score throught the alignment links, setup backtracking information
+    align_tag_col_t * g_best_aln_col = 0;
+    unsigned int g_best_ck = 0;
+    seq_coor_t g_best_t_pos = 0;
+    {
+        int kk;
+        int ck;
+        // char base;
+        int best_i;
+        int best_j;
+        int best_b;
+        int best_ck = -1;
+        double score;
+        double best_score;
+        double g_best_score;
+        // char best_mark;
+
+        align_tag_col_t * aln_col;
+
+        g_best_score = -1;
+
+        for (i = 0; i < t_len; i++) {  //loop through every template base
+            //printf("max delta: %d %d\n", i, msa_array[i]->max_delta);
+            for (j = 0; j <= msa_array[i]->max_delta; j++) { // loop through every delta position
+                for (kk = 0; kk < 5; kk++) {  // loop through diff bases of the same delta posiiton
+                    /*
+                    switch (kk) {
+                        case 0: base = 'A'; break;
+                        case 1: base = 'C'; break;
+                        case 2: base = 'G'; break;
+                        case 3: base = 'T'; break;
+                        case 4: base = '-'; break;
+                    }
+                    */
+                    aln_col = msa_array[i]->delta[j].base + kk;
+                    if (aln_col->count >= 0) {
+                        best_score = -1;
+                        best_i = -1;
+                        best_j = -1;
+                        best_b = -1;
+
+                        for (ck = 0; ck < aln_col->n_link; ck++) { // loop through differnt link to previous column
+                            int pi;
+                            int pj;
+                            int pkk;
+                            pi = aln_col->p_t_pos[ck];
+                            pj = aln_col->p_delta[ck];
+                            switch (aln_col->p_q_base[ck]) {
+                                case 'A': pkk = 0; break;
+                                case 'C': pkk = 1; break;
+                                case 'G': pkk = 2; break;
+                                case 'T': pkk = 3; break;
+                                case '-': pkk = 4; break;
+                                default: pkk = 4;
+                            }
+
+                            if (aln_col->p_t_pos[ck] == -1) {
+                                score =  (double) aln_col->link_count[ck] - (double) coverage[i] * 0.5;
+                            } else {
+                                score = msa_array[pi]->delta[pj].base[pkk].score +
+                                        (double) aln_col->link_count[ck] - (double) coverage[i] * 0.5;
+                            }
+                            // best_mark = ' ';
+                            if (score > best_score) {
+                                best_score = score;
+                                aln_col->best_p_t_pos = best_i = pi;
+                                aln_col->best_p_delta = best_j = pj;
+                                aln_col->best_p_q_base = best_b = pkk;
+                                best_ck = ck;
+                                // best_mark = '*';
+                            }
+                            /*
+                            printf("X %d %d %d %c %d %d %d %c %d %lf %c\n", coverage[i], i, j, base, aln_col->count,
+                                                                  aln_col->p_t_pos[ck],
+                                                                  aln_col->p_delta[ck],
+                                                                  aln_col->p_q_base[ck],
+                                                                  aln_col->link_count[ck],
+                                                                  score, best_mark);
+                            */
+                        }
+                        aln_col->score = best_score;
+                        if (best_score > g_best_score) {
+                            g_best_score = best_score;
+                            g_best_aln_col = aln_col;
+                            g_best_ck = best_ck;
+                            g_best_t_pos = i;
+                            //printf("GB %d %d %d %d\n", i, j, ck, g_best_aln_col);
+                        }
+                    }
+                }
+            }
+        }
+        assert(g_best_score != -1);
+    }
+
+    // reconstruct the sequences
+    unsigned int index;
+    char bb = '$';
+    int ck;
+    char * cns_str;
+    int * eqv;
+    double score0;
+
+    consensus = calloc( 1, sizeof(consensus_data) );
+    consensus->sequence = calloc( t_len * 2 + 1, sizeof(char) );
+    consensus->eqv = calloc( t_len * 2 + 1, sizeof(unsigned int) );
+    cns_str = consensus->sequence;
+    eqv =  consensus->eqv;
+
+    index = 0;
+    ck = g_best_ck;
+    i = g_best_t_pos;
+
+    while (1) {
+        if (coverage[i] > min_cov) {
+            switch (ck) {
+                case 0: bb = 'A'; break;
+                case 1: bb = 'C'; break;
+                case 2: bb = 'G'; break;
+                case 3: bb = 'T'; break;
+                case 4: bb = '-'; break;
+            }
+        } else {
+            switch (ck) {
+                case 0: bb = 'a'; break;
+                case 1: bb = 'c'; break;
+                case 2: bb = 'g'; break;
+                case 3: bb = 't'; break;
+                case 4: bb = '-'; break;
+            }
+        }
+        // Note: On bad input, bb will keep previous value, possibly '$'.
+
+        score0 = g_best_aln_col->score;
+        i = g_best_aln_col->best_p_t_pos;
+        if (i == -1 || index >= t_len * 2) break;
+        j = g_best_aln_col->best_p_delta;
+        ck = g_best_aln_col->best_p_q_base;
+        g_best_aln_col = msa_array[i]->delta[j].base + ck;
+
+        if (bb != '-') {
+            cns_str[index] = bb;
+            eqv[index] = (int) score0 - (int) g_best_aln_col->score;
+            //printf("C %d %d %c %lf %d %d\n", i, index, bb, g_best_aln_col->score, coverage[i], eqv[index] );
+            index ++;
+        }
+    }
+
+    // reverse the sequence
+    for (i = 0; i < index/2; i++) {
+        cns_str[i] = cns_str[i] ^ cns_str[index-i-1];
+        cns_str[index-i-1] = cns_str[i] ^ cns_str[index-i-1];
+        cns_str[i] = cns_str[i] ^ cns_str[index-i-1];
+        eqv[i] = eqv[i] ^ eqv[index-i-1];
+        eqv[index-i-1] = eqv[i] ^ eqv[index-i-1];
+        eqv[i] = eqv[i] ^ eqv[index-i-1];
+    }
+
+    cns_str[index] = 0;
+    //printf("%s\n", cns_str);
+#ifndef STATIC_ALLOCATE
+    for (i = 0; i < t_len; i++) {
+        free_delta_group(msa_array[i]);
+        free(msa_array[i]);
+    }
+
+    free(msa_array);
+#else
+    clean_msa_working_space(msa_array, t_len+1);
+#endif
+
+    free(coverage);
+    free(local_nbase);
+    return consensus;
+}
+
+//const unsigned int K = 8;
+//min_cov = 2
+//min_idt = 0.7
+consensus_data * generate_consensus( char ** input_seq,
+                           unsigned int n_seq,
+                           unsigned min_cov,
+                           unsigned K,
+                           double min_idt) {
+    unsigned int j;
+    unsigned int seq_count;
+    unsigned int aligned_seq_count;
+    kmer_lookup * lk_ptr;
+    seq_array sa_ptr;
+    seq_addr_array sda_ptr;
+    kmer_match * kmer_match_ptr;
+    aln_range * arange;
+    alignment * aln;
+    align_tags_t ** tags_list;
+    //char * consensus;
+    consensus_data * consensus;
+    double max_diff;
+    max_diff = 1.0 - min_idt;//0.3
+
+    seq_count = n_seq;
+    //printf("XX n_seq %d\n", n_seq);
+    //for (j=0; j < seq_count; j++) {
+    //    printf("seq_len: %u %u\n", j, strlen(input_seq[j]));
+    //};
+    fflush(stdout);
+
+    tags_list = calloc( seq_count, sizeof(align_tags_t *) );
+    lk_ptr = allocate_kmer_lookup( 1 << (K * 2) );//1 << (K * 2) = 2^16
+    sa_ptr = allocate_seq( (seq_coor_t) strlen( input_seq[0]) );
+    sda_ptr = allocate_seq_addr( (seq_coor_t) strlen( input_seq[0]) );
+    add_sequence( 0, K, input_seq[0], strlen(input_seq[0]), sda_ptr, sa_ptr, lk_ptr);
+    //mask_k_mer(1 << (K * 2), lk_ptr, 16);
+
+    aligned_seq_count = 0;
+    for (j=1; j < seq_count; j++) {
+
+        //printf("seq_len: %ld %u\n", j, strlen(input_seq[j]));
+
+        kmer_match_ptr = find_kmer_pos_for_seq(input_seq[j], strlen(input_seq[j]), K, sda_ptr, lk_ptr);
+#define INDEL_ALLOWENCE_0 6
+
+        arange = find_best_aln_range(kmer_match_ptr, K, K * INDEL_ALLOWENCE_0, 5);  // narrow band to avoid aligning through big indels
+
+        //printf("1:%ld %ld %ld %ld\n", arange_->s1, arange_->e1, arange_->s2, arange_->e2);
+
+        //arange = find_best_aln_range2(kmer_match_ptr, K, K * INDEL_ALLOWENCE_0, 5);  // narrow band to avoid aligning through big indels
+
+        //printf("2:%ld %ld %ld %ld\n\n", arange->s1, arange->e1, arange->s2, arange->e2);
+
+#define INDEL_ALLOWENCE_1 0.10
+        if (arange->e1 - arange->s1 < 100 || arange->e2 - arange->s2 < 100 ||
+            abs( (arange->e1 - arange->s1 ) - (arange->e2 - arange->s2) ) >
+                   (int) (0.5 * INDEL_ALLOWENCE_1 * (arange->e1 - arange->s1 + arange->e2 - arange->s2))) {
+            free_kmer_match( kmer_match_ptr);
+            free_aln_range(arange);
+            continue;
+        }
+        //printf("%ld %s\n", strlen(input_seq[j]), input_seq[j]);
+        //printf("%ld %s\n\n", strlen(input_seq[0]), input_seq[0]);
+
+
+#define INDEL_ALLOWENCE_2 150
+
+        aln = align(input_seq[j]+arange->s1, arange->e1 - arange->s1 ,
+                    input_seq[0]+arange->s2, arange->e2 - arange->s2 ,
+                    INDEL_ALLOWENCE_2, 1);
+        if (aln->aln_str_size > 500 && ((double) aln->dist / (double) aln->aln_str_size) < max_diff) {
+            tags_list[aligned_seq_count] = get_align_tags( aln->q_aln_str,
+                                                           aln->t_aln_str,
+                                                           aln->aln_str_size,
+                                                           arange, j,
+                                                           0);
+            aligned_seq_count ++;
+        }
+        /***
+        for (k = 0; k < tags_list[j]->len; k++) {
+            printf("%ld %d %c\n", tags_list[j]->align_tags[k].t_pos,
+                                   tags_list[j]->align_tags[k].delta,
+                                   tags_list[j]->align_tags[k].q_base);
+        }
+        ***/
+        free_aln_range(arange);
+        free_alignment(aln);
+        free_kmer_match( kmer_match_ptr);
+    }
+
+    if (aligned_seq_count > 0) {
+        consensus = get_cns_from_align_tags( tags_list, aligned_seq_count, strlen(input_seq[0]), min_cov );
+    } else {
+        // allocate an empty consensus sequence
+        consensus = calloc( 1, sizeof(consensus_data) );
+        consensus->sequence = calloc( 1, sizeof(char) );
+        consensus->eqv = calloc( 1, sizeof(unsigned int) );
+    }
+    //free(consensus);
+    free_seq_addr_array(sda_ptr);
+    free_seq_array(sa_ptr);
+    free_kmer_lookup(lk_ptr);
+    for (j=0; j < aligned_seq_count; j++) {
+        free_align_tags(tags_list[j]);
+    }
+    free(tags_list);
+    return consensus;
+}
+
+consensus_data * generate_utg_consensus( char ** input_seq,
+                           seq_coor_t *offset,
+                           unsigned int n_seq,
+                           unsigned min_cov,
+                           unsigned K,
+                           double min_idt) {
+
+    unsigned int j;
+    unsigned int seq_count;
+    unsigned int aligned_seq_count;
+    aln_range * arange;
+    alignment * aln;
+    align_tags_t ** tags_list;
+    //char * consensus;
+    consensus_data * consensus;
+    double max_diff;
+    seq_coor_t utg_len;
+    seq_coor_t r_len;
+    max_diff = 1.0 - min_idt;
+
+
+    seq_count = n_seq;
+    /***
+    for (j=0; j < seq_count; j++) {
+        printf("seq_len: %u %u\n", j, strlen(input_seq[j]));
+    };
+    fflush(stdout);
+    ***/
+    tags_list = calloc( seq_count+1, sizeof(align_tags_t *) );
+    utg_len =  strlen(input_seq[0]);
+    aligned_seq_count = 0;
+    arange = calloc( 1, sizeof(aln_range) );
+
+    arange->s1 = 0;
+    arange->e1 = strlen(input_seq[0]);
+    arange->s2 = 0;
+    arange->e2 = strlen(input_seq[0]);
+    tags_list[aligned_seq_count] = get_align_tags( input_seq[0], input_seq[0],
+                                                   strlen(input_seq[0]), arange, 0, 0);
+    aligned_seq_count += 1;
+    for (j=1; j < seq_count; j++) {
+        arange->s1 = 0;
+        arange->e1 = strlen(input_seq[j])-1;
+        arange->s2 = 0;
+        arange->e2 = strlen(input_seq[j])-1;
+
+        r_len = strlen(input_seq[j]);
+        //printf("seq_len: %u %u\n", j, r_len);
+        if ( offset[j] < 0) {
+            if ((r_len + offset[j]) < 128) {
+                continue;
+            }
+            if ( r_len + offset[j] < utg_len ) {
+
+                //printf("1: %ld %u %u\n", offset[j], r_len, utg_len);
+                aln = align(input_seq[j] - offset[j], r_len + offset[j] ,
+                            input_seq[0], r_len + offset[j] ,
+                            500, 1);
+            } else {
+                //printf("2: %ld %u %u\n", offset[j], r_len, utg_len);
+                aln = align(input_seq[j] - offset[j], utg_len ,
+                            input_seq[0], utg_len ,
+                            500, 1);
+            }
+            offset[j] = 0;
+
+        } else {
+            if ( offset[j] > utg_len - 128) {
+                continue;
+            }
+            if ( offset[j] + r_len > utg_len ) {
+                //printf("3: %ld %u %u\n", offset[j], r_len, utg_len);
+                aln = align(input_seq[j], utg_len - offset[j] ,
+                            input_seq[0]+offset[j], utg_len - offset[j],
+                            500, 1);
+            } else {
+                //printf("4: %ld %u %u\n", offset[j], r_len, utg_len);
+                aln = align(input_seq[j], r_len ,
+                            input_seq[0]+offset[j], r_len ,
+                            500, 1);
+            }
+        }
+        if (aln->aln_str_size > 500 && ((double) aln->dist / (double) aln->aln_str_size) < max_diff) {
+            tags_list[aligned_seq_count] = get_align_tags( aln->q_aln_str, aln->t_aln_str,
+                                                           aln->aln_str_size, arange, j,
+                                                           offset[j]);
+            aligned_seq_count ++;
+        }
+        free_alignment(aln);
+    }
+    free_aln_range(arange);
+    if (aligned_seq_count > 0) {
+        consensus = get_cns_from_align_tags( tags_list, aligned_seq_count, utg_len, 0 );
+    } else {
+        // allocate an empty consensus sequence
+        consensus = calloc( 1, sizeof(consensus_data) );
+        consensus->sequence = calloc( 1, sizeof(char) );
+        consensus->eqv = calloc( 1, sizeof(unsigned int) );
+    }
+    //free(consensus);
+    for (j=0; j < aligned_seq_count; j++) {
+        free_align_tags(tags_list[j]);
+    }
+    free(tags_list);
+    return consensus;
+}
+
+
+void free_consensus_data( consensus_data * consensus ){
+    free(consensus->sequence);
+    free(consensus->eqv);
+    free(consensus);
+}
+
+/***
+void main() {
+    unsigned int j;
+    char small_buffer[1024];
+    char big_buffer[65536];
+    char ** input_seq;
+    char ** seq_id;
+    int seq_count;
+    char * consensus;
+
+    input_seq = calloc( 501, sizeof(char *));
+    seq_id = calloc( 501, sizeof(char *));
+
+    while(1) {
+        seq_count = 0;
+        while (1) {
+
+            scanf("%s", small_buffer);
+            seq_id[seq_count] = calloc( strlen(small_buffer) + 1, sizeof(char));
+            strcpy(seq_id[seq_count], small_buffer);
+
+            scanf("%s", big_buffer);
+            input_seq[seq_count] = calloc( strlen(big_buffer) + 1 , sizeof(char));
+            strcpy(input_seq[seq_count], big_buffer);
+
+            if (strcmp(seq_id[seq_count], "+") == 0) {
+                break;
+            }
+            if (strcmp(seq_id[seq_count], "-") == 0) {
+                break;
+            }
+            //printf("%s\n", seq_id[seq_count]);
+            seq_count += 1;
+            if (seq_count > 500) break;
+        }
+        //printf("sc: %d\n", seq_count);
+        if (seq_count < 10 && strcmp(seq_id[seq_count], "-") != 0 ) continue;
+        if (seq_count < 10 && strcmp(seq_id[seq_count], "-") == 0 ) break;
+
+            consensus = generate_consensus(input_seq, seq_count, 8, 8);
+        if (strlen(consensus) > 500) {
+            printf(">%s\n%s\n", seq_id[0], consensus);
+        }
+        fflush(stdout);
+        free(consensus);
+        for (j=0; j < seq_count; j++) {
+            free(seq_id[j]);
+            free(input_seq[j]);
+        };
+
+    }
+    for (j=0; j < seq_count; j++) {
+        free(seq_id[j]);
+        free(input_seq[j]);
+    };
+    free(seq_id);
+    free(input_seq);
+}
+***/

+ 591 - 0
FALCON/src/c/kmer_lookup.c

@@ -0,0 +1,591 @@
+/*
+ * =====================================================================================
+ *
+ *       Filename:  kmer_count.c
+ *
+ *    Description:
+ *
+ *        Version:  0.1
+ *        Created:  07/20/2013 17:00:00
+ *       Revision:  none
+ *       Compiler:  gcc
+ *
+ *         Author:  Jason Chin,
+ *        Company:
+ *
+ * =====================================================================================
+
+ #################################################################################$$
+ # Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
+ #
+ # All rights reserved.
+ #
+ # Redistribution and use in source and binary forms, with or without
+ # modification, are permitted (subject to the limitations in the
+ # disclaimer below) provided that the following conditions are met:
+ #
+ #  * Redistributions of source code must retain the above copyright
+ #  notice, this list of conditions and the following disclaimer.
+ #
+ #  * Redistributions in binary form must reproduce the above
+ #  copyright notice, this list of conditions and the following
+ #  disclaimer in the documentation and/or other materials provided
+ #  with the distribution.
+ #
+ #  * Neither the name of Pacific Biosciences nor the names of its
+ #  contributors may be used to endorse or promote products derived
+ #  from this software without specific prior written permission.
+ #
+ # NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+ # GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+ # BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+ # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ # DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ # USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ # OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ # SUCH DAMAGE.
+ #################################################################################$$
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <limits.h>
+#include "common.h"
+
+
+const unsigned int KMERMATCHINC = 10000;
+
+int compare_seq_coor(const void * a, const void * b) {
+    const seq_coor_t * arg1 = a;
+    const seq_coor_t * arg2 = b;
+    return  (* arg1) - (* arg2);
+}
+
+
+kmer_lookup * allocate_kmer_lookup ( seq_coor_t size ) {
+    kmer_lookup * kl;
+
+    //printf("%lu is allocated for kmer lookup\n", size);
+    kl = (kmer_lookup *)  malloc( size * sizeof(kmer_lookup) );
+    init_kmer_lookup( kl, size);
+    return kl;
+}
+
+void init_kmer_lookup ( kmer_lookup * kl,  seq_coor_t size ) {
+    seq_coor_t i;
+    //printf("%lu is allocated for kmer lookup\n", size);
+    for (i=0; i<size; i++) {
+        kl[i].start = INT_MAX;
+        kl[i].last = INT_MAX;
+        kl[i].count = 0;
+    }
+}
+
+
+void free_kmer_lookup( kmer_lookup *  ptr) {
+    free(ptr);
+}
+
+seq_array allocate_seq(seq_coor_t size) {
+    seq_array sa;
+    sa  = (seq_array) malloc( size * sizeof(base) );
+    init_seq_array( sa, size);
+    return sa;
+}
+
+void init_seq_array( seq_array sa, seq_coor_t size) {
+    seq_coor_t i;
+    for (i=0; i<size; i++) {
+        sa[i] = 0xff;
+    }
+}
+
+void free_seq_array( seq_array sa) {
+    free(sa);
+}
+
+seq_addr_array allocate_seq_addr(seq_coor_t size) {
+    return (seq_addr_array) calloc( size, sizeof(seq_addr));
+}
+
+void free_seq_addr_array(seq_addr_array sda) {
+    free(sda);
+}
+
+seq_coor_t get_kmer_bitvector(seq_array sa, unsigned int K) {
+    unsigned int i;
+    seq_coor_t kmer_bv = 0;
+    seq_coor_t kmer_mask;
+
+    kmer_mask = 0;
+    for (i = 0; i < K; i++) {
+        kmer_mask <<= 2;
+        kmer_mask |= 0x00000003;
+    }
+
+    for (i = 0; i < K; i++) {
+        kmer_bv <<= 2;
+        kmer_bv |= (((unsigned int) sa[i]) & 0x03);
+    }
+
+    return kmer_bv;
+}
+
+// seq_coor_t = 0
+// K = 8
+void add_sequence ( seq_coor_t start,
+                    unsigned int K,
+                    char * seq,
+                    seq_coor_t seq_len,
+                    seq_addr_array sda,
+                    seq_array sa,
+                    kmer_lookup * lk ) {
+
+    seq_coor_t i;
+    seq_coor_t kmer_bv;
+    seq_coor_t kmer_mask;
+
+    kmer_mask = 0;
+    for (i = 0; i < K; i++) {
+        kmer_mask <<= 2;
+        kmer_mask |= 0x00000003;
+    }
+
+    for (i = 0; i < seq_len; i++) {
+        switch ( seq[i] ) {
+            case 'A':
+                sa[ start + i ] = 0;
+                break;
+            case 'C':
+                sa[ start + i ] = 1;
+                break;
+            case 'G':
+                sa[ start + i ] = 2;
+                break;
+            case 'T':
+                sa[ start + i ] = 3;
+        }
+    }
+    kmer_bv = get_kmer_bitvector( sa + start, K);
+    for (i = 0; i < seq_len - K;  i++) {
+        //printf("%lu %lu\n", i, kmer_bv);
+        //printf("lk before init: %lu %lu %lu\n", kmer_bv, lk[kmer_bv].start, lk[kmer_bv].last);
+        if (lk[kmer_bv].start == INT_MAX) {
+            lk[kmer_bv].start = start + i;
+            lk[kmer_bv].last = start + i;
+            lk[kmer_bv].count += 1;
+            //printf("lk init: %lu %lu %lu\n", kmer_bv, lk[kmer_bv].start, lk[kmer_bv].last);
+        } else {
+            sda[ lk[kmer_bv].last ] = start + i;
+            lk[kmer_bv].count += 1;
+            lk[kmer_bv].last = start + i;
+            //printf("lk change: %lu %lu %lu\n", kmer_bv, lk[kmer_bv].start, lk[kmer_bv].last);
+        }
+        kmer_bv <<= 2;
+        kmer_bv |= sa[ start + i + K];
+        kmer_bv &= kmer_mask;
+    }
+}
+
+
+void mask_k_mer(seq_coor_t size, kmer_lookup * kl, seq_coor_t threshold) {
+    seq_coor_t i;
+    for (i=0; i<size; i++) {
+        if (kl[i].count > threshold) {
+            kl[i].start = INT_MAX;
+            kl[i].last = INT_MAX;
+            //kl[i].count = 0;
+        }
+    }
+}
+
+
+kmer_match * find_kmer_pos_for_seq( char * seq, seq_coor_t seq_len, unsigned int K,
+                    seq_addr_array sda,
+                    kmer_lookup * lk) {
+    seq_coor_t i;
+    seq_coor_t kmer_bv;
+    seq_coor_t kmer_mask;
+    seq_coor_t kmer_pos;
+    seq_coor_t next_kmer_pos;
+    unsigned int half_K;
+    seq_coor_t kmer_match_rtn_allocation_size = KMERMATCHINC;
+    kmer_match * kmer_match_rtn;
+    base * sa;
+
+    kmer_match_rtn = (kmer_match *) malloc( sizeof(kmer_match) );
+    kmer_match_rtn->count = 0;
+    kmer_match_rtn->query_pos = (seq_coor_t *) calloc( kmer_match_rtn_allocation_size, sizeof( seq_coor_t ) );
+    kmer_match_rtn->target_pos = (seq_coor_t *) calloc( kmer_match_rtn_allocation_size, sizeof( seq_coor_t ) );
+
+    sa = calloc( seq_len, sizeof(base) );
+
+    kmer_mask = 0;
+    for (i = 0; i < K; i++) {
+        kmer_mask <<= 2;
+        kmer_mask |= 0x00000003;
+    }
+
+    for (i = 0; i < seq_len; i++) {
+        switch ( seq[i] ) {
+            case 'A':
+                sa[ i ] = 0;
+                break;
+            case 'C':
+                sa[ i ] = 1;
+                break;
+            case 'G':
+                sa[ i ] = 2;
+                break;
+            case 'T':
+                sa[ i ] = 3;
+        }
+    }
+
+
+    kmer_bv = get_kmer_bitvector(sa, K);
+    half_K = K >> 1;
+    for (i = 0; i < seq_len - K;  i += half_K) {
+        kmer_bv = get_kmer_bitvector(sa + i, K);
+        if (lk[kmer_bv].start == INT_MAX) {  //for high count k-mers
+            continue;
+        }
+        kmer_pos = lk[ kmer_bv ].start;
+        next_kmer_pos = sda[ kmer_pos ];
+        kmer_match_rtn->query_pos[ kmer_match_rtn->count ] = i;
+        kmer_match_rtn->target_pos[ kmer_match_rtn->count ] = kmer_pos;
+        kmer_match_rtn->count += 1;
+        if (kmer_match_rtn->count > kmer_match_rtn_allocation_size - 1000) {
+            kmer_match_rtn_allocation_size += KMERMATCHINC;
+            kmer_match_rtn->query_pos = (seq_coor_t *) realloc( kmer_match_rtn->query_pos,
+                                                                   kmer_match_rtn_allocation_size  * sizeof(seq_coor_t) );
+            kmer_match_rtn->target_pos = (seq_coor_t *) realloc( kmer_match_rtn->target_pos,
+                                                                    kmer_match_rtn_allocation_size  * sizeof(seq_coor_t) );
+        }
+        while ( next_kmer_pos > kmer_pos ){
+            kmer_pos = next_kmer_pos;
+            next_kmer_pos = sda[ kmer_pos ];
+            kmer_match_rtn->query_pos[ kmer_match_rtn->count ] = i;
+            kmer_match_rtn->target_pos[ kmer_match_rtn->count ] = kmer_pos;
+            kmer_match_rtn->count += 1;
+            if (kmer_match_rtn->count > kmer_match_rtn_allocation_size - 1000) {
+                kmer_match_rtn_allocation_size += KMERMATCHINC;
+                kmer_match_rtn->query_pos = (seq_coor_t *) realloc( kmer_match_rtn->query_pos,
+                                                                       kmer_match_rtn_allocation_size  * sizeof(seq_coor_t) );
+                kmer_match_rtn->target_pos = (seq_coor_t *) realloc( kmer_match_rtn->target_pos,
+                                                                        kmer_match_rtn_allocation_size  * sizeof(seq_coor_t) );
+            }
+        }
+    }
+    free(sa);
+    return kmer_match_rtn;
+}
+
+void free_kmer_match( kmer_match * ptr) {
+    free(ptr->query_pos);
+    free(ptr->target_pos);
+    free(ptr);
+}
+
+aln_range* find_best_aln_range(kmer_match * km_ptr,
+                              seq_coor_t K,
+                              seq_coor_t bin_size,
+                              seq_coor_t count_th) {
+    seq_coor_t i;
+    seq_coor_t j;
+    seq_coor_t q_min, q_max, t_min, t_max;
+    seq_coor_t * d_count;
+    seq_coor_t * q_coor;
+    seq_coor_t * t_coor;
+    aln_range * arange;
+
+    long int d, d_min, d_max;
+    long int cur_score;
+    long int max_score;
+    long int max_k_mer_count;
+    long int max_k_mer_bin;
+    seq_coor_t cur_start;
+
+    arange = calloc(1 , sizeof(aln_range));
+
+    q_min = INT_MAX;
+    q_max = 0;
+    t_min = INT_MAX;
+    t_max = 0;
+
+    d_min = INT_MAX;
+    d_max = LONG_MIN;
+
+    for (i = 0; i <  km_ptr->count; i++ ) {
+        if ( km_ptr -> query_pos[i] < q_min) {
+            q_min =  km_ptr->query_pos[i];
+        }
+        if ( km_ptr -> query_pos[i] > q_max) {
+            q_max =  km_ptr->query_pos[i];
+        }
+        if ( km_ptr -> target_pos[i] < t_min) {
+            t_min =  km_ptr->target_pos[i];
+        }
+        if ( km_ptr -> query_pos[i] > t_max) {
+            t_max =  km_ptr->target_pos[i];
+        }
+        d = (long int) km_ptr->query_pos[i] - (long int) km_ptr->target_pos[i];
+        if ( d < d_min ) {
+            d_min = d;
+        }
+        if ( d > d_max ) {
+            d_max = d;
+        }
+    }
+
+    //printf("%lu %ld %ld\n" , km_ptr->count, d_min, d_max);
+    d_count = calloc( (d_max - d_min)/bin_size + 1, sizeof(seq_coor_t) );
+    q_coor = calloc( km_ptr->count, sizeof(seq_coor_t) );
+    t_coor = calloc( km_ptr->count, sizeof(seq_coor_t) );
+
+    for (i = 0; i <  km_ptr->count; i++ ) {
+        d = (long int) (km_ptr->query_pos[i]) - (long int) (km_ptr->target_pos[i]);
+        d_count[ (d - d_min)/ (long int) bin_size ] += 1;
+        q_coor[i] = INT_MAX;
+        t_coor[i] = INT_MAX;
+    }
+
+    j = 0;
+    max_k_mer_count = 0;
+    max_k_mer_bin = INT_MAX;
+    for (i = 0; i <  km_ptr->count; i++ ) {
+        d = (long int) (km_ptr->query_pos[i]) - (long int) (km_ptr->target_pos[i]);
+        if ( d_count[ (d - d_min)/ (long int) bin_size ] > max_k_mer_count) {
+            max_k_mer_count =  d_count[ (d - d_min)/ (long int) bin_size ];
+            max_k_mer_bin = (d - d_min)/ (long int) bin_size;
+        }
+    }
+    //printf("k_mer: %lu %lu\n" , max_k_mer_count, max_k_mer_bin);
+
+    if ( max_k_mer_bin != INT_MAX && max_k_mer_count > count_th ) {
+        for (i = 0; i <  km_ptr->count; i++ ) {
+            d = (long int) (km_ptr->query_pos[i]) - (long int) (km_ptr->target_pos[i]);
+            if ( labs( ( (d - d_min)/ (long int) bin_size ) - max_k_mer_bin ) > 5 ) {
+                continue;
+            }
+            if (d_count[ (d - d_min)/ (long int) bin_size ] > count_th) {
+                q_coor[j] = km_ptr->query_pos[i];
+                t_coor[j] = km_ptr->target_pos[i];
+                //printf("d_count: %lu %lu\n" ,i, d_count[(d - d_min)/ (long int) bin_size]);
+                //printf("coor: %lu %lu\n" , q_coor[j], t_coor[j]);
+                j ++;
+            }
+        }
+    }
+
+    if (j > 1) {
+        arange->s1 = q_coor[0];
+        arange->e1 = q_coor[0];
+        arange->s2 = t_coor[0];
+        arange->e2 = t_coor[0];
+        arange->score = 0;
+
+        max_score = 0;
+        cur_score = 0;
+        cur_start = 0;
+
+        for (i = 1; i < j; i++) {
+            cur_score += 32 - (q_coor[i] - q_coor[i-1]);
+            //printf("deltaD, %lu %ld\n", q_coor[i] - q_coor[i-1], cur_score);
+            if (cur_score < 0) {
+                cur_score = 0;
+                cur_start = i;
+            } else if (cur_score > max_score) {
+                arange->s1 = q_coor[cur_start];
+                arange->s2 = t_coor[cur_start];
+                arange->e1 = q_coor[i];
+                arange->e2 = t_coor[i];
+                max_score = cur_score;
+                arange->score = max_score;
+                //printf("%lu %lu %lu %lu\n", arange.s1, arange.e1, arange.s2, arange.e2);
+            }
+        }
+
+    } else {
+        arange->s1 = 0;
+        arange->e1 = 0;
+        arange->s2 = 0;
+        arange->e2 = 0;
+        arange->score = 0;
+    }
+
+    // printf("free\n");
+
+    free(d_count);
+    free(q_coor);
+    free(t_coor);
+    return arange;
+}
+
+aln_range* find_best_aln_range2(kmer_match * km_ptr,
+                                seq_coor_t K,
+                                seq_coor_t bin_width,
+                                seq_coor_t count_th) {
+
+    seq_coor_t * d_coor;
+    seq_coor_t * hit_score;
+    seq_coor_t * hit_count;
+    seq_coor_t * last_hit;
+    seq_coor_t max_q, max_t;
+    seq_coor_t s, e, max_s, max_e, max_span, d_s, d_e, delta, d_len;
+    seq_coor_t px, py, cx, cy;
+    seq_coor_t max_hit_idx;
+    seq_coor_t max_hit_score, max_hit_count;
+    seq_coor_t i, j;
+    seq_coor_t candidate_idx, max_d, d;
+
+    aln_range * arange;
+
+    arange = calloc(1 , sizeof(aln_range));
+
+    d_coor = calloc( km_ptr->count, sizeof(seq_coor_t) );
+
+    max_q = -1;
+    max_t = -1;
+
+    for (i = 0; i <  km_ptr->count; i++ ) {
+        d_coor[i] = km_ptr->query_pos[i] - km_ptr->target_pos[i];
+        max_q = max_q > km_ptr->query_pos[i] ? max_q : km_ptr->query_pos[i];
+        max_t = max_t > km_ptr->target_pos[i] ? max_q : km_ptr->target_pos[i];
+
+    }
+
+    qsort(d_coor, km_ptr->count, sizeof(seq_coor_t), compare_seq_coor);
+
+
+    s = 0;
+    e = 0;
+    max_s = -1;
+    max_e = -1;
+    max_span = -1;
+    delta = (long int) ( 0.05 * ( max_q + max_t ) );
+    d_len =  km_ptr->count;
+    d_s = -1;
+    d_e = -1;
+    while (1) {
+        d_s = d_coor[s];
+        d_e = d_coor[e];
+        while (d_e < d_s + delta && e < d_len-1) {
+            e += 1;
+            d_e = d_coor[e];
+        }
+        if ( max_span == -1 || e - s > max_span ) {
+            max_span = e - s;
+            max_s = s;
+            max_e = e;
+        }
+        s += 1;
+        if (s == d_len || e == d_len) {
+            break;
+        }
+    }
+
+    if (max_s == -1 || max_e == -1 || max_e - max_s < 32) {
+        arange->s1 = 0;
+        arange->e1 = 0;
+        arange->s2 = 0;
+        arange->e2 = 0;
+        arange->score = 0;
+        free(d_coor);
+        return arange;
+    }
+
+    last_hit = calloc( km_ptr->count, sizeof(seq_coor_t) );
+    hit_score = calloc( km_ptr->count, sizeof(seq_coor_t) );
+    hit_count = calloc( km_ptr->count, sizeof(seq_coor_t) );
+
+    for (i = 0; i <  km_ptr->count; i++ ) {
+        last_hit[i] = -1;
+        hit_score[i] = 0;
+        hit_count[i] = 0;
+    }
+    max_hit_idx = -1;
+    max_hit_score = 0;
+    for (i = 0; i < km_ptr->count; i ++)  {
+        cx = km_ptr->query_pos[i];
+        cy = km_ptr->target_pos[i];
+        d = cx - cy;
+        if ( d < d_coor[max_s] || d > d_coor[max_e] ) continue;
+
+        j = i - 1;
+        candidate_idx = -1;
+        max_d = 65535;
+        while (1) {
+            if ( j < 0 ) break;
+            px = km_ptr->query_pos[j];
+            py = km_ptr->target_pos[j];
+            d = px - py;
+            if ( d < d_coor[max_s] || d > d_coor[max_e] ) {
+                j--;
+                continue;
+            }
+            if (cx - px > 320) break; //the number here controling how big alignment gap to be considered
+            if (cy > py && cx - px + cy - py < max_d && cy - py <= 320 ) {
+                max_d = cx - px + cy - py;
+                candidate_idx = j;
+            }
+            j--;
+        }
+        if (candidate_idx != -1) {
+            last_hit[i] = candidate_idx;
+            hit_score[i] = hit_score[candidate_idx] + (64 - max_d);
+            hit_count[i] = hit_count[candidate_idx] + 1;
+            if (hit_score[i] < 0) {
+                hit_score[i] = 0;
+                hit_count[i] = 0;
+            }
+        } else {
+            hit_score[i] = 0;
+            hit_count[i] = 0;
+        }
+        if (hit_score[i] > max_hit_score) {
+            max_hit_score = hit_score[i];
+            max_hit_count = hit_count[i];
+            max_hit_idx = i;
+        }
+
+    }
+    if (max_hit_idx == -1) {
+        arange->s1 = 0;
+        arange->e1 = 0;
+        arange->s2 = 0;
+        arange->e2 = 0;
+        arange->score = 0;
+        free(d_coor);
+        free(last_hit);
+        free(hit_score);
+        free(hit_count);
+        return arange;
+    }
+
+    arange->score = max_hit_count + 1;
+    arange->e1 = km_ptr->query_pos[max_hit_idx];
+    arange->e2 = km_ptr->target_pos[max_hit_idx];
+    i = max_hit_idx;
+    while (last_hit[i] != -1) {
+        i = last_hit[i];
+    }
+    arange->s1 = km_ptr->query_pos[i];
+    arange->s2 = km_ptr->target_pos[i];
+
+    free(d_coor);
+    free(last_hit);
+    free(hit_score);
+    free(hit_count);
+    return arange;
+}
+
+void free_aln_range( aln_range * arange) {
+    free(arange);
+}

+ 20 - 0
FALCON/src/py_scripts/fc_run.py

@@ -0,0 +1,20 @@
+
+
+import sys
+import logging
+from falcon_kit import run_support
+from falcon_kit.mains.run1 import main
+
+LOG = logging.getLogger()
+
+def clean():
+    import os
+    cmd = "rm -rf 0-rawreads 1-preads_ovl 2-asm-falcon all.log config.json General_config.json foo.snake log.out"
+    rc = os.system(cmd)
+    LOG.info("clean cmd={}, \nresult={}".format(cmd, rc))
+
+
+if __name__ == "__main__":
+    LOG = run_support.setup_logger(None)
+    clean()
+    main(sys.argv)

+ 0 - 0
pypeFlow/pwatcher/__init__.py


+ 512 - 0
pypeFlow/pwatcher/blocking.py

@@ -0,0 +1,512 @@
+"""Blocking process-watcher.
+
+See fs_based.py. Here, delete is a no-op, and run() starts threads, so
+the main program needs to wait for threads to finish somehow.
+
+Typical submission_string:
+
+    qsub -S /bin/bash -sync y -V -q production -N ${JOB_ID} \\\n -o "${STDOUT_FILE}" \\\n -e "${STDERR_FILE}" \\\n -pe smp ${NPROC} -l h_vmem=${MB}M \\\n "${CMD}"
+"""
+try:
+    from shlex import quote
+except ImportError:
+    from pipes import quote
+import collections
+import contextlib
+import copy
+import glob
+import json
+import logging
+import os
+import pprint
+import re
+import signal
+import string
+import subprocess
+import sys
+import threading
+import time
+import traceback
+
+log = logging.getLogger(__name__)
+
+LOCAL_SUBMISSION_STRING = '/bin/bash -C ${CMD} >| ${STDOUT_FILE} 2>| ${STDERR_FILE}' # for job_local override
+STATE_FN = 'state.py'
+Job = collections.namedtuple('Job', ['jobid', 'cmd', 'rundir', 'options'])
+MetaJob = collections.namedtuple('MetaJob', ['job', 'lang_exe'])
+lang_python_exe = sys.executable
+lang_bash_exe = '/bin/bash'
+
+@contextlib.contextmanager
+def cd(newdir):
+    prevdir = os.getcwd()
+    log.debug('CD: %r <- %r' %(newdir, prevdir))
+    os.chdir(os.path.expanduser(newdir))
+    try:
+        yield
+    finally:
+        log.debug('CD: %r -> %r' %(newdir, prevdir))
+        os.chdir(prevdir)
+
+class MetaJobClass(object):
+    ext = {
+        lang_python_exe: '.py',
+        lang_bash_exe: '.bash',
+    }
+    def get_wrapper(self):
+        # Totally by convention, for now.
+        return '%s/run-%s%s' %(self.mj.job.rundir, self.mj.job.jobid, self.ext[self.mj.lang_exe])
+    def get_sentinel(self):
+        return 'exit-%s' %self.mj.job.jobid # in watched dir
+    def get_pid(self):
+        return self.mj.pid
+    def kill(self, pid, sig):
+        stored_pid = self.get_pid()
+        if not pid:
+            pid = stored_pid
+            log.info('Not passed a pid to kill. Using stored pid:%s' %pid)
+        if pid and stored_pid:
+            if pid != stored_pid:
+                log.error('pid:%s != stored_pid:%s' %(pid, stored_pid))
+        os.kill(pid, sig)
+    def __init__(self, mj):
+        self.mj = mj
+class State(object):
+    def notify_threaded(self, jobid):
+        self.jobids_threaded.add(jobid)
+    def notify_started(self, jobid):
+        #state.top['jobids_submitted'].append(jobid)
+        self.jobids_submitted.add(jobid)
+        self.jobids_threaded.remove(jobid)
+        log.debug('Thread notify_started({}).'.format(jobid))
+    def notify_exited(self, jobid, rc):
+        #self.top['jobid2exit'][jobid] = rc
+        self.jobid2exit[jobid] = rc
+        self.jobids_submitted.remove(jobid)
+        log.debug('Thread notify_exited({}->{}).'.format(jobid, rc))
+    def set_job(self, jobid, mjob):
+        # Is this needed? For now, we are not actually saving state, so no.
+        self.top['jobs'][jobid] = mjob
+    def update_jobid2status(self, jobid2status):
+        for jobid in self.jobids_threaded:
+            status = 'THREADED'
+            jobid2status[jobid] = status
+        for jobid in self.jobids_submitted:
+            status = 'RUNNING'
+            # but actually it might not have started yet, or it could be dead, since we have blocking qsub calls
+            jobid2status[jobid] = status
+        for jobid, rc in self.jobid2exit.items():
+            status = 'EXIT {}'.format(rc)
+            jobid2status[jobid] = status
+    def get_running_jobids(self):
+        return list(self.jobids_submitted)
+    def serialize(self):
+        return pprint.pformat(self.top)
+    @staticmethod
+    def deserialize(directory, content):
+        state = State(directory)
+        state.top = eval(content)
+        state.content_prev = content
+        return state
+    @staticmethod
+    def create(directory):
+        state = State(directory)
+        #makedirs(state.get_directory_wrappers())
+        #makedirs(state.get_directory_jobs())
+        return state
+    def __init__(self, directory):
+        self.__directory = os.path.abspath(directory)
+        self.content_prev = ''
+        self.top = dict() # for serialization, when we decide we need it
+        self.top['jobs'] = dict()
+        #self.top['jobids_submitted'] = list()
+        #self.top['jobid2exit'] = dict()
+        self.jobids_threaded = set()
+        self.jobids_submitted = set()
+        self.jobid2exit = dict()
+
+class SafeState(object):
+    """Synchronized State proxy for accessing any
+    data which might be modified in a Thread.
+    """
+    def notify_threaded(self, jobid):
+        with self.lock:
+            self.state.notify_threaded(jobid)
+    def notify_started(self, jobid):
+        with self.lock:
+            self.state.notify_started(jobid)
+    def notify_exited(self, jobid, rc):
+        with self.lock:
+            self.state.notify_exited(jobid, rc)
+    def update_jobid2status(self, table):
+        with self.lock:
+            return self.state.update_jobid2status(table)
+    def get_running_jobids(self):
+        with self.lock:
+            return self.state.get_running_jobids()
+    def serialize(self):
+        with self.lock:
+            return self.state.serialize()
+    def __getattr__(self, name):
+        """For all other methods, just delegate.
+        """
+        return getattr(self.state, name)
+    def __init__(self, state):
+        self.state = state
+        self.lock = threading.Lock()
+
+def get_state(directory):
+    """For now, we never write.
+    """
+    state_fn = os.path.join(directory, STATE_FN)
+    if not os.path.exists(state_fn):
+        return State.create(directory)
+    assert False, 'No state directory needed, for now.'
+    try:
+        return State.deserialize(directory, open(state_fn).read())
+    except Exception:
+        log.exception('Failed to read state "%s". Ignoring (and soon over-writing) current state.'%state_fn)
+        # TODO: Backup previous STATE_FN?
+        return State(directory)
+def State_save(state):
+    # TODO: RW Locks, maybe for runtime of whole program.
+    content = state.serialize()
+    content_prev = state.content_prev
+    if content == content_prev:
+        return
+    fn = state.get_state_fn()
+    open(fn, 'w').write(content)
+    log.debug('saved state to %s' %repr(os.path.abspath(fn)))
+def Job_get_MetaJob(job, lang_exe=lang_bash_exe):
+    return MetaJob(job, lang_exe=lang_exe)
+def MetaJob_wrap(mjob, state):
+    """Write wrapper contents to mjob.wrapper.
+    """
+    metajob_rundir = mjob.job.rundir
+    wdir = metajob_rundir
+
+    bash_template = """#!%(lang_exe)s
+cmd="%(cmd)s"
+rundir="%(rundir)s"
+finish() {
+  echo "finish code: $?"
+}
+trap finish 0
+#printenv
+echo
+set -ex
+while [ ! -d "$rundir" ]; do sleep 1; done
+cd "$rundir"
+eval "$cmd"
+    """
+    mji = MetaJobClass(mjob)
+    wrapper_fn = os.path.join(wdir, mji.get_wrapper())
+    command = mjob.job.cmd
+
+    wrapped = bash_template %dict(
+        lang_exe=mjob.lang_exe,
+        cmd=command,
+        rundir=metajob_rundir,
+    )
+    log.debug('Writing wrapper "%s"' %wrapper_fn)
+    open(wrapper_fn, 'w').write(wrapped)
+    st = os.stat(wrapper_fn)
+    os.chmod(wrapper_fn, st.st_mode | 0o111)
+
+class JobThread(threading.Thread):
+    def run(self):
+        """Propagate environment, plus env_extra.
+        """
+        try:
+            self.notify_start(self.jobname)
+            log.debug('====>hello! started Thread {}'.format(threading.current_thread()))
+            myenv = dict(os.environ)
+            myenv.update(self.env_extra)
+            #log.debug('myenv:\n{}'.format(pprint.pformat(myenv)))
+            log.info("====>Popen: '{}'".format(self.cmd))
+            if not self.cmd:
+                msg = 'Why is self.cmd empty? {} {} {!r}'.format(self, self.jobname, self.cmd)
+                raise Exception(msg)
+            p = subprocess.Popen(self.cmd, env=myenv, shell=True)
+            log.debug("====>pid: {}".format(p.pid))
+            p.wait()
+            rc = p.returncode
+            log.debug("====>rc: {}".format(rc))
+            self.notify_exit(self.jobname, rc)
+        except:
+            log.exception('Failed to submit {}: {!r} Setting rc=42.'.format(self.jobname, self.cmd))
+            self.notify_exit(self.jobname, 42)
+    def __init__(self, jobname, cmd, notify_start, notify_exit, env_extra):
+        super(JobThread, self).__init__()
+        self.jobname = jobname
+        self.cmd = cmd
+        self.notify_start = notify_start
+        self.notify_exit = notify_exit
+        self.env_extra = env_extra
+
+class StringJobSubmitter(object):
+    """Substitute some variables into self.submission_string.
+    Use mains/job_start.sh as the top script. That requires
+    PYPEFLOW_JOB_START_SCRIPT in the environment as the real
+    script to run. This way, we are guaranteed that the top script exists,
+    and we can wait for the rest to appear in the filesystem.
+    """
+    def submit(self, jobname, mjob, state):
+        """Prepare job (based on wrappers) and submit as a new thread.
+        """
+        state.set_job(jobname, mjob)
+        jobname = mjob.job.jobid
+        job_dict = mjob.job.options
+        #nproc = mjob.job.options['NPROC']
+        #mb = mjob.job.options['MB']
+        mji = MetaJobClass(mjob)
+        #script_fn = os.path.join(state.get_directory_wrappers(), mji.get_wrapper())
+        script_fn = mji.get_wrapper()
+        exe = mjob.lang_exe
+
+        state.notify_threaded(jobname)
+        self.start(jobname, state, exe, script_fn, job_dict) # Can raise
+    def get_cmd(self, job_name, script_fn, job_dict):
+        """Vars:
+        (The old ones.) JOB_ID, STDOUT_FILE, STDERR_FILE, NPROC, MB, CMD
+        """
+        # We wrap in a program that waits for the executable to exist, so
+        # the filesystem has time to catch up on the remote machine.
+        # Hopefully, this will allow dependencies to become ready as well.
+        job_start_fn = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'mains/job_start.sh')
+        mapping = dict()
+        stdout = script_fn + '.stdout'
+        stderr = script_fn + '.stderr'
+        run_dir = os.getcwd()
+        mapping = dict(
+                JOB_EXE='/bin/bash',
+                JOB_NAME=job_name, JOB_ID=job_name,
+                #JOB_OPTS=JOB_OPTS,
+                #JOB_QUEUE=job_queue,
+                JOB_SCRIPT=job_start_fn, CMD=job_start_fn,
+                JOB_DIR=run_dir, DIR=run_dir,
+                JOB_STDOUT=stdout, STDOUT_FILE=stdout,
+                JOB_STDERR=stderr, STDERR_FILE=stderr,
+                #MB=pypeflow_mb,
+                #NPROC=pypeflow_nproc,
+        )
+        mapping.update(job_dict)
+        if 'JOB_OPTS' in mapping:
+            # a special two-level mapping: ${JOB_OPTS} is substituted first
+            mapping['JOB_OPTS'] = self.sub(mapping['JOB_OPTS'], mapping)
+        return self.sub(self.submission_string, mapping)
+    @staticmethod
+    def sub(template, mapping):
+        t = string.Template(template)
+        try:
+            return t.substitute(mapping)
+        except KeyError:
+            print(repr(mapping))
+            msg = 'Template substitution failed:\n template={!r}\n mapping={}'.format(
+                    template, pprint.pformat(mapping))
+            log.exception(msg)
+            raise
+    def start(self, jobname, state, exe, script_fn, job_dict):
+        """Run job in thread.
+        Thread will notify state.
+        Can raise.
+        """
+        #cmd = script_fn
+        cmd = self.get_cmd(jobname, script_fn, job_dict)
+        # job_start.sh relies on PYPEFLOW_*
+        env_extra = {
+            "PYPEFLOW_JOB_START_SCRIPT": script_fn,
+            "PYPEFLOW_JOB_START_TIMEOUT": "60",
+        }
+        log.debug('env_extra={}'.format(pprint.pformat(env_extra)))
+        notify_start = state.notify_started
+        notify_exit = state.notify_exited
+        th = JobThread(jobname, cmd, notify_start, notify_exit, env_extra)
+        #th.setDaemon(True)
+        th.start()
+    def __repr__(self):
+        return 'StringJobSubmitter(%s)' %repr(self.submission_string)
+    def __init__(self, submission_string):
+        self.submission_string = submission_string
+
+def link_rundir(state_rundir, user_rundir):
+    if user_rundir:
+        link_fn = os.path.join(user_rundir, 'pwatcher.dir')
+        if os.path.lexists(link_fn):
+            os.unlink(link_fn)
+        os.symlink(os.path.abspath(state_rundir), link_fn)
+
+def cmd_run(state, jobids, job_type, job_dict):
+    """
+    Wrap them and run them locally, each in the foreground of a thread.
+    """
+    jobs = dict()
+    submitted = list()
+    result = {'submitted': submitted}
+    if job_type != 'string':
+        log.debug("NOTE: In blocking pwatcher, job_type={!r}, should be 'string'".format(job_type))
+    for jobid, desc in jobids.items():
+        assert 'cmd' in desc
+        cmd = desc['cmd']
+        if 'rundir' in desc:
+            rundir = desc['rundir']
+        else:
+            rundir = os.path.dirname(cmd)
+        # These are all required now.
+        #nproc = desc['job_nproc']
+        #mb = desc['job_mb']
+        local = int(desc['job_local'])
+        options = copy.deepcopy(desc['job_dict']) #dict(NPROC=nproc, MB=mb, local=local)
+        options['local'] = local
+        jobs[jobid] = Job(jobid, cmd, rundir, options)
+    log.debug('jobs:\n%s' %pprint.pformat(jobs))
+    submission_string = job_dict['submit']
+    basic_submitter = StringJobSubmitter(submission_string)
+    local_submitter = StringJobSubmitter(LOCAL_SUBMISSION_STRING)
+    log.debug('Basic submitter: {!r}'.format(basic_submitter))
+    for jobid, job in jobs.items():
+        #desc = jobids[jobid]
+        log.debug(' starting job %s' %pprint.pformat(job))
+        mjob = Job_get_MetaJob(job)
+        MetaJob_wrap(mjob, state)
+        try:
+            #link_rundir(state.get_directory_job(jobid), desc.get('rundir'))
+            if job.options['local']:
+                submitter = local_submitter
+            else:
+                submitter = basic_submitter
+                if not submission_string:
+                    raise Exception('No "submit" key in job_dict:{!r}.'.format(job_dict))
+            submitter.submit(jobid, mjob, state)
+            submitted.append(jobid)
+        except Exception:
+            raise
+            log.exception('Failed to submit background-job:\n{!r}'.format(
+                submitter))
+    return result
+    # The caller is responsible for deciding what to do about job-submission failures. Re-try, maybe?
+
+def system(call, checked=False):
+    log.info('!{}'.format(call))
+    rc = os.system(call)
+    if checked and rc:
+        raise Exception('{} <- {!r}'.format(rc, call))
+    return rc
+
+_warned = dict()
+def warnonce(hashkey, msg):
+    if hashkey in _warned:
+        return
+    log.warning(msg)
+    _warned[hashkey] = True
+
+def cmd_query(state, which, jobids):
+    """Return the state of named jobids.
+    If which=='list', then query jobs listed as jobids.
+    If which=='known', then query all known jobs.
+    If which=='infer', same as 'known' now.
+    """
+    result = dict()
+    jobstats = dict()
+    result['jobids'] = jobstats
+    if which == 'list':
+        for jobid in jobids:
+            jobstats[jobid] = 'UNKNOWN'
+    state.update_jobid2status(jobstats)
+    jobids = set(jobids)
+    if which == 'list':
+        for jobid in list(jobstats.keys()):
+            # TODO: This might remove thousands. We should pass jobids along to update_jobid2status().
+            if jobid not in jobids:
+                del jobstats[jobid]
+    return result
+def cmd_delete(state, which, jobids):
+    """Kill designated jobs, including (hopefully) their
+    entire process groups.
+    If which=='list', then kill all jobs listed as jobids.
+    If which=='known', then kill all known jobs.
+    If which=='infer', then kill all jobs with heartbeats.
+    """
+    log.error('Noop. We cannot kill blocked threads. Hopefully, everything will die on SIGTERM.')
+def makedirs(path):
+    if not os.path.isdir(path):
+        os.makedirs(path)
+def readjson(ifs):
+    """Del keys that start with ~.
+    That lets us have trailing commas on all other lines.
+    """
+    content = ifs.read()
+    log.debug('content:%s' %repr(content))
+    jsonval = json.loads(content)
+    #pprint.pprint(jsonval)
+    def striptildes(subd):
+        if not isinstance(subd, dict):
+            return
+        for k,v in list(subd.items()):
+            if k.startswith('~'):
+                del subd[k]
+            else:
+                striptildes(v)
+    striptildes(jsonval)
+    #pprint.pprint(jsonval)
+    return jsonval
+
+class ProcessWatcher(object):
+    def run(self, jobids, job_type, job_defaults_dict):
+        #import traceback; log.debug(''.join(traceback.format_stack()))
+        log.debug('run(jobids={}, job_type={}, job_defaults_dict={})'.format(
+            '<%s>'%len(jobids), job_type, job_defaults_dict))
+        return cmd_run(self.state, jobids, job_type, job_defaults_dict)
+    def query(self, which='list', jobids=[]):
+        log.debug('query(which={!r}, jobids={})'.format(
+            which, '<%s>'%len(jobids)))
+        return cmd_query(self.state, which, jobids)
+    def delete(self, which='list', jobids=[]):
+        log.debug('delete(which={!r}, jobids={})'.format(
+            which, '<%s>'%len(jobids)))
+        return cmd_delete(self.state, which, jobids)
+    def __init__(self, state):
+        # state must be thread-safe
+        self.state = state
+
+def get_process_watcher(directory):
+    state = get_state(directory)
+    state = SafeState(state) # thread-safe proxy
+    #log.debug('state =\n%s' %pprint.pformat(state.top))
+    return ProcessWatcher(state)
+    #State_save(state)
+
+@contextlib.contextmanager
+def process_watcher(directory):
+    """This will (someday) hold a lock, so that
+    the State can be written safely at the end.
+    """
+    state = get_state(directory)
+    state = SafeState(state) # thread-safe proxy
+    #log.debug('state =\n%s' %pprint.pformat(state.top))
+    yield ProcessWatcher(state)
+    #State_save(state)
+
+def main(prog, cmd, state_dir='mainpwatcher', argsfile=None):
+    logging.basicConfig()
+    logging.getLogger().setLevel(logging.NOTSET)
+    log.warning('logging basically configured')
+    log.debug('debug mode on')
+    assert cmd in ['run', 'query', 'delete']
+    ifs = sys.stdin if not argsfile else open(argsfile)
+    argsdict = readjson(ifs)
+    log.info('argsdict =\n%s' %pprint.pformat(argsdict))
+    with process_watcher(state_dir) as watcher:
+        result = getattr(watcher, cmd)(**argsdict)
+        if result is not None:
+            log.info('getattr({!r}, {!r}): {}'.format(
+                watcher, cmd, pprint.pformat(result)))
+        log.info('Waiting for running jobs...r')
+        while watcher.state.get_running_jobids():
+            log.info('running: {!s}'.format(watcher.state.get_running_jobids()))
+            time.sleep(1)
+
+if __name__ == "__main__":
+    #import pdb
+    #pdb.set_trace()
+    main(*sys.argv) # pylint: disable=no-value-for-parameter

+ 785 - 0
pypeFlow/pwatcher/fs_based.py

@@ -0,0 +1,785 @@
+"""Filesytem-based process-watcher.
+
+This is meant to be part of a 2-process system. For now, let's call these processes the Definer and the Watcher.
+* The Definer creates a graph of tasks and starts a resolver loop, like pypeflow. It keeps a Waiting list, a Running list, and a Done list. It then communicates with the Watcher.
+* The Watcher has 3 basic functions in its API.
+  1. Spawn jobs.
+  2. Kill jobs.
+  3. Query jobs.
+1. Spawning jobs
+The job definition includes the script, how to run it (locally, qsub, etc.), and maybe some details (unique-id, run-directory). The Watcher then:
+  * wraps the script without something to update a heartbeat-file periodically,
+  * spawns each job (possibly as a background process locally),
+  * and records info (including PID or qsub-name) in a persistent database.
+2. Kill jobs.
+Since it has a persistent database, it can always kill any job, upon request.
+3. Query jobs.
+Whenever requested, it can poll the filesystem for all or any jobs, returning the subset of completed jobs. (For NFS efficiency, all the job-exit sentinel files can be in the same directory, along with the heartbeats.)
+
+The Definer would call the Watcher to spawn tasks, and then periodically to poll them. Because these are both now single-threaded, the Watcher *could* be a function within the Definer, or a it could be blocking call to a separate process. With proper locking on the database, users could also query the same executable as a separate process.
+
+Caching/timestamp-checking would be done in the Definer, flexibly specific to each Task.
+
+Eventually, the Watcher could be in a different programming language. Maybe perl. (In bash, a background heartbeat gets is own process group, so it can be hard to clean up.)
+"""
+
+try:
+    from shlex import quote
+except ImportError:
+    from pipes import quote
+import collections
+import contextlib
+import copy
+import glob
+import json
+import logging
+import os
+import pprint
+import re
+import signal
+import string
+import subprocess
+import sys
+import time
+import traceback
+
+from pypeflow.io import capture, syscall
+
+log = logging.getLogger(__name__)
+
+HEARTBEAT_RATE_S = 10.0
+ALLOWED_SKEW_S = 120.0
+STATE_FN = 'state.py'
+Job = collections.namedtuple('Job', ['jobid', 'cmd', 'rundir', 'options'])
+MetaJob = collections.namedtuple('MetaJob', ['job', 'lang_exe'])
+lang_python_exe = sys.executable
+lang_bash_exe = '/bin/bash'
+
+@contextlib.contextmanager
+def cd(newdir):
+    prevdir = os.getcwd()
+    log.debug('CD: %r <- %r' %(newdir, prevdir))
+    os.chdir(os.path.expanduser(newdir))
+    try:
+        yield
+    finally:
+        log.debug('CD: %r -> %r' %(newdir, prevdir))
+        os.chdir(prevdir)
+
+class MetaJobClass(object):
+    ext = {
+        lang_python_exe: '.py',
+        lang_bash_exe: '.bash',
+    }
+    def get_wrapper(self):
+        return 'run-%s%s' %(self.mj.job.jobid, self.ext[self.mj.lang_exe])
+    def get_sentinel(self):
+        return 'exit-%s' %self.mj.job.jobid # in watched dir
+    def get_heartbeat(self):
+        return 'heartbeat-%s' %self.mj.job.jobid # in watched dir
+    def get_pid(self):
+        return self.mj.pid
+    def kill(self, pid, sig):
+        stored_pid = self.get_pid()
+        if not pid:
+            pid = stored_pid
+            log.info('Not passed a pid to kill. Using stored pid:%s' %pid)
+        if pid and stored_pid:
+            if pid != stored_pid:
+                log.error('pid:%s != stored_pid:%s' %(pid, stored_pid))
+        os.kill(pid, sig)
+    def __init__(self, mj):
+        self.mj = mj
+class State(object):
+    def get_state_fn(self):
+        return os.path.join(self.__directory, STATE_FN)
+    def get_directory(self):
+        return self.__directory
+    def get_directory_wrappers(self):
+        return os.path.join(self.__directory, 'wrappers')
+    def get_directory_heartbeats(self):
+        return os.path.join(self.__directory, 'heartbeats')
+    def get_directory_exits(self):
+        return os.path.join(self.__directory, 'exits')
+    def get_directory_jobs(self):
+        # B/c the other directories can get big, we put most per-job data here, under each jobid.
+        return os.path.join(self.__directory, 'jobs')
+    def get_directory_job(self, jobid):
+        return os.path.join(self.get_directory_jobs(), jobid)
+    def submit_background(self, bjob):
+        """Run job in background.
+        Record in state.
+        """
+        self.top['jobs'][bjob.mjob.job.jobid] = bjob
+        jobid = bjob.mjob.job.jobid
+        mji = MetaJobClass(bjob.mjob)
+        script_fn = os.path.join(self.get_directory_wrappers(), mji.get_wrapper())
+        exe = bjob.mjob.lang_exe
+        run_dir = self.get_directory_job(jobid)
+        makedirs(run_dir)
+        with cd(run_dir):
+            bjob.submit(self, exe, script_fn) # Can raise
+        log.info('Submitted backgroundjob=%s'%repr(bjob))
+        self.top['jobids_submitted'].append(jobid)
+    def get_mji(self, jobid):
+        mjob = self.top['jobs'][jobid].mjob
+        return MetaJobClass(mjob)
+    def get_bjob(self, jobid):
+        return self.top['jobs'][jobid]
+    def get_bjobs(self):
+        return self.top['jobs']
+    def get_mjobs(self):
+        return {jobid: bjob.mjob for jobid, bjob in self.top['jobs'].items()}
+    def add_deleted_jobid(self, jobid):
+        self.top['jobids_deleted'].append(jobid)
+    def serialize(self):
+        return pprint.pformat(self.top)
+    @staticmethod
+    def deserialize(directory, content):
+        state = State(directory)
+        state.top = eval(content)
+        state.content_prev = content
+        return state
+    @staticmethod
+    def create(directory):
+        state = State(directory)
+        makedirs(state.get_directory_wrappers())
+        makedirs(state.get_directory_heartbeats())
+        makedirs(state.get_directory_exits())
+        #system('lfs setstripe -c 1 {}'.format(state.get_directory_heartbeats())) # no improvement noticed
+        makedirs(state.get_directory_jobs())
+        return state
+    def __init__(self, directory):
+        self.__directory = os.path.abspath(directory)
+        self.content_prev = ''
+        self.top = dict()
+        self.top['jobs'] = dict()
+        self.top['jobids_deleted'] = list()
+        self.top['jobids_submitted'] = list()
+
+def get_state(directory):
+    state_fn = os.path.join(directory, STATE_FN)
+    if not os.path.exists(state_fn):
+        return State.create(directory)
+    try:
+        return State.deserialize(directory, open(state_fn).read())
+    except Exception:
+        log.exception('Failed to read state "%s". Ignoring (and soon over-writing) current state.'%state_fn)
+        # TODO: Backup previous STATE_FN?
+        return State(directory)
+def State_save(state):
+    # TODO: RW Locks, maybe for runtime of whole program.
+    content = state.serialize()
+    content_prev = state.content_prev
+    if content == content_prev:
+        return
+    fn = state.get_state_fn()
+    open(fn, 'w').write(content)
+    log.debug('saved state to %s' %repr(os.path.abspath(fn)))
+def Job_get_MetaJob(job, lang_exe=lang_bash_exe):
+    return MetaJob(job, lang_exe=lang_exe)
+def MetaJob_wrap(mjob, state):
+    """Write wrapper contents to mjob.wrapper.
+    """
+    wdir = state.get_directory_wrappers()
+    hdir = state.get_directory_heartbeats()
+    edir = state.get_directory_exits()
+    metajob_rundir = mjob.job.rundir
+
+    bash_template = """#!%(lang_exe)s
+printenv
+echo
+set -x
+%(cmd)s
+    """
+    # We do not bother with 'set -e' here because this script is run either
+    # in the background or via qsub.
+    templates = {
+        lang_python_exe: python_template,
+        lang_bash_exe: bash_template,
+    }
+    mji = MetaJobClass(mjob)
+    wrapper_fn = os.path.join(wdir, mji.get_wrapper())
+    exit_sentinel_fn=os.path.join(edir, mji.get_sentinel())
+    heartbeat_fn=os.path.join(hdir, mji.get_heartbeat())
+    rate = HEARTBEAT_RATE_S
+    command = mjob.job.cmd
+
+    prog = 'heartbeat-wrapper' # missing in mobs
+    prog = 'python3 -m pwatcher.mains.fs_heartbeat'
+    heartbeat_wrapper_template = "{prog} --directory={metajob_rundir} --heartbeat-file={heartbeat_fn} --exit-file={exit_sentinel_fn} --rate={rate} {command} || echo 99 >| {exit_sentinel_fn}"
+    # We write 99 into exit-sentinel if the wrapper fails.
+    wrapped = heartbeat_wrapper_template.format(**locals())
+    log.debug('Wrapped "%s"' %wrapped)
+
+    wrapped = templates[mjob.lang_exe] %dict(
+        lang_exe=mjob.lang_exe,
+        cmd=wrapped,
+    )
+    log.debug('Writing wrapper "%s"' %wrapper_fn)
+    open(wrapper_fn, 'w').write(wrapped)
+
+def background(script, exe='/bin/bash'):
+    """Start script in background (so it keeps going when we exit).
+    Run in cwd.
+    For now, stdout/stderr are captured.
+    Return pid.
+    """
+    args = [exe, script]
+    sin = open(os.devnull)
+    sout = open('stdout', 'w')
+    serr = open('stderr', 'w')
+    pseudo_call = '{exe} {script} 1>|stdout 2>|stderr & '.format(exe=exe, script=script)
+    log.info('dir: {!r}\nCALL:\n {!r}'.format(os.getcwd(), pseudo_call))
+    proc = subprocess.Popen([exe, script], stdin=sin, stdout=sout, stderr=serr)
+    pid = proc.pid
+    log.info('pid=%s pgid=%s sub-pid=%s' %(os.getpid(), os.getpgid(0), proc.pid))
+    #checkcall = 'ls -l /proc/{}/cwd'.format(
+    #        proc.pid)
+    #system(checkcall, checked=True)
+    return pid
+
+def qstripped(option, flag='-q'):
+    """Given a string of options, remove any -q foo.
+    (No longer used.)
+
+    >>> qstripped('-xy -q foo -z bar')
+    '-xy -z bar'
+    >>> qstripped('-xy -p foo -z bar', '-p')
+    '-xy -z bar'
+    """
+    # For now, do not strip -qfoo
+    vals = option.strip().split()
+    while flag in vals:
+        i = vals.index(flag)
+        vals = vals[0:i] + vals[i+2:]
+    return ' '.join(vals)
+
+class MetaJobLocal(object):
+    """For jobs on the local machine, with process-watching.
+    We cannot simply run with '&' because then we would not know how
+    to kill the new background job.
+    """
+    def submit(self, state, exe, script_fn):
+        """Can raise.
+        """
+        pid = background(script_fn, exe=self.mjob.lang_exe)
+    def kill(self, state, heartbeat):
+        """Can raise.
+        (Actually, we could derive heartbeat from state. But for now, we know it anyway.)
+        """
+        hdir = state.get_directory_heartbeats()
+        heartbeat_fn = os.path.join(hdir, heartbeat)
+        with open(heartbeat_fn) as ifs:
+            line = ifs.readline()
+            pid = line.split()[1]
+            pid = int(pid)
+            pgid = line.split()[2]
+            pgid = int(pgid)
+            sig =signal.SIGKILL
+            log.info('Sending signal(%s) to pgid=-%s (pid=%s) based on heartbeat=%r' %(sig, pgid, pid, heartbeat))
+            try:
+                os.kill(-pgid, sig)
+            except Exception:
+                log.exception('Failed to kill(%s) pgid=-%s for %r. Trying pid=%s' %(sig, pgid, heartbeat_fn, pid))
+                os.kill(pid, sig)
+    def __repr__(self):
+        return 'MetaJobLocal(%s)' %repr(self.mjob)
+    def __init__(self, mjob):
+        self.mjob = mjob # PUBLIC
+class MetaJobSubmit(object):
+    """Generic job-submission, non-blocking.
+    Add shebang to script.
+    If running locally, then caller must append '&' onto job_submit to put job in background.
+    """
+    def submit(self, state, exe, script_fn):
+        """Run in cwd, in background.
+        Can raise.
+        """
+        run_dir = os.getcwd()
+        job_name = self.get_job_name()
+        #job_nproc = self.job_nproc
+        #job_mb = self.job_mb
+        #job_queue = self.job_queue
+        # Add shebang, in case shell_start_mode=unix_behavior (for SGE).
+        #   https://github.com/PacificBiosciences/FALCON/pull/348
+        with open(script_fn, 'r') as original: data = original.read()
+        with open(script_fn, 'w') as modified: modified.write("#!/bin/bash" + "\n" + data)
+        mapping = dict(
+                JOB_EXE='/bin/bash',
+                JOB_NAME=job_name,
+                #JOB_OPTS=JOB_OPTS,
+                #JOB_QUEUE=job_queue,
+                JOB_SCRIPT=script_fn, CMD=script_fn,
+                JOB_DIR=run_dir, DIR=run_dir,
+                JOB_STDOUT='stdout', STDOUT_FILE='stdout',
+                JOB_STDERR='stderr', STDERR_FILE='stderr',
+                #MB=pypeflow_mb,
+                #NPROC=pypeflow_nproc,
+        )
+        mapping.update(self.job_dict)
+        if 'JOB_OPTS' in mapping:
+            # a special two-level mapping: ${JOB_OPTS} is substituted first
+            mapping['JOB_OPTS'] = self.sub(mapping['JOB_OPTS'], mapping)
+        sge_cmd = self.sub(self.submit_template, mapping)
+        self.submit_capture = capture(sge_cmd)
+    def kill(self, state, heartbeat=None):
+        """Can raise.
+        """
+        #hdir = state.get_directory_heartbeats()
+        #heartbeat_fn = os.path.join(hdir, heartbeat)
+        #jobid = self.mjob.job.jobid
+        job_name = self.get_job_name()
+        job_num = self.get_job_num()
+        mapping = dict(
+                JOB_NAME=job_name,
+                JOB_NUM=job_name,
+        )
+        mapping.update(self.job_dict)
+        sge_cmd = self.sub(self.kill_template, mapping)
+        system(sge_cmd, checked=False)
+    def sub(self, unsub, mapping):
+        return string.Template(unsub).substitute(mapping)
+    def get_job_name(self):
+        """Some systems are limited to 15 characters, but we expect that to be truncated by the caller.
+        TODO: Choose a sequential jobname and record it. Priority: low, since collisions are very unlikely.
+        """
+        # jobid is an overloaded term in the pbsmrtpipe world, so we use job_name here.
+        return self.mjob.job.jobid
+    def get_job_num(self):
+        """For now, just the jobname.
+        """
+        return self.mjob.job.jobid
+    def __repr__(self):
+        return '{}({!r})'.format(self.__class__.__name__, self.mjob)
+    def __init__(self, mjob):
+        self.mjob = mjob
+        if not hasattr(self, 'JOB_OPTS'):
+            self.JOB_OPTS = None # unreachable, since this is an abstract class
+        self.job_dict = copy.deepcopy(self.mjob.job.options)
+        jd = self.job_dict
+        if 'submit' in jd:
+            self.submit_template = jd['submit']
+        if 'kill' in jd:
+            self.kill_template = jd['kill']
+        if 'JOB_OPTS' not in jd and hasattr(self, 'JOB_OPTS'):
+            jd['JOB_OPTS'] = self.JOB_OPTS
+        assert self.submit_template
+        assert self.kill_template
+        assert self.JOB_OPTS
+class MetaJobSge(MetaJobSubmit):
+    def __init__(self, mjob):
+        # '-V' => pass enV; '-j y' => combine out/err
+        self.submit_template = 'qsub -V -N ${JOB_NAME} ${JOB_OPTS} -cwd -o ${JOB_STDOUT} -e ${JOB_STDERR} -S /bin/bash ${JOB_SCRIPT}'
+        self.JOB_OPTS = '-q ${JOB_QUEUE} -pe smp ${NPROC}' # -l h_vmem=${MB}M does not work within PacBio
+        self.kill_template = 'qdel ${JOB_NAME}'
+        super(MetaJobSge, self).__init__(mjob)
+class MetaJobPbs(MetaJobSubmit):
+    """
+usage: qsub [-a date_time] [-A account_string] [-c interval]
+        [-C directive_prefix] [-e path] [-h ] [-I [-X]] [-j oe|eo] [-J X-Y[:Z]]
+        [-k o|e|oe] [-l resource_list] [-m mail_options] [-M user_list]
+        [-N jobname] [-o path] [-p priority] [-q queue] [-r y|n]
+        [-S path] [-u user_list] [-W otherattributes=value...]
+        [-v variable_list] [-V ] [-z] [script | -- command [arg1 ...]]
+    """
+    def get_job_num(self):
+        """Really an Id, not a number, but JOB_ID was used for something else.
+        See: https://github.com/PacificBiosciences/pypeFLOW/issues/54
+        """
+        cap = self.submit_capture
+        try:
+            re_cap = re.compile(r'\S+')
+            mo = re_cap.search(cap)
+            return mo.group(0)
+        except Exception:
+            log.exception('For PBS, failed to parse submit_capture={!r}\n Using job_name instead.'.format(cap))
+            return self.mjob.job.jobid
+    def __init__(self, mjob):
+        self.submit_template = 'qsub -V -N ${JOB_NAME} ${JOB_OPTS} -o ${JOB_STDOUT} -e ${JOB_STDERR} -S /bin/bash ${JOB_SCRIPT}'
+        self.JOB_OPTS = '-q ${JOB_QUEUE} --cpus-per-task=${NPROC} --mem-per-cpu=${MB}M'
+        self.kill_template = 'qdel ${JOB_NAME}'
+        super(MetaJobPbs, self).__init__(mjob)
+class MetaJobTorque(MetaJobSubmit):
+    # http://docs.adaptivecomputing.com/torque/4-0-2/help.htm#topics/commands/qsub.htm
+    def __init__(self, mjob):
+        self.submit_template = 'qsub -V -N ${JOB_NAME} ${JOB_OPTS} -d ${JOB_DIR} -o ${JOB_STDOUT} -e ${JOB_STDERR} -S /bin/bash ${JOB_SCRIPT}'
+        self.JOB_OPTS = '-q ${JOB_QUEUE} -l procs=${NPROC}'
+        self.kill_template = 'qdel ${JOB_NUM}'
+        super(MetaJobTorque, self).__init__(mjob)
+class MetaJobSlurm(MetaJobSubmit):
+    def __init__(self, mjob):
+        self.submit_template = 'sbatch -J ${JOB_NAME} ${JOB_OPTS} -D ${JOB_DIR} -o ${JOB_STDOUT} -e ${JOB_STDERR} --wrap="/bin/bash ${JOB_SCRIPT}"'
+        self.JOB_OPTS = '-p ${JOB_QUEUE} --mincpus=${NPROC} --mem-per-cpu=${MB}'
+        self.kill_template = 'scancel -n ${JOB_NUM}'
+        super(MetaJobSlurm, self).__init__(mjob)
+class MetaJobLsf(MetaJobSubmit):
+    def __init__(self, mjob):
+        self.submit_template = 'bsub -J ${JOB_NAME} ${JOB_OPTS} -o ${JOB_STDOUT} -e ${JOB_STDERR} "/bin/bash ${JOB_SCRIPT}"'
+        # "Sets the user's execution environment for the job, including the current working directory, file creation mask, and all environment variables, and sets LSF environment variables before starting the job."
+        self.JOB_OPTS = '-q ${JOB_QUEUE} -n ${NPROC}'
+        self.kill_template = 'bkill -J ${JOB_NUM}'
+        super(MetaJobLsf, self).__init__(mjob)
+
+def link_rundir(state_rundir, user_rundir):
+    if user_rundir:
+        link_fn = os.path.join(user_rundir, 'pwatcher.dir')
+        if os.path.lexists(link_fn):
+            os.unlink(link_fn)
+        os.symlink(os.path.abspath(state_rundir), link_fn)
+
+def cmd_run(state, jobids, job_type, job_defaults_dict):
+    """On stdin, each line is a unique job-id, followed by run-dir, followed by command+args.
+    Wrap them and run them locally, in the background.
+    """
+    # We don't really need job_defaults_dict as they were already
+    # added to job_dict for each job.
+    jobs = dict()
+    submitted = list()
+    result = {'submitted': submitted}
+    for jobid, desc in jobids.items():
+        options = copy.deepcopy(desc['job_dict']) # defaults were already applied here
+        if not options.get('job_type'):
+            options['job_type'] = job_type
+        if int(desc['job_local']):
+            options['job_type'] = 'local'
+        jobs[jobid] = Job(jobid, desc['cmd'], desc['rundir'], options)
+    log.debug('jobs:\n{}'.format(pprint.pformat(jobs)))
+    for jobid, job in jobs.items():
+        desc = jobids[jobid]
+        mjob = Job_get_MetaJob(job)
+        MetaJob_wrap(mjob, state)
+        options = job.options
+        my_job_type = job.options['job_type']
+        if my_job_type is None:
+            my_job_type = job_type
+        my_job_type = my_job_type.upper()
+        log.info(' starting job {} w/ job_type={}'.format(pprint.pformat(job), my_job_type))
+        if my_job_type == 'LOCAL':
+            bjob = MetaJobLocal(mjob)
+        elif my_job_type == 'SGE':
+            bjob = MetaJobSge(mjob)
+        elif my_job_type == 'PBS':
+            bjob = MetaJobPbs(mjob)
+        elif my_job_type == 'TORQUE':
+            bjob = MetaJobTorque(mjob)
+        elif my_job_type == 'SLURM':
+            bjob = MetaJobSlurm(mjob)
+        elif my_job_type == 'LSF':
+            bjob = MetaJobLsf(mjob)
+        else:
+            raise Exception('Unknown my_job_type=%s' %repr(my_job_type))
+        try:
+            link_rundir(state.get_directory_job(jobid), desc.get('rundir'))
+            state.submit_background(bjob)
+            submitted.append(jobid)
+        except Exception:
+            log.exception('In pwatcher.fs_based.cmd_run(), failed to submit background-job:\n{!r}'.format(
+                bjob))
+            #raise
+    return result
+    # The caller is responsible for deciding what to do about job-submission failures. Re-try, maybe?
+
+re_heartbeat = re.compile(r'heartbeat-(.+)')
+def get_jobid_for_heartbeat(heartbeat):
+    """This cannot fail unless we change the filename format.
+    """
+    mo = re_heartbeat.search(heartbeat)
+    jobid = mo.group(1)
+    return jobid
+def system(call, checked=False):
+    log.info('CALL:\n {}'.format(call))
+    rc = os.system(call)
+    if checked and rc:
+        raise Exception('{} <- {!r}'.format(rc, call))
+
+_warned = dict()
+def warnonce(hashkey, msg):
+    if hashkey in _warned:
+        return
+    log.warning(msg)
+    _warned[hashkey] = True
+
+def get_status(state, elistdir, reference_s, sentinel, heartbeat):
+    heartbeat_path = os.path.join(state.get_directory_heartbeats(), heartbeat)
+    # We take listdir so we can avoid extra system calls.
+    if sentinel in elistdir:
+        try:
+            pass
+            #os.remove(heartbeat_path) # Note: We no longer use the heartbeats.
+        except Exception:
+            log.debug('Unable to remove heartbeat {} when sentinel was found in exit-sentinels listdir.\n{}'.format(
+                repr(heartbeat_path), traceback.format_exc()))
+        sentinel_path = os.path.join(state.get_directory_exits(), sentinel)
+        with open(sentinel_path) as ifs:
+            rc = ifs.read().strip()
+        return 'EXIT {}'.format(rc)
+    ## TODO: Record last stat times, to avoid extra stat if too frequent.
+    #try:
+    #    mtime_s = os.path.getmtime(heartbeat_path)
+    #    if (mtime_s + 3*HEARTBEAT_RATE_S) < reference_s:
+    #        if (ALLOWED_SKEW_S + mtime_s + 3*HEARTBEAT_RATE_S) < reference_s:
+    #            msg = 'DEAD job? {} + 3*{} + {} < {} for {!r}'.format(
+    #                mtime_s, HEARTBEAT_RATE_S, ALLOWED_SKEW_S, reference_s, heartbeat_path)
+    #            log.debug(msg)
+    #            warnonce(heartbeat_path, msg)
+    #            return 'DEAD'
+    #        else:
+    #            log.debug('{} + 3*{} < {} for {!r}. You might have a large clock-skew, or filesystem delays, or just filesystem time-rounding.'.format(
+    #                mtime_s, HEARTBEAT_RATE_S, reference_s, heartbeat_path))
+    #except Exception as exc:
+    #    # Probably, somebody deleted it after our call to os.listdir().
+    #    # TODO: Decide what this really means.
+    #    log.debug('Heartbeat not (yet?) found at %r: %r' %(heartbeat_path, exc))
+    #    return 'UNKNOWN'
+    return 'RUNNING' # but actually it might not have started yet, or it could be dead, since we are not checking the heartbeat
+def cmd_query(state, which, jobids):
+    """Return the state of named jobids.
+    See find_jobids().
+    """
+    found = dict()
+    edir = state.get_directory_exits()
+    for heartbeat in find_heartbeats(state, which, jobids):
+        jobid = get_jobid_for_heartbeat(heartbeat)
+        mji = state.get_mji(jobid)
+        sentinel = mji.get_sentinel()
+        #system('ls -l {}/{} {}/{}'.format(edir, sentinel, hdir, heartbeat), checked=False)
+        found[jobid] = (sentinel, heartbeat)
+    elistdir = os.listdir(edir)
+    current_time_s = time.time()
+    result = dict()
+    jobstats = dict()
+    result['jobids'] = jobstats
+    for jobid, pair in found.items():
+        sentinel, heartbeat = pair
+        status = get_status(state, elistdir, current_time_s, sentinel, heartbeat)
+        log.debug('Status %s for heartbeat:%s' %(status, heartbeat))
+        jobstats[jobid] = status
+    return result
+def get_jobid2pid(pid2mjob):
+    result = dict()
+    for pid, mjob in pid2mjob.items():
+        jobid = mjob.job.jobid
+        result[jobid] = pid
+    return result
+def find_heartbeats(state, which, jobids):
+    """Yield heartbeat filenames.
+    If which=='list', then query jobs listed as jobids.
+    If which=='known', then query all known jobs.
+    If which=='infer', then query all jobs with heartbeats.
+    These are not quite finished, but already useful.
+    """
+    #log.debug('find_heartbeats for which=%s, jobids=%s' %(which, pprint.pformat(jobids)))
+    if which == 'infer':
+        for fn in glob.glob(os.path.join(state.get_directory_heartbeats(), 'heartbeat*')):
+            yield fn
+    elif which == 'known':
+        jobid2mjob = state.get_mjobs()
+        for jobid, mjob in jobid2mjob.items():
+            mji = MetaJobClass(mjob)
+            yield mji.get_heartbeat()
+    elif which == 'list':
+        jobid2mjob = state.get_mjobs()
+        #log.debug('jobid2mjob:\n%s' %pprint.pformat(jobid2mjob))
+        for jobid in jobids:
+            #log.debug('jobid=%s; jobids=%s' %(repr(jobid), repr(jobids)))
+            #if jobid not in jobid2mjob:
+            #    log.info("jobid=%s is not known. Might have been deleted already." %jobid)
+            mjob = jobid2mjob[jobid]
+            mji = MetaJobClass(mjob)
+            yield mji.get_heartbeat()
+    else:
+        raise Exception('which=%s'%repr(which))
+def delete_heartbeat(state, heartbeat, keep=False):
+    """
+    Kill the job with this heartbeat.
+    (If there is no heartbeat, then the job is already gone.)
+    Delete the entry from state and update its jobid.
+    Remove the heartbeat file, unless 'keep'.
+    """
+    hdir = state.get_directory_heartbeats()
+    heartbeat_fn = os.path.join(hdir, heartbeat)
+    jobid = get_jobid_for_heartbeat(heartbeat)
+    try:
+        bjob = state.get_bjob(jobid)
+    except Exception:
+        log.exception('In delete_heartbeat(), unable to find batchjob for %s (from %s)' %(jobid, heartbeat))
+        log.warning('Cannot delete. You might be able to delete this yourself if you examine the content of %s.' %heartbeat_fn)
+        # TODO: Maybe provide a default grid type, so we can attempt to delete anyway?
+        return
+    try:
+        bjob.kill(state, heartbeat)
+    except Exception as exc:
+        log.exception('Failed to kill job for heartbeat {!r} (which might mean it was already gone): {!r}'.format(
+            heartbeat, exc))
+    state.add_deleted_jobid(jobid)
+    # For now, keep it in the 'jobs' table.
+    try:
+        os.remove(heartbeat_fn)
+        log.debug('Removed heartbeat=%s' %repr(heartbeat))
+    except OSError as exc:
+        log.debug('Cannot remove heartbeat {!r}: {!r}'.format(heartbeat_fn, exc))
+    # Note: If sentinel suddenly appeared, that means the job exited. The pwatcher might wrongly think
+    # it was deleted, but its output might be available anyway.
+def cmd_delete(state, which, jobids):
+    """Kill designated jobs, including (hopefully) their
+    entire process groups.
+    If which=='list', then kill all jobs listed as jobids.
+    If which=='known', then kill all known jobs.
+    If which=='infer', then kill all jobs with heartbeats.
+    Remove those heartbeat files.
+    """
+    log.debug('Deleting jobs for jobids from %s (%s)' %(
+        which, repr(jobids)))
+    for heartbeat in find_heartbeats(state, which, jobids):
+        delete_heartbeat(state, heartbeat)
+def makedirs(path):
+    if not os.path.isdir(path):
+        os.makedirs(path)
+def readjson(ifs):
+    """Del keys that start with ~.
+    That lets us have trailing commas on all other lines.
+    """
+    content = ifs.read()
+    log.debug('content:%s' %repr(content))
+    jsonval = json.loads(content)
+    #pprint.pprint(jsonval)
+    def striptildes(subd):
+        if not isinstance(subd, dict):
+            return
+        for k,v in list(subd.items()):
+            if k.startswith('~'):
+                del subd[k]
+            else:
+                striptildes(v)
+    striptildes(jsonval)
+    #pprint.pprint(jsonval)
+    return jsonval
+
+class ProcessWatcher(object):
+    def run(self, jobids, job_type, job_defaults_dict):
+        #import traceback; log.debug(''.join(traceback.format_stack()))
+        log.debug('run(jobids={}, job_type={}, job_defaults_dict={})'.format(
+            '<%s>'%len(jobids), job_type, job_defaults_dict))
+        return cmd_run(self.state, jobids, job_type, job_defaults_dict)
+    def query(self, which='list', jobids=[]):
+        log.debug('query(which={!r}, jobids={})'.format(
+            which, '<%s>'%len(jobids)))
+        return cmd_query(self.state, which, jobids)
+    def delete(self, which='list', jobids=[]):
+        log.debug('delete(which={!r}, jobids={})'.format(
+            which, '<%s>'%len(jobids)))
+        return cmd_delete(self.state, which, jobids)
+    def __init__(self, state):
+        self.state = state
+
+def get_process_watcher(directory):
+    state = get_state(directory)
+    #log.debug('state =\n%s' %pprint.pformat(state.top))
+    return ProcessWatcher(state)
+    #State_save(state)
+
+@contextlib.contextmanager
+def process_watcher(directory):
+    """This will (someday) hold a lock, so that
+    the State can be written safely at the end.
+    """
+    state = get_state(directory)
+    #log.debug('state =\n%s' %pprint.pformat(state.top))
+    yield ProcessWatcher(state)
+    # TODO: Sometimes, maybe we should not save state.
+    # Or maybe we *should* on exception.
+    State_save(state)
+
+def main(prog, cmd, state_dir='mainpwatcher', argsfile=None):
+    logging.basicConfig()
+    logging.getLogger().setLevel(logging.NOTSET)
+    log.warning('logging basically configured')
+    log.debug('debug mode on')
+    assert cmd in ['run', 'query', 'delete']
+    ifs = sys.stdin if not argsfile else open(argsfile)
+    argsdict = readjson(ifs)
+    log.info('argsdict =\n%s' %pprint.pformat(argsdict))
+    with process_watcher(state_dir) as watcher:
+        result = getattr(watcher, cmd)(**argsdict)
+        if result is not None:
+            print(pprint.pformat(result))
+
+
+# With bash, we would need to set the session, rather than
+# the process group. That's not ideal, but this is here for reference.
+#  http://stackoverflow.com/questions/6549663/how-to-set-process-group-of-a-shell-script
+#
+bash_template = """#!%(lang_exe)s
+cmd='%(cmd)s'
+"$cmd"
+"""
+
+# perl might be better, for efficiency.
+# But we will use python for now.
+#
+python_template = r"""#!%(lang_exe)s
+import threading, time, os, sys
+
+cmd='%(cmd)s'
+sentinel_fn='%(sentinel_fn)s'
+heartbeat_fn='%(heartbeat_fn)s'
+sleep_s=%(sleep_s)s
+cwd='%(cwd)s'
+
+os.chdir(cwd)
+
+def log(msg):
+    sys.stderr.write(msg)
+    sys.stderr.write('\n')
+    #sys.stdout.flush()
+
+def thread_heartbeat():
+    ofs = open(heartbeat_fn, 'w')
+    pid = os.getpid()
+    pgid = os.getpgid(0)
+    x = 0
+    while True:
+        ofs.write('{} {} {}\n'.format(x, pid, pgid))
+        ofs.flush()
+        time.sleep(sleep_s)
+        x += 1
+def start_heartbeat():
+    hb = threading.Thread(target=thread_heartbeat)
+    log('alive? {}'.format(hb.is_alive()))
+    hb.daemon = True
+    hb.start()
+    return hb
+def main():
+    log('cwd:{!r}'.format(os.getcwd()))
+    if os.path.exists(sentinel_fn):
+        os.remove(sentinel_fn)
+    if os.path.exists(heartbeat_fn):
+        os.remove(heartbeat_fn)
+    os.system('touch {}'.format(heartbeat_fn))
+    log("before: pid={}s pgid={}s".format(os.getpid(), os.getpgid(0)))
+    try:
+        os.setpgid(0, 0)
+    except OSError as e:
+        log('Unable to set pgid. Possibly a grid job? Hopefully there will be no dangling processes when killed: {}'.format(
+            repr(e)))
+    log("after: pid={}s pgid={}s".format(os.getpid(), os.getpgid(0)))
+    hb = start_heartbeat()
+    log('alive? {} pid={} pgid={}'.format(hb.is_alive(), os.getpid(), os.getpgid(0)))
+    rc = os.system(cmd)
+    # Do not delete the heartbeat here. The discoverer of the sentinel will do that,
+    # to avoid a race condition.
+    #if os.path.exists(heartbeat_fn):
+    #    os.remove(heartbeat_fn)
+    with open(sentinel_fn, 'w') as ofs:
+        ofs.write(str(rc))
+    # sys.exit(rc) # No-one would see this anyway.
+    if rc:
+        raise Exception('{} <- {!r}'.format(rc, cmd))
+main()
+"""
+
+if __name__ == "__main__":
+    import pdb
+    pdb.set_trace()
+    main(*sys.argv) # pylint: disable=no-value-for-parameter

+ 0 - 0
pypeFlow/pwatcher/mains/__init__.py


+ 149 - 0
pypeFlow/pwatcher/mains/fs_heartbeat.py

@@ -0,0 +1,149 @@
+"""Filesystem heartbeat wrapper
+
+Perl might be better for efficiency.
+But we will use python for now.
+
+Non-zero status means *this* failed, not the wrapped command.
+"""
+import argparse
+import os
+import socket
+import sys
+import threading
+import time
+
+DESCRIPTION = """
+We wrap a system call to produce both a heartbeat and an exit-sentinel
+in the filesystem.
+"""
+EPILOG = """
+We share stderr/stdout with the command. We log to stderr (for now).
+"""
+HEARTBEAT_TEMPLATE = '0 {pid} {pgid}\n'
+EXIT_TEMPLATE = '{exit_code}'
+
+class _Formatter(argparse.RawDescriptionHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
+    pass
+_FORMATTER_CLASS = _Formatter
+
+def parse_args(args):
+    parser = argparse.ArgumentParser(
+        description=DESCRIPTION,
+        epilog=EPILOG,
+        formatter_class=_FORMATTER_CLASS,
+    )
+    parser.add_argument('--rate',
+        help='Heartbeat rate, in seconds',
+        type=float,
+        default=1.0, # TODO: Make this at least 10, maybe 60.
+    )
+    parser.add_argument('--heartbeat-file',
+        help='Path to heartbeat file. The first line will have the format {!r}. The rest are just elapsed time'.format(
+            HEARTBEAT_TEMPLATE),
+        required=True,
+    )
+    parser.add_argument('--exit-file',
+        help='Path to exit sentinel file. At end, it will have the format {!r}'.format(
+            EXIT_TEMPLATE),
+        required=True,
+    )
+    parser.add_argument('--directory',
+        help='Directory in which to run COMMAND.',
+        default='.',
+    )
+    parser.add_argument('command',
+        help='System call (to be joined by " "). We will block on this and return its result.',
+        nargs='+',
+        #required=True,
+    )
+    return parser.parse_args(args)
+
+def log(msg):
+    sys.stderr.write(msg)
+    sys.stderr.write('\n')
+    #sys.stdout.flush() # If we use stdout.
+
+def thread_heartbeat(heartbeat_fn, sleep_s):
+    with open(heartbeat_fn, 'w') as ofs:
+        pid = os.getpid()
+        pgid = os.getpgid(0)
+        ofs.write(HEARTBEAT_TEMPLATE.format(
+            **locals()))
+        elapsed = 0
+        ctime = 0
+        while True:
+            #ctime = time.time()
+            ofs.write('{elapsed} {ctime}\n'.format(
+                **locals()))
+            ofs.flush()
+            time.sleep(sleep_s)
+            elapsed += 1
+
+def start_heartbeat(heartbeat_fn, sleep_s):
+    hb = threading.Thread(target=thread_heartbeat, args=(heartbeat_fn, sleep_s))
+    log('alive? {}'.format(
+        bool(hb.is_alive())))
+    hb.daemon = True
+    hb.start()
+    return hb
+
+def run(args):
+    os.chdir(args.directory)
+    heartbeat_fn = os.path.abspath(args.heartbeat_file)
+    exit_fn = os.path.abspath(args.exit_file)
+    cwd = os.getcwd()
+    hostname = socket.getfqdn()
+    sleep_s = args.rate
+    log("""
+cwd:{cwd!r}
+hostname={hostname}
+heartbeat_fn={heartbeat_fn!r}
+exit_fn={exit_fn!r}
+sleep_s={sleep_s!r}""".format(
+        **locals()))
+    if os.path.exists(exit_fn):
+        os.remove(exit_fn)
+    if os.path.exists(heartbeat_fn):
+        os.remove(heartbeat_fn)
+    #os.system('touch {}'.format(heartbeat_fn)) # This would be over-written anyway.
+    log("before setpgid: pid={} pgid={}".format(os.getpid(), os.getpgid(0)))
+    try:
+        os.setpgid(0, 0) # This allows the entire tree of procs to be killed.
+        log(" after setpgid: pid={} pgid={}".format(
+            os.getpid(), os.getpgid(0)))
+    except OSError as e:
+        log(' Unable to set pgid. Possibly a grid job? Hopefully there will be no dangling processes when killed: {}'.format(
+            repr(e)))
+
+    #thread = start_heartbeat(heartbeat_fn, sleep_s)
+
+    #log('alive? {} pid={} pgid={}'.format(
+    #    bool(thread.is_alive()), os.getpid(), os.getpgid(0)))
+
+    call = ' '.join(args.command)
+    log('In cwd: {}, Blocking call: {!r}'.format(
+        os.getcwd(), call))
+    rc = os.system(call) # Blocking.
+
+    log(' returned: {!r}'.format(
+        rc))
+
+    # Do not delete the heartbeat here. The discoverer of the exit-sentinel will do that,
+    # to avoid a race condition.
+    #if os.path.exists(heartbeat_fn):
+    #    os.remove(heartbeat_fn)
+
+    exit_tmp_fn = exit_fn + '.tmp'
+    with open(exit_tmp_fn, 'w') as ofs:
+        ofs.write(EXIT_TEMPLATE.format(
+            exit_code=rc))
+    os.rename(exit_tmp_fn, exit_fn) # atomic
+    # sys.exit(rc) # No-one would see this anyway.
+
+def main():
+    args = parse_args(sys.argv[1:])
+    log(repr(args))
+    run(args)
+
+if __name__ == "__main__":
+    main()

+ 36 - 0
pypeFlow/pwatcher/mains/job_start.sh

@@ -0,0 +1,36 @@
+#!/bin/bash
+# vim: filetype=sh ts=4 sw=4 sts=4 et:
+#
+# Wait until file exists, then spawn.
+
+# This is not Python because the start_tmpl from pbsmrtpipe always runs bash.
+# But we use the .py extension because we want this installed with our Python
+# code, so we do not need to deal with mobs for installation. (But we might
+# need to chmod +x.)
+#
+# This can be run via
+#
+#     bash -c pwatcher/mains.job_start.py myprog 60
+#
+# Note: If anyone replaces this, you must ensure that running this is exactly equivalent
+# to running the "executable". In other words, no 'mkdir', no 'cd', etc. That will help
+# with debugging.
+
+set -vex
+executable=${PYPEFLOW_JOB_START_SCRIPT}
+timeout=${PYPEFLOW_JOB_START_TIMEOUT:-60} # wait 60s by default
+
+# Wait up to timeout seconds for the executable to become "executable",
+# then exec.
+#timeleft = int(timeout)
+while [[ ! -x "${executable}" ]]; do
+    if [[ "${timeout}" == "0" ]]; then
+        echo "timed out waiting for (${executable})"
+        exit 77
+    fi
+    echo "not executable: '${executable}', waiting ${timeout}s"
+    sleep 1
+    timeout=$((timeout-1))
+done
+
+/bin/bash ${executable}

+ 176 - 0
pypeFlow/pwatcher/mains/network_heartbeat.py

@@ -0,0 +1,176 @@
+"""Network server heartbeat wrapper
+
+Perl might be better for efficiency.
+But we will use python for now.
+
+Non-zero status means *this* failed, not the wrapped command.
+"""
+import argparse
+import os
+import shlex
+import socket
+import subprocess
+import sys
+import threading
+import time
+
+DESCRIPTION = """
+We wrap a system call to produce a heartbeat.
+"""
+EPILOG = """
+We log to the status server, and forward command stdout/stderr as well.
+"""
+
+class _Formatter(argparse.RawDescriptionHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
+    pass
+_FORMATTER_CLASS = _Formatter
+
+def parse_args(args):
+    parser = argparse.ArgumentParser(
+        description=DESCRIPTION,
+        epilog=EPILOG,
+        formatter_class=_FORMATTER_CLASS,
+    )
+    parser.add_argument('--rate',
+        help='Heartbeat rate, in seconds',
+        type=int,
+        default=600,
+    )
+    parser.add_argument('--heartbeat-server',
+        help='Address of the heartbeat server',
+        required=True,
+    )
+    parser.add_argument('--heartbeat-port',
+        help='Port of the heartbeat server',
+        type=int,
+        required=True,
+    )
+    parser.add_argument('--jobid',
+        help='Our jobid',
+        required=True,
+    )
+    parser.add_argument('--exit-dir',
+        help='Path to emergency exit sentinel directory',
+        required=True,
+    )
+    parser.add_argument('--directory',
+        help='Directory in which to run COMMAND.',
+        default='.',
+    )
+    parser.add_argument('command',
+        help='System call (to be joined by " "). We will block on this and return its result.',
+        nargs='+',
+        #required=True,
+    )
+    return parser.parse_args(args)
+
+# send message delimited with a \0
+def socket_send(socket, message):
+    socket.sendall(b'{}\0'.format(message))
+
+def log(heartbeat_server, jobid, msg):
+    hsocket = socket.socket()
+    try:
+        hsocket.connect(heartbeat_server)
+        socket_send(hsocket, 's {} {}\n'.format(jobid, msg))
+        hsocket.close()
+    except IOError:		# better to miss a line than terminate
+        pass
+
+def thread_heartbeat(heartbeat_server, jobid, sleep_s):
+    pid = os.getpid()
+    pgid = os.getpgid(0)
+    hsocket = socket.socket()
+    try:
+        hsocket.connect(heartbeat_server)
+        socket_send(hsocket, 'i {} {} {}'.format(jobid, pid, pgid))
+        hsocket.close()
+    except IOError:	# we hope it's a temporary error
+        pass
+    while True:
+        time.sleep(sleep_s)
+        hsocket = socket.socket()
+        try:
+            hsocket.connect(heartbeat_server)
+            socket_send(hsocket, 'h {}'.format(jobid))
+            hsocket.close()
+        except IOError:	# we hope it's a temporary error
+            pass
+
+def start_heartbeat(heartbeat_server, jobid, sleep_s):
+    hb = threading.Thread(target=thread_heartbeat, args=(heartbeat_server, jobid, sleep_s))
+    log(heartbeat_server, jobid, 'alive? {}'.format(
+        bool(hb.is_alive())))
+    hb.daemon = True
+    hb.start()
+    return hb
+
+def run(args):
+    heartbeat_server = (args.heartbeat_server, args.heartbeat_port)
+    jobid = args.jobid
+    log(heartbeat_server, jobid, repr(args))
+    os.chdir(args.directory)
+    exit_dir = args.exit_dir
+    exit_fn = os.path.join(os.path.abspath(exit_dir), jobid)
+    cwd = os.getcwd()
+    hostname = socket.getfqdn()
+    sleep_s = args.rate
+    log(heartbeat_server, jobid, """
+cwd:{cwd!r}
+hostname={hostname}
+heartbeat_server={heartbeat_server!r}
+jobid={jobid}
+exit_dir={exit_dir!r}
+sleep_s={sleep_s!r}""".format(
+        **locals()))
+    log(heartbeat_server, jobid, "before setpgid: pid={} pgid={}".format(os.getpid(), os.getpgid(0)))
+    try:
+        os.setpgid(0, 0) # This allows the entire tree of procs to be killed.
+        log(heartbeat_server, jobid, " after setpgid: pid={} pgid={}".format(
+            os.getpid(), os.getpgid(0)))
+    except OSError as e:
+        log(heartbeat_server, jobid, ' Unable to set pgid. Possibly a grid job? Hopefully there will be no dangling processes when killed: {}'.format(
+            repr(e)))
+
+    thread = start_heartbeat(heartbeat_server, jobid, sleep_s)
+
+    log(heartbeat_server, jobid, 'alive? {} pid={} pgid={}'.format(
+        bool(thread.is_alive()), os.getpid(), os.getpgid(0)))
+
+    call = ' '.join(args.command)
+    log(heartbeat_server, jobid, 'In cwd: {}, Blocking call: {!r}'.format(
+        os.getcwd(), call))
+    sp = subprocess.Popen(shlex.split(call), stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    # forward all output to server until job ends, then get exit value
+    with sp.stdout as f:
+        for line in iter(f.readline, b''):
+            # can't use log() for this because it appends a \n
+            hsocket = socket.socket()
+            try:
+                hsocket.connect(heartbeat_server)
+                socket_send(hsocket, 's {} {}'.format(jobid, line))
+                hsocket.close()
+            except IOError:		# better to miss a line than terminate
+                pass
+    rc = sp.wait()
+
+    log(heartbeat_server, jobid, ' returned: {!r}'.format(
+        rc))
+
+    hsocket = socket.socket()
+    try:
+        hsocket.connect(heartbeat_server)
+        socket_send(hsocket, 'e {} {}'.format(jobid, rc))
+        hsocket.close()
+    except IOError as e:
+        log(heartbeat_server, jobid, 'could not update heartbeat server with exit status: {} {}: {!r}'.format(jobid, rc, e))
+        with open(exit_fn, 'w') as f:
+            f.write(str(rc))
+    # sys.exit(rc) # No-one would see this anyway.
+
+def main():
+    args = parse_args(sys.argv[1:])
+    run(args)
+
+if __name__ == "__main__":
+    main()

+ 12 - 0
pypeFlow/pwatcher/mains/pwatcher.py

@@ -0,0 +1,12 @@
+from .. import fs_based
+import pdb
+import sys
+
+def main():
+    fs_based.main(*sys.argv) # pylint: disable=no-value-for-parameter
+
+# If run directly, rather than via the 'entry-point',
+# then pdb will be used.
+if __name__ == "__main__":
+    #pdb.set_trace()
+    main()

+ 131 - 0
pypeFlow/pwatcher/mains/pypeflow_example.py

@@ -0,0 +1,131 @@
+from pypeflow.simple_pwatcher_bridge import (PypeProcWatcherWorkflow, MyFakePypeThreadTaskBase,
+        makePypeLocalFile, fn, PypeTask)
+import json
+import logging.config
+import os
+import sys
+
+JOB_TYPE = os.environ.get('JOB_TYPE', 'local')
+SLEEP_S = os.environ.get('SLEEP_S', '1')
+log = logging.getLogger(__name__)
+
+def spawn(args, check=False):
+    cmd = args[0]
+    log.debug('$(%s %s)' %(cmd, repr(args)))
+    rc = os.spawnv(os.P_WAIT, cmd, args) # spawnvp for PATH lookup
+    msg = "Call %r returned %d." % (cmd, rc)
+    if rc:
+        log.warning(msg)
+        if check:
+            raise Exception(msg)
+    else:
+        log.debug(msg)
+    return rc
+def system(call, check=False):
+    log.debug('$(%s)' %repr(call))
+    rc = os.system(call)
+    msg = "Call %r returned %d." % (call, rc)
+    if rc:
+        log.warning(msg)
+        if check:
+            raise Exception(msg)
+    else:
+        log.debug(msg)
+    return rc
+def makedirs(d):
+    if not os.path.isdir(d):
+        os.makedirs(d)
+def taskrun0(self):
+    template = """
+sleep_s=%(sleep_s)s
+ofile=%(ofile)s
+
+set -vex
+echo start0
+sleep ${sleep_s}
+touch ${ofile}
+echo end0
+"""
+    bash = template %dict(
+        #ifile=fn(self.i0),
+        ofile=fn(self.f0),
+        sleep_s=self.parameters['sleep_s'],
+    )
+    log.debug('taskrun0 bash:\n' + bash)
+    script = 'taskrun0.sh'
+    with open(script, 'w') as ofs:
+        ofs.write(bash)
+    #system("bash {}".format(script), check=True)
+    #spawn(['/bin/bash', script], check=True) # Beware! Hard to kill procs.
+    self.generated_script_fn = script
+    return script
+def taskrun1(self):
+    template = """
+sleep_s=%(sleep_s)s
+ifile=%(ifile)s
+ofile=%(ofile)s
+
+set -vex
+echo start1
+sleep ${sleep_s}
+cp -f ${ifile} ${ofile}
+echo end1
+"""
+    bash = template %dict(
+        ifile=fn(self.f0),
+        ofile=fn(self.f1),
+        sleep_s=self.parameters['sleep_s'],
+    )
+    log.debug('taskrun1 bash:\n' + bash)
+    script = 'taskrun1.sh'
+    with open(script, 'w') as ofs:
+        ofs.write(bash)
+    #system("bash {}".format(script), check=True)
+    self.generated_script_fn = script
+    return script
+
+def main():
+    lfn = 'logging-cfg.json'
+    if os.path.exists(lfn):
+        logging.config.dictConfig(json.load(open(lfn)))
+    else:
+        logging.basicConfig()
+        logging.getLogger().setLevel(logging.NOTSET)
+        try:
+            import logging_tree
+            logging_tree.printout()
+        except ImportError:
+            pass
+    log.debug('DEBUG LOGGING ON')
+    log.warning('Available via env: JOB_TYPE={}, SLEEP_S={}'.format(
+        JOB_TYPE, SLEEP_S))
+    exitOnFailure=False
+    concurrent_jobs=2
+    Workflow = PypeProcWatcherWorkflow
+    wf = Workflow(job_type=JOB_TYPE)
+    wf.max_jobs = concurrent_jobs
+
+    par = dict(sleep_s=SLEEP_S)
+    DIR ='mytmp'
+    makedirs(DIR)
+    f0 = makePypeLocalFile('mytmp/f0')
+    f1 = makePypeLocalFile('mytmp/f1')
+    make_task = PypeTask(
+            inputs = {},
+            outputs = {'f0': f0},
+            parameters = par,
+    )
+    task = make_task(taskrun0)
+    wf.addTasks([task])
+    make_task = PypeTask(
+            inputs = {'f0': f0},
+            outputs = {'f1': f1},
+            parameters = par,
+    )
+    task = make_task(taskrun1)
+    wf.addTasks([task])
+    wf.refreshTargets([task])
+    #wf.refreshTargets(exitOnFailure=exitOnFailure)
+
+if __name__ == "__main__":
+    main()

+ 0 - 0
pypeFlow/pwatcher/mains/query_server.py


Some files were not shown because too many files changed in this diff