#!/usr/bin/env python

# encoding: utf-8
"""
tophat.py

Created by Cole Trapnell on 2008-12-25.
Copyright (c) 2008 Cole Trapnell. All rights reserved.
Updated and maintained by Daehwan Kim and Geo Pertea since Jul 2010.
"""
import sys
try:
    import psyco
    psyco.full()
except ImportError:
    pass

import getopt
import subprocess
import errno
import os
import warnings
import re
import signal
from datetime import datetime, date, time
from shutil import copy
import logging

use_message = '''
TopHat maps short sequences from spliced transcripts to whole genomes.

Usage:
    tophat [options] <bowtie_index> <reads1[,reads2,...]> [reads1[,reads2,...]] \\
                                    [quals1,[quals2,...]] [quals1[,quals2,...]]

Options:
    -v/--version
    -o/--output-dir                <string>    [ default: ./tophat_out     ]
    --bowtie1                                  [ default: bowtie2          ]
    -a/--min-anchor                <int>       [ default: 8                ]
    -m/--splice-mismatches         <0-2>       [ default: 0                ]
    -i/--min-intron-length         <int>       [ default: 50               ]
    -I/--max-intron-length         <int>       [ default: 500000           ]
    -g/--max-multihits             <int>       [ default: 20               ]
    -x/--transcriptome-max-hits    <int>       [ default: 60               ]
    -n/--transcriptome-mismatches  <int>       [ default: 1                ]
    -M/--prefilter-multihits                   ( for -G/--GTF option, enable
                                                 an initial bowtie search
                                                 against the genome )
    --max-insertion-length         <int>       [ default: 3                ]
    --max-deletion-length          <int>       [ default: 3                ]
    --solexa-quals
    --solexa1.3-quals                          (same as phred64-quals)
    --phred64-quals                            (same as solexa1.3-quals)
    -Q/--quals
    --integer-quals
    -C/--color                                 (Solid - color space)
    --color-out
    --library-type                 <string>    (fr-unstranded, fr-firststrand,
                                                fr-secondstrand)
    -p/--num-threads               <int>       [ default: 1                   ]
    -G/--GTF                       <filename>  (GTF/GFF with known transcripts)
    --transcriptome-index          <bwtidx>    (transcriptome bowtie index)
    -T/--transcriptome-only                    (map only to the transcriptome)
    -j/--raw-juncs                 <filename>
    --insertions                   <filename>
    --deletions                    <filename>
    -r/--mate-inner-dist           <int>
    --mate-std-dev                 <int>       [ default: 20                  ]
    --no-novel-juncs
    --no-novel-indels
    --no-gtf-juncs
    --no-coverage-search
    --coverage-search
    --microexon-search
    --keep-tmp
    --tmp-dir                      <dirname>   [ default: <output_dir>/tmp ]
    -z/--zpacker                   <program>   [ default: gzip             ]
    -X/--unmapped-fifo                         [use mkfifo to compress more temporary
                                                 files for color space reads]

Advanced Options:
    --report-secondary-alignments
    --report-discordant-pair-alignments
    --genome-read-mismatches       <int>       [ default: 2                ]
    -N/--read-mismatches           <int>       [ default: 2                ]
    --segment-mismatches           <int>       [ default: 2                ]
    --segment-length               <int>       [ default: 25               ]
    --bowtie-n                                 [ default: bowtie -v        ]
    --min-coverage-intron          <int>       [ default: 50               ]
    --max-coverage-intron          <int>       [ default: 20000            ]
    --min-segment-intron           <int>       [ default: 50               ]
    --max-segment-intron           <int>       [ default: 500000           ]
    --no-sort-bam                              (Output BAM is not coordinate-sorted)
    --no-convert-bam                           (Do not output bam format.
                                                Output is <output_dir>/accepted_hit.sam)
    --keep-fasta-order
    --allow-partial-mapping

Bowtie2 related options:
  Preset options in --end-to-end mode (local alignment is not used in TopHat2)
    --b2-very-fast
    --b2-fast
    --b2-sensitive
    --b2-very-sensitive

  Alignment options
    --b2-N                         <int>       [ default: 0                ]
    --b2-L                         <int>       [ default: 20               ]
    --b2-i                         <func>      [ default: S,1,1.25         ]
    --b2-n-ceil                    <func>      [ default: L,0,0.15         ]
    --b2-gbar                      <int>       [ default: 4                ]

  Scoring options
    --b2-mp                        <int>,<int> [ default: 6,2              ]
    --b2-np                        <int>       [ default: 1                ]
    --b2-rdg                       <int>,<int> [ default: 5,3              ]
    --b2-rfg                       <int>,<int> [ default: 5,3              ]
    --b2-score-min                 <func>      [ default: L,-0.6,-0.6      ]

  Effort options
    --b2-D                         <int>       [ default: 15               ]
    --b2-R                         <int>       [ default: 2                ]

Fusion related options:
    --fusion-search
    --fusion-anchor-length         <int>       [ default: 20               ]
    --fusion-min-dist              <int>       [ default: 10000000         ]
    --fusion-read-mismatches       <int>       [ default: 2                ]
    --fusion-multireads            <int>       [ default: 2                ]
    --fusion-multipairs            <int>       [ default: 2                ]
    --fusion-ignore-chromosomes    <list>      [ e.g, <chrM,chrX>          ]

    --fusion-do-not-resolve-conflicts          [this is for test purposes  ]

SAM Header Options (for embedding sequencing run metadata in output):
    --rg-id                        <string>    (read group ID)
    --rg-sample                    <string>    (sample ID)
    --rg-library                   <string>    (library ID)
    --rg-description               <string>    (descriptive string, no tabs allowed)
    --rg-platform-unit             <string>    (e.g Illumina lane ID)
    --rg-center                    <string>    (sequencing center name)
    --rg-date                      <string>    (ISO 8601 date of the sequencing run)
    --rg-platform                  <string>    (Sequencing platform descriptor)
'''

# Deprecated:
#    --min-closure-exon             <int>       [ default: 100              ]
#    --min-closure-intron           <int>       [ default: 50               ]
#    --max-closure-intron           <int>       [ default: 5000             ]
#    --no-closure-search
#    --closure-search
#    --butterfly-search
#    --no-butterfly-search
#    -F/--min-isoform-fraction      <float>     [ default: 0.15             ]

class Usage(Exception):
    def __init__(self, msg):
        self.msg = msg

output_dir = "./tophat_out/"
logging_dir = output_dir + "logs/"
run_log = None
tophat_log = None  #main log file handle
tophat_logger = None # main logging object
run_cmd = None
tmp_dir = output_dir + "tmp/"
bin_dir = sys.path[0] + "/"
use_zpacker = False # this is set by -z/--zpacker option (-z0 leaves it False)

use_BAM_Unmapped = False # automatically set to True for non-Solid reads, handles unmapped reads in BAM format

use_BWT_FIFO = False # can only be set to True if use_zpacker is True and only with -C/--color
# enabled by -X/-unmapped-fifo option (unless -z0)
unmapped_reads_fifo = None # if use_BWT_FIFO is True, this tricks bowtie into writing the
                           # unmapped reads into a compressed file

samtools_path = None
bowtie_path = None
fail_str = "\t[FAILED]\n"
gtf_juncs = None #file name with junctions extracted from given GFF file

#mapping types:


_reads_vs_G, _reads_vs_T, _segs_vs_G, _segs_vs_J = range(1,5)

def init_logger(log_fname):
    global tophat_logger
    tophat_logger = logging.getLogger('project')
    formatter = logging.Formatter('%(asctime)s %(message)s', '[%Y-%m-%d %H:%M:%S]')
    # level = logging.__dict__.get(options.loglevel.upper(),logging.DEBUG)
    tophat_logger.setLevel(logging.DEBUG)

    hstream = logging.StreamHandler(sys.stderr)
    hstream.setFormatter(formatter)
    tophat_logger.addHandler(hstream)
    #
    # Output logging information to file
    if os.path.isfile(log_fname):
        os.remove(log_fname)
    global tophat_log
    logfh = logging.FileHandler(log_fname)
    logfh.setFormatter(formatter)
    tophat_logger.addHandler(logfh)
    tophat_log=logfh.stream


# TopHatParams captures all of the runtime paramaters used by TopHat, and many
# of these are passed as command line options to exectubles run by the pipeline

# This class and its nested classes also do options parsing through parse_options()
# and option validation via the member function check()



class BowtieFltFiles:
    def __init__(self,
          seqfiles=None, qualfiles=None,
          mappings=None,
          unmapped_reads=None,
          multihit_reads=None):
        self.seqfiles=seqfiles
        self.qualfiles=qualfiles
        self.mappings=mappings
        self.unmapped_reads=unmapped_reads
        self.multihit_reads=multihit_reads

class TopHatParams:

    # SpliceConstraints is a group of runtime parameters that specify what
    # constraints to put on junctions discovered by the program.  These constraints
    # are used to filter out spurious/false positive junctions.

    class SpliceConstraints:
        def __init__(self,
                     min_anchor_length,
                     min_intron_length,
                     max_intron_length,
                     splice_mismatches,
                     min_isoform_fraction):
            self.min_anchor_length = min_anchor_length
            self.min_intron_length = min_intron_length
            self.max_intron_length = max_intron_length
            self.splice_mismatches = splice_mismatches
            self.min_isoform_fraction = min_isoform_fraction

        def parse_options(self, opts):
            for option, value in opts:
                if option in ("-m", "--splice-mismatches"):
                    self.splice_mismatches = int(value)
                elif option in ("-a", "--min-anchor"):
                    self.min_anchor_length = int(value)
                elif option in ("-F", "--min-isoform-fraction"):
                    self.min_isoform_fraction = float(value)
                elif option in ("-i", "--min-intron-length"):
                    self.min_intron_length = int(value)
                elif option in ("-I", "--max-intron-length"):
                    self.max_intron_length = int(value)

        def check(self):
            if self.splice_mismatches not in [0,1,2]:
                die("Error: arg to --splice-mismatches must be 0, 1, or 2")
            if self.min_anchor_length < 4:
                die("Error: arg to --min-anchor-len must be greater than 4")
            if self.min_isoform_fraction < 0.0 or self.min_isoform_fraction > 1.0:
                die("Error: arg to --min-isoform-fraction must be between 0.0 and 1.0")
            if self.min_intron_length <= 0:
                die("Error: arg to --min-intron-length must be greater than 0")
            if self.max_intron_length <= 0:
                die("Error: arg to --max-intron-length must be greater than 0")

    # SystemParams is a group of runtime parameters that determine how to handle
    # temporary files produced during a run and how many threads to use for threaded
    # stages of the pipeline (e.g. Bowtie)

    class SystemParams:
        def __init__(self,
                     num_threads,
                     keep_tmp):
            self.num_threads = num_threads
            self.keep_tmp = keep_tmp
            self.zipper = "gzip"
            self.zipper_opts= []

        def parse_options(self, opts):
            global use_zpacker
            global use_BWT_FIFO
            for option, value in opts:
                if option in ("-p", "--num-threads"):
                    self.num_threads = int(value)
                elif option == "--keep-tmp":
                    self.keep_tmp = True
                elif option in ("-z","--zpacker"):
                    if value.lower() in ["-", " ", ".", "0", "none", "f", "false", "no"]:
                        value=""
                    self.zipper = value
                    #if not self.zipper:
                    #   self.zipper='gzip'
                elif option in ("-X", "--unmapped-fifo"):
                    use_BWT_FIFO=True
            if self.zipper:
                use_zpacker=True
                if self.num_threads>1 and not self.zipper_opts:
                    if self.zipper.endswith('pbzip2') or self.zipper.endswith('pigz'):
                         self.zipper_opts.append('-p'+str(self.num_threads))
            else:
                use_zpacker=False
                if use_BWT_FIFO: use_BWT_FIFO=False
        def cmd(self):
            cmdline=[]
            if self.zipper:
                 cmdline.extend(['-z',self.zipper])
            if self.num_threads>1:
                 cmdline.extend(['-p'+str(self.num_threads)])
            return cmdline

        def check(self):
            if self.num_threads<1 :
                 die("Error: arg to --num-threads must be greater than 0")
            if self.zipper:
                xzip=which(self.zipper)
                if not xzip:
                    die("Error: cannot find compression program "+self.zipper)

    # ReadParams is a group of runtime parameters that specify various properties
    # of the user's reads (e.g. which quality scale their are on, how long the
    # fragments are, etc).
    class ReadParams:
        def __init__(self,
                     solexa_quals,
                     phred64_quals,
                     quals,
                     integer_quals,
                     color,
                     library_type,
                     seed_length,
                     reads_format,
                     mate_inner_dist,
                     mate_inner_dist_std_dev,
                     read_group_id,
                     sample_id,
                     library_id,
                     description,
                     seq_platform_unit,
                     seq_center,
                     seq_run_date,
                     seq_platform):
            self.solexa_quals = solexa_quals
            self.phred64_quals = phred64_quals
            self.quals = quals
            self.integer_quals = integer_quals
            self.color = color
            self.library_type = library_type
            self.seed_length = seed_length
            self.reads_format = reads_format
            self.mate_inner_dist = mate_inner_dist
            self.mate_inner_dist_std_dev = mate_inner_dist_std_dev
            self.read_group_id = read_group_id
            self.sample_id = sample_id
            self.library_id = library_id
            self.description = description
            self.seq_platform_unit = seq_platform_unit
            self.seq_center = seq_center
            self.seq_run_date = seq_run_date
            self.seq_platform = seq_platform

        def parse_options(self, opts):
            for option, value in opts:
                if option == "--solexa-quals":
                    self.solexa_quals = True
                elif option in ("--solexa1.3-quals", "--phred64-quals"):
                    self.phred64_quals = True
                elif option in ("-Q", "--quals"):
                    self.quals = True
                elif option == "--integer-quals":
                    self.integer_quals = True
                elif option in ("-C", "--color"):
                    self.color = True
                elif option == "--library-type":
                    self.library_type = value
                elif option in ("-s", "--seed-length"):
                    self.seed_length = int(value)
                elif option in ("-r", "--mate-inner-dist"):
                    self.mate_inner_dist = int(value)
                elif option == "--mate-std-dev":
                    self.mate_inner_dist_std_dev = int(value)
                elif option == "--rg-id":
                    self.read_group_id = value
                elif option == "--rg-sample":
                    self.sample_id = value
                elif option == "--rg-library":
                    self.library_id = value
                elif option == "--rg-description":
                    self.description = value
                elif option == "--rg-platform-unit":
                    self.seq_platform_unit = value
                elif option == "--rg-center":
                    self.seq_center = value
                elif option == "--rg-date":
                    self.seq_run_date = value
                elif option == "--rg-platform":
                    self.seq_platform = value

        def check(self):
            if self.seed_length and self.seed_length < 20:
                die("Error: arg to --seed-length must be at least 20")

            if self.mate_inner_dist_std_dev != None and self.mate_inner_dist_std_dev < 0:
                die("Error: arg to --mate-std-dev must at least 0")
            if (not self.read_group_id and self.sample_id) or (self.read_group_id and not self.sample_id):
                die("Error: --rg-id and --rg-sample must be specified or omitted together")

    # SearchParams is a group of runtime parameters that specify how TopHat will
    # search for splice junctions

    class SearchParams:
        def __init__(self,
                     min_closure_exon,
                     min_closure_intron,
                     max_closure_intron,
                     min_coverage_intron,
                     max_coverage_intron,
                     min_segment_intron,
                     max_segment_intron):

             self.min_closure_exon_length = min_closure_exon
             self.min_closure_intron_length = min_closure_intron
             self.max_closure_intron_length = max_closure_intron
             self.min_coverage_intron_length = min_coverage_intron
             self.max_coverage_intron_length = max_coverage_intron
             self.min_segment_intron_length = min_segment_intron
             self.max_segment_intron_length = max_segment_intron

        def parse_options(self, opts):
            for option, value in opts:
                if option == "--min-closure-exon":
                    self.min_closure_exon_length = int(value)
                if option == "--min-closure-intron":
                    self.min_closure_intron_length = int(value)
                if option == "--max-closure-intron":
                    self.max_closure_intron_length = int(value)
                if option == "--min-coverage-intron":
                    self.min_coverage_intron_length = int(value)
                if option == "--max-coverage-intron":
                    self.max_coverage_intron_length = int(value)
                if option == "--min-segment-intron":
                    self.min_segment_intron_length = int(value)
                if option == "--max-segment-intron":
                    self.max_segment_intron_length = int(value)

        def check(self):
            if self.min_closure_exon_length < 0:
                die("Error: arg to --min-closure-exon must be at least 20")
            if self.min_closure_intron_length < 0:
                die("Error: arg to --min-closure-intron must be at least 20")
            if self.max_closure_intron_length < 0:
                die("Error: arg to --max-closure-intron must be at least 20")
            if self.min_coverage_intron_length < 0:
                die("Error: arg to --min-coverage-intron must be at least 20")
            if self.max_coverage_intron_length < 0:
                die("Error: arg to --max-coverage-intron must be at least 20")
            if self.min_segment_intron_length < 0:
                die("Error: arg to --min-segment-intron must be at least 20")
            if self.max_segment_intron_length < 0:
                die("Error: arg to --max-segment-intron must be at least 20")

    class ReportParams:
        def __init__(self):
            self.sort_bam = True
            self.convert_bam = True

        def parse_options(self, opts):
            for option, value in opts:
                if option == "--no-sort-bam":
                    self.sort_bam = False
                if option == "--no-convert-bam":
                    self.convert_bam = False

    class Bowtie2Params:
        def __init__(self):
            self.very_fast = False
            self.fast = False
            self.sensitive = False
            self.very_sensitive = False

            self.N = 0
            self.L = 20
            self.i = "S,1,1.25"
            self.n_ceil = "L,0,0.15"
            self.gbar = 4

            self.mp = "6,2"
            self.np = 1
            self.rdg = "5,3"
            self.rfg = "5,3"
            self.score_min = "L,-0.6,-0.6"

            self.D = 15
            self.R = 2

        def parse_options(self, opts):
            for option, value in opts:
                if option == "--b2-very-fast":
                    self.very_fast = True
                if option == "--b2-fast":
                    self.fast = True
                if option == "--b2-sensitive":
                    self.sensitive = True
                if option == "--b2-very-sensitive":
                    self.very_sensitive = True

                if option == "--b2-N":
                    self.N = int(value)
                if option == "--b2-L":
                    self.L = 20
                if option == "--b2-i":
                    self.i = value
                if option == "--b2-n-ceil":
                    self.n_ceil = value
                if option == "--b2-gbar":
                    self.gbar = 4

                if option == "--b2-mp":
                    self.mp = value
                if option == "--b2-np":
                    self.np = int(value)
                if option == "--b2-rdg":
                    self.rdg = value
                if option == "--b2-rfg":
                    self.rfg = value
                if option == "--b2-score-min":
                    self.score_min = value

                if option == "--b2-D":
                    self.D = int(value)
                if option == "--b2-R":
                    self.R = int(value)

        def check(self):
            more_than_once = False
            if self.very_fast:
                if self.fast or self.sensitive or self.very_sensitive:
                    more_than_once = True
            else:
                if self.fast:
                    if self.sensitive or self.very_sensitive:
                        more_than_once = True
                else:
                    if self.sensitive and self.very_sensitive:
                        more_than_once = True

            if more_than_once:
                die("Error: use only one of --b2-very-fast, --b2-fast, --b2-sensitive, --b2-very-sensitive")

            if not self.N in [0, 1]:
                die("Error: arg to --b2-N must be either 0 or 1")

            function_re = r'^[CLSG],-?\d+(\.\d+)?,-?\d+(\.\d+)?$'
            function_match = re.search(function_re, self.i)

            if not function_match:
                die("Error: arg to --b2-i must be <func> (e.g. --b2-i S,1,1.25)")

            function_match = re.search(function_re, self.n_ceil)
            if not function_match:
                die("Error: arg to --b2-n-ceil must be <func> (e.g. --b2-n-ceil L,0,0.15)")

            function_match = re.search(function_re, self.score_min)
            if not function_match:
                die("Error: arg to --b2-score-min must be <func> (e.g. --b2-score-min L,-0.6,-0.6)")

            pair_re = r'^\d+,\d+$'
            pair_match = re.search(pair_re, self.mp)
            if not pair_match:
                die("Error: arg to --b2-mp must be <int>,<int> (e.g. --b2-mp 6,2)")

            pair_match = re.search(pair_re, self.rdg)
            if not pair_match:
                die("Error: arg to --b2-rdg must be <int>,<int> (e.g. --b2-mp 5,3)")

            pair_match = re.search(pair_re, self.rfg)
            if not pair_match:
                die("Error: arg to --b2-rfg must be <int>,<int> (e.g. --b2-mp 5,3)")


    def __init__(self):
        self.splice_constraints = self.SpliceConstraints(8,     # min_anchor
                                                         50,    # min_intron
                                                         500000, # max_intron
                                                         0,     # splice_mismatches
                                                         0.15)  # min_isoform_frac

        self.preflt_data = [ BowtieFltFiles(), BowtieFltFiles() ]
        self.sam_header = None
        self.read_params = self.ReadParams(False,               # solexa_scale
                                           False,
                                           False,               # quals
                                           None,                # integer quals
                                           False,               # SOLiD - color space
                                           "",                  # library type (e.g. "illumina-stranded-pair-end")
                                           None,                # seed_length
                                           "fastq",             # quality_format
                                           None,                # mate inner distance
                                           20,                  # mate inner dist std dev
                                           None,                # read group id
                                           None,                # sample id
                                           None,                # library id
                                           None,                # description
                                           None,                # platform unit (i.e. lane)
                                           None,                # sequencing center
                                           None,                # run date
                                           None)                # sequencing platform

        self.system_params = self.SystemParams(1,               # bowtie_threads (num_threads)
                                               False)           # keep_tmp

        self.search_params = self.SearchParams(100,             # min_closure_exon_length
                                               50,              # min_closure_intron_length
                                               5000,            # max_closure_intron_length
                                               50,              # min_coverage_intron_length
                                               20000,           # max_coverage_intron_length
                                               50,              # min_segment_intron_length
                                               500000)          # max_segment_intron_length

        self.report_params = self.ReportParams()

        self.bowtie2_params = self.Bowtie2Params()

        self.bowtie2 = True
        self.gff_annotation = None
        self.transcriptome_only = False
        self.transcriptome_index = None
        self.transcriptome_outdir = None
        self.raw_junctions = None
        self.find_novel_juncs = True
        self.find_novel_indels = True
        self.find_novel_fusions = True
        self.find_GFF_juncs = True
        self.max_hits = 20
        self.t_max_hits = 60
        self.max_seg_hits = 40
        self.prefilter_multi = False
        self.genome_read_mismatches = 2
        self.max_read_mismatches = 2
        self.t_mismatches = 1
        self.segment_length = 25
        self.segment_mismatches = 2
        self.bowtie_alignment_option = "-v"
        self.max_insertion_length = 3
        self.max_deletion_length = 3
        self.raw_insertions = None
        self.raw_deletions = None
        self.coverage_search = None
        self.closure_search = False
        #self.butterfly_search = None
        self.butterfly_search = False
        self.microexon_search = False
        self.report_secondary_alignments = False
        self.report_discordant_pair_alignments = False

        self.keep_fasta_order = False
        self.partial_mapping = False

        self.fusion_search = False
        self.fusion_anchor_length = 20
        self.fusion_min_dist = 10000000
        self.fusion_read_mismatches = 2
        self.fusion_multireads = 2
        self.fusion_multipairs = 2
        self.fusion_ignore_chromosomes = []
        self.fusion_do_not_resolve_conflicts = False

    def check(self):
        self.splice_constraints.check()
        self.read_params.check()
        self.system_params.check()
        if self.segment_length < 10:
            die("Error: arg to --segment-length must at least 10")
        if self.segment_mismatches < 0 or self.segment_mismatches > 3:
            die("Error: arg to --segment-mismatches must in [0, 3]")
        if self.t_mismatches < 0 or self.t_mismatches > 3:
            die("Error: arg to -n or --transcriptome-mismatches must in [0, 3]")

        if self.read_params.color:
            if self.bowtie2:
                th_log("Warning: bowtie2 in colorspace is not supported; --bowtie1 option assumed.")
                self.bowtie2=False
            if self.fusion_search:
                die("Error: fusion-search in colorspace is not yet supported")
            if self.butterfly_search:
                die("Error: butterfly-search in colorspace is not yet supported")

        self.bowtie2_params.check()

        library_types = ["fr-unstranded", "fr-firststrand", "fr-secondstrand"]

        if self.read_params.library_type and self.read_params.library_type not in library_types:
            die("Error: library-type should be one of: "+', '.join(library_types))

        self.search_params.max_closure_intron_length = min(self.splice_constraints.max_intron_length,
                                                           self.search_params.max_closure_intron_length)

        self.search_params.max_segment_intron_length = min(self.splice_constraints.max_intron_length,
                                                           self.search_params.max_segment_intron_length)

        self.search_params.max_coverage_intron_length = min(self.splice_constraints.max_intron_length,
                                                            self.search_params.max_coverage_intron_length)

        if self.max_insertion_length >= self.segment_length:
            die("Error: the max insertion length ("+self.max_insertion_length+") can not be equal to or greater than the segment length ("+self.segment_length+")")

        if self.max_insertion_length < 0:
            die("Error: the max insertion length ("+self.max_insertion_length+") can not be less than 0")

        if self.max_deletion_length >= self.splice_constraints.min_intron_length:
            die("Error: the max deletion length ("+self.max_deletion_length+") can not be equal to or greater than the min intron length ("+self.splice_constraints.min_intron_length+")")

        if self.max_deletion_length < 0:
           die("Error: the max deletion length ("+self.max_deletion_length+") can not be less than 0")

    def cmd(self):
        cmd = ["--min-anchor", str(self.splice_constraints.min_anchor_length),
               "--splice-mismatches", str(self.splice_constraints.splice_mismatches),
               "--min-report-intron", str(self.splice_constraints.min_intron_length),
               "--max-report-intron", str(self.splice_constraints.max_intron_length),
               "--min-isoform-fraction", str(self.splice_constraints.min_isoform_fraction),
               "--output-dir", output_dir,
               "--max-multihits", str(self.max_hits),
               "--max-seg-multihits", str(self.max_seg_hits),
               "--segment-length", str(self.segment_length),
               "--segment-mismatches", str(self.segment_mismatches),
               "--min-closure-exon", str(self.search_params.min_closure_exon_length),
               "--min-closure-intron", str(self.search_params.min_closure_intron_length),
               "--max-closure-intron", str(self.search_params.max_closure_intron_length),
               "--min-coverage-intron", str(self.search_params.min_coverage_intron_length),
               "--max-coverage-intron", str(self.search_params.max_coverage_intron_length),
               "--min-segment-intron", str(self.search_params.min_segment_intron_length),
               "--max-segment-intron", str(self.search_params.max_segment_intron_length),
               "--max-mismatches", str(self.max_read_mismatches),
               "--max-insertion-length", str(self.max_insertion_length),
               "--max-deletion-length", str(self.max_deletion_length)]

        if not self.bowtie2:
            cmd.extend(["--bowtie1"])

        if self.fusion_search:
            cmd.extend(["--fusion-search",
                        "--fusion-anchor-length", str(self.fusion_anchor_length),
                        "--fusion-min-dist", str(self.fusion_min_dist),
                        "--fusion-read-mismatches", str(self.fusion_read_mismatches),
                        "--fusion-multireads", str(self.fusion_multireads),
                        "--fusion-multipairs", str(self.fusion_multipairs)])

            if self.fusion_ignore_chromosomes:
                cmd.extend(["--fusion-ignore-chromosomes", ",".join(self.fusion_ignore_chromosomes)])

            if self.fusion_do_not_resolve_conflicts:
                cmd.extend(["--fusion-do-not-resolve-conflicts"])

        cmd.extend(self.system_params.cmd())

        if self.read_params.mate_inner_dist != None:
            cmd.extend(["--inner-dist-mean", str(self.read_params.mate_inner_dist),
                        "--inner-dist-std-dev", str(self.read_params.mate_inner_dist_std_dev)])
        if self.gff_annotation != None:
            cmd.extend(["--gtf-annotations", str(self.gff_annotation)])
            if gtf_juncs:
               cmd.extend(["--gtf-juncs", gtf_juncs])
        if self.closure_search == False:
            cmd.append("--no-closure-search")
        if not self.coverage_search:
            cmd.append("--no-coverage-search")
        if not self.microexon_search:
            cmd.append("--no-microexon-search")
        if self.butterfly_search:
            cmd.append("--butterfly-search")
        if self.read_params.solexa_quals:
            cmd.append("--solexa-quals")
        if self.read_params.quals:
            cmd.append("--quals")
        if self.read_params.integer_quals:
            cmd.append("--integer-quals")
        if self.read_params.color:
            cmd.append("--color")
        if self.read_params.library_type:
            cmd.extend(["--library-type", self.read_params.library_type])
        if self.read_params.read_group_id:
            cmd.extend(["--rg-id", self.read_params.read_group_id])
        if self.read_params.phred64_quals:
            cmd.append("--phred64-quals")
        return cmd

    # This is the master options parsing routine, which calls parse_options for
    # the delegate classes (e.g. SpliceConstraints) that handle certain groups
    # of options.
    def parse_options(self, argv):
        try:
            opts, args = getopt.getopt(argv[1:], "hvp:m:n:N:F:a:i:I:G:Tr:o:j:Xz:s:g:x:MQC",
                                        ["version",
                                         "help",
                                         "output-dir=",
                                         "bowtie1",
                                         "solexa-quals",
                                         "solexa1.3-quals",
                                         "phred64-quals",
                                         "quals",
                                         "integer-quals",
                                         "color",
                                         "library-type=",
                                         "num-threads=",
                                         "splice-mismatches=",
                                         "max-multihits=",
                                         "min-isoform-fraction=",
                                         "min-anchor-length=",
                                         "min-intron-length=",
                                         "max-intron-length=",
                                         "GTF=",
                                         "transcriptome-only",
                                         "transcriptome-max-hits=",
                                         "transcriptome-index=",
                                         "raw-juncs=",
                                         "no-novel-juncs",
                                         "allow-fusions",
                                         "fusion-search",
                                         "fusion-anchor-length=",
                                         "fusion-min-dist=",
                                         "fusion-read-mismatches=",
                                         "fusion-multireads=",
                                         "fusion-multipairs=",
                                         "fusion-ignore-chromosomes=",
                                         "fusion-do-not-resolve-conflicts",
                                         "no-novel-indels",
                                         "no-gtf-juncs",
                                         "mate-inner-dist=",
                                         "mate-std-dev=",
                                         "no-coverage-search",
                                         "coverage-search",
                                         #"no-closure-search",
                                         #"closure-search",
                                         #"min-closure-exon=",
                                         #"min-closure-intron=",
                                         #"max-closure-intron=",
                                         #"butterfly-search",
                                         #"no-butterfly-search",
                                         "prefilter-multihits",
                                         "microexon-search",
                                         "min-coverage-intron=",
                                         "max-coverage-intron=",
                                         "min-segment-intron=",
                                         "max-segment-intron=",
                                         "seed-length=",
                                         "genome-read-mismatches=",
                                         "read-mismatches=",
                                         "max-read-mismatches=",
                                         "transcriptome-mismatches=",
                                         "segment-length=",
                                         "segment-mismatches=",
                                         "bowtie-n",
                                         "keep-tmp",
                                         "rg-id=",
                                         "rg-sample=",
                                         "rg-library=",
                                         "rg-description=",
                                         "rg-platform-unit=",
                                         "rg-center=",
                                         "rg-date=",
                                         "rg-platform=",
                                         "tmp-dir=",
                                         "zpacker=",
                                         "unmapped-fifo",
                                         "max-insertion-length=",
                                         "max-deletion-length=",
                                         "insertions=",
                                         "deletions=",
                                         "no-sort-bam",
                                         "no-convert-bam",
                                         "report-secondary-alignments",
                                         "report-discordant-pair-alignments",
                                         "keep-fasta-order",
                                         "allow-partial-mapping",
                                         "b2-very-fast",
                                         "b2-fast",
                                         "b2-sensitive",
                                         "b2-very-sensitive",
                                         "b2-N=",
                                         "b2-L=",
                                         "b2-i=",
                                         "b2-n-ceil=",
                                         "b2-gbar=",
                                         "b2-ma=",
                                         "b2-mp=",
                                         "b2-np=",
                                         "b2-rdg=",
                                         "b2-rfg=",
                                         "b2-score-min=",
                                         "b2-D=",
                                         "b2-R="])
        except getopt.error, msg:
            raise Usage(msg)

        self.splice_constraints.parse_options(opts)
        self.system_params.parse_options(opts)
        self.read_params.parse_options(opts)
        self.search_params.parse_options(opts)
        self.report_params.parse_options(opts)
        self.bowtie2_params.parse_options(opts)
        global use_BWT_FIFO
        global use_BAM_Unmapped
        if not self.read_params.color:
           use_BWT_FIFO=False
           use_BAM_Unmapped=True
        global output_dir
        global logging_dir
        global tmp_dir

        custom_tmp_dir = None
        custom_out_dir = None
        # option processing
        for option, value in opts:
            if option in ("-v", "--version"):
                print "TopHat v%s" % (get_version())
                sys.exit(0)
            if option in ("-h", "--help"):
                raise Usage(use_message)
            if option == "--bowtie1":
                self.bowtie2 = False
            if option in ("-g", "--max-multihits"):
                self.max_hits = int(value)
                self.max_seg_hits = max(10, self.max_hits * 2)
            if option in ("-x", "--transcriptome-max-hits"):
                self.t_max_hits = int(value)
            if option in ("-G", "--GTF"):
                self.gff_annotation = value
            if option in ("-T", "--transcriptome-only"):
                self.transcriptome_only = True
            if option == "--transcriptome-index":
                self.transcriptome_index = value
            if option in("-M", "--prefilter-multihits"):
                self.prefilter_multi = True
            if option in ("-j", "--raw-juncs"):
                self.raw_junctions = value
            if option == "--no-novel-juncs":
                self.find_novel_juncs = False
            if option == "--no-novel-indels":
                self.find_novel_indels = False
            if option == "--fusion-search":
                self.fusion_search = True
            if option == "--fusion-anchor-length":
                self.fusion_anchor_length = int(value)
            if option == "--fusion-min-dist":
                self.fusion_min_dist = int(value)
            if option == "--fusion-read-mismatches":
                self.fusion_read_mismatches = int(value)
            if option == "--fusion-multireads":
                self.fusion_multireads = int(value)
            if option == "--fusion-multipairs":
                self.fusion_multipairs = int(value)
            if option == "--fusion-ignore-chromosomes":
                self.fusion_ignore_chromosomes = value.split(",")
            if option == "--fusion-do-not-resolve-conflicts":
                self.fusion_do_not_resolve_conflicts = True
            if option == "--no-gtf-juncs":
                self.find_GFF_juncs = False
            if option == "--no-coverage-search":
                self.coverage_search = False
            if option == "--coverage-search":
                self.coverage_search = True

            self.closure_search = False
            #if option == "--no-closure-search":
            #    self.closure_search = False
            #if option == "--closure-search":
            #    self.closure_search = True
            if option == "--microexon-search":
                self.microexon_search = True

            self.butterfly_search = False
            #if option == "--butterfly-search":
            #    self.butterfly_search = True
            #if option == "--no-butterfly-search":
            #    self.butterfly_search = False
            if option == "--genome-read-mismatches":
                self.genome_read_mismatches = int(value)
            if option in ("-N", "--read-mismatches", "--max-read-mismatches"):
                self.max_read_mismatches = int(value)
            if option in ("-n", "--transcriptome-mismatches"):
                self.t_mismatches = int(value)
            if option == "--segment-length":
                self.segment_length = int(value)
            if option == "--segment-mismatches":
                self.segment_mismatches = int(value)
            if option == "--bowtie-n":
                self.bowtie_alignment_option = "-n"
            if option == "--max-insertion-length":
                self.max_insertion_length = int(value)
            if option == "--max-deletion-length":
                self.max_deletion_length = int(value)
            if option == "--insertions":
                self.raw_insertions = value
            if option == "--deletions":
                self.raw_deletions = value
            if option == "--report-secondary-alignments":
                self.report_secondary_alignments = True
            if option == "--report-discordant-pair-alignments":
                self.report_discordant_pair_alignments = True
            if option == "--keep-fasta-order":
                self.keep_fasta_order = True
            if option == "--allow-partial-mapping":
                self.partial_mapping = True
            if option in ("-o", "--output-dir"):
                custom_out_dir = value + "/"
            if option == "--tmp-dir":
                custom_tmp_dir = value + "/"
        if self.transcriptome_only:
           self.find_novel_juncs=False
           self.find_novel_indels=False
        if custom_out_dir:
            output_dir = custom_out_dir
            logging_dir = output_dir + "logs/"
            tmp_dir = output_dir + "tmp/"
            sam_header = tmp_dir + "stub_header.sam"
        if custom_tmp_dir:
            tmp_dir = custom_tmp_dir
            sam_header = tmp_dir + "stub_header.sam"
        if len(args) < 2:
            raise Usage(use_message)
        return args


def nonzeroFile(filepath):
  if os.path.exists(filepath):
     fpath, fname=os.path.split(filepath)
     fbase, fext =os.path.splitext(fname)
     if fext.lower() == ".bam":
        output = os.popen("samtools view %s | head | wc -l" % filepath).read()[:-1]
        if int(output) > 0:
          return True
     else:
        if os.path.getsize(filepath)>25:
          return True
  return False

def bamExists_and_NonEmpty(filepath):
  if os.path.exists(filepath):
      output = os.popen("samtools view %s | head | wc -l" % filepath).read()[:-1]
      if int(output) > 0:
          return True

  return False

# check if a file exists and has non-zero (or minimum) size
def fileExists(filepath, minfsize=1):
  if os.path.exists(filepath) and os.path.getsize(filepath)>=minfsize:
     return True
  else:
     return False

def removeFileWithIndex(filepath):
    if os.path.exists(filepath):
        os.remove(filepath)

        fileindexpath = filepath + ".index"
        if os.path.exists(fileindexpath):
            os.remove(fileindexpath)

def getFileDir(filepath):
   #if fullpath given, returns path including the ending /
   fpath, fname=os.path.split(filepath)
   if fpath: fpath+='/'
   return fpath

def getFileBaseName(filepath):
   fpath, fname=os.path.split(filepath)
   fbase, fext =os.path.splitext(fname)
   fx=fext.lower()
   if (fx in ['.fq','.txt','.seq','.bwtout'] or fx.find('.fa')==0) and len(fbase)>0:
      return fbase
   elif fx == '.z' or fx.find('.gz')==0 or fx.find('.bz')==0:
      fb, fext = os.path.splitext(fbase)
      fx=fext.lower()
      if (fx in ['.fq','.txt','.seq','.bwtout'] or fx.find('.fa')==0) and len(fb)>0:
         return fb
      else:
         return fbase
   else:
     if len(fbase)>0:
        return fbase
     else:
        return fname

# Returns the current time in a nice format
def right_now():
    curr_time = datetime.now()
    return curr_time.strftime("%c")

# The TopHat logging formatter
def th_log(out_str):
  #print >> sys.stderr, "[%s] %s" % (right_now(), out_str)
  if tophat_logger:
       tophat_logger.info(out_str)

def th_logp(out_str=""):
  print >> sys.stderr, out_str
  if tophat_log:
        print >> tophat_log, out_str

def die(msg=None):
  if msg is not None:
    th_logp(msg)
  sys.exit(1)

# Ensures that the output, logging, and temp directories are present. If not,
# they are created
def prepare_output_dir():

    #th_log("Preparing output location "+output_dir)
    if os.path.exists(output_dir):
        pass
    else:
        os.mkdir(output_dir)

    if os.path.exists(logging_dir):
        pass
    else:
        os.mkdir(logging_dir)

    if os.path.exists(tmp_dir):
        pass
    else:
        try:
          os.makedirs(tmp_dir)
        except OSError, o:
          die("\nError creating directory %s (%s)" % (tmp_dir, o))


# to be added as preexec_fn for every subprocess.Popen() call:
# see http://bugs.python.org/issue1652
def subprocess_setup():
 # Python installs a SIGPIPE handler by default, which causes
 # gzip or other de/compression pipes to complain about "stdout: Broken pipe"
   signal.signal(signal.SIGPIPE, signal.SIG_DFL)

# Check that the Bowtie index specified by the user is present and all files
# are there.
def check_bowtie_index(idx_prefix, is_bowtie2):
    th_log("Checking for Bowtie index files")
    idxext="ebwt"
    bowtie_ver=""
    if is_bowtie2:
        idxext="bt2"
        bowtie_ver="2 "

    idx_fwd_1 = idx_prefix + ".1."+idxext
    idx_fwd_2 = idx_prefix + ".2."+idxext
    idx_rev_1 = idx_prefix + ".rev.1."+idxext
    idx_rev_2 = idx_prefix + ".rev.2."+idxext

    if os.path.exists(idx_fwd_1) and \
       os.path.exists(idx_fwd_2) and \
       os.path.exists(idx_rev_1) and \
       os.path.exists(idx_rev_2):
        return
    else:
        bwtidxerr="Error: Could not find Bowtie "+bowtie_ver+"index files (" + idx_prefix + ".*."+idxext+")"
        bwtidx_env = os.environ.get("BOWTIE_INDEXES")

        if bwtidx_env == None:
            die(bwtidxerr)
        if os.path.exists(bwtidx_env+idx_fwd_1) and \
           os.path.exists(bwtidx_env+idx_fwd_2) and \
           os.path.exists(bwtidx_env+idx_rev_1) and \
           os.path.exists(bwtidx_env+idx_rev_2):
            return
        else:
            die(bwtidxerr)

# Reconstructs the multifasta file from which the Bowtie index was created, if
# it's not already there.
def bowtie_idx_to_fa(idx_prefix, is_bowtie2):
    idx_name = idx_prefix.split('/')[-1]
    th_log("Reconstituting reference FASTA file from Bowtie index")

    try:
        tmp_fasta_file_name = tmp_dir + idx_name + ".fa"
        tmp_fasta_file = open(tmp_fasta_file_name, "w")

        inspect_log = open(logging_dir + "bowtie_inspect_recons.log", "w")

        if is_bowtie2:
            inspect_cmd = [prog_path("bowtie2-inspect")]
        else:
            inspect_cmd = [prog_path("bowtie-inspect")]

        inspect_cmd += [idx_prefix]

        th_logp("  Executing: " + " ".join(inspect_cmd) + " > " + tmp_fasta_file_name)
        ret = subprocess.call(inspect_cmd,
                              stdout=tmp_fasta_file,
                              stderr=inspect_log)
        # Bowtie reported an error
        if ret != 0:
           die(fail_str+"Error: bowtie-inspect returned an error\n"+log_tail(logging_dir + "bowtie_inspect_recons.log"))

    # Bowtie not found
    except OSError, o:
        if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
            die(fail_str+"Error: bowtie-inspect not found on this system.  Did you forget to include it in your PATH?")

    return tmp_fasta_file_name

# Checks whether the multifasta file for the genome is present alongside the
# Bowtie index files for it.
def check_fasta(idx_prefix, is_bowtie2):
    th_log("Checking for reference FASTA file")
    idx_fasta = idx_prefix + ".fa"
    if os.path.exists(idx_fasta):
        return idx_fasta
    else:
        bowtie_idx_env_var = os.environ.get("BOWTIE_INDEXES")
        if bowtie_idx_env_var:
            idx_fasta = bowtie_idx_env_var + idx_prefix + ".fa"
            if os.path.exists(idx_fasta):
                return idx_fasta

        th_logp("\tWarning: Could not find FASTA file " + idx_fasta)
        idx_fa = bowtie_idx_to_fa(idx_prefix, is_bowtie2)
        return idx_fa

# Check that both the Bowtie index and the genome's fasta file are present
def check_index(idx_prefix, is_bowtie2):
    check_bowtie_index(idx_prefix, is_bowtie2)
    ref_fasta_file = check_fasta(idx_prefix, is_bowtie2)

    return (ref_fasta_file, None)

# Retrive a tuple containing the system's version of Bowtie.  Parsed from
# `bowtie --version`
def get_bowtie_version(is_bowtie2):
    try:
        # Launch Bowtie to capture its version info
        proc = subprocess.Popen([bowtie_path, "--version"],
                          stdout=subprocess.PIPE)

        stdout_value = proc.communicate()[0]

        bowtie_version = None
        bowtie_out = repr(stdout_value)

        # Find the version identifier
        if is_bowtie2:
            version_str = "bowtie2-align version "
        else:
            version_str = "bowtie version "

        ver_str_idx = bowtie_out.find(version_str)
        if ver_str_idx != -1:
            nl = bowtie_out.find("\\n", ver_str_idx)
            version_val = bowtie_out[ver_str_idx + len(version_str):nl]

            if is_bowtie2:
                ver_numbers = version_val.split('.')
                ver_numbers[2:] = ver_numbers[2].split('-')
                bowtie_version = [int(x) for x in ver_numbers[:3]] + [int(ver_numbers[3][4:])]
            else:
                bowtie_version = [int(x) for x in version_val.split('.')]

        if len(bowtie_version) == 3:
            bowtie_version.append(0)

        return bowtie_version
    except OSError, o:
       errmsg=fail_str+str(o)+"\n"
       if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
           errmsg+="Error: bowtie not found on this system"
       die(errmsg)

def get_index_sam_header(params, idx_prefix, name = ""):
    try:
        temp_sam_header_filename = tmp_dir + "temp.samheader.sam"
        temp_sam_header_file = open(temp_sam_header_filename, "w")

        bowtie_header_cmd = [bowtie_path]

        read_params = params.read_params
        if not params.bowtie2:
            bowtie_header_cmd += ["--sam"]

        if read_params.color:
            bowtie_header_cmd.append('-C')

        bowtie_header_cmd.extend([idx_prefix, '/dev/null'])
        subprocess.call(bowtie_header_cmd,
                   stdout=temp_sam_header_file,
                   stderr=open('/dev/null'))

        temp_sam_header_file.close()
        temp_sam_header_file = open(temp_sam_header_filename, "r")

        bowtie_sam_header_filename = tmp_dir + idx_prefix.split('/')[-1]
        if name != "":
             bowtie_sam_header_filename += ("_" + name)
        bowtie_sam_header_filename += ".bwt.samheader.sam"

        bowtie_sam_header_file = open(bowtie_sam_header_filename, "w")

        preamble = []
        sq_dict_lines = []

        for line in temp_sam_header_file.readlines():
            line = line.strip()
            if line.find("@SQ") != -1:
                # Sequence dictionary record
                cols = line.split('\t')
                seq_name = None
                for col in cols:
                    fields = col.split(':')
                    #print fields
                    if len(fields) > 0 and fields[0] == "SN":
                        seq_name = fields[1]
                if seq_name == None:
                    die("Error: malformed sequence dictionary in sam header")
                sq_dict_lines.append([seq_name,line])
            elif line.find("CL"):
                continue
            else:
                preamble.append(line)

        print >> bowtie_sam_header_file, "@HD\tVN:1.0\tSO:coordinate"
        if read_params.read_group_id and read_params.sample_id:
            rg_str = "@RG\tID:%s\tSM:%s" % (read_params.read_group_id,
                                            read_params.sample_id)
            if read_params.library_id:
                rg_str += "\tLB:%s" % read_params.library_id
            if read_params.description:
                rg_str += "\tDS:%s" % read_params.description
            if read_params.seq_platform_unit:
                rg_str += "\tPU:%s" % read_params.seq_platform_unit
            if read_params.seq_center:
                rg_str += "\tCN:%s" % read_params.seq_center
            if read_params.mate_inner_dist:
                rg_str += "\tPI:%s" % read_params.mate_inner_dist
            if read_params.seq_run_date:
                rg_str += "\tDT:%s" % read_params.seq_run_date
            if read_params.seq_platform:
                rg_str += "\tPL:%s" % read_params.seq_platform

            print >> bowtie_sam_header_file, rg_str

        if not params.keep_fasta_order:
            sq_dict_lines.sort(lambda x,y: cmp(x[0],y[0]))

        for [name, line] in sq_dict_lines:
            print >> bowtie_sam_header_file, line
        print >> bowtie_sam_header_file, "@PG\tID:TopHat\tVN:%s\tCL:%s" % (get_version(), run_cmd)

        bowtie_sam_header_file.close()
        temp_sam_header_file.close()
        return bowtie_sam_header_filename

    except OSError, o:
       errmsg=fail_str+str(o)+"\n"
       if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
           errmsg+="Error: bowtie not found on this system"
       die(errmsg)

# Make sure Bowtie is installed and is recent enough to be useful
def check_bowtie(params):
    bowtie_req=""
    if params.bowtie2:
        bowtie_req="2-align"
    log_msg = "Checking for Bowtie"
    th_log(log_msg)

    bowtie_bin = "bowtie"+bowtie_req

    global bowtie_path
    bowtie_version = None
    bowtie_path=which(bowtie_bin)
    if bowtie_path:
      bowtie_version = get_bowtie_version(params.bowtie2)
    if params.bowtie2 and bowtie_version == None:
        th_logp("  Bowtie 2 not found, checking for older version..")
        #try to fallback on bowtie 1
        params.bowtie2=False
        bowtie_path=which('bowtie')
        if bowtie_path:
           bowtie_version=get_bowtie_version(params.bowtie2)
    if bowtie_version == None:
           die("Error: Bowtie not found on this system.")
    if params.bowtie2:
        if bowtie_version[0] < 3 and bowtie_version[1] < 1 and bowtie_version[2] < 1 and bowtie_version[3] < 5:
            die("Error: TopHat requires Bowtie 2.0.0-beta5 or later")
    else:
        if bowtie_version[0] < 1 and bowtie_version[1] < 12 and bowtie_version[2] < 3:
            die("Error: TopHat requires Bowtie 0.12.3 or later")
    th_logp("\t\t  Bowtie version:\t %s" % ".".join([str(x) for x in bowtie_version]))


# Retrive a tuple containing the system's version of samtools.  Parsed from
# `samtools`
def get_samtools_version():
    try:
        # Launch Bowtie to capture its version info
        proc = subprocess.Popen(samtools_path, stderr=subprocess.PIPE)
        samtools_out = proc.communicate()[1]

        # Find the version identifier
        version_match = re.search(r'Version:\s+(\d+)\.(\d+).(\d+)([a-zA-Z]?)', samtools_out)
        samtools_version_arr = [int(version_match.group(x)) for x in [1,2,3]]
        if version_match.group(4):
            samtools_version_arr.append(version_match.group(4))
        else:
            samtools_version_arr.append(0)

        return version_match.group(), samtools_version_arr
    except OSError, o:
       errmsg=fail_str+str(o)+"\n"
       if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
           errmsg+="Error: samtools not found on this system"
       die(errmsg)

# Make sure the SAM tools are installed and are recent enough to be useful
def check_samtools():
    th_log("Checking for Samtools")
    global samtools_path
    samtools_path=prog_path("samtools")
    samtools_version_str, samtools_version_arr = get_samtools_version()
    if samtools_version_str == None:
        die("Error: Samtools not found on this system")
    elif  samtools_version_arr[1] < 1 or samtools_version_arr[2] < 7:
        die("Error: TopHat requires Samtools 0.1.7 or later")
    th_logp("\t\tSamtools version:\t %s" % ".".join([str(x) for x in samtools_version_arr]))



class FastxReader:
  def __init__(self, i_file, is_color=0, fname=''):
    self.bufline=None
    self.format=None
    self.ifile=i_file
    self.nextRecord=None
    self.eof=None
    self.fname=fname
    self.lastline=None
    self.numrecords=0
    self.isColor=0
    if is_color : self.isColor=1
    # determine file type
    #no records processed yet, skip custom header lines if any
    hlines=10 # allow maximum 10 header lines
    self.lastline=" "
    while hlines>0 and self.lastline[0] not in "@>" :
       self.lastline=self.ifile.readline()
       hlines-=1
    if self.lastline[0] == '@':
      self.format='fastq'
      self.nextRecord=self.nextFastq
    elif self.lastline[0] == '>':
      self.format='fasta'
      self.nextRecord=self.nextFasta
    else:
      die("Error: cannot determine record type in input file %s" % fname)
    self.bufline=self.lastline
    self.lastline=None

  def nextFastq(self):
    # returning tuple: (seqID, sequence_string, seq_len, qv_string)
    seqid,seqstr,qstr,seq_len='','','',0
    if self.eof: return (seqid, seqstr, seq_len, qstr)
    fline=self.getLine #shortcut to save a bit of time
    line=fline()

    if not line : return (seqid, seqstr, seq_len, qstr)
    while len(line.rstrip())==0: # skip empty lines
      line=fline()
      if not line : return (seqid, seqstr,seq_len, qstr)
    try:
      if line[0] != "@":
          raise ValueError("Records in Fastq files should start with '@' character")

      seqid = line[1:].rstrip()
      seqstr = fline().rstrip()

      #There may now be more sequence lines, or the "+" quality marker line:
      while True:
          line = fline()
          if not line:
             raise ValueError("Premature end of file (missing quality values for "+seqid+")")
          if line[0] == "+":
             # -- sequence string ended
             #qtitle = line[1:].rstrip()
             #if qtitle and qtitle != seqid:
             #   raise ValueError("Different read ID for sequence and quality (%s vs %s)" \
             #                    % (seqid, qtitle))
             break
          seqstr += line.rstrip() #removes trailing newlines
          #loop until + found
      seq_len = len(seqstr)
      #at least one line of quality data should follow
      qstrlen=0
      #now read next lines as quality values until seq_len is reached
      while True:
          line=fline()
          if not line : break #end of file
          qstr += line.rstrip()
          qstrlen=len(qstr)
          if qstrlen + self.isColor >= seq_len :
               break # qv string has reached the length of seq string
          #loop until qv has the same length as seq

      if self.isColor:
           # and qstrlen==seq_len :
           if qstrlen==seq_len:
             #qual string may have a dummy qv at the beginning, should be stripped
             qstr = qstr[1:]
             qstrlen -= 1
           if qstrlen!=seq_len-1:
             raise ValueError("Length mismatch between sequence and quality strings "+ \
                                "for %s (%i vs %i)." % (seqid, seq_len, qstrlen))
      else:
           if seq_len != qstrlen :
              raise ValueError("Length mismatch between sequence and quality strings "+ \
                                "for %s (%i vs %i)." % (seqid, seq_len, qstrlen))
    except ValueError, err:
        die("\nError encountered parsing file "+self.fname+":\n "+str(err))
    #return the record
    self.numrecords+=1
    ##--discard the primer base [NO]
    if self.isColor :
        seq_len-=1
        seqstr = seqstr[1:]
    return (seqid, seqstr, seq_len, qstr)

  def nextFasta(self):
    # returning tuple: (seqID, sequence_string, seq_len)
    seqid,seqstr,seq_len='','',0
    fline=self.getLine # shortcut to readline function of f
    line=fline() # this will use the buffer line if it's there
    if not line : return (seqid, seqstr, seq_len, None)
    while len(line.rstrip())==0: # skip empty lines
      line=fline()
      if not line : return (seqid, seqstr, seq_len, None)
    try:
       if line[0] != ">":
          raise ValueError("Records in Fasta files must start with '>' character")
       seqid = line[1:].split()[0]
       #more sequence lines, or the ">" quality marker line:
       while True:
          line = fline()
          if not line: break
          if line[0] == '>':
             #next sequence starts here
             self.ungetLine()
             break
          seqstr += line.rstrip()
          #loop until '>' found
       seq_len = len(seqstr)
       if seq_len < 3:
          raise ValueError("Read %s too short (%i)." \
                           % (seqid, seq_len))
    except ValueError, err:
        die("\nError encountered parsing fasta file "+self.fname+"\n "+str(err))
    #return the record and continue
    self.numrecords+=1
    if self.isColor : # -- discard primer base
        seq_len-=1
        seqstr=seqstr[1:]
    return (seqid, seqstr, seq_len, None)

  def getLine(self):
      if self.bufline: #return previously buffered line
         r=self.bufline
         self.bufline=None
         return r
      else: #read a new line from stream and return it
         if self.eof: return None
         self.lastline=self.ifile.readline()
         if not self.lastline:
            self.eof=1
            return None
         return self.lastline
  def ungetLine(self):
      if self.lastline is None:
         th_logp("Warning: FastxReader called ungetLine() with no prior line!")
      self.bufline=self.lastline
      self.lastline=None
#< class FastxReader

def fa_write(fhandle, seq_id, seq):
    """
    Write to a file in the FASTA format.

    Arguments:
    - `fhandle`: A file handle open for writing
    - `seq_id`: The sequence id string for this sequence
    - `seq`: An unformatted string of the sequence to write
    """
    line_len = 60
    fhandle.write(">" + seq_id + "\n")
    for i in xrange(len(seq) / line_len + 1):
        start = i * line_len
        #end = (i+1) * line_len if (i+1) * line_len < len(seq) else len(seq)
        if (i+1) * line_len < len(seq):
             end = (i+1) * line_len
        else:
             end = len(seq)
        fhandle.write( seq[ start:end ] + "\n")

class ZReader:
    def __init__(self, filename, params, guess=True):
        self.fname=filename
        self.file=None
        self.fsrc=None
        self.popen=None
        sys_params = params.system_params
        pipecmd=[]
        s=filename.lower()
        if s.endswith(".bam"):
           pipecmd=[prog_path("bam2fastx")]
           if params.read_params.color:
               pipecmd+=["--color"]
           pipecmd+=["--all", "-"]
        else:
          if guess:
             if s.endswith(".z") or s.endswith(".gz") or s.endswith(".gzip"):
                  pipecmd=['gzip']
             else:
                  if s.endswith(".bz2") or s.endswith(".bzip2") or s.endswith(".bzip"):
                       pipecmd=['bzip2']
             if len(pipecmd)>0 and which(pipecmd[0]) is None:
                 die("Error: cannot find %s to decompress input file %s " % (pipecmd, filename))
             if len(pipecmd)>0:
                if pipecmd[0]=='gzip' and sys_params.zipper.endswith('pigz'):
                   pipecmd[0]=sys_params.zipper
                   pipecmd.extend(sys_params.zipper_opts)
                elif pipecmd[0]=='bzip2' and sys_params.zipper.endswith('pbzip2'):
                   pipecmd[0]=sys_params.zipper
                   pipecmd.extend(sys_params.zipper_opts)
          else: #not guessing, but must still check if it's a compressed file
             if use_zpacker and filename.endswith(".z"):
                pipecmd=[sys_params.zipper]
                pipecmd.extend(sys_params.zipper_opts)

          if pipecmd:
             pipecmd+=['-cd']
        if pipecmd:
           try:
              self.fsrc=open(self.fname, 'rb')
              self.popen=subprocess.Popen(pipecmd,
                    preexec_fn=subprocess_setup,
                    stdin=self.fsrc,
                    stdout=subprocess.PIPE, stderr=tophat_log, close_fds=True)
           except Exception:
              die("Error: could not open pipe "+' '.join(pipecmd)+' < '+ self.fname)
           self.file=self.popen.stdout
        else:
           self.file=open(filename)
    def close(self):
       if self.fsrc: self.fsrc.close()
       self.file.close()
       if self.popen:
           self.popen.wait()
           self.popen=None

class ZWriter:
   def __init__(self, filename, sysparams):
      self.fname=filename
      if use_zpacker:
          pipecmd=[sysparams.zipper,"-cf", "-"]
          self.ftarget=open(filename, "wb")
          try:
             self.popen=subprocess.Popen(pipecmd,
                   preexec_fn=subprocess_setup,
                   stdin=subprocess.PIPE,
                   stderr=tophat_log, stdout=self.ftarget, close_fds=True)
          except Exception:
              die("Error: could not open writer pipe "+' '.join(pipecmd)+' < '+ self.fname)
          self.file=self.popen.stdin # client writes to this end of the pipe
      else: #no compression
          self.file=open(filename, "w")
          self.ftarget=None
          self.popen=None
   def close(self):
      self.file.close()
      if self.ftarget: self.ftarget.close()
      if self.popen:
          self.popen.wait() #! required to actually flush the pipes (eek!)
          self.popen=None

# check_reads_format() examines the first few records in the user files
# to determines the file format
def check_reads_format(params, reads_files):
    #seed_len = params.read_params.seed_length
    fileformat = params.read_params.reads_format

    observed_formats = set([])
    # observed_scales = set([])
    min_seed_len = 99999
    max_seed_len = 0
    files = reads_files.split(',')

    for f_name in files:
        #try:
        zf = ZReader(f_name, params)
        #except IOError:
        #   die("Error: could not open file "+f_name)
        freader=FastxReader(zf.file, params.read_params.color, zf.fname)
        toread=4 #just sample the first 4 reads
        while toread>0:
            seqid, seqstr, seq_len, qstr = freader.nextRecord()
            if not seqid: break
            toread-=1
            if seq_len < 20:
                  th_logp("Warning: found a read < 20bp in "+f_name)
            else:
                min_seed_len = min(seq_len, min_seed_len)
                max_seed_len = max(seq_len, max_seed_len)
        zf.close()
        observed_formats.add(freader.format)
    if len(observed_formats) > 1:
        die("Error: TopHat requires all reads be either FASTQ or FASTA.  Mixing formats is not supported.")
    fileformat=list(observed_formats)[0]
    #if seed_len != None:
    #    seed_len = max(seed_len, max_seed_len)
    #else:
    #    seed_len = max_seed_len
    #print >> sys.stderr, "\tmin read length: %dbp, max read length: %dbp" % (min_seed_len, max_seed_len)
    th_logp("\tformat:\t\t %s" % fileformat)
    if fileformat == "fastq":
        quality_scale = "phred33 (default)"
        if params.read_params.solexa_quals and not params.read_params.phred64_quals:
            quality_scale = "solexa33 (reads generated with GA pipeline version < 1.3)"
        elif params.read_params.phred64_quals:
            quality_scale = "phred64 (reads generated with GA pipeline version >= 1.3)"
        th_logp("\tquality scale:\t %s" % quality_scale)
    elif fileformat == "fasta":
        if params.read_params.color:
            params.read_params.integer_quals = True

    #print seed_len, format, solexa_scale
    #NOTE: seed_len will be re-evaluated later by prep_reads
    return TopHatParams.ReadParams(params.read_params.solexa_quals,
                                   params.read_params.phred64_quals,
                                   params.read_params.quals,
                                   params.read_params.integer_quals,
                                   params.read_params.color,
                                   params.read_params.library_type,
                                   #seed_len,
                                   params.read_params.seed_length,
                                   fileformat,
                                   params.read_params.mate_inner_dist,
                                   params.read_params.mate_inner_dist_std_dev,
                                   params.read_params.read_group_id,
                                   params.read_params.sample_id,
                                   params.read_params.library_id,
                                   params.read_params.description,
                                   params.read_params.seq_platform_unit,
                                   params.read_params.seq_center,
                                   params.read_params.seq_run_date,
                                   params.read_params.seq_platform)

def log_tail(logfile, lines=1):
    f=open(logfile, "r")
    f.seek(0, 2)
    fbytes= f.tell()
    size=lines
    block=-1
    while size > 0 and fbytes+block*1024  > 0:
        if (fbytes+block*1024 > 0):
            ##Seek back once more, if possible
            f.seek( block*1024, 2 )
        else:
            #Seek to the beginning
            f.seek(0, 0)
        data= f.read( 1024 )
        linesFound= data.count('\n')
        size -= linesFound
        block -= 1
    if (fbytes + block*1024 > 0):
       f.seek(block*1024, 2)
    else:
       f.seek(0,0)
    f.readline() # find a newline
    lastBlocks= list( f.readlines() )
    f.close()
    return "\n".join(lastBlocks[-lines:])

# Format a DateTime as a pretty string.
# FIXME: Currently doesn't support days!
def formatTD(td):
    days = td.days
    hours = td.seconds // 3600
    minutes = (td.seconds % 3600) // 60
    seconds = td.seconds % 60

    if days > 0:
        return '%d days %02d:%02d:%02d' % (days, hours, minutes, seconds)
    else:
        return '%02d:%02d:%02d' % (hours, minutes, seconds)

class PrepReadsInfo:
    def __init__(self, fname, out_fname):
           self.min_len  = [0, 0]
           self.max_len  = [0, 0]
           self.in_count = [0, 0]
           self.out_count= [0, 0]
           self.kept_reads = [None, None]
           try:
             f=open(fname,"r")
             self.min_len[0]=int(f.readline().split("=")[-1])
             self.max_len[0]=int(f.readline().split("=")[-1])
             self.in_count[0]=int(f.readline().split("=")[-1])
             self.out_count[0]=int(f.readline().split("=")[-1])
             if (self.out_count[0]==0) or (self.max_len[0]<16):
               raise Exception()
             line=f.readline()
             if line and line.find("=") > 0:
                self.min_len[1]=int(line.split("=")[-1])
                self.max_len[1]=int(f.readline().split("=")[-1])
                self.in_count[1]=int(f.readline().split("=")[-1])
                self.out_count[1]=int(f.readline().split("=")[-1])
                if (self.out_count[1]==0) or (self.max_len[1]<16):
                   raise Exception()
           except Exception, e:
             die(fail_str+"Error retrieving prep_reads info.")
           sides=["left", "right"]
           for ri in (0,1):
               if self.in_count[ri]==0: break
               trashed=self.in_count[ri]-self.out_count[ri]
               self.kept_reads[ri]=out_fname.replace("%side%", sides[ri])
               th_logp("\t%5s reads: min. length=%s, max. length=%s, %s kept reads (%s discarded)" %  (sides[ri], self.min_len[ri], self.max_len[ri], self.out_count[ri], trashed))

def prep_reads_cmd(params, l_reads_list, l_quals_list=None, r_reads_list=None, r_quals_list=None, out_file=None, aux_file=None,
                                 index_file=None, filter_reads=[], hits_to_filter=[]):
  #generate a prep_reads cmd arguments
  prep_cmd = [prog_path("prep_reads")]

  prep_cmd.extend(params.cmd())

  if params.read_params.reads_format == "fastq":
      prep_cmd += ["--fastq"]
  elif params.read_params.reads_format == "fasta":
      prep_cmd += ["--fasta"]
  if hits_to_filter:
    prep_cmd += ["--flt-hits=" + ",".join(hits_to_filter)]
  if aux_file:
    prep_cmd += ["--aux-outfile="+aux_file]
  if index_file:
      prep_cmd += ["--index-outfile="+index_file] # could be a template
  if filter_reads:
    prep_cmd += ["--flt-reads=" + ",".join(filter_reads)]
  if params.sam_header:
    prep_cmd += ["--sam-header="+params.sam_header]
  if out_file:
    prep_cmd += ["--outfile="+out_file] #could be a template
  prep_cmd.append(l_reads_list)
  if l_quals_list:
        prep_cmd.append(l_quals_list)
  if r_reads_list:
    prep_cmd.append(r_reads_list)
    if r_quals_list:
        prep_cmd.append(r_quals_list)

  return prep_cmd

# Calls the prep_reads executable, which prepares an internal read library.
# The read library features reads with monotonically increasing integer IDs.
# prep_reads also filters out very low complexy or garbage reads as well as
# polyA reads.
#--> returns a PrepReadsInfo structure
def prep_reads(params, l_reads_list, l_quals_list, r_reads_list, r_quals_list, prefilter_reads=[]):
    reads_suffix = ".bam"
    use_bam = True

    #if params.read_params.color:
    #   reads_suffix = ".fq"
    #   use_bam = False

    # for parallelization, we don't compress the read files
    do_use_zpacker = use_zpacker and not use_bam
    if do_use_zpacker and params.system_params.num_threads > 1:
        do_use_zpacker = False

    if do_use_zpacker: reads_suffix += ".z"

    out_suffix = "_kept_reads" + reads_suffix
    #kept_reads_filename = tmp_dir + output_name + reads_suffix

    for side in ("left", "right"):
       kept_reads_filename = tmp_dir + side + out_suffix
    if os.path.exists(kept_reads_filename):
        os.remove(kept_reads_filename)
    out_tmpl="left"
    out_fname=None
    kept_reads = None #output file handle
    if r_reads_list:
        out_tmpl="%side%"
    if use_bam:
       out_fname = tmp_dir + out_tmpl + out_suffix
    else:
      #assumed no right reads given here, only one side is being processed
      kept_reads = open(tmp_dir + out_tmpl + out_suffix, "wb")
    log_fname=logging_dir + "prep_reads.log"
    filter_log = open(log_fname,"w")

    info_file = output_dir+"prep_reads.info"
    index_file = out_fname + ".index"
    if do_use_zpacker: index_file=None

    prep_cmd=prep_reads_cmd(params, l_reads_list, l_quals_list, r_reads_list, r_quals_list,
                                       out_fname, info_file, index_file, prefilter_reads)
    shell_cmd = ' '.join(prep_cmd)
    #finally, add the compression pipe if needed
    zip_cmd=[]
    if do_use_zpacker:
       zip_cmd=[ params.system_params.zipper ]
       zip_cmd.extend(params.system_params.zipper_opts)
       zip_cmd.extend(['-c','-'])
       shell_cmd +=' | '+' '.join(zip_cmd)
    if not use_bam: shell_cmd += ' >' +kept_reads_filename
    retcode = None
    try:
        print >> run_log, shell_cmd
        if do_use_zpacker:
            filter_proc = subprocess.Popen(prep_cmd,
                                  stdout=subprocess.PIPE,
                                  stderr=filter_log)
            zip_proc=subprocess.Popen(zip_cmd,
                                  preexec_fn=subprocess_setup,
                                  stdin=filter_proc.stdout,
                                  stderr=tophat_log, stdout=kept_reads)
            filter_proc.stdout.close() #as per http://bugs.python.org/issue7678
            zip_proc.communicate()
            retcode=filter_proc.poll()
            if retcode==0:
              retcode=zip_proc.poll()
        else:
            if use_bam:
              retcode = subprocess.call(prep_cmd, stderr=filter_log)
            else:
              retcode = subprocess.call(prep_cmd,
                                 stdout=kept_reads, stderr=filter_log)
        if retcode:
            die(fail_str+"Error running 'prep_reads'\n"+log_tail(log_fname))

    except OSError, o:
        errmsg=fail_str+str(o)
        die(errmsg+"\n"+log_tail(log_fname))

    if kept_reads: kept_reads.close()
    return PrepReadsInfo(info_file, tmp_dir + out_tmpl + out_suffix)


# Call bowtie
def bowtie(params,
           bwt_idx_prefix,
           sam_header_filename,
           reads_list,
           reads_format,
           num_mismatches,
           mapped_reads,
           unmapped_reads,
           extra_output = "",
           mapping_type = _reads_vs_G,
           multihits_out = None): #only --prefilter-multihits should activate this parameter for the initial prefilter search
    start_time = datetime.now()
    bwt_idx_name = bwt_idx_prefix.split('/')[-1]
    reads_file=reads_list[0]
    readfile_basename=getFileBaseName(reads_file)

    g_mapping, t_mapping, seg_mapping = False, False, False
    if mapping_type == _reads_vs_T:
        t_mapping = True
    elif mapping_type >= _segs_vs_G:
        seg_mapping = True
    else:
        g_mapping = True

    bowtie_str = "Bowtie"
    if params.bowtie2:
        bowtie_str += "2"

    if seg_mapping:
        if not params.bowtie2:
            backup_bowtie_alignment_option = params.bowtie_alignment_option
            params.bowtie_alignment_option = "-v"

    if t_mapping:
       th_log("Mapping %s to transcriptome %s with %s %s" % (readfile_basename,
                     bwt_idx_name, bowtie_str, extra_output))
    else:
       qryname = readfile_basename
       if len(reads_list) > 1:
           bnames=[]
           for fname in reads_list:
              bnames += [getFileBaseName(fname)]
           qryname = ",".join(bnames)
       th_log("Mapping %s to genome %s with %s %s" % (qryname,
                     bwt_idx_name, bowtie_str, extra_output))
    bwt_log = open(logging_dir + 'bowtie.'+readfile_basename+'.fixmap.log', "w")
    #bwt_mapped=mapped_reads
    unmapped_reads_out=None
    if unmapped_reads:
         unmapped_reads_out=unmapped_reads+".fq"
    use_FIFO = use_BWT_FIFO and use_zpacker and unmapped_reads and params.read_params.color
    if use_FIFO:
         global unmapped_reads_fifo
         unmapped_reads_fifo=unmapped_reads+".fifo"
         if os.path.exists(unmapped_reads_fifo):
             os.remove(unmapped_reads_fifo)
         try:
             os.mkfifo(unmapped_reads_fifo)
         except OSError, o:
             die(fail_str+"Error at mkfifo("+unmapped_reads_fifo+'). '+str(o))
         unmapped_reads_out+=".z"

    # Launch Bowtie
    try:
        bowtie_cmd = [bowtie_path]
        if reads_format == "fastq":
            bowtie_cmd += ["-q"]
        elif reads_format == "fasta":
            bowtie_cmd += ["-f"]

        if params.read_params.color:
            bowtie_cmd += ["-C", "--col-keepends"]
            #if unmapped_reads:
            #   bowtie_cmd += ["--un", unmapped_reads_out]

        unzip_cmd=None
        bam_input=False
        if len(reads_list) > 0 and reads_list[0].endswith('.bam'):
           bam_input=True
           unzip_cmd=[ prog_path('bam2fastx'), "--all" ]
           if params.read_params.color:
               unzip_cmd.append("--color")
           if reads_format:
              unzip_cmd.append("--" + reads_format)
           unzip_cmd+=[reads_list[0]]

        if use_zpacker and (unzip_cmd is None):
          unzip_cmd=[ params.system_params.zipper ]
          unzip_cmd.extend(params.system_params.zipper_opts)
          unzip_cmd+=['-cd']

        fifo_pid=None
        if use_FIFO:
             unm_zipcmd=[ params.system_params.zipper ]
             unm_zipcmd.extend(params.system_params.zipper_opts)
             unm_zipcmd+=['-c']
             print >> run_log, ' '.join(unm_zipcmd)+' < '+ unmapped_reads_fifo + ' > '+ unmapped_reads_out + ' & '
             fifo_pid=os.fork()
             if fifo_pid==0:
                 def on_sig_exit(sig, func=None):
                    os._exit(os.EX_OK)
                 signal.signal(signal.SIGTERM, on_sig_exit)
                 subprocess.call(unm_zipcmd,
                                 stdin=open(unmapped_reads_fifo, "r"),
                                 stderr=tophat_log,
                                 stdout=open(unmapped_reads_out, "wb"))
                 os._exit(os.EX_OK)

        fix_map_cmd = [prog_path('fix_map_ordering')]
        if params.read_params.color:
            fix_map_cmd += ["--color"]

        if params.bowtie2:
            if t_mapping or g_mapping:
                max_penalty, min_penalty = params.bowtie2_params.mp.split(',')
                max_penalty, min_penalty = int(max_penalty), int(min_penalty)
                min_score = (max_penalty - 1) * num_mismatches
                fix_map_cmd += ["--bowtie2-min-score", str(min_score)]

        samzip_cmd=None

        if unmapped_reads or params.system_params.num_threads >= 1:
           #mapping on the genome
           #fix_map_ordering will write BAM file directly, and unmapped_reads as BAM file too
           mapped_reads += ".bam"

           fix_map_cmd += ["--sam-header", sam_header_filename, "-", mapped_reads]
           if unmapped_reads:
               unmapped_reads_out = unmapped_reads + ".bam"
               fix_map_cmd += [unmapped_reads_out]
        else:
           #mapping on segment_juncs (spliced potential junctions)
           # SAM headers could be huge
           mapped_reads += ".sam"
           fix_map_cmd += ["-"]
           if use_zpacker:
              fix_map_cmd += ["-"]
              samzip_cmd=[ params.system_params.zipper ]
              samzip_cmd.extend(params.system_params.zipper_opts)
              samzip_cmd += ["-c"]
              mapped_reads += ".z"
           else:
              fix_map_cmd += [mapped_reads]

        fix_map_cmd += ["--index-outfile", mapped_reads + ".index"]
        if not params.bowtie2:
           fix_map_cmd += ["--bowtie1"]
        if multihits_out != None:
           fix_map_cmd += ["--aux-outfile", params.preflt_data[multihits_out].multihit_reads]
           fix_map_cmd += ["--max-multihits", str(params.max_hits)]

        if t_mapping:
            max_hits = params.t_max_hits
        elif seg_mapping:
            max_hits = params.max_seg_hits
        else:
            max_hits = params.max_hits

        if num_mismatches > 3:
           num_mismatches = 3

        if params.bowtie2:
            if seg_mapping or multihits_out != None:
                # since bowtie2 does not suppress reads that map to too many places,
                # we suppress those in segment_juncs and long_spanning_reads.
                bowtie_cmd += ["-k", str(max_hits + 1)]
            else:
                bowtie_cmd += ["-k", str(max_hits)]

            bowtie2_params = params.bowtie2_params
            if seg_mapping:
                # after intensive testing,
                # the following parameters seem to work faster than Bowtie1 and as sensitive as Bowtie1,
                # but room for further improvements remains.
                bowtie_cmd += ["-N", str(min(num_mismatches, 1))]
                bowtie_cmd += ["-i", "C,2,0"]
                bowtie_cmd += ["-L", "20"]
                # bowtie_cmd += ["-L", str(min(params.segment_length, 20))]
            else:
                bowtie2_preset = ""
                if bowtie2_params.very_fast:
                    bowtie2_preset = "--very-fast"
                elif bowtie2_params.fast:
                    bowtie2_preset = "--fast"
                elif bowtie2_params.sensitive:
                    bowtie2_preset = "--sensitive"
                elif bowtie2_params.very_sensitive:
                    bowtie2_preset = "--very-sensitive"

                if bowtie2_preset != "":
                    bowtie_cmd += [bowtie2_preset]
                else:
                    bowtie_cmd += ["-D", str(bowtie2_params.D),
                                   "-R", str(bowtie2_params.R),
                                   "-N", str(bowtie2_params.N),
                                   "-L", str(bowtie2_params.L),
                                   "-i", bowtie2_params.i]

                # "--n-ceil" is not correctly parsed in Bowtie2,
                # I  (daehwan) already talked to Ben who will fix the problem.
                bowtie_cmd += [# "--n-ceil", bowtie2_params.n_ceil,
                               "--gbar", str(bowtie2_params.gbar),
                               "--mp", bowtie2_params.mp,
                               "--np", str(bowtie2_params.np),
                               "--rdg", bowtie2_params.rdg,
                               "--rfg", bowtie2_params.rfg,
                               "--score-min", bowtie2_params.score_min]

        else:
            bowtie_cmd += [params.bowtie_alignment_option, str(num_mismatches),
                           "-k", str(max_hits),
                           "-m", str(max_hits),
                           "-S"]

        bowtie_cmd += ["-p", str(params.system_params.num_threads)]

        if params.bowtie2: #always use headerless SAM file
            bowtie_cmd += ["--sam-no-hd"]
        else:
            bowtie_cmd += ["--sam-nohead"]

        if not params.bowtie2:
            if multihits_out != None:
                bowtie_cmd += ["--max", params.preflt_data[multihits_out].multihit_reads]
            else:
                bowtie_cmd += ["--max", "/dev/null"]

        if params.bowtie2:
            bowtie_cmd += ["-x"]

        bowtie_cmd += [ bwt_idx_prefix ]
        bowtie_proc=None
        shellcmd=""
        unzip_proc=None

        if multihits_out != None:
           #special prefilter bowtie run: we use prep_reads on the fly
           #in order to get multi-mapped reads to exclude later
           prep_cmd = prep_reads_cmd(params, params.preflt_data[0].seqfiles, params.preflt_data[0].qualfiles,
                                      params.preflt_data[1].seqfiles, params.preflt_data[1].qualfiles)
           prep_cmd.insert(1,"--flt-side="+str(multihits_out))
           sides=["left", "right"]
           preplog_fname=logging_dir + "prep_reads.prefilter_%s.log" % sides[multihits_out]
           prepfilter_log = open(preplog_fname,"w")
           unzip_proc = subprocess.Popen(prep_cmd,
                                stdout=subprocess.PIPE,
                                stderr=prepfilter_log)
           shellcmd=' '.join(prep_cmd) + "|"
        else:
           z_input=use_zpacker and reads_file.endswith(".z")
           if z_input:
              unzip_proc = subprocess.Popen(unzip_cmd,
                                     stdin=open(reads_file, "rb"),
                                     stderr=tophat_log, stdout=subprocess.PIPE)
              shellcmd=' '.join(unzip_cmd) + "< " +reads_file +"|"
           else:
               #must be uncompressed fastq input (unmapped reads from a previous run)
               #or a BAM file with unmapped reads
               if bam_input:
                   unzip_proc = subprocess.Popen(unzip_cmd, stderr=tophat_log, stdout=subprocess.PIPE)
                   shellcmd=' '.join(unzip_cmd) + "|"
               else:
                   bowtie_cmd += [reads_file]
                   if not unzip_proc:
                        bowtie_proc = subprocess.Popen(bowtie_cmd,
                                     stdout=subprocess.PIPE,
                                     stderr=bwt_log)
        if unzip_proc:
              #input is compressed OR prep_reads is used as a filter
              bowtie_cmd += ['-']
              bowtie_proc = subprocess.Popen(bowtie_cmd,
                                     stdin=unzip_proc.stdout,
                                     stdout=subprocess.PIPE,
                                     stderr=bwt_log)
              unzip_proc.stdout.close() # see http://bugs.python.org/issue7678

        shellcmd += ' '.join(bowtie_cmd) + '|' + ' '.join(fix_map_cmd)
        pipeline_proc = None
        fix_order_proc = None
        if samzip_cmd:
           shellcmd += "|"+ ' '.join(samzip_cmd)+" > " + mapped_reads
           fix_order_proc = subprocess.Popen(fix_map_cmd,
                                             stdin=bowtie_proc.stdout,
                                             stderr=tophat_log,
                                             stdout=subprocess.PIPE)
           bowtie_proc.stdout.close() # needed?
           pipeline_proc = subprocess.Popen(samzip_cmd,
                                 preexec_fn=subprocess_setup,
                                 stdin=fix_order_proc.stdout,
                                 stderr=tophat_log,
                                 stdout=open(mapped_reads, "wb"))
           fix_order_proc.stdout.close()
        else:
           #write BAM output directly
           fix_order_proc = subprocess.Popen(fix_map_cmd,
                                          stdin=bowtie_proc.stdout,
                                          stderr=tophat_log)
                                        # stdout=open(mapped_reads, "w"))
           pipeline_proc = fix_order_proc
           bowtie_proc.stdout.close()

        # shellcmd += " > " + mapped_reads
        print >> run_log, shellcmd
        retcode = None
        if pipeline_proc:
            pipeline_proc.communicate()
            retcode = pipeline_proc.returncode
        if use_FIFO:
            if fifo_pid and not os.path.exists(unmapped_reads_out):
                try:
                  os.kill(fifo_pid, signal.SIGTERM)
                except:
                  pass
        if retcode:
            die(fail_str+"Error running:\n"+shellcmd)
    except OSError, o:
        die(fail_str+"Error: "+str(o))

    # Success
    #finish_time = datetime.now()
    #duration = finish_time - start_time
    #print >> sys.stderr, "\t\t\t[%s elapsed]" %  formatTD(duration)
    if use_FIFO:
        try:
          os.remove(unmapped_reads_fifo)
        except:
          pass
    if multihits_out != None and not os.path.exists(params.preflt_data[multihits_out].multihit_reads):
        open(params.preflt_data[multihits_out].multihit_reads, "w").close()

    if seg_mapping:
        if not params.bowtie2:
            params.bowtie_alignment_option = backup_bowtie_alignment_option

    return (mapped_reads, unmapped_reads_out)


# Retrieve a .juncs file from a GFF file by calling the gtf_juncs executable
def get_gtf_juncs(gff_annotation):
    th_log("Reading known junctions from GTF file")
    gtf_juncs_log = open(logging_dir + "gtf_juncs.log", "w")

    gff_prefix = gff_annotation.split('/')[-1].split('.')[0]

    gtf_juncs_out_name  = tmp_dir + gff_prefix + ".juncs"
    gtf_juncs_out = open(gtf_juncs_out_name, "w")

    gtf_juncs_cmd=[prog_path("gtf_juncs"), gff_annotation]
    try:
        print >> run_log, " ".join(gtf_juncs_cmd), " > "+gtf_juncs_out_name
        retcode = subprocess.call(gtf_juncs_cmd,
                                  stderr=gtf_juncs_log,
                                  stdout=gtf_juncs_out)
        # cvg_islands returned an error
        if retcode == 1:
            th_logp("\tWarning: TopHat did not find any junctions in GTF file")
            return (False, gtf_juncs_out_name)
        elif retcode != 0:
            die(fail_str+"Error: GTF junction extraction failed with err ="+str(retcode))

    # cvg_islands not found
    except OSError, o:
       errmsg=fail_str+str(o)+"\n"
       if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
           errmsg+="Error: gtf_juncs not found on this system"
       die(errmsg)
    return (True, gtf_juncs_out_name)

# Call bowtie-build on the FASTA file of sythetic splice junction sequences
def build_juncs_bwt_index(is_bowtie2, external_splice_prefix, color):
    th_log("Indexing splices")
    bowtie_build_log = open(logging_dir + "bowtie_build.log", "w")

    #user_splices_out_prefix  = output_dir + "user_splices_idx"

    if is_bowtie2:
        bowtie_build_cmd = [prog_path("bowtie2-build")]
    else:
        bowtie_build_cmd = [prog_path("bowtie-build")]

    if color:
        bowtie_build_cmd += ["-C"]

    bowtie_build_cmd += [external_splice_prefix + ".fa",
                         external_splice_prefix]
    try:
        print >> run_log, " ".join(bowtie_build_cmd)
        retcode = subprocess.call(bowtie_build_cmd,
                                 stdout=bowtie_build_log)

        if retcode != 0:
            die(fail_str+"Error: Splice sequence indexing failed with err ="+ str(retcode))
    except OSError, o:
        errmsg=fail_str+str(o)+"\n"
        if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
            errmsg+="Error: bowtie-build not found on this system"
        die(errmsg)
    return external_splice_prefix

# Build a splice index from a .juncs file, suitable for use with specified read
# (or read segment) lengths
def build_juncs_index(is_bowtie2,
                      min_anchor_length,
                      max_seg_len,
                      juncs_prefix,
                      external_juncs,
                      external_insertions,
                      external_deletions,
                      external_fusions,
                      reference_fasta,
                      color):
    th_log("Retrieving sequences for splices")

    juncs_file_list = ",".join(external_juncs)
    insertions_file_list = ",".join(external_insertions)
    deletions_file_list = ",".join(external_deletions)
    fusions_file_list = ",".join(external_fusions)

    # do not use insertions and deletions in case of Bowtie2
    if is_bowtie2:
        insertions_file_list = "/dev/null"
        deletions_file_list = "/dev/null"

    juncs_db_log = open(logging_dir + "juncs_db.log", "w")

    external_splices_out_prefix  = tmp_dir + juncs_prefix
    external_splices_out_name = external_splices_out_prefix + ".fa"

    external_splices_out = open(external_splices_out_name, "w")
    # juncs_db_cmd = [bin_dir + "juncs_db",
    juncs_db_cmd = [prog_path("juncs_db"),
                    str(min_anchor_length),
                    str(max_seg_len),
                    juncs_file_list,
                    insertions_file_list,
                    deletions_file_list,
                    fusions_file_list,
                    reference_fasta]
    try:
        print >> run_log, " ".join(juncs_db_cmd) + " > " + external_splices_out_name
        retcode = subprocess.call(juncs_db_cmd,
                                 stderr=juncs_db_log,
                                 stdout=external_splices_out)

        if retcode != 0:
            die(fail_str+"Error: Splice sequence retrieval failed with err ="+str(retcode))
    # juncs_db not found
    except OSError, o:
       errmsg=fail_str+str(o)+"\n"
       if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
           errmsg+="Error: juncs_db not found on this system"
       die(errmsg)

    external_splices_out_prefix = build_juncs_bwt_index(is_bowtie2, external_splices_out_prefix, color)
    return external_splices_out_prefix

def build_idx_from_fa(is_bowtie2, fasta_fname, out_dir, color):
    """ Build a bowtie index from a FASTA file.

    Arguments:
    - `fasta_fname`: File path to FASTA file.
    - `out_dir`: Output directory to place index in. (includes os.sep)

    Returns:
    - The path to the Bowtie index.
    """
    bwt_idx_path = out_dir + os.path.basename(fasta_fname).replace(".fa", "")

    if is_bowtie2:
        bowtie_idx_cmd = [prog_path("bowtie2-build")]
    else:
        bowtie_idx_cmd = [prog_path("bowtie-build")]

    if color:
        bowtie_idx_cmd += ["-C"]

    bowtie_idx_cmd += [fasta_fname,
                       bwt_idx_path]
    try:
        th_log("Building Bowtie index from " + os.path.basename(fasta_fname))
        print >> run_log, " ".join(bowtie_idx_cmd)
        retcode = subprocess.call(bowtie_idx_cmd,
                                  stdout=open(os.devnull, "w"),
                                  stderr=open(os.devnull, "w"))
        if retcode != 0:
            die(fail_str + "Error: Couldn't build bowtie index with err = "
                + str(retcode))
    except OSError, o:
       errmsg=fail_str+str(o)+"\n"
       if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
           errmsg+="Error: bowtie-build not found on this system"
       die(errmsg)

    return bwt_idx_path

# Print out the sam header, embedding the user's specified library properties.
# FIXME: also needs SQ dictionary lines
def write_sam_header(read_params, sam_file):
    print >> sam_file, "@HD\tVN:1.0\tSO:coordinate"
    if read_params.read_group_id and read_params.sample_id:
        rg_str = "@RG\tID:%s\tSM:%s" % (read_params.read_group_id,
                                        read_params.sample_id)
        if read_params.library_id:
            rg_str += "\tLB:%s" % read_params.library_id
        if read_params.description:
            rg_str += "\tDS:%s" % read_params.description
        if read_params.seq_platform_unit:
            rg_str += "\tPU:%s" % read_params.seq_platform_unit
        if read_params.seq_center:
            rg_str += "\tCN:%s" % read_params.seq_center
        if read_params.mate_inner_dist:
            rg_str += "\tPI:%s" % read_params.mate_inner_dist
        if read_params.seq_run_date:
            rg_str += "\tDT:%s" % read_params.seq_run_date
        if read_params.seq_platform:
            rg_str += "\tPL:%s" % read_params.seq_platform

        print >> sam_file, rg_str
    print >> sam_file, "@PG\tID:TopHat\tVN:%s\tCL:%s" % (get_version(), run_cmd)

# Write final TopHat output, via tophat_reports and wiggles
def compile_reports(params, sam_header_filename, ref_fasta, mappings, readfiles, gff_annotation):
    th_log("Reporting output tracks")
    left_maps, right_maps = mappings
    left_reads, right_reads = readfiles
    left_maps = [x for x in left_maps if (os.path.exists(x) and os.path.getsize(x) > 25)]
    left_maps = ','.join(left_maps)

    if len(right_maps) > 0:
        right_maps = [x for x in right_maps if (os.path.exists(x) and os.path.getsize(x) > 25)]
        right_maps = ','.join(right_maps)

    log_fname = logging_dir + "reports.log"
    report_log = open(log_fname, "w")
    junctions = output_dir + "junctions.bed"
    insertions = output_dir + "insertions.bed"
    deletions = output_dir + "deletions.bed"
    accepted_hits = output_dir + "accepted_hits"
    report_cmdpath = prog_path("tophat_reports")
    fusions = output_dir + "fusions.out"
    report_cmd = [report_cmdpath]

    alignments_output_filename = tmp_dir + "accepted_hits"

    report_cmd.extend(params.cmd())
    report_cmd += ["--sam-header", sam_header_filename]
    if params.report_secondary_alignments:
        report_cmd += ["--report-secondary-alignments"]

    if params.report_discordant_pair_alignments:
        report_cmd += ["--report-discordant-pair-alignments"]

    report_cmd.extend(["--samtools="+samtools_path])

    b2_params = params.bowtie2_params
    max_penalty, min_penalty = b2_params.mp.split(',')
    report_cmd += ["--bowtie2-max-penalty", max_penalty,
                  "--bowtie2-min-penalty", min_penalty]

    report_cmd += ["--bowtie2-penalty-for-N", str(b2_params.np)]

    read_gap_open, read_gap_cont = b2_params.rdg.split(',')
    report_cmd += ["--bowtie2-read-gap-open", read_gap_open,
                  "--bowtie2-read-gap-cont", read_gap_cont]

    ref_gap_open, ref_gap_cont = b2_params.rfg.split(',')
    report_cmd += ["--bowtie2-ref-gap-open", ref_gap_open,
                  "--bowtie2-ref-gap-cont", ref_gap_cont]

    report_cmd.extend([ref_fasta,
                       junctions,
                       insertions,
                       deletions,
                       fusions,
                       alignments_output_filename,
                       left_maps,
                       left_reads])

    if len(right_maps) > 0 and right_reads:
        report_cmd.append(right_maps)
        report_cmd.append(right_reads)

    try:
        print >> run_log, " ".join(report_cmd)
        report_proc=subprocess.call(report_cmd,
                                            preexec_fn=subprocess_setup,
                                            stderr=report_log)
        if report_proc != 0:
              die(fail_str+"Error running "+" ".join(report_cmd)+"\n"+log_tail(log_fname))
        bam_parts = []
        for i in range(params.system_params.num_threads):
               bam_part_filename = "%s%d.bam" % (alignments_output_filename, i)
               if os.path.exists(bam_part_filename):
                  bam_parts.append(bam_part_filename)
               else:
                  break
        num_bam_parts = len(bam_parts)

        if params.report_params.sort_bam:
            pids = [0 for i in range(num_bam_parts)]
            sorted_bam_parts = ["%s%d_sorted" % (alignments_output_filename, i) for i in range(num_bam_parts)]
            #left_um_parts = ["%s%s%d_sorted" % (alignments_output_filename, i) for i in range(num_bam_parts)]
            #right_um_parts = ["%s%d_sorted" % (alignments_output_filename, i) for i in range(num_bam_parts)]
            for i in range(num_bam_parts):
                    bamsort_cmd = [samtools_path,
                                   "sort",
                                   bam_parts[i],
                                   sorted_bam_parts[i]]

                    sorted_bam_parts[i] += ".bam"
                    print >> run_log, " ".join(bamsort_cmd)
                    pid = os.fork()
                    if pid == 0:
                        subprocess.call(bamsort_cmd,
                                        stderr=open(logging_dir + "reports.samtools_sort.log%d" % i, "w"))
                        os._exit(os.EX_OK)
                    else:
                        pids[i] = pid

            for i in range(len(pids)):
                    if pids[i] > 0:
                        os.waitpid(pids[i], 0)
                        pids[i] = 0

            for bam_part in bam_parts:
                os.remove(bam_part)
            bam_parts = sorted_bam_parts[:]
        #-- endif sort_bam

        if num_bam_parts > 1:
            if params.report_params.sort_bam:
               bammerge_cmd = [samtools_path,
                    "merge","-f","-h", sam_header_filename]
               if not params.report_params.convert_bam:
                    bammerge_cmd += ["-u"]
            else: #not sorted, so just raw merge
               bammerge_cmd = [prog_path("bam_merge"), "-Q",
                     "--sam-header", sam_header_filename]

            if params.report_params.convert_bam:
               bammerge_cmd += ["%s.bam" % accepted_hits]
               bammerge_cmd += bam_parts
               print >> run_log, " ".join(bammerge_cmd)
               subprocess.call(bammerge_cmd,
                      stderr=open(logging_dir + "reports.merge_bam.log", "w"))
            else: #make .sam
               bammerge_cmd += ["-"]
               bammerge_cmd += bam_parts
               merge_proc = subprocess.Popen(bammerge_cmd,
                            stdout=subprocess.PIPE,
                            stderr=open(logging_dir + "reports.merge_bam.log", "w"))
               bam2sam_cmd = [samtools_path, "view", "-h", "-"]
               sam_proc = subprocess.Popen(bam2sam_cmd,
                              stdin=merge_proc.stdout,
                              stdout=open(accepted_hits + ".sam", "w"),
                              stderr=open(logging_dir + "accepted_hits_bam_to_sam.log", "w"))
               merge_proc.stdout.close()
               shellcmd = " ".join(bammerge_cmd) + " | " + " ".join(bam2sam_cmd)
               print >> run_log, shellcmd
               sam_proc.communicate()
               retcode = sam_proc.returncode
               if retcode:
                 die(fail_str+"Error running:\n"+shellcmd)
        else: # only one file
            os.rename(bam_parts[0], accepted_hits+".bam")
            if not params.report_params.convert_bam:
               #just convert to .sam
               bam2sam_cmd = [samtools_path, "view", "-h", accepted_hits+".bam"]
               shellcmd = " ".join(bam2sam_cmd) + " > " + accepted_hits + ".sam"
               print >> run_log, shellcmd
               r = subprocess.call(bam2sam_cmd,
                              stdout=open(accepted_hits + ".sam", "w"),
                              stderr=open(logging_dir + "accepted_hits_bam_to_sam.log", "w"))
               if r != 0:
                  die(fail_str+"Error running: "+shellcmd)
               os.remove(accepted_hits+".bam")

    except OSError, o:
          die(fail_str+"Error: "+str(o)+"\n"+log_tail(log_fname))

    try:
    # -- merge the unmapped files
      um_parts = []
      um_merged = output_dir + "unmapped.bam"
      for i in range(params.system_params.num_threads):
          left_um_file =  tmp_dir + "unmapped_left_%d.bam" % i
          right_um_file = tmp_dir + "unmapped_right_%d.bam" % i
          um_len = len(um_parts)
          if bamExists_and_NonEmpty(left_um_file):
             um_parts.append(left_um_file)
          if right_reads and bamExists_and_NonEmpty(right_um_file):
             um_parts.append(right_um_file)

      if len(um_parts) > 0:
          if len(um_parts)==1:
            os.rename(um_parts[0], um_merged)
          else:
            merge_cmd=[prog_path("bam_merge"), "-Q",
              "--sam-header", sam_header_filename, um_merged]
            merge_cmd += um_parts
            print >> run_log, " ".join(merge_cmd)
            ret = subprocess.call( merge_cmd,
                                   stderr=open(logging_dir + "bam_merge_um.log", "w") )
            if ret != 0:
                die(fail_str+"Error executing: "+" ".join(merge_cmd)+"\n"+log_tail(logging_dir+"bam_merge_um.log"))
    except OSError, o:
          die(fail_str+"Error: "+str(o)+"\n"+log_tail(log_fname))

    return junctions


# Split up each read in a FASTQ file into multiple segments. Creates a FASTQ file
# for each segment  This function needs to be fixed to support mixed read length
# inputs
def open_output_files(prefix, num_files_prev, num_files, out_segf, extension, params):
       i = num_files_prev + 1
       while i <= num_files:
          segfname=prefix+("_seg%d" % i)+extension
          out_segf.append(ZWriter(segfname,params.system_params))
          i += 1

def split_reads(reads_filename,
                prefix,
                fasta,
                params,
                segment_length):
    #reads_file = open(reads_filename)
    zreads = ZReader(reads_filename, params, False)
    out_segfiles = []

    if fasta:
        extension = ".fa"
    else:
        extension = ".fq"
    if use_zpacker: extension += ".z"
    def convert_color_to_bp(color_seq):
        decode_dic = { 'A0':'A', 'A1':'C', 'A2':'G', 'A3':'T', 'A4':'N', 'A.':'N', 'AN':'N',
                       'C0':'C', 'C1':'A', 'C2':'T', 'C3':'G', 'C4':'N', 'C.':'N', 'CN':'N',
                       'G0':'G', 'G1':'T', 'G2':'A', 'G3':'C', 'G4':'N', 'G.':'N', 'GN':'N',
                       'T0':'T', 'T1':'G', 'T2':'C', 'T3':'A', 'T4':'N', 'T.':'N', 'TN':'N',
                       'N0':'N', 'N1':'N', 'N2':'N', 'N3':'N', 'N4':'N', 'N.':'N', 'NN':'N',
                       '.0':'N', '.1':'N', '.2':'N', '.3':'N', '.4':'N', '..':'N', '.N':'N' }

        base = color_seq[0]
        bp_seq = base
        for ch in color_seq[1:]:
            base = decode_dic[base+ch]
            bp_seq += base
        return bp_seq

    def convert_bp_to_color(bp_seq):
        encode_dic = { 'AA':'0', 'CC':'0', 'GG':'0', 'TT':'0',
                       'AC':'1', 'CA':'1', 'GT':'1', 'TG':'1',
                       'AG':'2', 'CT':'2', 'GA':'2', 'TC':'2',
                       'AT':'3', 'CG':'3', 'GC':'3', 'TA':'3',
                       'A.':'4', 'C.':'4', 'G.':'4', 'T.':'4',
                       '.A':'4', '.C':'4', '.G':'4', '.T':'4',
                       '.N':'4', 'AN':'4', 'CN':'4', 'GN':'4',
                       'TN':'4', 'NA':'4', 'NC':'4', 'NG':'4',
                       'NT':'4', 'NN':'4', 'N.':'4', '..':'4' }

        base = bp_seq[0]
        color_seq = base
        for ch in bp_seq[1:]:
            color_seq += encode_dic[base + ch]
            base = ch

        return color_seq

    def split_record(read_name, read_seq, read_qual, out_segf, offsets, color):
        if color:
            color_offset = 1
            read_seq_temp = convert_color_to_bp(read_seq)

            seg_num = 1
            while seg_num + 1 < len(offsets):
                if read_seq[offsets[seg_num]+1] not in ['0', '1', '2', '3']:
                    return
                seg_num += 1
        else:
            color_offset = 0

        seg_num = 0
        last_seq_offset = 0
        while seg_num + 1 < len(offsets):
            f = out_segf[seg_num].file
            seg_seq = read_seq[last_seq_offset+color_offset:offsets[seg_num + 1]+color_offset]
            print >> f, "%s|%d:%d:%d" % (read_name,last_seq_offset,seg_num, len(offsets) - 1)
            if color:
                print >> f, "%s%s" % (read_seq_temp[last_seq_offset], seg_seq)
            else:
                print >> f, seg_seq
            if not fasta:
                seg_qual = read_qual[last_seq_offset:offsets[seg_num + 1]]
                print >> f, "+"
                print >> f, seg_qual
            seg_num += 1
            last_seq_offset = offsets[seg_num]

    line_state = 0
    read_name = ""
    read_seq = ""
    read_quals = ""
    num_segments = 0
    offsets = []
    for line in zreads.file:
        if line.strip() == "":
            continue
        if line_state == 0:
            read_name = line.strip()
        elif line_state == 1:
            read_seq = line.strip()

            read_length = len(read_seq)
            tmp_num_segments = read_length / segment_length
            offsets = [segment_length * i for i in range(0, tmp_num_segments + 1)]

            # Bowtie's minimum read length here is 20bp, so if the last segment
            # is between 20 and segment_length bp long, go ahead and write it out
            if read_length % segment_length >= min(segment_length - 2, 20):
                offsets.append(read_length)
                tmp_num_segments += 1
            else:
                offsets[-1] = read_length

            if tmp_num_segments == 1:
                offsets = [0, read_length]

            if tmp_num_segments > num_segments:
                open_output_files(prefix, num_segments, tmp_num_segments, out_segfiles, extension, params)
                num_segments = tmp_num_segments

            if fasta:
                split_record(read_name, read_seq, None, out_segfiles, offsets, params.read_params.color)
        elif line_state == 2:
            line = line.strip()
        else:
            read_quals = line.strip()
            if not fasta:
                split_record(read_name, read_seq, read_quals, out_segfiles, offsets, params.read_params.color)

        line_state += 1
        if fasta:
            line_state %= 2
        else:
            line_state %= 4
    zreads.close()
    out_fnames=[]
    for zf in out_segfiles:
        zf.close()
        out_fnames.append(zf.fname)
    #return [o.fname for o in out_segfiles]
    return out_fnames

# Find possible splice junctions using the "closure search" strategy, and report
# them in closures.juncs.  Calls the executable closure_juncs
def junctions_from_closures(params,
                            sam_header_filename,
                            left_maps,
                            right_maps,
                            ref_fasta):
    th_log("Searching for junctions via mate-pair closures")


    #maps = [x for x in seg_maps if (os.path.exists(x) and os.path.getsize(x) > 0)]
    #if len(maps) == 0:
    #    return None
    slash = left_maps[0].rfind('/')
    juncs_out = ""
    if slash != -1:
        juncs_out += left_maps[0][:slash+1]
    fusions_out = juncs_out

    juncs_out += "closure.juncs"
    fusions_out += "closure.fusions"

    juncs_log = open(logging_dir + "closure.log", "w")
    juncs_cmdpath=prog_path("closure_juncs")
    juncs_cmd = [juncs_cmdpath]

    left_maps = ','.join(left_maps)
    right_maps = ','.join(right_maps)

    juncs_cmd.extend(params.cmd())
    juncs_cmd.extend(["--sam-header", sam_header_filename,
                      juncs_out,
                      fusions_out,
                      ref_fasta,
                      left_maps,
                      right_maps])
    try:
        print >> run_log, ' '.join(juncs_cmd)
        retcode = subprocess.call(juncs_cmd,
                                 stderr=juncs_log)

        # spanning_reads returned an error
        if retcode != 0:
           die(fail_str+"Error: closure-based junction search failed with err ="+str(retcode))
    # cvg_islands not found
    except OSError, o:
        if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
           th_logp(fail_str + "Error: closure_juncs not found on this system")
        die(str(o))
    return [juncs_out]

# Find possible junctions by examining coverage and split segments in the initial
# map and segment maps.  Report junctions, insertions, and deletions in segment.juncs,
# segment.insertions, and segment.deletions.  Calls the executable
# segment_juncs
def junctions_from_segments(params,
                            sam_header_filename,
                            left_reads,
                            left_reads_map,
                            left_seg_maps,
                            right_reads,
                            right_reads_map,
                            right_seg_maps,
                            unmapped_reads,
                            reads_format,
                            ref_fasta):
    # if left_reads_map != left_seg_maps[0]:
    th_log("Searching for junctions via segment mapping")

    if params.coverage_search == True:
        print >> sys.stderr, "\tCoverage-search algorithm is turned on, making this step very slow"
        print >> sys.stderr, "\tPlease try running TopHat again with the option (--no-coverage-search) if this step takes too much time or memory."

    out_path=getFileDir(left_seg_maps[0])
    juncs_out=out_path+"segment.juncs"
    insertions_out=out_path+"segment.insertions"
    deletions_out =out_path+"segment.deletions"
    fusions_out = out_path+"segment.fusions"

    left_maps = ','.join(left_seg_maps)
    log_fname = logging_dir + "segment_juncs.log"
    segj_log = open(log_fname, "w")
    segj_cmd = [prog_path("segment_juncs")]

    segj_cmd.extend(params.cmd())
    segj_cmd.extend(["--sam-header", sam_header_filename,
                     "--ium-reads", ",".join(unmapped_reads),
                     ref_fasta,
                     juncs_out,
                     insertions_out,
                     deletions_out,
                     fusions_out,
                     left_reads,
                     left_reads_map,
                     left_maps])
    if right_seg_maps:
        right_maps = ','.join(right_seg_maps)
        segj_cmd.extend([right_reads, right_reads_map, right_maps])
    try:
        print >> run_log, " ".join(segj_cmd)
        retcode = subprocess.call(segj_cmd,
                                 preexec_fn=subprocess_setup,
                                 stderr=segj_log)

        # spanning_reads returned an error
        if retcode != 0:
           die(fail_str+"Error: segment-based junction search failed with err ="+str(retcode)+"\n"+log_tail(log_fname))

    # cvg_islands not found
    except OSError, o:
        if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
           th_logp(fail_str + "Error: segment_juncs not found on this system")
        die(str(o))

    return [juncs_out, insertions_out, deletions_out, fusions_out]

# Joins mapped segments into full-length read alignments via the executable
# long_spanning_reads
def join_mapped_segments(params,
                         sam_header_filename,
                         reads,
                         ref_fasta,
                         possible_juncs,
                         possible_insertions,
                         possible_deletions,
                         possible_fusions,
                         contig_seg_maps,
                         spliced_seg_maps,
                         alignments_out_name):
    rn=""
    if len(contig_seg_maps)>1:
       th_log("Joining segment hits")
       rn=".segs"
    else:
       th_log("Processing bowtie hits")
    contig_seg_maps = ','.join(contig_seg_maps)

    possible_juncs = ','.join(possible_juncs)
    possible_insertions = ",".join(possible_insertions)
    possible_deletions = ",".join(possible_deletions)
    possible_fusions = ",".join(possible_fusions)

    log_fname=logging_dir + "long_spanning_reads"+rn+".log"
    align_log = open(log_fname, "w")
    align_cmd = [prog_path("long_spanning_reads")]

    align_cmd.extend(params.cmd())
    align_cmd += ["--sam-header", sam_header_filename]

    b2_params = params.bowtie2_params
    max_penalty, min_penalty = b2_params.mp.split(',')
    align_cmd += ["--bowtie2-max-penalty", max_penalty,
                  "--bowtie2-min-penalty", min_penalty]

    align_cmd += ["--bowtie2-penalty-for-N", str(b2_params.np)]

    read_gap_open, read_gap_cont = b2_params.rdg.split(',')
    align_cmd += ["--bowtie2-read-gap-open", read_gap_open,
                  "--bowtie2-read-gap-cont", read_gap_cont]

    ref_gap_open, ref_gap_cont = b2_params.rfg.split(',')
    align_cmd += ["--bowtie2-ref-gap-open", ref_gap_open,
                  "--bowtie2-ref-gap-cont", ref_gap_cont]

    align_cmd.append(ref_fasta)
    align_cmd.extend([reads,
                      possible_juncs,
                      possible_insertions,
                      possible_deletions,
                      possible_fusions,
                      alignments_out_name,
                      contig_seg_maps])

    if spliced_seg_maps:
        spliced_seg_maps = ','.join(spliced_seg_maps)
        align_cmd.append(spliced_seg_maps)

    try:
        print >> run_log, " ".join(align_cmd)
        ret = subprocess.call(align_cmd,
                                  stderr=align_log)
        if ret:
          die(fail_str+"Error running 'long_spanning_reads':"+log_tail(log_fname))
    except OSError, o:
        die(fail_str+"Error: "+str(o))

# This class collects spliced and unspliced alignments for each of the
# left and right read files provided by the user.
class Maps:
        def __init__(self,
                     unspliced_sam,
                     seg_maps,
                     unmapped_segs,
                     segs):
            self.unspliced_sam = unspliced_sam
            self.seg_maps = seg_maps
            self.unmapped_segs = unmapped_segs
            self.segs = segs

# Map2GTF stuff
def m2g_convert_coords(params, sam_header_filename, gtf_fname, reads, out_fname):
    """ajjkljlks

    Arguments:
    - `params`: TopHat parameters
    - `gtf_fname`: File name pointing to the annotation.
    - `reads`: The reads to convert coords (in Bowtie format).
    - `out_fname`: The file name pointing to the output.
    """
    m2g_cmd = [prog_path("map2gtf")]
    m2g_cmd.extend(params.cmd())
    m2g_cmd += ["--sam-header", sam_header_filename]
    m2g_cmd.append(gtf_fname)
    m2g_cmd.append(reads) #could be BAM file
    m2g_cmd.append(out_fname)
    fbasename = getFileBaseName(reads)
    m2g_log = logging_dir + "m2g_" + fbasename + ".out"
    m2g_err = logging_dir + "m2g_" + fbasename + ".err"

    try:
        th_log("Converting " + fbasename + " to genomic coordinates (map2gtf)")
        print >> run_log, " ".join(m2g_cmd) + " > " + m2g_log
        ret = subprocess.call(m2g_cmd,
                              stdout=open(m2g_log, "w"),
                              stderr=open(m2g_err, "w"))
        if ret != 0:
            die(fail_str + " Error: map2gtf returned an error")
    except OSError, o:
        err_msg = fail_str + str(o)
        die(err_msg + "\n")


def gtf_to_fasta(params, trans_gtf, genome, out_fname):
    """ Make the transcriptome from a GTF.

    Arguments:
    - `trans_gtf`:
    - `genome`:
    - `out_fname`:
    """
    # TODO: Call gtf_to_fasta
    g2f_cmd = [prog_path("gtf_to_fasta")]
    g2f_cmd.extend(params.cmd())
    g2f_cmd.append(trans_gtf)
    g2f_cmd.append(genome)
    g2f_cmd.append(out_fname)

    g2f_log = logging_dir + "g2f.out"
    g2f_err = logging_dir + "g2f.err"

    try:
        print >> run_log, " ".join(g2f_cmd)+" > " + g2f_log
        ret = subprocess.call(g2f_cmd,
                              stdout = open(g2f_log, "w"),
                              stderr = open(g2f_err, "w"))
        if ret != 0:
            die(fail_str + " Error: gtf_to_fasta returned an error.")
    except OSError, o:
        err_msg = fail_str + str(o)
        die(err_msg + "\n")

def map2gtf(params, genome_sam_header_filename, ref_fasta, left_reads, right_reads):
    """ Main GTF mapping function

    Arguments:
    - `params`: The TopHat parameters.
    - `ref_fasta`: The reference genome.
    - `left_reads`: A list of reads.
    - `right_reads`: A list of reads (empty if single-end).

    """
    test_input_file(params.gff_annotation)

    # th_log("Reading in GTF file: " + params.gff_annotation)
    # transcripts = gtf_to_transcripts(params.gff_annotation)

    gtf_name = getFileBaseName(params.gff_annotation)
    m2g_bwt_idx = None
    t_out_dir = tmp_dir
    if params.transcriptome_index and not params.transcriptome_outdir:
       m2g_bwt_idx = params.transcriptome_index
       th_log("Using pre-built transcriptome index..")
    else:
       th_log("Creating transcriptome data files..")
       if params.transcriptome_outdir:
         t_out_dir=params.transcriptome_outdir+"/"
       m2g_ref_fasta = t_out_dir + gtf_name + ".fa"
       gtf_to_fasta(params, params.gff_annotation, ref_fasta, m2g_ref_fasta)
       m2g_bwt_idx = build_idx_from_fa(params.bowtie2, m2g_ref_fasta, t_out_dir, params.read_params.color)

    transcriptome_header_filename = get_index_sam_header(params, m2g_bwt_idx)

    mapped_gtf_list = []
    unmapped_gtf_list = []
    # do the initial mapping in GTF coordinates
    for reads in [left_reads, right_reads]:
        if reads == None or os.path.getsize(reads) < 25 :
            continue
        fbasename = getFileBaseName(reads)
        mapped_gtf_out = tmp_dir + fbasename + ".m2g"
        #if use_zpacker:
        #    mapped_gtf_out+=".z"

        unmapped_gtf = tmp_dir + fbasename + ".m2g_um"
        #if use_BWT_FIFO:
        #    unmapped_gtf += ".z"

        (mapped_gtf_map, unmapped) = bowtie(params,
                                            m2g_bwt_idx,
                                            transcriptome_header_filename,
                                            [reads],
                                            "fastq",
                                            params.t_mismatches,
                                            mapped_gtf_out,
                                            unmapped_gtf,
                                            "", _reads_vs_T)
        mapped_gtf_list.append(mapped_gtf_map)
        unmapped_gtf_list.append(unmapped)

    bam_gtf_list = []
    for reads in mapped_gtf_list:
        fbasename = getFileBaseName(reads)
        bam_out_fname = tmp_dir + fbasename + "_converted.bam"
        m2g_convert_coords(params,
                           genome_sam_header_filename,
                           params.gff_annotation,
                           reads,
                           bam_out_fname)

        bam_gtf_list.append(bam_out_fname)
        if not params.system_params.keep_tmp:
            removeFileWithIndex(reads)

    if len(bam_gtf_list) < 2:
        bam_gtf_list.append(None)
        unmapped_gtf_list.append(None)

    return (bam_gtf_list, unmapped_gtf_list)
# end Map2GTF

def get_preflt_data(params, ri, target_reads, out_mappings, out_unmapped):
 ## extract mappings and unmapped reads from prefilter mappings and preflt_ium
 ##
 #this is accomplished by a special prep_reads usage (triggered by --flt-hits)
 out_bam=None
 #if params.read_params.color:
 #  out_unmapped += ".fq"
 #  #if use_zpacker: out_unmapped += ".z"
 #else:
 out_unmapped += ".bam"
 out_bam = out_unmapped
 # no colorspace reads
 do_use_zpacker = use_zpacker and not out_bam
 prep_cmd=prep_reads_cmd(params, params.preflt_data[ri].unmapped_reads, None,
                     None, None, # right-side mates
                     out_bam, # stdout file
                     out_mappings, # aux file (filtered mappings)
                     None, # no index for out_bam
                     [target_reads], # prefilter reads
                     [params.preflt_data[ri].mappings]) # mappings to filter
 if not out_bam: um_reads = open(out_unmapped, "wb")
 sides=["left","right"]
 log_fname=logging_dir + "prep_reads.from_preflt."+sides[ri]+".log"
 filter_log = open(log_fname,"w")

 shell_cmd = " ".join(prep_cmd)
 #add the compression pipe
 zip_cmd=[]
 if do_use_zpacker:
    zip_cmd=[ params.system_params.zipper ]
    zip_cmd.extend(params.system_params.zipper_opts)
    zip_cmd.extend(['-c','-'])
    shell_cmd +=' | '+' '.join(zip_cmd)
 if not out_bam:
    shell_cmd += ' >' + out_unmapped
 retcode=0
 try:
     print >> run_log, shell_cmd
     if do_use_zpacker:
         prep_proc = subprocess.Popen(prep_cmd,
                               stdout=subprocess.PIPE,
                               stderr=filter_log)
         zip_proc = subprocess.Popen(zip_cmd,
                               preexec_fn=subprocess_setup,
                               stdin=prep_proc.stdout,
                               stderr=tophat_log, stdout=um_reads)
         prep_proc.stdout.close() #as per http://bugs.python.org/issue7678
         zip_proc.communicate()
         retcode=prep_proc.poll()
         if retcode==0:
           retcode=zip_proc.poll()
     else:
         if out_bam:
             retcode = subprocess.call(prep_cmd, stderr=filter_log)
         else:
             retcode = subprocess.call(prep_cmd, stdout=um_reads,
                              stderr=filter_log)
     if retcode:
         die(fail_str+"Error running 'prep_reads'\n"+log_tail(log_fname))

 except OSError, o:
     errmsg=fail_str+str(o)
     die(errmsg+"\n"+log_tail(log_fname))
 if not out_bam: um_reads.close()

 return (out_mappings, out_unmapped)


# The main aligment routine of TopHat.  This function executes most of the
# workflow producing a set of candidate alignments for each cDNA fragment in a
# pair of SAM alignment files (for paired end reads).
def spliced_alignment(params,
                      bwt_idx_prefix,
                      sam_header_filename,
                      ref_fasta,
                      read_len,
                      segment_len,
                      prepared_reads,
                      user_supplied_junctions,
                      user_supplied_insertions,
                      user_supplied_deletions):

    possible_juncs = []
    possible_juncs.extend(user_supplied_junctions)

    possible_insertions = []
    possible_insertions.extend(user_supplied_insertions)
    possible_deletions = []
    possible_deletions.extend(user_supplied_deletions)
    possible_fusions = []

    left_reads, right_reads = prepared_reads

    maps = [[], []] # maps[0] = left_reads mapping data, maps[1] = right_reads_mapping_data
    # Before anything, map the reads using Map2GTF (if using annotation)
    m2g_maps = [ None, None ] # left, right
    initial_reads = [ left_reads, right_reads ]

    if params.gff_annotation:
        (mapped_gtf_list, unmapped_gtf_list) = \
            map2gtf(params, sam_header_filename, ref_fasta, left_reads, right_reads)

        m2g_left_maps, m2g_right_maps = mapped_gtf_list
        m2g_maps = [m2g_left_maps, m2g_right_maps]
        if params.transcriptome_only or not fileExists(unmapped_gtf_list[0]):
            # The case where the user doesn't want to map to anything other
            # than the transcriptome OR we have no unmapped reads
            maps[0] = [m2g_left_maps]
            if right_reads:
                maps[1] = [m2g_right_maps]

            return maps
        # Feed the unmapped reads into spliced_alignment()
        initial_reads = unmapped_gtf_list[:]
        th_log("Resuming TopHat pipeline with unmapped reads")

        if not nonzeroFile(initial_reads[0]) and \
                (not initial_reads[1] or not nonzeroFile(initial_reads[1])):

            if m2g_maps[1]:
                return [[m2g_maps[0]], [m2g_maps[1]]]
            else:
                return [[m2g_maps[0]], []]

    max_seg_len = segment_len #this is the ref seq span on either side of the junctions
                              #to be extracted into segment_juncs.fa

    num_segs = int(read_len / segment_len)
    if (read_len % segment_len) >= min(segment_len-2, 20):
        #remainder is shorter but long enough to become a new segment
        num_segs += 1
    else:
       # the last segment is longer
       if num_segs>1: max_seg_len += (read_len % segment_len)

    if num_segs <= 1:
         th_logp("Warning: you have only one segment per read.\n\tIf the read length is greater than or equal to 45bp,\n\twe strongly recommend that you decrease --segment-length to about half the read length because TopHat will work better with multiple segments")

    # Using the num_segs value returned by check_reads(),
    # decide which junction discovery strategy to use
    if num_segs < 3:
       #if params.butterfly_search != False:
       #   params.butterfly_search = True
       if params.coverage_search != False:
           params.coverage_search = True
       if num_segs == 1:
         segment_len = read_len
    else: #num_segs >= 3:
        # if we have at least three segments, just use split segment search,
        # which is the most sensitive and specific, fastest, and lightest-weight.
        # so unless specifically requested, disable the other junction searches
        if params.closure_search != True:
               params.closure_search = False
        if params.coverage_search != True:
               params.coverage_search = False
        if params.butterfly_search != True:
                params.butterfly_search = False

    # Perform the first part of the TopHat work flow on the left and right
    # reads of paired ends separately - we'll use the pairing information later
    have_left_IUM = False
    for ri in (0,1):
        reads=initial_reads[ri]
        if reads == None or not nonzeroFile(reads):
            continue

        fbasename=getFileBaseName(reads)
        unspliced_out = tmp_dir + fbasename + ".mapped"
        unspliced_sam = None
        unmapped_reads = None
        #if use_zpacker: unspliced_out+=".z"
        unmapped_unspliced = tmp_dir + fbasename + "_unmapped"
        if params.prefilter_multi:
          #unmapped_unspliced += ".z"
          (unspliced_sam, unmapped_reads) = get_preflt_data(params, ri, reads, unspliced_out, unmapped_unspliced)
        else:
        # Perform the initial Bowtie mapping of the full length reads
          (unspliced_sam, unmapped_reads) = bowtie(params,
                                                   bwt_idx_prefix,
                                                   sam_header_filename,
                                                   [reads],
                                                   "fastq",
                                                   params.genome_read_mismatches,
                                                   unspliced_out,
                                                   unmapped_unspliced,
                                                   "",
                                                   _reads_vs_G)

        seg_maps = []
        unmapped_segs = []
        segs = []

        have_IUM = nonzeroFile(unmapped_reads)
        if ri==0 and have_IUM:
           have_left_IUM = True
        if num_segs > 1 and have_IUM:
            # split up the IUM reads into segments
            # unmapped_reads can be in BAM format
            read_segments = split_reads(unmapped_reads,
                                        tmp_dir + fbasename,
                                        False,
                                        params,
                                        segment_len)

            if not params.system_params.keep_tmp:
                removeFileWithIndex(unmapped_reads)

            # Map each segment file independently with Bowtie
            for i in range(len(read_segments)):
                seg = read_segments[i]
                fbasename=getFileBaseName(seg)
                seg_out =  tmp_dir + fbasename
                unmapped_seg = tmp_dir + fbasename + "_unmapped"
                extra_output = "(%d/%d)" % (i+1, len(read_segments))
                (seg_map, unmapped) = bowtie(params,
                                             bwt_idx_prefix,
                                             sam_header_filename,
                                             [seg],
                                             "fastq",
                                             params.segment_mismatches,
                                             seg_out,
                                             unmapped_seg,
                                             extra_output,
                                             _segs_vs_G)
                seg_maps.append(seg_map)
                unmapped_segs.append(unmapped)
                segs.append(seg)

            # Collect the segment maps for left and right reads together
            maps[ri] = Maps(unspliced_sam, seg_maps, unmapped_segs, segs)
        else:
            # if there's only one segment, just collect the initial map as the only
            # map to be used downstream for coverage-based junction discovery
            read_segments = [reads]
            maps[ri] = Maps(unspliced_sam, [unspliced_sam], [unmapped_reads], [unmapped_reads])

    # XXX: At this point if using M2G, have three sets of reads:
    # mapped to transcriptome, mapped to genome, and unmapped (potentially
    # spliced or poly-A tails) - hp
    unmapped_reads = []
    if maps[0]:
        left_reads_map = maps[0].unspliced_sam
        left_seg_maps = maps[0].seg_maps
        unmapped_reads = maps[0].unmapped_segs
    else:
        left_reads_map = None
        left_seg_maps = None

    if right_reads and maps[1]:
        right_reads_map = maps[1].unspliced_sam
        right_seg_maps = maps[1].seg_maps
        unmapped_reads.extend(maps[1].unmapped_segs)
    else:
        right_reads_map = None
        right_seg_maps = None

    if params.find_novel_juncs and have_left_IUM: # or params.find_novel_indels:
        # Call segment_juncs to infer a list of possible splice junctions from
        # the regions of the genome covered in the initial and segment maps
        #if params.find_novel_juncs:
        #TODO: in m2g case, we might want to pass the m2g mappings as well,
        #      or perhaps the GTF file directly
        #      -> this could improve alternative junction detection?
        juncs = junctions_from_segments(params,
                                        sam_header_filename,
                                        left_reads,
                                        left_reads_map,
                                        left_seg_maps,
                                        right_reads,
                                        right_reads_map,
                                        right_seg_maps,
                                        unmapped_reads,
                                        "fastq",
                                        ref_fasta)

        if not params.system_params.keep_tmp:
            for unmapped_seg in unmapped_reads:
                removeFileWithIndex(unmapped_seg)

        if os.path.getsize(juncs[0]) != 0:
            possible_juncs.append(juncs[0])
        if params.find_novel_indels:
            if os.path.getsize(juncs[1]) != 0:
                possible_insertions.append(juncs[1])
            if os.path.getsize(juncs[2]) != 0:
                possible_deletions.append(juncs[2])
        if params.find_novel_fusions:
            if os.path.getsize(juncs[3]) != 0:
                possible_fusions.append(juncs[3])
        # Optionally, and for paired reads only, use a closure search to
        # discover addtional junctions
        if params.closure_search and left_reads and right_reads:
            juncs = junctions_from_closures(params,
                                            sam_header_filename,
                                            [maps[initial_reads[left_reads]].unspliced_sam, maps[initial_reads[left_reads]].seg_maps[-1]],
                                            [maps[initial_reads[right_reads]].unspliced_sam, maps[initial_reads[right_reads]].seg_maps[-1]],
                                            ref_fasta)
            if os.path.getsize(juncs[0]) != 0:
                possible_juncs.extend(juncs)

    if len(possible_insertions) == 0 and len(possible_deletions) == 0 and len(possible_juncs) == 0 and len(possible_fusions) == 0:
        spliced_seg_maps = None
        junc_idx_prefix = None
    else:
        junc_idx_prefix = "segment_juncs"
    if len(possible_insertions) == 0:
        possible_insertions.append(os.devnull)
        # print >> sys.stderr, "Warning: insertions database is empty!"
    if len(possible_deletions) == 0:
        possible_deletions.append(os.devnull)
        # print >> sys.stderr, "Warning: deletions database is empty!"
    if len(possible_juncs) == 0:
        possible_juncs.append(os.devnull)
        th_logp("Warning: junction database is empty!")
    if len(possible_fusions) == 0:
        possible_fusions.append(os.devnull)
    if junc_idx_prefix:
        juncs_bwt_idx = build_juncs_index(params.bowtie2,
                                          3,
                                          max_seg_len,
                                          junc_idx_prefix,
                                          possible_juncs,
                                          possible_insertions,
                                          possible_deletions,
                                          possible_fusions,
                                          ref_fasta,
                                          params.read_params.color)

        juncs_bwt_samheader = get_index_sam_header(params, juncs_bwt_idx)

    # Now map read segments (or whole IUM reads, if num_segs == 1) to the splice
    # index with Bowtie
    # for reads in [left_reads, right_reads]:
    for ri in (0,1):
        reads = initial_reads[ri]
        if not reads:
            continue

        spliced_seg_maps = []
        rfname=getFileBaseName(reads)
        rfdir=getFileDir(reads)

        m2g_map = m2g_maps[ri]
        mapped_reads = rfdir + rfname + ".candidates.bam"
        merged_map = rfdir + rfname + ".candidates_and_unspl.bam"

        if maps[ri]:
            unspl_samfile = maps[ri].unspliced_sam
        else:
            unspl_samfile = None

        have_IUM = True
        if reads == None or not nonzeroFile(reads):
            have_IUM = False

        if have_IUM:
            if junc_idx_prefix:
                i = 0
                for seg in maps[ri].segs:
                    #search each segment
                    fsegname = getFileBaseName(seg)
                    seg_out = tmp_dir + fsegname + ".to_spliced"
                    #if use_zpacker: seg_out += ".z"
                    extra_output = "(%d/%d)" % (i+1, len(maps[ri].segs))
                    (seg_map, unmapped) = bowtie(params,
                                                 tmp_dir + junc_idx_prefix,
                                                 juncs_bwt_samheader,
                                                 [seg],
                                                 "fastq",
                                                 params.segment_mismatches,
                                                 seg_out,
                                                 None,
                                                 extra_output,
                                                 _segs_vs_J)
                    spliced_seg_maps.append(seg_map)
                    i += 1

                    if not params.system_params.keep_tmp:
                        removeFileWithIndex(seg)


                # Join the contigous and spliced segment hits into full length
                #   read alignments
                # -- spliced mappings built from all segment mappings vs genome and junc_db
                join_mapped_segments(params,
                                     sam_header_filename,
                                     reads,
                                     ref_fasta,
                                     possible_juncs,
                                     possible_insertions,
                                     possible_deletions,
                                     possible_fusions,
                                     maps[ri].seg_maps,
                                     spliced_seg_maps,
                                     mapped_reads)

                if not params.system_params.keep_tmp:
                    for seg_map in maps[ri].seg_maps:
                        removeFileWithIndex(seg_map)

                    for spliced_seg_map in spliced_seg_maps:
                        removeFileWithIndex(spliced_seg_map)


        #if num_segs > 1 or m2g_map:
        # Merge the spliced and unspliced full length alignments into
        # a single SAM file.
        # The individual SAM files are all already sorted in
        # increasing read ID order.
        # NOTE: We also should be able to address bug #134 here, by replacing
        # contiguous alignments that poke into an intron by a small amount by
        # the correct spliced alignment.

        try:
            merge_cmd = [prog_path("bam_merge"),
                         "--index-outfile", merged_map + ".index",
                         "--sam-header", sam_header_filename,
                         merged_map]

            merge_files = []

            if unspl_samfile and \
                    bamExists_and_NonEmpty(unspl_samfile):
                merge_files += [unspl_samfile]

            if m2g_map:
                merge_files += [m2g_map]

            if os.path.exists(mapped_reads):
                merge_files += [mapped_reads]
            else:
                for bam_i in range(0, params.system_params.num_threads):
                    temp_bam = mapped_reads[:-4] + str(bam_i) + ".bam"
                    if os.path.exists(temp_bam):
                        merge_files += [temp_bam]
                    else:
                        break

            if len(merge_files) >= 2:
                merge_cmd += merge_files
                print >> run_log, " ".join(merge_cmd)
                ret = subprocess.call( merge_cmd,
                                       stderr=open(logging_dir + "bam_merge.log", "w") )

                if ret != 0:
                    die(fail_str+"Error executing: "+" ".join(merge_cmd))

                if not params.system_params.keep_tmp:
                    for merge_file in merge_files:
                        removeFileWithIndex(merge_file)

            elif len(merge_files) == 1:
                os.rename(merge_files[0], merged_map)

        except OSError, o:
            die(fail_str+"Error: "+str(o))

        maps[ri] = [merged_map]

        if not params.system_params.keep_tmp:
            if mapped_reads and os.path.exists(mapped_reads):
                 os.remove(mapped_reads)
        #else:
        #    # no segments or transcriptome mappings, so no merge is needed
        #    #  because join_mapped_segments() produced the final BAM already
        #    os.rename(mapped_reads, merged_map)
        #    maps[ri] = [merged_map]

        #if not params.system_params.keep_tmp:
        #    if unspl_samfile and os.path.exists(unspl_samfile):
        #        os.remove(unspl_samfile)

    return maps

# rough equivalent of the 'which' command to find external programs
# (current script path is tested first, then PATH envvar)
def which(program):
    def is_executable(fpath):
        return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
    fpath, fname = os.path.split(program)
    if fpath:
        if is_executable(program):
            return program
    else:
        progpath = os.path.join(bin_dir, program)
        if is_executable(progpath):
           return progpath
        for path in os.environ["PATH"].split(os.pathsep):
           progpath = os.path.join(path, program)
           if is_executable(progpath):
              return progpath
    return None

def prog_path(program):
    progpath=which(program)
    if progpath == None:
        die("Error locating program: "+program)
    return progpath

# FIXME: this should get set during the make dist autotools phase of the build
def get_version():
   return "2.0.3"

def mlog(msg):
  print >> sys.stderr, "[DBGLOG]:"+msg

def test_input_file(filename):
    try:
        test_file = open(filename, "r")
    except IOError, o:
        die("Error: Opening file %s" % filename)
    return

def main(argv=None):
    warnings.filterwarnings("ignore", "tmpnam is a potential security risk")

    # Initialize default parameter values
    params = TopHatParams()
    try:
        if argv is None:
            argv = sys.argv
            args = params.parse_options(argv)
            params.check()

        bwt_idx_prefix = args[0]
        left_reads_list = args[1]
        left_quals_list, right_quals_list = None, None
        if (not params.read_params.quals and len(args) > 2) or (params.read_params.quals and len(args) > 3):
            if params.read_params.mate_inner_dist == None:
                params.read_params.mate_inner_dist = 50
                #die("Error: you must set the mean inner distance between mates with -r")

            right_reads_list = args[2]
            if params.read_params.quals:
                left_quals_list = args[3]
                right_quals_list = args[4]
        else:
            right_reads_list = None
            if params.read_params.quals:
                left_quals_list = args[2]

        start_time = datetime.now()
        prepare_output_dir()
        init_logger(logging_dir + "tophat.log")

        th_logp()
        th_log("Beginning TopHat run (v"+get_version()+")")
        th_logp("-----------------------------------------------")

        global run_log
        run_log = open(logging_dir + "run.log", "w", 0)
        global run_cmd
        run_cmd = " ".join(argv)
        print >> run_log, run_cmd

        check_bowtie(params)
        check_samtools()

        # Validate all the input files, check all prereqs before committing
        # to the run
        if params.gff_annotation:
           if not os.path.exists(params.gff_annotation):
             die("Error: cannot find transcript file %s" % params.gff_annotation)
           if os.path.getsize(params.gff_annotation)<10:
             die("Error: invalid transcript file %s" % params.gff_annotation)

        if params.transcriptome_index:
           if params.gff_annotation:
               #gff file given, so transcriptome data will be written there
               gff_basename = getFileBaseName(params.gff_annotation)
               #just in case, check if it's not already there (-G/--GTF given again by mistake)
               tpath, tname = os.path.split(params.transcriptome_index)
               new_subdir=False
               if tpath in (".", "./") or not tpath :
                  if not os.path.exists(params.transcriptome_index):
                    os.makedirs(params.transcriptome_index)
                    new_subdir=True
               if new_subdir or (os.path.exists(params.transcriptome_index) and os.path.isdir(params.transcriptome_index)):
                   params.transcriptome_index = os.path.join(params.transcriptome_index, gff_basename)
               gff_out=params.transcriptome_index+".gff"
               if not (os.path.exists(gff_out) and os.path.getsize(gff_out)==os.path.getsize(params.gff_annotation)):
                  #generate the transcriptome data files
                  tpath, tname = os.path.split(params.transcriptome_index)
                  params.transcriptome_outdir=tpath
           t_gff=params.transcriptome_index+".gff"
           if params.transcriptome_outdir:
              #will create the transcriptome data files
              if not os.path.exists(params.transcriptome_outdir):
                os.makedirs(params.transcriptome_outdir)
              copy(params.gff_annotation, t_gff)
           else:
              #try to use existing transcriptome data files

              if not (os.path.exists(t_gff) and os.path.getsize(t_gff)>10):
                  die("Error: GFF transcripts file not found or invalid (%s)" % t_gff)
              check_bowtie_index(params.transcriptome_index, params.bowtie2)
           params.gff_annotation=t_gff
           #end @ transcriptome_index given

        (ref_fasta, ref_seq_dict) = check_index(bwt_idx_prefix, params.bowtie2)

        th_log("Generating SAM header for "+bwt_idx_prefix)
        # we need to provide another name for this sam header as genome and transcriptome may have the same prefix.
        sam_header_filename = get_index_sam_header(params, bwt_idx_prefix, "genome")
        params.sam_header = sam_header_filename
        #if not params.skip_check_reads:
        reads_list = left_reads_list
        if right_reads_list:
                reads_list = reads_list + "," + right_reads_list
        params.read_params = check_reads_format(params, reads_list)

        user_supplied_juncs = []
        user_supplied_insertions = []
        user_supplied_deletions = []
        user_supplied_fusions = []
        global gtf_juncs
        if params.gff_annotation and params.find_GFF_juncs:
            test_input_file(params.gff_annotation)
            (found_juncs, gtf_juncs) = get_gtf_juncs(params.gff_annotation)
            ##-- we shouldn't need these junctions in user_supplied_juncs anymore because now map2gtf does a much better job
            ## but we still need them loaded in gtf_juncs for later splice verification
            if found_juncs:
                ## and not params.gff_annotation:
                user_supplied_juncs.append(gtf_juncs)
            #else:
            #    gtf_juncs = None
        if params.raw_junctions:
            test_input_file(params.raw_junctions)
            user_supplied_juncs.append(params.raw_junctions)

        if params.raw_insertions:
            test_input_file(params.raw_insertions)
            user_supplied_insertions.append(params.raw_insertions)

        if params.raw_deletions:
            test_input_file(params.raw_deletions)
            user_supplied_deletions.append(params.raw_deletions)

        global unmapped_reads_fifo
        unmapped_reads_fifo = tmp_dir + str(os.getpid())+".bwt_unmapped.z.fifo"

        # Now start the time consuming stuff
        if params.prefilter_multi:
            sides=("left","right")
            read_lists=(left_reads_list, right_reads_list)
            qual_lists=(left_quals_list, right_quals_list)
            for ri in (0,1):
               reads_list=read_lists[ri]
               if not reads_list:
                  continue
               fmulti_ext="bam"
               if not params.bowtie2:
                 fmulti_ext="fq"
               params.preflt_data[ri].seqfiles = reads_list
               params.preflt_data[ri].qualfiles = qual_lists[ri]
               params.preflt_data[ri].multihit_reads = tmp_dir + sides[ri]+"_multimapped."+fmulti_ext
               side_imap = tmp_dir + sides[ri]+"_im"
               #if use_zpacker: side_imap+=".z"
               side_ium = tmp_dir + sides[ri]+"_ium"
               #if use_BWT_FIFO and not params.bowtie2:
               #   side_ium += ".z"
               th_log("Pre-filtering multi-mapped "+sides[ri]+" reads")
               rdlist=reads_list.split(',')
               bwt=bowtie(params, bwt_idx_prefix, sam_header_filename, rdlist,
                     params.read_params.reads_format,
                     params.genome_read_mismatches,
                     side_imap, side_ium,
                     "", _reads_vs_G,  ri )             #  multi-mapped reads will be in params.preflt_data[ri].multihit_reads
               params.preflt_data[ri].mappings = bwt[0] # initial mappings
               params.preflt_data[ri].unmapped_reads = bwt[1] # IUM reads

        th_log("Preparing reads")
        prep_info=None

        multihit_reads = []
        if params.preflt_data[0].multihit_reads:
           multihit_reads += [params.preflt_data[0].multihit_reads]
        if params.preflt_data[1].multihit_reads:
           multihit_reads += [params.preflt_data[1].multihit_reads]
        prep_info= prep_reads(params,
                         left_reads_list, left_quals_list,
                         right_reads_list, right_quals_list,
                         multihit_reads)

        min_read_len = prep_info.min_len[0]
        if prep_info.min_len[1] > 0 and min_read_len > prep_info.min_len[1]:
           min_read_len = prep_info.min_len[1]

        if min_read_len < 20:
                  th_logp("Warning: short reads (<20bp) will make TopHat quite slow and take large amount of memory because they are likely to be mapped to too many places")

        max_read_len=max(prep_info.max_len[0], prep_info.max_len[1])

        seed_len=params.read_params.seed_length
        if seed_len: #if read len was explicitly given
            seed_len = max(seed_len, min_read_len)
            #can't be smaller than minimum length observed
        else:
            seed_len = max_read_len
        params.read_params.seed_length=seed_len
        # turn off integer-quals
        if params.read_params.integer_quals:
            params.read_params.integer_quals = False

        input_reads = prep_info.kept_reads[:]
        mappings = spliced_alignment(params,
                              bwt_idx_prefix,
                              sam_header_filename,
                              ref_fasta,
                              params.read_params.seed_length,
                              params.segment_length,
                              input_reads,
                              user_supplied_juncs,
                              user_supplied_insertions,
                              user_supplied_deletions)

        compile_reports(params,
                        sam_header_filename,
                        ref_fasta,
                        mappings,
                        input_reads,
                        params.gff_annotation)

        if not params.system_params.keep_tmp:
            try:
              for m in mappings[0]:
                  os.remove(m)
              for m in input_reads:
                  if m and os.path.exists(m): os.remove(m)
              for m in mappings[1]:
                  os.remove(m)
              tmp_files = os.listdir(tmp_dir)
              for t in tmp_files:
                  os.remove(tmp_dir+t)
              os.rmdir(tmp_dir)
            except OSError, o:
              th_logp("Warning: couldn't remove all temporary files in "+tmp_dir)

        finish_time = datetime.now()
        duration = finish_time - start_time
        th_logp("-----------------------------------------------")
        th_log("Run complete: %s elapsed" %  formatTD(duration))

    except Usage, err:
        th_logp(sys.argv[0].split("/")[-1] + ": " + str(err.msg))
        th_logp("    for detailed help see http://tophat.cbcb.umd.edu/manual.html")
        return 2


if __name__ == "__main__":
    sys.exit(main())
