#!/bin/env perl

=pod

=head1 NAME

calcdatarange - calculate data range and produce new 'chromosomes' field

=head1 SYNOPSIS

  cat data.txt | calcdatarange -karyotype file.txt [-padding 1e6] [-segments] [-breaks]

=head1 DESCRIPTION

Generates a list of intervals that contain data points in the provided file. 

=head1 INPUT 

The input format is

  chr start end

Any other fields are disregarded. The chromosome names must be the
same as provided in the karyotype file.

  # data.txt
  hs2 158957620 159057620
  hs2 159048074 159148074
  hs7 129578309 129678309 
  hs7 129655696 129755696
  hs19 9299916 9399916
  ...

=head1 OPTIONS

=head1 -karyotype FILE

You must provide the karyotype file, which is necessary to define the
extent of the chromosomes. The chromosome names in your input file
must match those in the karyotype file.

The default is human hg19 karyotype in

  data/karyotype/karyotype.human.hg19.txt

=head2 -padding INT

Each data interval is padded by the value provided via C<-padding>. Overlapping padded intervals are combined.

=head2 -segments

Generate a value for Circos' C<chromosomes> field which defines ideogram segments where data is found.

  chromosomes = hs1[hs1-0]:7.499806-10.157563;hs1[hs1-1]:20.749674-23.578635 ...

=head2 -breaks

Generates a value for Circos' C<chromosomes_breaks> field which defines ideogram crop regions, where data is not found.

=head2 -chromosomes_units INT

The value of C<chromosomes_units> for your image. This is usually set to 1000000, which is this script's default.

=head1 HISTORY

=over

=item 20 Nov 2012

Expanded documentation.

-segments is now default if neither -segments nor -breaks is provided

Added chromosomes_units.

=back 

=head1 AUTHOR

Martin Krzywinski

=head1 CONTACT

Martin Krzywinski
Genome Sciences Center
BC Cancer Research Center
100-570 W 7th Ave
Vancouver BC V5Z 4S6

mkweb.bcgsc.ca
martink@bcgsc.ca

=cut

use strict;
use warnings FATAL=>"all";

use Carp;
use Config::General;
use Cwd qw(getcwd abs_path);
use Data::Dumper;
use File::Basename;
use FindBin;
use Getopt::Long;
use Math::VecStat qw(sum min max average);
use Pod::Usage;
use Time::HiRes qw(gettimeofday tv_interval);
use Storable;
use lib "$FindBin::RealBin";
use lib "$FindBin::RealBin/../lib";
use lib "$FindBin::RealBin/lib";

our (%OPT,%CONF,$conf);
our @COMMAND_LINE = ("karyotype=s",
										 "breaks",
										 "segments",
										 "chromosomes_units=i",
										 "padding=f",
										 "configfile=s",
										 "help",
										 "man",
										 "debug:i");
our $VERSION = 0.01;

# common and custom module imports below
#use Regexp::Common;
#use IO::File;
#use List::Util;
#use List::MoreUtils;
use Set::IntSpan;
use Math::Round qw(nearest_ceil nearest_floor);
#use Statistics::Descriptive;

# read and parse configuration file
_parse_config();
#_dump_config();

# read the karyotype and store the span of each chromosome
my $k = read_k($CONF{karyotype});

my $data_range;
# read all the data spans and construct their union
# on each chromosome
#
# pad the data span, if -padding is used
while(<>) {
	chomp;
	my ($chr,$start,$end) = split;
	next unless defined $end;
	my $set = Set::IntSpan->new("$start-$end");
	$data_range->{$chr} ||= Set::IntSpan->new();
	if($CONF{padding}) {
		$set = Set::IntSpan->new(sprintf("%d-%d",
																		$set->min-$CONF{padding},
																		$set->max+$CONF{padding}));
	}
	printdebug(1,$chr,$set->min,$set->max);
	$data_range->{$chr}->U( $set );
}

if($CONF{segments}) {
	my $circos_segments;
	for my $chr (sort {$k->{$a}{idx} <=> $k->{$b}{idx}} keys %$data_range) {
		printdebug(1,"calculating segments",$chr);
		my $id=0;
		for my $set ($data_range->{$chr}->sets) {
			my $chrset       = $k->{$chr}{set}->intersect($set);
			my ($start,$end) = ($chrset->min,$chrset->max);
			push @$circos_segments, {range=>sprintf("%f-%f",
																							$start/$CONF{chromosomes_units},
																							$end/$CONF{chromosomes_units}),
															 chr=>$chr,
															 tag=>"$chr-$id"};
			$id++;
		}
	}
	printinfo("chromosomes = " . join(";", map { sprintf("%s[%s]:%s",$_->{chr},$_->{tag},$_->{range}) } @$circos_segments));
}

if($CONF{breaks}) {
	my ($data_breaks,$circos_breaks);
	# for each chromosome in the karyotype, determine
	# parts of the chromosome which are not covered by data spans
	for my $chr (sort {$k->{$a}{idx} <=> $k->{$b}{idx}} keys %$k) {
		printdebug(1,"calculating breaks",$chr);
		if($data_range->{lc $chr}) {
			$data_breaks->{$chr} = $k->{$chr}{set}->diff($data_range->{lc $chr});
			for my $set ($data_breaks->{$chr}->spans) {
				my ($start,$end) = @$set;
				push @{$circos_breaks->{$chr}}, sprintf("%f-%f",
																								$start/$CONF{chromosomes_units},
																								$end/$CONF{chromosomes_units});
			}
		} else {
			push @{$circos_breaks->{$chr}}, "(-)";
		}
	}
	printinfo("chromosomes_breaks = " . join(";",(map { sprintf("-%s:%s", $_, join(",",@{$circos_breaks->{$_}})) } sort keys %$circos_breaks)));
}

sub read_k {
	my $file = shift;
	open(F,$file);
	my $k;
	my $idx = 0;
	while(<F>) {
		chomp;
		next unless /^chr/;
		my @tok = split;
		my ($chr,$start,$end) = @tok[2,4,5];
		my $set = Set::IntSpan->new("$start-$end");
		$k->{$chr} = {set=>$set,idx=>$idx++};
		printdebug(1,"k",$chr,$set->run_list);
	}
	return $k;
}

sub validateconfiguration {
	$CONF{segments}            = 1 if ! $CONF{segments} && ! $CONF{breaks};
	$CONF{padding}           ||= 1e6;
	$CONF{chromosomes_units} ||= 1e6;
}

# HOUSEKEEPING ###############################################################

sub _dump_config {
    printdumper(\%OPT,\%CONF);
}

sub _parse_config {
  my $dump_debug_level = 3;
  GetOptions(\%OPT,@COMMAND_LINE);
  pod2usage() if $OPT{help};
  pod2usage(-verbose=>2) if $OPT{man};
  loadconfiguration($OPT{configfile});
  populateconfiguration(); # copy command line options to config hash
  validateconfiguration(); 
  if(defined $CONF{debug} && $CONF{debug} == $dump_debug_level) {
    $Data::Dumper::Indent    = 2;
    $Data::Dumper::Quotekeys = 0;
    $Data::Dumper::Terse     = 0;
    $Data::Dumper::Sortkeys  = 1;
    $Data::Dumper::Varname = "OPT";
    printdumper(\%OPT);
    $Data::Dumper::Varname = "CONF";
    printdumper(\%CONF);
    exit;
  }
}

sub populateconfiguration {
  for my $var (keys %OPT) {
    $CONF{$var} = $OPT{$var};
  }
  repopulateconfiguration(\%CONF);
}

sub repopulateconfiguration {
  my $root     = shift;
  return unless ref($root) eq "HASH";
  for my $key (keys %$root) {
      my $value = $root->{$key};
      if(ref($value) eq "HASH") {
	  repopulateconfiguration($value);
      } elsif (ref($value) eq "ARRAY") {
	  for my $item (@$value) {
	      repopulateconfiguration($item);
	  }
      } elsif(defined $value) {
	  while($value =~ /__([^_].+?)__/g) {
	      my $source = "__" . $1 . "__";
	      my $target = eval $1;
	      $value =~ s/\Q$source\E/$target/g;
	  }
	  $root->{$key} = $value;
      }
  }
}

################################################################
#
#

sub loadconfiguration {
  my $file = shift;
  if(defined $file) {
    if(-e $file && -r _) {
      # provided configuration file exists and can be read
      $file = abs_path($file);
    } else {
      confess "The configuration file [$file] passed with -configfile does not exist or cannot be read.";
    }
  } else {
    # otherwise, try to automatically find a configuration file
    my ($scriptname,$path,$suffix) = fileparse($0);
    my $cwd     = getcwd();
    my $bindir  = $FindBin::RealBin;
    my $userdir = $ENV{HOME};
    my @candidate_files = (
	"$cwd/$scriptname.conf",
	"$cwd/etc/$scriptname.conf",
	"$cwd/../etc/$scriptname.conf",
	"$bindir/$scriptname.conf",
	"$bindir/etc/$scriptname.conf",
	"$bindir/../etc/$scriptname.conf",
	"$userdir/.$scriptname.conf",
	);
    my @additional_files = (
	
	);
    for my $candidate_file (@additional_files,@candidate_files) {
	#printinfo("configsearch",$candidate_file);
	if(-e $candidate_file && -r _) {
	    $file = $candidate_file;
	    #printinfo("configfound",$candidate_file);
	    last;
	}
    }
  }
  if(defined $file) {
    $OPT{configfile} = $file;
    $conf = new Config::General(
	-ConfigFile=>$file,
	-IncludeRelative=>1,
	-IncludeAgain=>1,
	-ExtendedAccess=>1,
	-AllowMultiOptions=>"yes",
	-LowerCaseNames=>1,
	-AutoTrue=>1
	);
    %CONF = $conf->getall;
  }
}

sub printdebug {
    my ($level,@msg) = @_;
    my $prefix = "debug";
    if(defined $CONF{debug} && $CONF{debug} >= $level) {
	printinfo(sprintf("%s[%d]",$prefix,$level),@msg);
    }
}

sub printinfo {
    print join(" ",@_),"\n";
}

sub printdumper {
    print Dumper(@_);
}

