#!/usr/bin/perl -w -CIOE
use strict;

my $TARGET;
my $DATA="data";
my ($H,$O)=('','');

while(@ARGV)
{
    $_=shift;
    if (/^-t|--target$/)
    {
        # No longer required.
    }
    elsif (/^--data-dir$/)
    {
        @ARGV or die "$0: $_ requires an argument.\n";
        $DATA=shift;
    }
    elsif (/^-d|--debug$/)
    {
        ($H,$O)=("\e[31m","\e[0m");
    }
    elsif (/^-/)
    {
        die "$0: Unknown option: $_\n";
    }
    else
    {
        $TARGET.=" $_";
    }
}

my %tran=('latin'=>{});
my $maxlen=0;
defined $TARGET or $TARGET="latin";
$TARGET=~s/^\s*//;
$TARGET="\L$TARGET";

sub tran_one
{
    my ($l,$r,$ls,$rs)=@_;
    !defined ${$tran{$ls}}{$l}
        or die "Conflict for [$l] in $rs>$ls, it resolves to [${$tran{$ls}}{$l}] and [$r]\n";
    $r =~ s/\*//; # '*' is the carrier for combining chars and empty string
    ${$tran{$ls}}{$l}=$r;
    length($l)<$maxlen or $maxlen=length($l);
    length($r)<$maxlen or $maxlen=length($r);
}
  
sub read_config_file($)
{
    my ($f)=@_;
    undef my $SCRIPT;
    my $NOLOWER=0;
    
    unless (open F, "<utf8", $f)
    {
        warn "Can't read file: $f\n";
        return;
    }
    while(<F>)
    {
        chomp;
        s/#.*// unless s/#VERBATIM#$//;
        s/^\s+//;
        s/\s+$//;
        next if /^$/;
        
        if (/^NOLOWER/)
        {
            $NOLOWER=1;
            next;
        }
        if (/^ALLUPPER/)
        {
            $NOLOWER=2;
            next;
        }
        if (m{^SCRIPT(?:\s+|:\s*)([a-zA-Z0-9_/ -]+)$})
        {
            $SCRIPT="\L$1";
            !defined($tran{"$SCRIPT"}) or die "Script $SCRIPT defined twice, 2nd time in $f\n";
            $tran{"$SCRIPT"}={};
            next;
        }
        /(\S+)\s*(|<|>|=|\s)\s*(\S+)$/
            or die "$f: cannot parse line: [$H$_$O]\n";
        defined $SCRIPT
            or die "$f: character found before SCRIPT started\n";
        unless ($NOLOWER)
        {
            tran_one("\L$1", "\L$3", 'latin', $SCRIPT) unless $2 eq '<';
            tran_one("\L$3", "\L$1", $SCRIPT, 'latin') unless $2 eq '>';
        }
        elsif ($NOLOWER == 1)
        {
            tran_one("$1", "$3", 'latin', $SCRIPT) unless $2 eq '<';
            tran_one("$3", "$1", $SCRIPT, 'latin') unless $2 eq '>';
        }
        else # ALLUPPER
        {
            tran_one(  "$1", "$3", 'latin', $SCRIPT) unless $2 eq '<';
            tran_one("\L$3", "$1", $SCRIPT, 'latin') unless $2 eq '>';
        }
    }
    close F;
}

sub read_config
{
    my ($f)=@_;
    return read_config_file($f) if -f $f;
    unless(opendir(DIR, $f))
    {
        warn "Can't read dir: $f\n";
        return;
    }
    read_config("$f/$_") for sort grep /^[a-zA-Z0-9_-]+$/, readdir DIR;
    closedir DIR;
}

read_config($DATA);

$tran{$TARGET}
  or die "Unknown script \"$TARGET\".  Valid ones:\n".join("\n",sort keys %tran)."\n";

sub tran_shift($)
{
    my ($t)=@_;
    my ($l,$r,$lc);
        
    for(my $len=$maxlen;$len;$len--)
    {
        # Slooow, but lowercasing/consuming can be very tricky because some
        # characters like ß expand.

        # Because of glibc not supporting lowercasing some scripts yet,
        # we need to try uppercase first manually.
        $l=substr($_,0,$len);
        if (defined ($r=${$t}{$l}))
        {
            substr($_,0,$len)='';
            print $r;
            return 1;
        }

        $lc=lc($l);
        next unless ($lc ne $l);  #input was not in lowercase

        if (defined ($r=${$t}{$lc}))
        {
            substr($_,0,$len)='';
            $r=/^\p{IsLower}/? ucfirst($r) : uc($r);  #UPPER vs Title case
            print $r;
            return 1;
        }
    }
    return 0;
}

while(<>)
{
    #s/\x{0130}/I\x{0307}/g; # The only character which expands when lowercasing.
    while($_ ne '')
    {
        next if tran_shift($tran{$TARGET});
        print $H,substr($_,0,1),$O;
        substr($_,0,1)='';
    }
}
