#!/usr/bin/perl
# -*- mode: cperl; indent-tabs-mode: nil; tab-width: 3; cperl-indent-level: 3; -*-
use strict;
use warnings;
use utf8;
use FileHandle;
use IPC::Open2;
use Digest::SHA qw(sha1_base64);
use FindBin qw($Bin);

BEGIN {
	$| = 1;
	binmode(STDIN, ':encoding(UTF-8)');
	binmode(STDOUT, ':encoding(UTF-8)');
}
use open qw( :encoding(UTF-8) :std );
use feature 'unicode_strings';

use Getopt::Long;
Getopt::Long::Configure('no_ignore_case');
my $opt_bin = '';
my $opt_fst = 0;
my $opt_verbose = 0;
my $opt_noquote_hyph = 0;
my $opt_noquote_raw = 0;
my $rop = GetOptions(
	'bin|b=s' => \$opt_bin,
	'fst|f' => \$opt_fst,
	'verbose|v' => \$opt_verbose,
	'no-quote-hyphen|H' => \$opt_noquote_hyph,
	'no-quote-raw|R' => \$opt_noquote_raw,
                    );

my @bins = ($opt_bin, '/usr/local/bin/hfst-tokenise', '/opt/local/bin/hfst-tokenise', '/usr/bin/hfst-tokenise');
my $bin = '';
foreach my $f (@bins) {
   if ($f && -x $f) {
      $bin = $f;
      last;
   }
}
if (!$bin || !-x $bin) {
   die("No usable hfst-tokenise given or found!\n");
}
if ($opt_verbose) {
   print STDERR "kal-tokenise: Using ${bin}\n";
}

my $prefix = '/usr';
my @fsts = ('', "${prefix}/share/giella/kal/tokeniser-disamb-gt-desc.pmhfst", '/usr/local/share/giella/kal/tokeniser-disamb-gt-desc.pmhfst', '/usr/share/giella/kal/tokeniser-disamb-gt-desc.pmhfst', "$Bin/../tokenisers/tokeniser-disamb-gt-desc.pmhfst");
if (defined $ARGV[0] && -s $ARGV[0]) {
   $fsts[0] = $ARGV[0];
}
my $fst = '';
foreach my $f (@fsts) {
   if ($f && -s $f && -r $f) {
      $fst = $f;
      last;
   }
}

if (!$fst || !-s $fst) {
   die("No usable tokeniser.pmhfst given or found!\n");
}
if ($opt_verbose) {
   print STDERR "kal-tokenise: Using ${fst}\n";
}

my $t = "${bin} -L ${fst} | grep --line-buffered -v -F 'Use/Hybrid'";

# Prefixes from root.lexc ca. lines 14-15
my @p = qw(AA TA);

# Main POS tags from root.lexc ca. lines 19-29
my @m = qw(N V Pali Conj Adv Interj Pron Prop Num Symbol);

# Other POS tags from root.lexc ca. lines 114-180
my @a = qw(
   Sg Du Pl
   Abs Rel Trm Abl Lok Aeq Ins Via
   Nom Akk
   Ind Int Imp Opt Cau Con Par Cont ContNeg IteCau
   1Sg 2Sg 3Sg 4Sg 1Pl 2Pl 3Pl 4Pl 1Du 2Du 3Du 4Du
   1SgO 2SgO 3SgO 4SgO 1PlO 2PlO 3PlO 4PlO 1DuO 2DuO 3DuO 4DuO
   1SgPoss 2SgPoss 3SgPoss 4SgPoss 1PlPoss 2PlPoss 3PlPoss 4PlPoss
   );

# List of tags that should block Gram/TV and Gram/IV propagation
my @v_block = qw(
   NIQAR HTR NAR GUMINAR SIMA TARIAQAR
   UTE
   SAAR SAR TIP NIRAR SURE SUGE GASUGE GASURE NASUGE NASURE TSAALI TIR TITIR QQU
   );

my $pp = join('|', @p);
my $mp = join('|', @m);
my $ap = join('|', @a);
my $vb = join('|', @v_block);
my $i = 'i';

open2(*OUT, *IN, $t) or die $!;
binmode(OUT, ':encoding(UTF-8)');
binmode(IN, ':encoding(UTF-8)');
autoflush OUT 1;
autoflush IN 1;

my %cache = ();
my $puncts = '\x{2018}-\x{201F}\x{2039}\x{203A}\x{2E42}\x{3008}-\x{300B}\x{301D}-\x{301F}”“"\'´`‘’()»«';
my $ps_double = '[\x{2018}-\x{201B}\'´`‘’]{2,}|[\x{201C}-\x{201F}\x{2E42}\x{301D}-\x{301F}”“"]+';
my $ps_single_r = '\x{2018}-\x{201B}\'´`‘’';
my $ps_single = "[$ps_single_r]+";
my $ps_parens = '[()]';
my $ps_angle = '[\x{2039}\x{203A}\x{3008}-\x{300B}»«]';

sub call_tokenizer {
   my ($in) = @_;

   # Protect tags early, to avoid that the various workarounds mangle them
   if ($in =~ m@^</?\w+.*>@) {
      return $in;
   }

   # Turn very long ALL-UPPER strings to First Upper to avoid geometric slowdown
   while ($in =~ m@(\p{Lu}{15,})@) {
      my ($w,$t) = ($1, ucfirst(lc($1)));
      $in =~ s@$w@\x{e001} $t@g;
   }

   # Turn URLs into placeholders to avoid endless loop
   my %url_cache = ();
   while ($in =~ m@((?:(ht|f)tp(s?)://)?[^\s\pZ/]+\.[a-z]{2,}/\S*)@) {
      my ($s,$u) = ($1,$1);
      utf8::encode($s); # sha1_base64() can't handle UTF-8 for some reason
      my $hash = sha1_base64($s);
      $hash =~ s/[^a-zA-Z0-9]/x/g;
      $url_cache{$u} = "URL_$hash";
      $in =~ s@\Q$u\E@URL_$hash@g;
      #print STDERR "URL $u -> $hash\n";
   }

   my @qhyphs = ();

   # Turn "quoted with hyphen suffix"-imi into placeholders
   if (!$opt_noquote_hyph) {
      # Keep suffixes in line with affixes/propernouns.lexc lexicon ZcitationsformZ
      my @qs = split(m@([$puncts]+\p{Pd}i?(?:p|mut|mit|minngaannit|minngaanniit|miit|mi|tut|mik|kkut))@, $in);
      if (scalar(@qs) > 1) {
         for (my $i=1 ; $i<scalar(@qs) ; $i+=2) {
            my ($punct,$dash,$suffix) = ($qs[$i] =~ m@([$puncts]+)(\p{Pd})(i?(?:p|mut|mit|minngaannit|minngaanniit|miit|mi|tut|mik|kkut))@);

            my @accent = ();
            my $ps = "[$puncts]+";
            if ($punct =~ m@^$ps_double$@) {
               # Match double single-quotes with actual double-quotes
               $ps = $ps_double;
            }
            elsif ($punct =~ m@^$ps_single$@) {
               # Protect things like o'Malley d'Aix l'Étudiant 4'eren and possessive 's
               while ($qs[$i-1] =~ s@(\s[\pL\pN\pM]+)([$ps_single_r])([^$ps_single_r]+)$@$1\x{E000}$3@i) {
                  push(@accent, $2);
               }
               $ps = $ps_single;
            }
            elsif ($punct =~ m@^$ps_parens$@) {
               $ps = $ps_parens;
            }
            elsif ($punct =~ m@^$ps_angle$@) {
               # Various angle-brackets
               $ps = $ps_angle;
            }

            if ($qs[$i-1] =~ m@(.*)(^|[\s.,:;?()])([\pL\pN\pM]*$ps)([\pL\pN\pM\s!].*?)$@) {
               my ($pre,$ppunct,$quote) = ($1.$2, $3, $4);
               # Skip if the quoted part contains what looks like a sentence fragment
               if ($quote !~ m@\s[\p{Ll}\pN\pM]+\.\s\p{Lu}@ && scalar(() = ($quote =~ m@(\s+)@g)) < 20) {
                  my $n = scalar(@qhyphs);
                  $qs[$i-1] =~ s@(\Q$pre\E)\Q$ppunct$quote\E$@$1 QuotedHyphen$n-$suffix @;
                  for my $ip (@accent) {
                     $quote =~ s@\x{E000}@$ip@;
                  }
                  $qs[$i] = '';
                  push(@qhyphs, ["$ppunct$quote$punct", $dash, $suffix]);
                  #print STDERR "QHYPH $qhyphs[$n]->[0] $dash $suffix\n";
               }
               else {
                  #print STDERR "REJECTED $ppunct$quote$qs[$i]\n";
               }
            }
            else {
               #print STDERR "NOMATCH $qs[$i-1] $qs[$i]\n";
            }
            for my $ip (@accent) {
               $qs[$i-1] =~ s@\x{E000}@$ip@;
            }
         }
         $in = join('', @qs);
      }
   }

   if (!$opt_noquote_raw) {
=pod
      # Turn "double-quoted without hyphen suffix" into placeholders
      while ($in =~ m@(^|\s|\pP)($ps_double)((?:\S+?\s+?){0,6}?\S[\pL\pN\pM]*?)($ps_double)(\s|\pP|$)@) {
         my ($pre,$pq,$q,$sq,$suf) = ($1,$2,$3,$4,$5);
         my $n = scalar(@qhyphs);
         $in =~ s@\Q$pq$q$sq\E@ QuotedHyphen$n @g;
         push(@qhyphs, ["$pq$q$sq", '', '']);
         #print STDERR "DQUOTE $qhyphs[$n]->[0]\n";
      }

      # Turn 'single-quoted without hyphen suffix' into placeholders
      while ($in =~ m@(^|\s|\pP)($ps_single)((?:\S+?\s+?){0,6}?\S[\pL\pN\pM]*?)($ps_single)(\s|\pP|$)@) {
         my ($pre,$pq,$q,$sq,$suf) = ($1,$2,$3,$4,$5);
         my $n = scalar(@qhyphs);
         $in =~ s@\Q$pq$q$sq\E@ QuotedHyphen$n @g;
         push(@qhyphs, ["$pq$q$sq", '', '']);
         #print STDERR "SQUOTE $qhyphs[$n]->[0]\n";
      }

      # Turn (parentheticals) into placeholders
      while ($in =~ m@(^|\s|\pP)([(])((?:\S+?\s+?){0,6}?\S[\pL\pN\pM]*?)([)])(\s|\pP|$)@) {
         my ($pre,$pq,$q,$sq,$suf) = ($1,$2,$3,$4,$5);
         my $n = scalar(@qhyphs);
         $in =~ s@\Q$pq$q$sq\E@ QuotedHyphen$n @g;
         push(@qhyphs, ["$pq$q$sq", '', '']);
         print STDERR "PQUOTE $qhyphs[$n]->[0]\n";
      }
=cut
   }

   # If the input consisted only of quotes, undo the placeholders
   if ($in =~ m@^(\s*QuotedHyphen\d+(?:-\S+)?)+\s*$@) {
      for (my $n=0 ; $n<scalar(@qhyphs) ; ++$n) {
         my ($q,$d,$s) = @{$qhyphs[$n]};
         $in =~ s@QuotedHyphen$n-@$q$d@g;
         $in =~ s@QuotedHyphen$n(\s|$)@$q$1@g;
      }
   }

   my $out = '';
   if (defined $cache{$in}) {
      $out = $cache{$in};
   }
   else {
      my $nonce = '<nonce-'.rand().'>';
      print IN "$in\n$nonce\n";
      while ($out !~ /\Q$nonce\E/) {
         $out .= <OUT>;
      }
      $out =~ s@\Q$nonce\E\s*@@sg;

      # Restore ALL-UPPER forms
      $out =~ s@\x{e001} *\n"<([^\n]+?)>"@"<\U$1>"@g;
      $out =~ s@"<\x{e001}>"\n\t"\x{e001}" \?\n"<(.+?)>"@"<\U$1>"@g;

      # Restore URLs and give them the default analysis
      while (my ($u,$h) = each(%url_cache)) {
         $out =~ s@"<\Q$h\E>"@"<$u>"@g;
         $out =~ s@\t"\Q$h\E" \?@\t"$u" Sem/Url Prop Abs\n\t"$u" URL@g;
      }

      for (my $n=0 ; $n<scalar(@qhyphs) ; ++$n) {
         my ($q,$d,$s) = @{$qhyphs[$n]};
         $q =~ s@[\s\pZ]+@\x{E020}@g;
         $out =~ s@"QuotedHyphen$n"( N [^\n]+)@"$q"$1 Heur/Quote@g;
         $out =~ s@<QuotedHyphen$n-@<$q$d@g;
         $out =~ s@<QuotedHyphen$n>@<$q>@g;
      }

      # Move prefixes after the base form
      $out =~ s@^(\s+)($pp) (".+?" )@$1$3Prefix/$2 @mg;

      # Special case +URL
      $out =~ s/"\+URL/" URL/g;

      # Remove empty lines, because they were simply spaces in the input
      # $out =~ s/\n([\s\t]*\n)+/\n/g; # HANDLE BETTER

      if ($in !~ /\s/) {
         $cache{$in} = $out;
      }
   }

   return $out;
}

sub parse_cohorts {
   my ($out) = @_;

   my @cs = ();
   my $in_c = 0;
   my $last = 0;
   # Parse hfst-tokenize's output to array of cohorts
   foreach (split(/\n+/, $out)) {
      my $f = $_;
      s/\s+$//g;

      if (/^"<.*?>"/) {
         # Start of a cohort
         if ($in_c) {
            ++$last;
         }
         $cs[$last][0] = $_;
         $cs[$last][1] = '';
         $in_c = 1;
      }
      elsif (/^\t+/ && $in_c) {
         # A reading
         $cs[$last][1] .= "$_\n";
      }
      else {
         # Neither cohort nor reading
         if ($in_c) {
            ++$last;
         }
         $cs[$last][0] = $_;
         $cs[$last][1] = '';
         if (!/^</ && $_) {
            # Not a tag, so give it a null analysis
            $cs[$last][0] = "\"<$_>\"";
            $cs[$last][1] = "\t\"$_\" ?\n";
         }
         ++$last;
         if ($f =~ /^[.]+([ \t]+)$/) {
            $cs[$last][0] = $1;
            $cs[$last][1] = '';
            ++$last;
         }
         $in_c = 0;
      }
   }

   return @cs;
}

sub analyze {
   my ($in) = @_;
   my $out = call_tokenizer($in);

   my @cs = parse_cohorts($out);

   my $did = 1;
   while ($did) {
      $did = 0;

      for (my $i=0 ; $i<scalar(@cs) ; ++$i) {
         # Skip empty lines and tags
         if (!$cs[$i]->[0] || $cs[$i]->[0] =~ /^</) {
            next;
         }

         # Work around HFST bug where abbreviations without space after are misanalyzed
         if (defined $cs[$i+1] && $cs[$i+1]->[0] =~ /^"<\.>"/ && ($cs[$i]->[1] =~ m@ Gram/Abbr\b@ || $cs[$i]->[1] =~ / \?/)) {
            my ($f) = ($cs[$i]->[0] =~ /^"<(.*)>"/);
            my $m = call_tokenizer("$f.");
            if ($m =~ /\t"/ && $m !~ / \?/ && $m =~ /^("<[^\n]*>"[^\n]*)\n(.+)$/s) {
               ($cs[$i]->[0], $cs[$i]->[1]) = ($1, $2);
               splice(@cs, $i+1, 1);
               $did = 1;
               last;
            }
         }

         # Merge sequential dots
         if ($cs[$i]->[0] =~ /^"<([.]+)>"/) {
            my $dots = $1;
            if (defined $cs[$i+1] && $cs[$i+1]->[0] =~ /^"<([.]+)>"/) {
               $dots .= $1;
               $cs[$i]->[0] =~ s/^"<[.]+>"/"<$dots>"/;
               $cs[$i]->[1] =~ s/(\s)"[.]+"/$1"$dots"/g;
               splice(@cs, $i+1, 1);
               $did = 1;
               last;
            }
         }

         if ($cs[$i]->[1] !~ / \?/) {
            next;
         }

         # Work around HFST bug by recursing un-analyzed chunks
         my ($f) = ($cs[$i]->[0] =~ /^"<(.*)>"/);
         my %heur = ();
         my $orig = '';

         my $merge_ncs = sub {
            my ($m) = @_;
            my @ncs = parse_cohorts($m);
            my $h = join(' ', sort(keys(%heur)));
            foreach my $n (@ncs) {
               if ($orig) {
                  $n->[0] =~ s/^("<.*>")/"<$orig>"/;
               }
               if ($h) {
                  $n->[1] =~ s/(\t".*"[^\n]+)/$1 $h/g;
               }
            }
            splice(@cs, $i, 1, @ncs);
            $did = 1;
         };

         while (42) {
            # If input has non-alphanumerics, try it again as-is - this catches punctuation
            if ($f =~ /[^\pL\pN\pM]/) {
               my $m = call_tokenizer($f);
               if ($m =~ /\t"[^\n]*"[^?\n]+\n/) {
                  $merge_ncs->($m);
                  last;
               }
            }

            # Split non-alphanumeric from alphanumeric and retry them separately
            if ($f =~ /^([^\pL\pN\pM]+)(.+)$/ || $f =~ /^(.+?)([^\pL\pN\pM]+)$/) {
               my ($f1,$f2) = ($1, $2);
               my $m = call_tokenizer($f1).call_tokenizer($f2);
               if ($m =~ /\t"[^\n]*"[^?\n]+\n/) {
                  $merge_ncs->($m);
                  last;
               }
            }

            # If input might be lacking a -, try with one
            if ($f =~ /^([\p{Lu}\pN\pM]+)(\P{Lu}+)$/ && $f !~ /-/) {
               $orig = $orig || $f;
               my $m = call_tokenizer("$1-$2");
               if ($m =~ /\t"[^\n]*"[^?\n]+\n/ && $m !~ /^"<.*?>".*?\n"<.*?>"/s && $m !~ /(?:^|\n)[^\n<>"]+(?:\n|$)/s) {
                  $heur{'Heur/Hyphen'} = 1;
                  $merge_ncs->($m);
                  last;
               }
            }

            # If input doesn't have lower-case letters, try it as first-upper
            if ($f =~ /^[\pL\pN]/ && $f =~ /\p{Lu}/ && $f !~ /\p{Ll}/) {
               $orig = $orig || $f;
               $f = ucfirst(lc($f));
               if ($f ne $orig) {
                  $heur{'Heur/Case'} = 1;
                  my $m = call_tokenizer($f);
                  if ($m =~ /\t"[^\n]*"[^?\n]+\n/) {
                     $merge_ncs->($m);
                     last;
                  }
                  next;
               }
            }

            # If input is first-upper, try it as lower-case
            if ($f =~ /^\p{Lu}\P{Lu}+/ && $f =~ /\p{Ll}/) {
               $heur{'Heur/Case'} = 1;
               $orig = $orig || $f;
               $f = lc($f);
               my $m = call_tokenizer($f);
               if ($m =~ /\t"[^\n]*"[^?\n]+\n/) {
                  $merge_ncs->($m);
                  last;
               }
               next;
            }

            # If input doesn't have upper-case letters, try it as first-upper
            if ($f =~ /^[\pL\pN]/ && $f =~ /\p{Ll}/ && $f !~ /\p{Lu}/) {
               $orig = $orig || $f;
               $f = ucfirst(lc($f));
               if ($f ne $orig) {
                  $heur{'Heur/Case'} = 1;
                  my $m = call_tokenizer($f);
                  if ($m =~ /\t"[^\n]*"[^?\n]+\n/) {
                     $merge_ncs->($m);
                     last;
                  }
                  # This one should not recurse
               }
            }

            last;
         }
      }
   }

   $out = '';
   foreach my $c (@cs) {
      $out .= $c->[0]."\n";
      if ($c->[1]) {
         $out .= $c->[1];
      }
   }

   return $out;
}

sub propagate_tags {
   my ($c) = @_;

   if ($c !~ m@^\s+"@ || $c !~ m@Gram/[TI]V@) {
      return $c;
   }

   my ($b, $t) = ($c =~ m@^(\s+"[^\n]+?"?[^"]*")(\s+.*)$@);
   my @ts = split(/(?= (?:[A-Z]+|$mp) )/, $t);

   my $gram = '';
   my $lastm = '';
   my $block = 0;

   my $prepend = sub {
      if ($gram && !$block && $ts[0] !~ m@ Gram/[TI]V@) {
         $ts[0] = $ts[0].' '.$gram;
      }
      $gram = '';
      $block = 1;
   };

   for (my $i=0 ; $i<scalar(@ts) ; ++$i) {
      $ts[$i] =~ s@(Gram/[TI]V) ?(.*?)(\1)@$2$3@g;

      $_ = " ${ts[$i]} ";
      if (m@ (?:$mp) @) {
         $gram = '';
         $block = 1;
      }
      elsif (m@ (?:$vb) .*?Der/vv @) {
         $prepend->();
      }
      elsif ($gram && m@ Der/vn @) {
         if ($i && $ts[$i-1] !~ m@ Gram/[TI]V@) {
            $ts[$i-1] = $ts[$i-1].' '.$gram;
            $ts[$i-1] =~ s@( Gram/(?:Pass|Refl).*?) Gram/[TI]V@$1@g;
         }
         $prepend->();
      }
      elsif (m@Der/(vn|nv|nn)@) {
         $gram = '';
         $block = 1;
      }
      if (m@ (Gram/[TI]V) @) {
         $gram = $1;
      }
   }

   return $b.join('', @ts);
}

my $out = "\"<¶>\"\n\t\"¶\" CLB\n";
my $wf = '';
while (<STDIN>) {
   s/\x{feff}//g; # Unicode BOM

   if (scalar(keys(%cache)) > 10000) {
      %cache = ();
   }

   $out = "\"<¶>\"\n\t\"¶\" CLB\n";
   if (!/^\s*$/) {
      $out = analyze($_);
   }

   my @ls = split(/\n+/, $out, -1);
   foreach (@ls) {
      $_ = propagate_tags($_);

      if ($opt_fst) {
         if (m@^"<(.+)>"@) {
            # Store wordform for later
            $wf = $1;
            print "\n";
            next;
         }
         elsif (m@^\s+"([^\n]+?"?[^"]*)"@) {
            # Turn CG back into FST, protecting baseforms with spaces
            my $b = $1;
            s/^\s+//g;
            s/"\Q$b\E"/XBASEFORMX/;
            s/\s+/+/g;
            s@^(.+)\+Prefix/($pp)@$2+$1@;
            s/XBASEFORMX/$b/;
            # Prepend previously seen wordform
            $_ = "$wf\t$_";
         }
      }
      elsif (m@^\s+"@) {
         # Mark all internal POS tags so they don't confuse the later steps
         while (s@ ($mp|$ap)( .+ (?:$mp)(?: |$))@ $i$1$2@g) {}
      }
      print "$_\n";
   }
}

if ($out ne "\"<¶>\"\n\t\"¶\" CLB\n") {
   if ($opt_fst) {
      print "¶\t¶+CLB\n";
   }
   else {
      print "\"<¶>\"\n\t\"¶\" CLB\n";
   }
}
