[ Index ]

PHP Cross Reference of Unnamed Project

title

Body

[close]

/se3-unattended/var/se3/unattended/install/linuxaux/opt/perl/lib/5.10.0/Unicode/ -> Collate.pm (source)

   1  package Unicode::Collate;
   2  
   3  BEGIN {
   4      unless ("A" eq pack('U', 0x41)) {
   5      die "Unicode::Collate cannot stringify a Unicode code point\n";
   6      }
   7  }
   8  
   9  use 5.006;
  10  use strict;
  11  use warnings;
  12  use Carp;
  13  use File::Spec;
  14  
  15  no warnings 'utf8';
  16  
  17  our $VERSION = '0.52';
  18  our $PACKAGE = __PACKAGE__;
  19  
  20  my @Path = qw(Unicode Collate);
  21  my $KeyFile = "allkeys.txt";
  22  
  23  # Perl's boolean
  24  use constant TRUE  => 1;
  25  use constant FALSE => "";
  26  use constant NOMATCHPOS => -1;
  27  
  28  # A coderef to get combining class imported from Unicode::Normalize
  29  # (i.e. \&Unicode::Normalize::getCombinClass).
  30  # This is also used as a HAS_UNICODE_NORMALIZE flag.
  31  my $CVgetCombinClass;
  32  
  33  # Supported Levels
  34  use constant MinLevel => 1;
  35  use constant MaxLevel => 4;
  36  
  37  # Minimum weights at level 2 and 3, respectively
  38  use constant Min2Wt => 0x20;
  39  use constant Min3Wt => 0x02;
  40  
  41  # Shifted weight at 4th level
  42  use constant Shift4Wt => 0xFFFF;
  43  
  44  # A boolean for Variable and 16-bit weights at 4 levels of Collation Element
  45  # PROBLEM: The Default Unicode Collation Element Table
  46  # has weights over 0xFFFF at the 4th level.
  47  # The tie-breaking in the variable weights
  48  # other than "shift" (as well as "shift-trimmed") is unreliable.
  49  use constant VCE_TEMPLATE => 'Cn4';
  50  
  51  # A sort key: 16-bit weights
  52  # See also the PROBLEM on VCE_TEMPLATE above.
  53  use constant KEY_TEMPLATE => 'n*';
  54  
  55  # Level separator in a sort key:
  56  # i.e. pack(KEY_TEMPLATE, 0)
  57  use constant LEVEL_SEP => "\0\0";
  58  
  59  # As Unicode code point separator for hash keys.
  60  # A joined code point string (denoted by JCPS below)
  61  # like "65;768" is used for internal processing
  62  # instead of Perl's Unicode string like "\x41\x{300}",
  63  # as the native code point is different from the Unicode code point
  64  # on EBCDIC platform.
  65  # This character must not be included in any stringified
  66  # representation of an integer.
  67  use constant CODE_SEP => ';';
  68  
  69  # boolean values of variable weights
  70  use constant NON_VAR => 0; # Non-Variable character
  71  use constant VAR     => 1; # Variable character
  72  
  73  # specific code points
  74  use constant Hangul_LBase  => 0x1100;
  75  use constant Hangul_LIni   => 0x1100;
  76  use constant Hangul_LFin   => 0x1159;
  77  use constant Hangul_LFill  => 0x115F;
  78  use constant Hangul_VBase  => 0x1161;
  79  use constant Hangul_VIni   => 0x1160; # from Vowel Filler
  80  use constant Hangul_VFin   => 0x11A2;
  81  use constant Hangul_TBase  => 0x11A7; # from "no-final" codepoint
  82  use constant Hangul_TIni   => 0x11A8;
  83  use constant Hangul_TFin   => 0x11F9;
  84  use constant Hangul_TCount => 28;
  85  use constant Hangul_NCount => 588;
  86  use constant Hangul_SBase  => 0xAC00;
  87  use constant Hangul_SIni   => 0xAC00;
  88  use constant Hangul_SFin   => 0xD7A3;
  89  use constant CJK_UidIni    => 0x4E00;
  90  use constant CJK_UidFin    => 0x9FA5;
  91  use constant CJK_UidF41    => 0x9FBB;
  92  use constant CJK_ExtAIni   => 0x3400;
  93  use constant CJK_ExtAFin   => 0x4DB5;
  94  use constant CJK_ExtBIni   => 0x20000;
  95  use constant CJK_ExtBFin   => 0x2A6D6;
  96  use constant BMP_Max       => 0xFFFF;
  97  
  98  # Logical_Order_Exception in PropList.txt
  99  my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
 100  
 101  sub UCA_Version { "14" }
 102  
 103  sub Base_Unicode_Version { "4.1.0" }
 104  
 105  ######
 106  
 107  sub pack_U {
 108      return pack('U*', @_);
 109  }
 110  
 111  sub unpack_U {
 112      return unpack('U*', shift(@_).pack('U*'));
 113  }
 114  
 115  ######
 116  
 117  my (%VariableOK);
 118  @VariableOK{ qw/
 119      blanked  non-ignorable  shifted  shift-trimmed
 120    / } = (); # keys lowercased
 121  
 122  our @ChangeOK = qw/
 123      alternate backwards level normalization rearrange
 124      katakana_before_hiragana upper_before_lower
 125      overrideHangul overrideCJK preprocess UCA_Version
 126      hangul_terminator variable
 127    /;
 128  
 129  our @ChangeNG = qw/
 130      entry mapping table maxlength
 131      ignoreChar ignoreName undefChar undefName variableTable
 132      versionTable alternateTable backwardsTable forwardsTable rearrangeTable
 133      derivCode normCode rearrangeHash
 134      backwardsFlag
 135    /;
 136  # The hash key 'ignored' is deleted at v 0.21.
 137  # The hash key 'isShift' is deleted at v 0.23.
 138  # The hash key 'combining' is deleted at v 0.24.
 139  # The hash key 'entries' is deleted at v 0.30.
 140  # The hash key 'L3_ignorable' is deleted at v 0.40.
 141  
 142  sub version {
 143      my $self = shift;
 144      return $self->{versionTable} || 'unknown';
 145  }
 146  
 147  my (%ChangeOK, %ChangeNG);
 148  @ChangeOK{ @ChangeOK } = ();
 149  @ChangeNG{ @ChangeNG } = ();
 150  
 151  sub change {
 152      my $self = shift;
 153      my %hash = @_;
 154      my %old;
 155      if (exists $hash{variable} && exists $hash{alternate}) {
 156      delete $hash{alternate};
 157      }
 158      elsif (!exists $hash{variable} && exists $hash{alternate}) {
 159      $hash{variable} = $hash{alternate};
 160      }
 161      foreach my $k (keys %hash) {
 162      if (exists $ChangeOK{$k}) {
 163          $old{$k} = $self->{$k};
 164          $self->{$k} = $hash{$k};
 165      }
 166      elsif (exists $ChangeNG{$k}) {
 167          croak "change of $k via change() is not allowed!";
 168      }
 169      # else => ignored
 170      }
 171      $self->checkCollator();
 172      return wantarray ? %old : $self;
 173  }
 174  
 175  sub _checkLevel {
 176      my $level = shift;
 177      my $key   = shift; # 'level' or 'backwards'
 178      MinLevel <= $level or croak sprintf
 179      "Illegal level %d (in value for key '%s') lower than %d.",
 180          $level, $key, MinLevel;
 181      $level <= MaxLevel or croak sprintf
 182      "Unsupported level %d (in value for key '%s') higher than %d.",
 183          $level, $key, MaxLevel;
 184  }
 185  
 186  my %DerivCode = (
 187      8 => \&_derivCE_8,
 188      9 => \&_derivCE_9,
 189     11 => \&_derivCE_9, # 11 == 9
 190     14 => \&_derivCE_14,
 191  );
 192  
 193  sub checkCollator {
 194      my $self = shift;
 195      _checkLevel($self->{level}, "level");
 196  
 197      $self->{derivCode} = $DerivCode{ $self->{UCA_Version} }
 198      or croak "Illegal UCA version (passed $self->{UCA_Version}).";
 199  
 200      $self->{variable} ||= $self->{alternate} || $self->{variableTable} ||
 201                  $self->{alternateTable} || 'shifted';
 202      $self->{variable} = $self->{alternate} = lc($self->{variable});
 203      exists $VariableOK{ $self->{variable} }
 204      or croak "$PACKAGE unknown variable parameter name: $self->{variable}";
 205  
 206      if (! defined $self->{backwards}) {
 207      $self->{backwardsFlag} = 0;
 208      }
 209      elsif (! ref $self->{backwards}) {
 210      _checkLevel($self->{backwards}, "backwards");
 211      $self->{backwardsFlag} = 1 << $self->{backwards};
 212      }
 213      else {
 214      my %level;
 215      $self->{backwardsFlag} = 0;
 216      for my $b (@{ $self->{backwards} }) {
 217          _checkLevel($b, "backwards");
 218          $level{$b} = 1;
 219      }
 220      for my $v (sort keys %level) {
 221          $self->{backwardsFlag} += 1 << $v;
 222      }
 223      }
 224  
 225      defined $self->{rearrange} or $self->{rearrange} = [];
 226      ref $self->{rearrange}
 227      or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF";
 228  
 229      # keys of $self->{rearrangeHash} are $self->{rearrange}.
 230      $self->{rearrangeHash} = undef;
 231  
 232      if (@{ $self->{rearrange} }) {
 233      @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
 234      }
 235  
 236      $self->{normCode} = undef;
 237  
 238      if (defined $self->{normalization}) {
 239      eval { require Unicode::Normalize };
 240      $@ and croak "Unicode::Normalize is required to normalize strings";
 241  
 242      $CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass;
 243  
 244      if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default
 245          $self->{normCode} = \&Unicode::Normalize::NFD;
 246      }
 247      elsif ($self->{normalization} ne 'prenormalized') {
 248          my $norm = $self->{normalization};
 249          $self->{normCode} = sub {
 250          Unicode::Normalize::normalize($norm, shift);
 251          };
 252          eval { $self->{normCode}->("") }; # try
 253          $@ and croak "$PACKAGE unknown normalization form name: $norm";
 254      }
 255      }
 256      return;
 257  }
 258  
 259  sub new
 260  {
 261      my $class = shift;
 262      my $self = bless { @_ }, $class;
 263  
 264      # If undef is passed explicitly, no file is read.
 265      $self->{table} = $KeyFile if ! exists $self->{table};
 266      $self->read_table() if defined $self->{table};
 267  
 268      if ($self->{entry}) {
 269      while ($self->{entry} =~ /([^\n]+)/g) {
 270          $self->parseEntry($1);
 271      }
 272      }
 273  
 274      $self->{level} ||= MaxLevel;
 275      $self->{UCA_Version} ||= UCA_Version();
 276  
 277      $self->{overrideHangul} = FALSE
 278      if ! exists $self->{overrideHangul};
 279      $self->{overrideCJK} = FALSE
 280      if ! exists $self->{overrideCJK};
 281      $self->{normalization} = 'NFD'
 282      if ! exists $self->{normalization};
 283      $self->{rearrange} = $self->{rearrangeTable} ||
 284      ($self->{UCA_Version} <= 11 ? $DefaultRearrange : [])
 285      if ! exists $self->{rearrange};
 286      $self->{backwards} = $self->{backwardsTable}
 287      if ! exists $self->{backwards};
 288  
 289      $self->checkCollator();
 290  
 291      return $self;
 292  }
 293  
 294  sub read_table {
 295      my $self = shift;
 296  
 297      my($f, $fh);
 298      foreach my $d (@INC) {
 299      $f = File::Spec->catfile($d, @Path, $self->{table});
 300      last if open($fh, $f);
 301      $f = undef;
 302      }
 303      if (!defined $f) {
 304      $f = File::Spec->catfile(@Path, $self->{table});
 305      croak("$PACKAGE: Can't locate $f in \@INC (\@INC contains: @INC)");
 306      }
 307  
 308      while (my $line = <$fh>) {
 309      next if $line =~ /^\s*#/;
 310      unless ($line =~ s/^\s*\@//) {
 311          $self->parseEntry($line);
 312          next;
 313      }
 314  
 315      # matched ^\s*\@
 316      if ($line =~ /^version\s*(\S*)/) {
 317          $self->{versionTable} ||= $1;
 318      }
 319      elsif ($line =~ /^variable\s+(\S*)/) { # since UTS #10-9
 320          $self->{variableTable} ||= $1;
 321      }
 322      elsif ($line =~ /^alternate\s+(\S*)/) { # till UTS #10-8
 323          $self->{alternateTable} ||= $1;
 324      }
 325      elsif ($line =~ /^backwards\s+(\S*)/) {
 326          push @{ $self->{backwardsTable} }, $1;
 327      }
 328      elsif ($line =~ /^forwards\s+(\S*)/) { # parhaps no use
 329          push @{ $self->{forwardsTable} }, $1;
 330      }
 331      elsif ($line =~ /^rearrange\s+(.*)/) { # (\S*) is NG
 332          push @{ $self->{rearrangeTable} }, _getHexArray($1);
 333      }
 334      }
 335      close $fh;
 336  }
 337  
 338  
 339  ##
 340  ## get $line, parse it, and write an entry in $self
 341  ##
 342  sub parseEntry
 343  {
 344      my $self = shift;
 345      my $line = shift;
 346      my($name, $entry, @uv, @key);
 347  
 348      return if $line !~ /^\s*[0-9A-Fa-f]/;
 349  
 350      # removes comment and gets name
 351      $name = $1
 352      if $line =~ s/[#%]\s*(.*)//;
 353      return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
 354  
 355      # gets element
 356      my($e, $k) = split /;/, $line;
 357      croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
 358      if ! $k;
 359  
 360      @uv = _getHexArray($e);
 361      return if !@uv;
 362  
 363      $entry = join(CODE_SEP, @uv); # in JCPS
 364  
 365      if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
 366      my $ele = pack_U(@uv);
 367  
 368      # regarded as if it were not entried in the table
 369      return
 370          if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
 371  
 372      # replaced as completely ignorable
 373      $k = '[.0000.0000.0000.0000]'
 374          if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
 375      }
 376  
 377      # replaced as completely ignorable
 378      $k = '[.0000.0000.0000.0000]'
 379      if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
 380  
 381      my $is_L3_ignorable = TRUE;
 382  
 383      foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
 384      my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
 385      my @wt = _getHexArray($arr);
 386      push @key, pack(VCE_TEMPLATE, $var, @wt);
 387      $is_L3_ignorable = FALSE
 388          if $wt[0] || $wt[1] || $wt[2];
 389      # Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable
 390      # is completely ignorable.
 391      # For expansion, an entry $is_L3_ignorable
 392      # if and only if "all" CEs are [.0000.0000.0000].
 393      }
 394  
 395      $self->{mapping}{$entry} = $is_L3_ignorable ? [] : \@key;
 396  
 397      if (@uv > 1) {
 398      (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv)
 399          and $self->{maxlength}{$uv[0]} = @uv;
 400      }
 401  }
 402  
 403  
 404  ##
 405  ## VCE = _varCE(variable term, VCE)
 406  ##
 407  sub _varCE
 408  {
 409      my $vbl = shift;
 410      my $vce = shift;
 411      if ($vbl eq 'non-ignorable') {
 412      return $vce;
 413      }
 414      my ($var, @wt) = unpack VCE_TEMPLATE, $vce;
 415  
 416      if ($var) {
 417      return pack(VCE_TEMPLATE, $var, 0, 0, 0,
 418          $vbl eq 'blanked' ? $wt[3] : $wt[0]);
 419      }
 420      elsif ($vbl eq 'blanked') {
 421      return $vce;
 422      }
 423      else {
 424      return pack(VCE_TEMPLATE, $var, @wt[0..2],
 425          $vbl eq 'shifted' && $wt[0]+$wt[1]+$wt[2] ? Shift4Wt : 0);
 426      }
 427  }
 428  
 429  sub viewSortKey
 430  {
 431      my $self = shift;
 432      $self->visualizeSortKey($self->getSortKey(@_));
 433  }
 434  
 435  sub visualizeSortKey
 436  {
 437      my $self = shift;
 438      my $view = join " ", map sprintf("%04X", $_), unpack(KEY_TEMPLATE, shift);
 439  
 440      if ($self->{UCA_Version} <= 8) {
 441      $view =~ s/ ?0000 ?/|/g;
 442      } else {
 443      $view =~ s/\b0000\b/|/g;
 444      }
 445      return "[$view]";
 446  }
 447  
 448  
 449  ##
 450  ## arrayref of JCPS   = splitEnt(string to be collated)
 451  ## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, true)
 452  ##
 453  sub splitEnt
 454  {
 455      my $self = shift;
 456      my $wLen = $_[1];
 457  
 458      my $code = $self->{preprocess};
 459      my $norm = $self->{normCode};
 460      my $map  = $self->{mapping};
 461      my $max  = $self->{maxlength};
 462      my $reH  = $self->{rearrangeHash};
 463      my $ver9 = $self->{UCA_Version} >= 9 && $self->{UCA_Version} <= 11;
 464  
 465      my ($str, @buf);
 466  
 467      if ($wLen) {
 468      $code and croak "Preprocess breaks character positions. "
 469              . "Don't use with index(), match(), etc.";
 470      $norm and croak "Normalization breaks character positions. "
 471              . "Don't use with index(), match(), etc.";
 472      $str = $_[0];
 473      }
 474      else {
 475      $str = $_[0];
 476      $str = &$code($str) if ref $code;
 477      $str = &$norm($str) if ref $norm;
 478      }
 479  
 480      # get array of Unicode code point of string.
 481      my @src = unpack_U($str);
 482  
 483      # rearrangement:
 484      # Character positions are not kept if rearranged,
 485      # then neglected if $wLen is true.
 486      if ($reH && ! $wLen) {
 487      for (my $i = 0; $i < @src; $i++) {
 488          if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
 489          ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
 490          $i++;
 491          }
 492      }
 493      }
 494  
 495      # remove a code point marked as a completely ignorable.
 496      for (my $i = 0; $i < @src; $i++) {
 497      $src[$i] = undef
 498          if _isIllegal($src[$i]) || ($ver9 &&
 499          $map->{ $src[$i] } && @{ $map->{ $src[$i] } } == 0);
 500      }
 501  
 502      for (my $i = 0; $i < @src; $i++) {
 503      my $jcps = $src[$i];
 504  
 505      # skip removed code point
 506      if (! defined $jcps) {
 507          if ($wLen && @buf) {
 508          $buf[-1][2] = $i + 1;
 509          }
 510          next;
 511      }
 512  
 513      my $i_orig = $i;
 514  
 515      # find contraction
 516      if ($max->{$jcps}) {
 517          my $temp_jcps = $jcps;
 518          my $jcpsLen = 1;
 519          my $maxLen = $max->{$jcps};
 520  
 521          for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) {
 522          next if ! defined $src[$p];
 523          $temp_jcps .= CODE_SEP . $src[$p];
 524          $jcpsLen++;
 525          if ($map->{$temp_jcps}) {
 526              $jcps = $temp_jcps;
 527              $i = $p;
 528          }
 529          }
 530  
 531      # not-contiguous contraction with Combining Char (cf. UTS#10, S2.1).
 532      # This process requires Unicode::Normalize.
 533      # If "normalization" is undef, here should be skipped *always*
 534      # (in spite of bool value of $CVgetCombinClass),
 535      # since canonical ordering cannot be expected.
 536      # Blocked combining character should not be contracted.
 537  
 538          if ($self->{normalization})
 539          # $self->{normCode} is false in the case of "prenormalized".
 540          {
 541          my $preCC = 0;
 542          my $curCC = 0;
 543  
 544          for (my $p = $i + 1; $p < @src; $p++) {
 545              next if ! defined $src[$p];
 546              $curCC = $CVgetCombinClass->($src[$p]);
 547              last unless $curCC;
 548              my $tail = CODE_SEP . $src[$p];
 549              if ($preCC != $curCC && $map->{$jcps.$tail}) {
 550              $jcps .= $tail;
 551              $src[$p] = undef;
 552              } else {
 553              $preCC = $curCC;
 554              }
 555          }
 556          }
 557      }
 558  
 559      # skip completely ignorable
 560      if ($map->{$jcps} && @{ $map->{$jcps} } == 0) {
 561          if ($wLen && @buf) {
 562          $buf[-1][2] = $i + 1;
 563          }
 564          next;
 565      }
 566  
 567      push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps;
 568      }
 569      return \@buf;
 570  }
 571  
 572  
 573  ##
 574  ## list of VCE = getWt(JCPS)
 575  ##
 576  sub getWt
 577  {
 578      my $self = shift;
 579      my $u    = shift;
 580      my $vbl  = $self->{variable};
 581      my $map  = $self->{mapping};
 582      my $der  = $self->{derivCode};
 583  
 584      return if !defined $u;
 585      return map(_varCE($vbl, $_), @{ $map->{$u} })
 586      if $map->{$u};
 587  
 588      # JCPS must not be a contraction, then it's a code point.
 589      if (Hangul_SIni <= $u && $u <= Hangul_SFin) {
 590      my $hang = $self->{overrideHangul};
 591      my @hangulCE;
 592      if ($hang) {
 593          @hangulCE = map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$hang($u));
 594      }
 595      elsif (!defined $hang) {
 596          @hangulCE = $der->($u);
 597      }
 598      else {
 599          my $max  = $self->{maxlength};
 600          my @decH = _decompHangul($u);
 601  
 602          if (@decH == 2) {
 603          my $contract = join(CODE_SEP, @decH);
 604          @decH = ($contract) if $map->{$contract};
 605          } else { # must be <@decH == 3>
 606          if ($max->{$decH[0]}) {
 607              my $contract = join(CODE_SEP, @decH);
 608              if ($map->{$contract}) {
 609              @decH = ($contract);
 610              } else {
 611              $contract = join(CODE_SEP, @decH[0,1]);
 612              $map->{$contract} and @decH = ($contract, $decH[2]);
 613              }
 614              # even if V's ignorable, LT contraction is not supported.
 615              # If such a situatution were required, NFD should be used.
 616          }
 617          if (@decH == 3 && $max->{$decH[1]}) {
 618              my $contract = join(CODE_SEP, @decH[1,2]);
 619              $map->{$contract} and @decH = ($decH[0], $contract);
 620          }
 621          }
 622  
 623          @hangulCE = map({
 624              $map->{$_} ? @{ $map->{$_} } : $der->($_);
 625          } @decH);
 626      }
 627      return map _varCE($vbl, $_), @hangulCE;
 628      }
 629      elsif (_isUIdeo($u, $self->{UCA_Version})) {
 630      my $cjk  = $self->{overrideCJK};
 631      return map _varCE($vbl, $_),
 632          $cjk
 633          ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$cjk($u))
 634          : defined $cjk && $self->{UCA_Version} <= 8 && $u < 0x10000
 635              ? _uideoCE_8($u)
 636              : $der->($u);
 637      }
 638      else {
 639      return map _varCE($vbl, $_), $der->($u);
 640      }
 641  }
 642  
 643  
 644  ##
 645  ## string sortkey = getSortKey(string arg)
 646  ##
 647  sub getSortKey
 648  {
 649      my $self = shift;
 650      my $lev  = $self->{level};
 651      my $rEnt = $self->splitEnt(shift); # get an arrayref of JCPS
 652      my $v2i  = $self->{UCA_Version} >= 9 &&
 653          $self->{variable} ne 'non-ignorable';
 654  
 655      my @buf; # weight arrays
 656      if ($self->{hangul_terminator}) {
 657      my $preHST = '';
 658      foreach my $jcps (@$rEnt) {
 659          # weird things like VL, TL-contraction are not considered!
 660          my $curHST = '';
 661          foreach my $u (split /;/, $jcps) {
 662          $curHST .= getHST($u);
 663          }
 664          if ($preHST && !$curHST || # hangul before non-hangul
 665          $preHST =~ /L\z/ && $curHST =~ /^T/ ||
 666          $preHST =~ /V\z/ && $curHST =~ /^L/ ||
 667          $preHST =~ /T\z/ && $curHST =~ /^[LV]/) {
 668  
 669          push @buf, $self->getWtHangulTerm();
 670          }
 671          $preHST = $curHST;
 672  
 673          push @buf, $self->getWt($jcps);
 674      }
 675      $preHST # end at hangul
 676          and push @buf, $self->getWtHangulTerm();
 677      }
 678      else {
 679      foreach my $jcps (@$rEnt) {
 680          push @buf, $self->getWt($jcps);
 681      }
 682      }
 683  
 684      # make sort key
 685      my @ret = ([],[],[],[]);
 686      my $last_is_variable;
 687  
 688      foreach my $vwt (@buf) {
 689      my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
 690  
 691      # "Ignorable (L1, L2) after Variable" since track. v. 9
 692      if ($v2i) {
 693          if ($var) {
 694          $last_is_variable = TRUE;
 695          }
 696          elsif (!$wt[0]) { # ignorable
 697          next if $last_is_variable;
 698          }
 699          else {
 700          $last_is_variable = FALSE;
 701          }
 702      }
 703      foreach my $v (0..$lev-1) {
 704          0 < $wt[$v] and push @{ $ret[$v] }, $wt[$v];
 705      }
 706      }
 707  
 708      # modification of tertiary weights
 709      if ($self->{upper_before_lower}) {
 710      foreach my $w (@{ $ret[2] }) {
 711          if    (0x8 <= $w && $w <= 0xC) { $w -= 6 } # lower
 712          elsif (0x2 <= $w && $w <= 0x6) { $w += 6 } # upper
 713          elsif ($w == 0x1C)             { $w += 1 } # square upper
 714          elsif ($w == 0x1D)             { $w -= 1 } # square lower
 715      }
 716      }
 717      if ($self->{katakana_before_hiragana}) {
 718      foreach my $w (@{ $ret[2] }) {
 719          if    (0x0F <= $w && $w <= 0x13) { $w -= 2 } # katakana
 720          elsif (0x0D <= $w && $w <= 0x0E) { $w += 5 } # hiragana
 721      }
 722      }
 723  
 724      if ($self->{backwardsFlag}) {
 725      for (my $v = MinLevel; $v <= MaxLevel; $v++) {
 726          if ($self->{backwardsFlag} & (1 << $v)) {
 727          @{ $ret[$v-1] } = reverse @{ $ret[$v-1] };
 728          }
 729      }
 730      }
 731  
 732      join LEVEL_SEP, map pack(KEY_TEMPLATE, @$_), @ret;
 733  }
 734  
 735  
 736  ##
 737  ## int compare = cmp(string a, string b)
 738  ##
 739  sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
 740  sub eq  { $_[0]->getSortKey($_[1]) eq  $_[0]->getSortKey($_[2]) }
 741  sub ne  { $_[0]->getSortKey($_[1]) ne  $_[0]->getSortKey($_[2]) }
 742  sub lt  { $_[0]->getSortKey($_[1]) lt  $_[0]->getSortKey($_[2]) }
 743  sub le  { $_[0]->getSortKey($_[1]) le  $_[0]->getSortKey($_[2]) }
 744  sub gt  { $_[0]->getSortKey($_[1]) gt  $_[0]->getSortKey($_[2]) }
 745  sub ge  { $_[0]->getSortKey($_[1]) ge  $_[0]->getSortKey($_[2]) }
 746  
 747  ##
 748  ## list[strings] sorted = sort(list[strings] arg)
 749  ##
 750  sub sort {
 751      my $obj = shift;
 752      return
 753      map { $_->[1] }
 754          sort{ $a->[0] cmp $b->[0] }
 755          map [ $obj->getSortKey($_), $_ ], @_;
 756  }
 757  
 758  
 759  sub _derivCE_14 {
 760      my $u = shift;
 761      my $base =
 762      (CJK_UidIni  <= $u && $u <= CJK_UidF41)
 763          ? 0xFB40 : # CJK
 764      (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
 765       CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
 766          ? 0xFB80   # CJK ext.
 767          : 0xFBC0;  # others
 768  
 769      my $aaaa = $base + ($u >> 15);
 770      my $bbbb = ($u & 0x7FFF) | 0x8000;
 771      return
 772      pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
 773      pack(VCE_TEMPLATE, NON_VAR, $bbbb,      0,      0, $u);
 774  }
 775  
 776  sub _derivCE_9 {
 777      my $u = shift;
 778      my $base =
 779      (CJK_UidIni  <= $u && $u <= CJK_UidFin)
 780          ? 0xFB40 : # CJK
 781      (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
 782       CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
 783          ? 0xFB80   # CJK ext.
 784          : 0xFBC0;  # others
 785  
 786      my $aaaa = $base + ($u >> 15);
 787      my $bbbb = ($u & 0x7FFF) | 0x8000;
 788      return
 789      pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
 790      pack(VCE_TEMPLATE, NON_VAR, $bbbb,      0,      0, $u);
 791  }
 792  
 793  sub _derivCE_8 {
 794      my $code = shift;
 795      my $aaaa =  0xFF80 + ($code >> 15);
 796      my $bbbb = ($code & 0x7FFF) | 0x8000;
 797      return
 798      pack(VCE_TEMPLATE, NON_VAR, $aaaa, 2, 1, $code),
 799      pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code);
 800  }
 801  
 802  sub _uideoCE_8 {
 803      my $u = shift;
 804      return pack(VCE_TEMPLATE, NON_VAR, $u, Min2Wt, Min3Wt, $u);
 805  }
 806  
 807  sub _isUIdeo {
 808      my ($u, $uca_vers) = @_;
 809      return(
 810      (CJK_UidIni <= $u &&
 811          ($uca_vers >= 14 ? ( $u <= CJK_UidF41) : ($u <= CJK_UidFin)))
 812          ||
 813      (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin)
 814          ||
 815      (CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
 816      );
 817  }
 818  
 819  
 820  sub getWtHangulTerm {
 821      my $self = shift;
 822      return _varCE($self->{variable},
 823      pack(VCE_TEMPLATE, NON_VAR, $self->{hangul_terminator}, 0,0,0));
 824  }
 825  
 826  
 827  ##
 828  ## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
 829  ##
 830  sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
 831  
 832  #
 833  # $code *must* be in Hangul syllable.
 834  # Check it before you enter here.
 835  #
 836  sub _decompHangul {
 837      my $code = shift;
 838      my $si = $code - Hangul_SBase;
 839      my $li = int( $si / Hangul_NCount);
 840      my $vi = int(($si % Hangul_NCount) / Hangul_TCount);
 841      my $ti =      $si % Hangul_TCount;
 842      return (
 843      Hangul_LBase + $li,
 844      Hangul_VBase + $vi,
 845      $ti ? (Hangul_TBase + $ti) : (),
 846      );
 847  }
 848  
 849  sub _isIllegal {
 850      my $code = shift;
 851      return ! defined $code                      # removed
 852      || ($code < 0 || 0x10FFFF < $code)      # out of range
 853      || (($code & 0xFFFE) == 0xFFFE)         # ??FFF[EF] (cf. utf8.c)
 854      || (0xD800 <= $code && $code <= 0xDFFF) # unpaired surrogates
 855      || (0xFDD0 <= $code && $code <= 0xFDEF) # other non-characters
 856      ;
 857  }
 858  
 859  # Hangul Syllable Type
 860  sub getHST {
 861      my $u = shift;
 862      return
 863      Hangul_LIni <= $u && $u <= Hangul_LFin || $u == Hangul_LFill ? "L" :
 864      Hangul_VIni <= $u && $u <= Hangul_VFin         ? "V" :
 865      Hangul_TIni <= $u && $u <= Hangul_TFin         ? "T" :
 866      Hangul_SIni <= $u && $u <= Hangul_SFin ?
 867          ($u - Hangul_SBase) % Hangul_TCount ? "LVT" : "LV" : "";
 868  }
 869  
 870  
 871  ##
 872  ## bool _nonIgnorAtLevel(arrayref weights, int level)
 873  ##
 874  sub _nonIgnorAtLevel($$)
 875  {
 876      my $wt = shift;
 877      return if ! defined $wt;
 878      my $lv = shift;
 879      return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
 880  }
 881  
 882  ##
 883  ## bool _eqArray(
 884  ##    arrayref of arrayref[weights] source,
 885  ##    arrayref of arrayref[weights] substr,
 886  ##    int level)
 887  ## * comparison of graphemes vs graphemes.
 888  ##   @$source >= @$substr must be true (check it before call this);
 889  ##
 890  sub _eqArray($$$)
 891  {
 892      my $source = shift;
 893      my $substr = shift;
 894      my $lev = shift;
 895  
 896      for my $g (0..@$substr-1){
 897      # Do the $g'th graphemes have the same number of AV weigths?
 898      return if @{ $source->[$g] } != @{ $substr->[$g] };
 899  
 900      for my $w (0..@{ $substr->[$g] }-1) {
 901          for my $v (0..$lev-1) {
 902          return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
 903          }
 904      }
 905      }
 906      return 1;
 907  }
 908  
 909  ##
 910  ## (int position, int length)
 911  ## int position = index(string, substring, position, [undoc'ed grobal])
 912  ##
 913  ## With "grobal" (only for the list context),
 914  ##  returns list of arrayref[position, length].
 915  ##
 916  sub index
 917  {
 918      my $self = shift;
 919      my $str  = shift;
 920      my $len  = length($str);
 921      my $subE = $self->splitEnt(shift);
 922      my $pos  = @_ ? shift : 0;
 923         $pos  = 0 if $pos < 0;
 924      my $grob = shift;
 925  
 926      my $lev  = $self->{level};
 927      my $v2i  = $self->{UCA_Version} >= 9 &&
 928          $self->{variable} ne 'non-ignorable';
 929  
 930      if (! @$subE) {
 931      my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
 932      return $grob
 933          ? map([$_, 0], $temp..$len)
 934          : wantarray ? ($temp,0) : $temp;
 935      }
 936      $len < $pos
 937      and return wantarray ? () : NOMATCHPOS;
 938      my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE);
 939      @$strE
 940      or return wantarray ? () : NOMATCHPOS;
 941  
 942      my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
 943  
 944      my $last_is_variable;
 945      for my $vwt (map $self->getWt($_), @$subE) {
 946      my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
 947      my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
 948  
 949      # "Ignorable (L1, L2) after Variable" since track. v. 9
 950      if ($v2i) {
 951          if ($var) {
 952          $last_is_variable = TRUE;
 953          }
 954          elsif (!$wt[0]) { # ignorable
 955          $to_be_pushed = FALSE if $last_is_variable;
 956          }
 957          else {
 958          $last_is_variable = FALSE;
 959          }
 960      }
 961  
 962      if (@subWt && !$var && !$wt[0]) {
 963          push @{ $subWt[-1] }, \@wt if $to_be_pushed;
 964      } else {
 965          push @subWt, [ \@wt ];
 966      }
 967      }
 968  
 969      my $count = 0;
 970      my $end = @$strE - 1;
 971  
 972      $last_is_variable = FALSE; # reuse
 973      for (my $i = 0; $i <= $end; ) { # no $i++
 974      my $found_base = 0;
 975  
 976      # fetch a grapheme
 977      while ($i <= $end && $found_base == 0) {
 978          for my $vwt ($self->getWt($strE->[$i][0])) {
 979          my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
 980          my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
 981  
 982          # "Ignorable (L1, L2) after Variable" since track. v. 9
 983          if ($v2i) {
 984              if ($var) {
 985              $last_is_variable = TRUE;
 986              }
 987              elsif (!$wt[0]) { # ignorable
 988              $to_be_pushed = FALSE if $last_is_variable;
 989              }
 990              else {
 991              $last_is_variable = FALSE;
 992              }
 993          }
 994  
 995          if (@strWt && !$var && !$wt[0]) {
 996              push @{ $strWt[-1] }, \@wt if $to_be_pushed;
 997              $finPos[-1] = $strE->[$i][2];
 998          } elsif ($to_be_pushed) {
 999              push @strWt, [ \@wt ];
1000              push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1];
1001              $finPos[-1] = NOMATCHPOS if $found_base;
1002              push @finPos, $strE->[$i][2];
1003              $found_base++;
1004          }
1005          # else ===> no-op
1006          }
1007          $i++;
1008      }
1009  
1010      # try to match
1011      while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
1012          if ($iniPos[0] != NOMATCHPOS &&
1013              $finPos[$#subWt] != NOMATCHPOS &&
1014              _eqArray(\@strWt, \@subWt, $lev)) {
1015          my $temp = $iniPos[0] + $pos;
1016  
1017          if ($grob) {
1018              push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
1019              splice @strWt,  0, $#subWt;
1020              splice @iniPos, 0, $#subWt;
1021              splice @finPos, 0, $#subWt;
1022          }
1023          else {
1024              return wantarray
1025              ? ($temp, $finPos[$#subWt] - $iniPos[0])
1026              :  $temp;
1027          }
1028          }
1029          shift @strWt;
1030          shift @iniPos;
1031          shift @finPos;
1032      }
1033      }
1034  
1035      return $grob
1036      ? @g_ret
1037      : wantarray ? () : NOMATCHPOS;
1038  }
1039  
1040  ##
1041  ## scalarref to matching part = match(string, substring)
1042  ##
1043  sub match
1044  {
1045      my $self = shift;
1046      if (my($pos,$len) = $self->index($_[0], $_[1])) {
1047      my $temp = substr($_[0], $pos, $len);
1048      return wantarray ? $temp : \$temp;
1049      # An lvalue ref \substr should be avoided,
1050      # since its value is affected by modification of its referent.
1051      }
1052      else {
1053      return;
1054      }
1055  }
1056  
1057  ##
1058  ## arrayref matching parts = gmatch(string, substring)
1059  ##
1060  sub gmatch
1061  {
1062      my $self = shift;
1063      my $str  = shift;
1064      my $sub  = shift;
1065      return map substr($str, $_->[0], $_->[1]),
1066          $self->index($str, $sub, 0, 'g');
1067  }
1068  
1069  ##
1070  ## bool subst'ed = subst(string, substring, replace)
1071  ##
1072  sub subst
1073  {
1074      my $self = shift;
1075      my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1076  
1077      if (my($pos,$len) = $self->index($_[0], $_[1])) {
1078      if ($code) {
1079          my $mat = substr($_[0], $pos, $len);
1080          substr($_[0], $pos, $len, $code->($mat));
1081      } else {
1082          substr($_[0], $pos, $len, $_[2]);
1083      }
1084      return TRUE;
1085      }
1086      else {
1087      return FALSE;
1088      }
1089  }
1090  
1091  ##
1092  ## int count = gsubst(string, substring, replace)
1093  ##
1094  sub gsubst
1095  {
1096      my $self = shift;
1097      my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1098      my $cnt = 0;
1099  
1100      # Replacement is carried out from the end, then use reverse.
1101      for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
1102      if ($code) {
1103          my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
1104          substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
1105      } else {
1106          substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
1107      }
1108      $cnt++;
1109      }
1110      return $cnt;
1111  }
1112  
1113  1;
1114  __END__
1115  
1116  =head1 NAME
1117  
1118  Unicode::Collate - Unicode Collation Algorithm
1119  
1120  =head1 SYNOPSIS
1121  
1122    use Unicode::Collate;
1123  
1124    #construct
1125    $Collator = Unicode::Collate->new(%tailoring);
1126  
1127    #sort
1128    @sorted = $Collator->sort(@not_sorted);
1129  
1130    #compare
1131    $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
1132  
1133    # If %tailoring is false (i.e. empty),
1134    # $Collator should do the default collation.
1135  
1136  =head1 DESCRIPTION
1137  
1138  This module is an implementation of Unicode Technical Standard #10
1139  (a.k.a. UTS #10) - Unicode Collation Algorithm (a.k.a. UCA).
1140  
1141  =head2 Constructor and Tailoring
1142  
1143  The C<new> method returns a collator object.
1144  
1145     $Collator = Unicode::Collate->new(
1146        UCA_Version => $UCA_Version,
1147        alternate => $alternate, # deprecated: use of 'variable' is recommended.
1148        backwards => $levelNumber, # or \@levelNumbers
1149        entry => $element,
1150        hangul_terminator => $term_primary_weight,
1151        ignoreName => qr/$ignoreName/,
1152        ignoreChar => qr/$ignoreChar/,
1153        katakana_before_hiragana => $bool,
1154        level => $collationLevel,
1155        normalization  => $normalization_form,
1156        overrideCJK => \&overrideCJK,
1157        overrideHangul => \&overrideHangul,
1158        preprocess => \&preprocess,
1159        rearrange => \@charList,
1160        table => $filename,
1161        undefName => qr/$undefName/,
1162        undefChar => qr/$undefChar/,
1163        upper_before_lower => $bool,
1164        variable => $variable,
1165     );
1166  
1167  =over 4
1168  
1169  =item UCA_Version
1170  
1171  If the tracking version number of UCA is given,
1172  behavior of that tracking version is emulated on collating.
1173  If omitted, the return value of C<UCA_Version()> is used.
1174  C<UCA_Version()> should return the latest tracking version supported.
1175  
1176  The supported tracking version: 8, 9, 11, or 14.
1177  
1178       UCA       Unicode Standard         DUCET (@version)
1179       ---------------------------------------------------
1180        8              3.1                3.0.1 (3.0.1d9)
1181        9     3.1 with Corrigendum 3      3.1.1 (3.1.1)
1182       11              4.0                4.0.0 (4.0.0)
1183       14             4.1.0               4.1.0 (4.1.0)
1184  
1185  Note: Recent UTS #10 renames "Tracking Version" to "Revision."
1186  
1187  =item alternate
1188  
1189  -- see 3.2.2 Alternate Weighting, version 8 of UTS #10
1190  
1191  For backward compatibility, C<alternate> (old name) can be used
1192  as an alias for C<variable>.
1193  
1194  =item backwards
1195  
1196  -- see 3.1.2 French Accents, UTS #10.
1197  
1198       backwards => $levelNumber or \@levelNumbers
1199  
1200  Weights in reverse order; ex. level 2 (diacritic ordering) in French.
1201  If omitted, forwards at all the levels.
1202  
1203  =item entry
1204  
1205  -- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10.
1206  
1207  If the same character (or a sequence of characters) exists
1208  in the collation element table through C<table>,
1209  mapping to collation elements is overrided.
1210  If it does not exist, the mapping is defined additionally.
1211  
1212      entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
1213  0063 0068 ; [.0E6A.0020.0002.0063] # ch
1214  0043 0068 ; [.0E6A.0020.0007.0043] # Ch
1215  0043 0048 ; [.0E6A.0020.0008.0043] # CH
1216  006C 006C ; [.0F4C.0020.0002.006C] # ll
1217  004C 006C ; [.0F4C.0020.0007.004C] # Ll
1218  004C 004C ; [.0F4C.0020.0008.004C] # LL
1219  00F1      ; [.0F7B.0020.0002.00F1] # n-tilde
1220  006E 0303 ; [.0F7B.0020.0002.00F1] # n-tilde
1221  00D1      ; [.0F7B.0020.0008.00D1] # N-tilde
1222  004E 0303 ; [.0F7B.0020.0008.00D1] # N-tilde
1223  ENTRY
1224  
1225      entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
1226  00E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e>
1227  00C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E>
1228  ENTRY
1229  
1230  B<NOTE:> The code point in the UCA file format (before C<';'>)
1231  B<must> be a Unicode code point (defined as hexadecimal),
1232  but not a native code point.
1233  So C<0063> must always denote C<U+0063>,
1234  but not a character of C<"\x63">.
1235  
1236  Weighting may vary depending on collation element table.
1237  So ensure the weights defined in C<entry> will be consistent with
1238  those in the collation element table loaded via C<table>.
1239  
1240  In DUCET v4.0.0, primary weight of C<C> is C<0E60>
1241  and that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A>
1242  (as a value between C<0E60> and C<0E6D>)
1243  makes ordering as C<C E<lt> CH E<lt> D>.
1244  Exactly speaking DUCET already has some characters between C<C> and C<D>:
1245  C<small capital C> (C<U+1D04>) with primary weight C<0E64>,
1246  C<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>,
1247  and C<c-curl> (C<U+0255>) with C<0E69>.
1248  Then primary weight C<0E6A> for C<CH> makes C<CH>
1249  ordered between C<c-curl> and C<D>.
1250  
1251  =item hangul_terminator
1252  
1253  -- see 7.1.4 Trailing Weights, UTS #10.
1254  
1255  If a true value is given (non-zero but should be positive),
1256  it will be added as a terminator primary weight to the end of
1257  every standard Hangul syllable. Secondary and any higher weights
1258  for terminator are set to zero.
1259  If the value is false or C<hangul_terminator> key does not exist,
1260  insertion of terminator weights will not be performed.
1261  
1262  Boundaries of Hangul syllables are determined
1263  according to conjoining Jamo behavior in F<the Unicode Standard>
1264  and F<HangulSyllableType.txt>.
1265  
1266  B<Implementation Note:>
1267  (1) For expansion mapping (Unicode character mapped
1268  to a sequence of collation elements), a terminator will not be added
1269  between collation elements, even if Hangul syllable boundary exists there.
1270  Addition of terminator is restricted to the next position
1271  to the last collation element.
1272  
1273  (2) Non-conjoining Hangul letters
1274  (Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not
1275  automatically terminated with a terminator primary weight.
1276  These characters may need terminator included in a collation element
1277  table beforehand.
1278  
1279  =item ignoreChar
1280  
1281  =item ignoreName
1282  
1283  -- see 3.2.2 Variable Weighting, UTS #10.
1284  
1285  Makes the entry in the table completely ignorable;
1286  i.e. as if the weights were zero at all level.
1287  
1288  Through C<ignoreChar>, any character matching C<qr/$ignoreChar/>
1289  will be ignored. Through C<ignoreName>, any character whose name
1290  (given in the C<table> file as a comment) matches C<qr/$ignoreName/>
1291  will be ignored.
1292  
1293  E.g. when 'a' and 'e' are ignorable,
1294  'element' is equal to 'lament' (or 'lmnt').
1295  
1296  =item katakana_before_hiragana
1297  
1298  -- see 7.3.1 Tertiary Weight Table, UTS #10.
1299  
1300  By default, hiragana is before katakana.
1301  If the parameter is made true, this is reversed.
1302  
1303  B<NOTE>: This parameter simplemindedly assumes that any hiragana/katakana
1304  distinctions must occur in level 3, and their weights at level 3 must be
1305  same as those mentioned in 7.3.1, UTS #10.
1306  If you define your collation elements which violate this requirement,
1307  this parameter does not work validly.
1308  
1309  =item level
1310  
1311  -- see 4.3 Form Sort Key, UTS #10.
1312  
1313  Set the maximum level.
1314  Any higher levels than the specified one are ignored.
1315  
1316    Level 1: alphabetic ordering
1317    Level 2: diacritic ordering
1318    Level 3: case ordering
1319    Level 4: tie-breaking (e.g. in the case when variable is 'shifted')
1320  
1321    ex.level => 2,
1322  
1323  If omitted, the maximum is the 4th.
1324  
1325  =item normalization
1326  
1327  -- see 4.1 Normalize, UTS #10.
1328  
1329  If specified, strings are normalized before preparation of sort keys
1330  (the normalization is executed after preprocess).
1331  
1332  A form name C<Unicode::Normalize::normalize()> accepts will be applied
1333  as C<$normalization_form>.
1334  Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>.
1335  See C<Unicode::Normalize::normalize()> for detail.
1336  If omitted, C<'NFD'> is used.
1337  
1338  C<normalization> is performed after C<preprocess> (if defined).
1339  
1340  Furthermore, special values, C<undef> and C<"prenormalized">, can be used,
1341  though they are not concerned with C<Unicode::Normalize::normalize()>.
1342  
1343  If C<undef> (not a string C<"undef">) is passed explicitly
1344  as the value for this key,
1345  any normalization is not carried out (this may make tailoring easier
1346  if any normalization is not desired). Under C<(normalization =E<gt> undef)>,
1347  only contiguous contractions are resolved;
1348  e.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>,
1349  C<A-cedilla-ring> would be primary equal to C<A>.
1350  In this point,
1351  C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })>
1352  B<is not> equivalent to C<(normalization =E<gt> 'NFD')>.
1353  
1354  In the case of C<(normalization =E<gt> "prenormalized")>,
1355  any normalization is not performed, but
1356  non-contiguous contractions with combining characters are performed.
1357  Therefore
1358  C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })>
1359  B<is> equivalent to C<(normalization =E<gt> 'NFD')>.
1360  If source strings are finely prenormalized,
1361  C<(normalization =E<gt> 'prenormalized')> may save time for normalization.
1362  
1363  Except C<(normalization =E<gt> undef)>,
1364  B<Unicode::Normalize> is required (see also B<CAVEAT>).
1365  
1366  =item overrideCJK
1367  
1368  -- see 7.1 Derived Collation Elements, UTS #10.
1369  
1370  By default, CJK Unified Ideographs are ordered in Unicode codepoint order
1371  but C<CJK Unified Ideographs> (if C<UCA_Version> is 8 to 11, its range is
1372  C<U+4E00..U+9FA5>; if C<UCA_Version> is 14, its range is C<U+4E00..U+9FBB>)
1373  are lesser than C<CJK Unified Ideographs Extension> (its range is
1374  C<U+3400..U+4DB5> and C<U+20000..U+2A6D6>).
1375  
1376  Through C<overrideCJK>, ordering of CJK Unified Ideographs can be overrided.
1377  
1378  ex. CJK Unified Ideographs in the JIS code point order.
1379  
1380    overrideCJK => sub {
1381        my $u = shift;             # get a Unicode codepoint
1382        my $b = pack('n', $u);     # to UTF-16BE
1383        my $s = your_unicode_to_sjis_converter($b); # convert
1384        my $n = unpack('n', $s);   # convert sjis to short
1385        [ $n, 0x20, 0x2, $u ];     # return the collation element
1386    },
1387  
1388  ex. ignores all CJK Unified Ideographs.
1389  
1390    overrideCJK => sub {()}, # CODEREF returning empty list
1391  
1392     # where ->eq("Pe\x{4E00}rl", "Perl") is true
1393     # as U+4E00 is a CJK Unified Ideograph and to be ignorable.
1394  
1395  If C<undef> is passed explicitly as the value for this key,
1396  weights for CJK Unified Ideographs are treated as undefined.
1397  But assignment of weight for CJK Unified Ideographs
1398  in table or C<entry> is still valid.
1399  
1400  =item overrideHangul
1401  
1402  -- see 7.1 Derived Collation Elements, UTS #10.
1403  
1404  By default, Hangul Syllables are decomposed into Hangul Jamo,
1405  even if C<(normalization =E<gt> undef)>.
1406  But the mapping of Hangul Syllables may be overrided.
1407  
1408  This parameter works like C<overrideCJK>, so see there for examples.
1409  
1410  If you want to override the mapping of Hangul Syllables,
1411  NFD, NFKD, and FCD are not appropriate,
1412  since they will decompose Hangul Syllables before overriding.
1413  
1414  If C<undef> is passed explicitly as the value for this key,
1415  weight for Hangul Syllables is treated as undefined
1416  without decomposition into Hangul Jamo.
1417  But definition of weight for Hangul Syllables
1418  in table or C<entry> is still valid.
1419  
1420  =item preprocess
1421  
1422  -- see 5.1 Preprocessing, UTS #10.
1423  
1424  If specified, the coderef is used to preprocess
1425  before the formation of sort keys.
1426  
1427  ex. dropping English articles, such as "a" or "the".
1428  Then, "the pen" is before "a pencil".
1429  
1430       preprocess => sub {
1431             my $str = shift;
1432             $str =~ s/\b(?:an?|the)\s+//gi;
1433             return $str;
1434          },
1435  
1436  C<preprocess> is performed before C<normalization> (if defined).
1437  
1438  =item rearrange
1439  
1440  -- see 3.1.3 Rearrangement, UTS #10.
1441  
1442  Characters that are not coded in logical order and to be rearranged.
1443  If C<UCA_Version> is equal to or lesser than 11, default is:
1444  
1445      rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1446  
1447  If you want to disallow any rearrangement, pass C<undef> or C<[]>
1448  (a reference to empty list) as the value for this key.
1449  
1450  If C<UCA_Version> is equal to 14, default is C<[]> (i.e. no rearrangement).
1451  
1452  B<According to the version 9 of UCA, this parameter shall not be used;
1453  but it is not warned at present.>
1454  
1455  =item table
1456  
1457  -- see 3.2 Default Unicode Collation Element Table, UTS #10.
1458  
1459  You can use another collation element table if desired.
1460  
1461  The table file should locate in the F<Unicode/Collate> directory
1462  on C<@INC>. Say, if the filename is F<Foo.txt>,
1463  the table file is searched as F<Unicode/Collate/Foo.txt> in C<@INC>.
1464  
1465  By default, F<allkeys.txt> (as the filename of DUCET) is used.
1466  If you will prepare your own table file, any name other than F<allkeys.txt>
1467  may be better to avoid namespace conflict.
1468  
1469  If C<undef> is passed explicitly as the value for this key,
1470  no file is read (but you can define collation elements via C<entry>).
1471  
1472  A typical way to define a collation element table
1473  without any file of table:
1474  
1475     $onlyABC = Unicode::Collate->new(
1476         table => undef,
1477         entry => << 'ENTRIES',
1478  0061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
1479  0041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
1480  0062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
1481  0042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
1482  0063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
1483  0043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1484  ENTRIES
1485      );
1486  
1487  If C<ignoreName> or C<undefName> is used, character names should be
1488  specified as a comment (following C<#>) on each line.
1489  
1490  =item undefChar
1491  
1492  =item undefName
1493  
1494  -- see 6.3.4 Reducing the Repertoire, UTS #10.
1495  
1496  Undefines the collation element as if it were unassigned in the table.
1497  This reduces the size of the table.
1498  If an unassigned character appears in the string to be collated,
1499  the sort key is made from its codepoint
1500  as a single-character collation element,
1501  as it is greater than any other assigned collation elements
1502  (in the codepoint order among the unassigned characters).
1503  But, it'd be better to ignore characters
1504  unfamiliar to you and maybe never used.
1505  
1506  Through C<undefChar>, any character matching C<qr/$undefChar/>
1507  will be undefined. Through C<undefName>, any character whose name
1508  (given in the C<table> file as a comment) matches C<qr/$undefName/>
1509  will be undefined.
1510  
1511  ex. Collation weights for beyond-BMP characters are not stored in object:
1512  
1513      undefChar => qr/[^\0-\x{fffd}]/,
1514  
1515  =item upper_before_lower
1516  
1517  -- see 6.6 Case Comparisons, UTS #10.
1518  
1519  By default, lowercase is before uppercase.
1520  If the parameter is made true, this is reversed.
1521  
1522  B<NOTE>: This parameter simplemindedly assumes that any lowercase/uppercase
1523  distinctions must occur in level 3, and their weights at level 3 must be
1524  same as those mentioned in 7.3.1, UTS #10.
1525  If you define your collation elements which differs from this requirement,
1526  this parameter doesn't work validly.
1527  
1528  =item variable
1529  
1530  -- see 3.2.2 Variable Weighting, UTS #10.
1531  
1532  This key allows to variable weighting for variable collation elements,
1533  which are marked with an ASTERISK in the table
1534  (NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
1535  
1536     variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
1537  
1538  These names are case-insensitive.
1539  By default (if specification is omitted), 'shifted' is adopted.
1540  
1541     'Blanked'        Variable elements are made ignorable at levels 1 through 3;
1542                      considered at the 4th level.
1543  
1544     'Non-Ignorable'  Variable elements are not reset to ignorable.
1545  
1546     'Shifted'        Variable elements are made ignorable at levels 1 through 3
1547                      their level 4 weight is replaced by the old level 1 weight.
1548                      Level 4 weight for Non-Variable elements is 0xFFFF.
1549  
1550     'Shift-Trimmed'  Same as 'shifted', but all FFFF's at the 4th level
1551                      are trimmed.
1552  
1553  =back
1554  
1555  =head2 Methods for Collation
1556  
1557  =over 4
1558  
1559  =item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
1560  
1561  Sorts a list of strings.
1562  
1563  =item C<$result = $Collator-E<gt>cmp($a, $b)>
1564  
1565  Returns 1 (when C<$a> is greater than C<$b>)
1566  or 0 (when C<$a> is equal to C<$b>)
1567  or -1 (when C<$a> is lesser than C<$b>).
1568  
1569  =item C<$result = $Collator-E<gt>eq($a, $b)>
1570  
1571  =item C<$result = $Collator-E<gt>ne($a, $b)>
1572  
1573  =item C<$result = $Collator-E<gt>lt($a, $b)>
1574  
1575  =item C<$result = $Collator-E<gt>le($a, $b)>
1576  
1577  =item C<$result = $Collator-E<gt>gt($a, $b)>
1578  
1579  =item C<$result = $Collator-E<gt>ge($a, $b)>
1580  
1581  They works like the same name operators as theirs.
1582  
1583     eq : whether $a is equal to $b.
1584     ne : whether $a is not equal to $b.
1585     lt : whether $a is lesser than $b.
1586     le : whether $a is lesser than $b or equal to $b.
1587     gt : whether $a is greater than $b.
1588     ge : whether $a is greater than $b or equal to $b.
1589  
1590  =item C<$sortKey = $Collator-E<gt>getSortKey($string)>
1591  
1592  -- see 4.3 Form Sort Key, UTS #10.
1593  
1594  Returns a sort key.
1595  
1596  You compare the sort keys using a binary comparison
1597  and get the result of the comparison of the strings using UCA.
1598  
1599     $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
1600  
1601        is equivalent to
1602  
1603     $Collator->cmp($a, $b)
1604  
1605  =item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1606  
1607  Converts a sorting key into its representation form.
1608  If C<UCA_Version> is 8, the output is slightly different.
1609  
1610     use Unicode::Collate;
1611     my $c = Unicode::Collate->new();
1612     print $c->viewSortKey("Perl"),"\n";
1613  
1614     # output:
1615     # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1616     #  Level 1               Level 2               Level 3               Level 4
1617  
1618  =back
1619  
1620  =head2 Methods for Searching
1621  
1622  B<DISCLAIMER:> If C<preprocess> or C<normalization> parameter is true
1623  for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1624  C<subst>, C<gsubst>) is croaked,
1625  as the position and the length might differ
1626  from those on the specified string.
1627  (And C<rearrange> and C<hangul_terminator> parameters are neglected.)
1628  
1629  The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1630  like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1631  but they are not aware of any pattern, but only a literal substring.
1632  
1633  =over 4
1634  
1635  =item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1636  
1637  =item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
1638  
1639  If C<$substring> matches a part of C<$string>, returns
1640  the position of the first occurrence of the matching part in scalar context;
1641  in list context, returns a two-element list of
1642  the position and the length of the matching part.
1643  
1644  If C<$substring> does not match any part of C<$string>,
1645  returns C<-1> in scalar context and
1646  an empty list in list context.
1647  
1648  e.g. you say
1649  
1650    my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1651                                       # (normalization => undef) is REQUIRED.
1652    my $str = "Ich muß studieren Perl.";
1653    my $sub = "MÜSS";
1654    my $match;
1655    if (my($pos,$len) = $Collator->index($str, $sub)) {
1656        $match = substr($str, $pos, $len);
1657    }
1658  
1659  and get C<"muß"> in C<$match> since C<"muß">
1660  is primary equal to C<"MÜSS">.
1661  
1662  =item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1663  
1664  =item C<($match)   = $Collator-E<gt>match($string, $substring)>
1665  
1666  If C<$substring> matches a part of C<$string>, in scalar context, returns
1667  B<a reference to> the first occurrence of the matching part
1668  (C<$match_ref> is always true if matches,
1669  since every reference is B<true>);
1670  in list context, returns the first occurrence of the matching part.
1671  
1672  If C<$substring> does not match any part of C<$string>,
1673  returns C<undef> in scalar context and
1674  an empty list in list context.
1675  
1676  e.g.
1677  
1678      if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1679      print "matches [$$match_ref].\n";
1680      } else {
1681      print "doesn't match.\n";
1682      }
1683  
1684       or
1685  
1686      if (($match) = $Collator->match($str, $sub)) { # list context
1687      print "matches [$match].\n";
1688      } else {
1689      print "doesn't match.\n";
1690      }
1691  
1692  =item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1693  
1694  If C<$substring> matches a part of C<$string>, returns
1695  all the matching parts (or matching count in scalar context).
1696  
1697  If C<$substring> does not match any part of C<$string>,
1698  returns an empty list.
1699  
1700  =item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1701  
1702  If C<$substring> matches a part of C<$string>,
1703  the first occurrence of the matching part is replaced by C<$replacement>
1704  (C<$string> is modified) and return C<$count> (always equals to C<1>).
1705  
1706  C<$replacement> can be a C<CODEREF>,
1707  taking the matching part as an argument,
1708  and returning a string to replace the matching part
1709  (a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1710  
1711  =item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1712  
1713  If C<$substring> matches a part of C<$string>,
1714  all the occurrences of the matching part is replaced by C<$replacement>
1715  (C<$string> is modified) and return C<$count>.
1716  
1717  C<$replacement> can be a C<CODEREF>,
1718  taking the matching part as an argument,
1719  and returning a string to replace the matching part
1720  (a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1721  
1722  e.g.
1723  
1724    my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1725                                       # (normalization => undef) is REQUIRED.
1726    my $str = "Camel donkey zebra came\x{301}l CAMEL horse cAm\0E\0L...";
1727    $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1728  
1729    # now $str is "<b>Camel</b> donkey zebra <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cAm\0E\0L</b>...";
1730    # i.e., all the camels are made bold-faced.
1731  
1732  =back
1733  
1734  =head2 Other Methods
1735  
1736  =over 4
1737  
1738  =item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1739  
1740  Change the value of specified keys and returns the changed part.
1741  
1742      $Collator = Unicode::Collate->new(level => 4);
1743  
1744      $Collator->eq("perl", "PERL"); # false
1745  
1746      %old = $Collator->change(level => 2); # returns (level => 4).
1747  
1748      $Collator->eq("perl", "PERL"); # true
1749  
1750      $Collator->change(%old); # returns (level => 2).
1751  
1752      $Collator->eq("perl", "PERL"); # false
1753  
1754  Not all C<(key,value)>s are allowed to be changed.
1755  See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1756  
1757  In the scalar context, returns the modified collator
1758  (but it is B<not> a clone from the original).
1759  
1760      $Collator->change(level => 2)->eq("perl", "PERL"); # true
1761  
1762      $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
1763  
1764      $Collator->change(level => 4)->eq("perl", "PERL"); # false
1765  
1766  =item C<$version = $Collator-E<gt>version()>
1767  
1768  Returns the version number (a string) of the Unicode Standard
1769  which the C<table> file used by the collator object is based on.
1770  If the table does not include a version line (starting with C<@version>),
1771  returns C<"unknown">.
1772  
1773  =item C<UCA_Version()>
1774  
1775  Returns the tracking version number of UTS #10 this module consults.
1776  
1777  =item C<Base_Unicode_Version()>
1778  
1779  Returns the version number of UTS #10 this module consults.
1780  
1781  =back
1782  
1783  =head1 EXPORT
1784  
1785  No method will be exported.
1786  
1787  =head1 INSTALL
1788  
1789  Though this module can be used without any C<table> file,
1790  to use this module easily, it is recommended to install a table file
1791  in the UCA format, by copying it under the directory
1792  <a place in @INC>/Unicode/Collate.
1793  
1794  The most preferable one is "The Default Unicode Collation Element Table"
1795  (aka DUCET), available from the Unicode Consortium's website:
1796  
1797     http://www.unicode.org/Public/UCA/
1798  
1799     http://www.unicode.org/Public/UCA/latest/allkeys.txt (latest version)
1800  
1801  If DUCET is not installed, it is recommended to copy the file
1802  from http://www.unicode.org/Public/UCA/latest/allkeys.txt
1803  to <a place in @INC>/Unicode/Collate/allkeys.txt
1804  manually.
1805  
1806  =head1 CAVEATS
1807  
1808  =over 4
1809  
1810  =item Normalization
1811  
1812  Use of the C<normalization> parameter requires the B<Unicode::Normalize>
1813  module (see L<Unicode::Normalize>).
1814  
1815  If you need not it (say, in the case when you need not
1816  handle any combining characters),
1817  assign C<normalization =E<gt> undef> explicitly.
1818  
1819  -- see 6.5 Avoiding Normalization, UTS #10.
1820  
1821  =item Conformance Test
1822  
1823  The Conformance Test for the UCA is available
1824  under L<http://www.unicode.org/Public/UCA/>.
1825  
1826  For F<CollationTest_SHIFTED.txt>,
1827  a collator via C<Unicode::Collate-E<gt>new( )> should be used;
1828  for F<CollationTest_NON_IGNORABLE.txt>, a collator via
1829  C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>.
1830  
1831  B<Unicode::Normalize is required to try The Conformance Test.>
1832  
1833  =back
1834  
1835  =head1 AUTHOR, COPYRIGHT AND LICENSE
1836  
1837  The Unicode::Collate module for perl was written by SADAHIRO Tomoyuki,
1838  <SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2005,
1839  SADAHIRO Tomoyuki. Japan. All rights reserved.
1840  
1841  This module is free software; you can redistribute it and/or
1842  modify it under the same terms as Perl itself.
1843  
1844  The file Unicode/Collate/allkeys.txt was copied directly
1845  from L<http://www.unicode.org/Public/UCA/4.1.0/allkeys.txt>.
1846  This file is Copyright (c) 1991-2005 Unicode, Inc. All rights reserved.
1847  Distributed under the Terms of Use in L<http://www.unicode.org/copyright.html>.
1848  
1849  =head1 SEE ALSO
1850  
1851  =over 4
1852  
1853  =item Unicode Collation Algorithm - UTS #10
1854  
1855  L<http://www.unicode.org/reports/tr10/>
1856  
1857  =item The Default Unicode Collation Element Table (DUCET)
1858  
1859  L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
1860  
1861  =item The conformance test for the UCA
1862  
1863  L<http://www.unicode.org/Public/UCA/latest/CollationTest.html>
1864  
1865  L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip>
1866  
1867  =item Hangul Syllable Type
1868  
1869  L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt>
1870  
1871  =item Unicode Normalization Forms - UAX #15
1872  
1873  L<http://www.unicode.org/reports/tr15/>
1874  
1875  =back
1876  
1877  =cut


Generated: Tue Mar 17 22:47:18 2015 Cross-referenced by PHPXref 0.7.1