[ Index ]

PHP Cross Reference of Unnamed Project

title

Body

[close]

/se3-unattended/var/se3/unattended/install/linuxaux/opt/perl/lib/5.10.0/Unicode/ -> UCD.pm (source)

   1  package Unicode::UCD;
   2  
   3  use strict;
   4  use warnings;
   5  
   6  our $VERSION = '0.25';
   7  
   8  use Storable qw(dclone);
   9  
  10  require Exporter;
  11  
  12  our @ISA = qw(Exporter);
  13  
  14  our @EXPORT_OK = qw(charinfo
  15              charblock charscript
  16              charblocks charscripts
  17              charinrange
  18              general_categories bidi_types
  19              compexcl
  20              casefold casespec
  21              namedseq);
  22  
  23  use Carp;
  24  
  25  =head1 NAME
  26  
  27  Unicode::UCD - Unicode character database
  28  
  29  =head1 SYNOPSIS
  30  
  31      use Unicode::UCD 'charinfo';
  32      my $charinfo   = charinfo($codepoint);
  33  
  34      use Unicode::UCD 'charblock';
  35      my $charblock  = charblock($codepoint);
  36  
  37      use Unicode::UCD 'charscript';
  38      my $charscript = charscript($codepoint);
  39  
  40      use Unicode::UCD 'charblocks';
  41      my $charblocks = charblocks();
  42  
  43      use Unicode::UCD 'charscripts';
  44      my $charscripts = charscripts();
  45  
  46      use Unicode::UCD qw(charscript charinrange);
  47      my $range = charscript($script);
  48      print "looks like $script\n" if charinrange($range, $codepoint);
  49  
  50      use Unicode::UCD qw(general_categories bidi_types);
  51      my $categories = general_categories();
  52      my $types = bidi_types();
  53  
  54      use Unicode::UCD 'compexcl';
  55      my $compexcl = compexcl($codepoint);
  56  
  57      use Unicode::UCD 'namedseq';
  58      my $namedseq = namedseq($named_sequence_name);
  59  
  60      my $unicode_version = Unicode::UCD::UnicodeVersion();
  61  
  62  =head1 DESCRIPTION
  63  
  64  The Unicode::UCD module offers a simple interface to the Unicode
  65  Character Database.
  66  
  67  =cut
  68  
  69  my $UNICODEFH;
  70  my $BLOCKSFH;
  71  my $SCRIPTSFH;
  72  my $VERSIONFH;
  73  my $COMPEXCLFH;
  74  my $CASEFOLDFH;
  75  my $CASESPECFH;
  76  my $NAMEDSEQFH;
  77  
  78  sub openunicode {
  79      my ($rfh, @path) = @_;
  80      my $f;
  81      unless (defined $$rfh) {
  82      for my $d (@INC) {
  83          use File::Spec;
  84          $f = File::Spec->catfile($d, "unicore", @path);
  85          last if open($$rfh, $f);
  86          undef $f;
  87      }
  88      croak __PACKAGE__, ": failed to find ",
  89                File::Spec->catfile(@path), " in @INC"
  90          unless defined $f;
  91      }
  92      return $f;
  93  }
  94  
  95  =head2 charinfo
  96  
  97      use Unicode::UCD 'charinfo';
  98  
  99      my $charinfo = charinfo(0x41);
 100  
 101  charinfo() returns a reference to a hash that has the following fields
 102  as defined by the Unicode standard:
 103  
 104      key
 105  
 106      code             code point with at least four hexdigits
 107      name             name of the character IN UPPER CASE
 108      category         general category of the character
 109      combining        classes used in the Canonical Ordering Algorithm
 110      bidi             bidirectional type
 111      decomposition    character decomposition mapping
 112      decimal          if decimal digit this is the integer numeric value
 113      digit            if digit this is the numeric value
 114      numeric          if numeric is the integer or rational numeric value
 115      mirrored         if mirrored in bidirectional text
 116      unicode10        Unicode 1.0 name if existed and different
 117      comment          ISO 10646 comment field
 118      upper            uppercase equivalent mapping
 119      lower            lowercase equivalent mapping
 120      title            titlecase equivalent mapping
 121  
 122      block            block the character belongs to (used in \p{In...})
 123      script           script the character belongs to
 124  
 125  If no match is found, a reference to an empty hash is returned.
 126  
 127  The C<block> property is the same as returned by charinfo().  It is
 128  not defined in the Unicode Character Database proper (Chapter 4 of the
 129  Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database
 130  (Chapter 14 of TUS3).  Similarly for the C<script> property.
 131  
 132  Note that you cannot do (de)composition and casing based solely on the
 133  above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
 134  you will need also the compexcl(), casefold(), and casespec() functions.
 135  
 136  =cut
 137  
 138  # NB: This function is duplicated in charnames.pm
 139  sub _getcode {
 140      my $arg = shift;
 141  
 142      if ($arg =~ /^[1-9]\d*$/) {
 143      return $arg;
 144      } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) {
 145      return hex($1);
 146      }
 147  
 148      return;
 149  }
 150  
 151  # Lingua::KO::Hangul::Util not part of the standard distribution
 152  # but it will be used if available.
 153  
 154  eval { require Lingua::KO::Hangul::Util };
 155  my $hasHangulUtil = ! $@;
 156  if ($hasHangulUtil) {
 157      Lingua::KO::Hangul::Util->import();
 158  }
 159  
 160  sub hangul_decomp { # internal: called from charinfo
 161      if ($hasHangulUtil) {
 162      my @tmp = decomposeHangul(shift);
 163      return sprintf("%04X %04X",      @tmp) if @tmp == 2;
 164      return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
 165      }
 166      return;
 167  }
 168  
 169  sub hangul_charname { # internal: called from charinfo
 170      return sprintf("HANGUL SYLLABLE-%04X", shift);
 171  }
 172  
 173  sub han_charname { # internal: called from charinfo
 174      return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
 175  }
 176  
 177  my @CharinfoRanges = (
 178  # block name
 179  # [ first, last, coderef to name, coderef to decompose ],
 180  # CJK Ideographs Extension A
 181    [ 0x3400,   0x4DB5,   \&han_charname,   undef  ],
 182  # CJK Ideographs
 183    [ 0x4E00,   0x9FA5,   \&han_charname,   undef  ],
 184  # Hangul Syllables
 185    [ 0xAC00,   0xD7A3,   $hasHangulUtil ? \&getHangulName : \&hangul_charname,  \&hangul_decomp ],
 186  # Non-Private Use High Surrogates
 187    [ 0xD800,   0xDB7F,   undef,   undef  ],
 188  # Private Use High Surrogates
 189    [ 0xDB80,   0xDBFF,   undef,   undef  ],
 190  # Low Surrogates
 191    [ 0xDC00,   0xDFFF,   undef,   undef  ],
 192  # The Private Use Area
 193    [ 0xE000,   0xF8FF,   undef,   undef  ],
 194  # CJK Ideographs Extension B
 195    [ 0x20000,  0x2A6D6,  \&han_charname,   undef  ],
 196  # Plane 15 Private Use Area
 197    [ 0xF0000,  0xFFFFD,  undef,   undef  ],
 198  # Plane 16 Private Use Area
 199    [ 0x100000, 0x10FFFD, undef,   undef  ],
 200  );
 201  
 202  sub charinfo {
 203      my $arg  = shift;
 204      my $code = _getcode($arg);
 205      croak __PACKAGE__, "::charinfo: unknown code '$arg'"
 206      unless defined $code;
 207      my $hexk = sprintf("%06X", $code);
 208      my($rcode,$rname,$rdec);
 209      foreach my $range (@CharinfoRanges){
 210        if ($range->[0] <= $code && $code <= $range->[1]) {
 211          $rcode = $hexk;
 212      $rcode =~ s/^0+//;
 213      $rcode =  sprintf("%04X", hex($rcode));
 214          $rname = $range->[2] ? $range->[2]->($code) : '';
 215          $rdec  = $range->[3] ? $range->[3]->($code) : '';
 216          $hexk  = sprintf("%06X", $range->[0]); # replace by the first
 217          last;
 218        }
 219      }
 220      openunicode(\$UNICODEFH, "UnicodeData.txt");
 221      if (defined $UNICODEFH) {
 222      use Search::Dict 1.02;
 223      if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
 224          my $line = <$UNICODEFH>;
 225          return unless defined $line;
 226          chomp $line;
 227          my %prop;
 228          @prop{qw(
 229               code name category
 230               combining bidi decomposition
 231               decimal digit numeric
 232               mirrored unicode10 comment
 233               upper lower title
 234              )} = split(/;/, $line, -1);
 235          $hexk =~ s/^0+//;
 236          $hexk =  sprintf("%04X", hex($hexk));
 237          if ($prop{code} eq $hexk) {
 238          $prop{block}  = charblock($code);
 239          $prop{script} = charscript($code);
 240          if(defined $rname){
 241                      $prop{code} = $rcode;
 242                      $prop{name} = $rname;
 243                      $prop{decomposition} = $rdec;
 244                  }
 245          return \%prop;
 246          }
 247      }
 248      }
 249      return;
 250  }
 251  
 252  sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
 253      my ($table, $lo, $hi, $code) = @_;
 254  
 255      return if $lo > $hi;
 256  
 257      my $mid = int(($lo+$hi) / 2);
 258  
 259      if ($table->[$mid]->[0] < $code) {
 260      if ($table->[$mid]->[1] >= $code) {
 261          return $table->[$mid]->[2];
 262      } else {
 263          _search($table, $mid + 1, $hi, $code);
 264      }
 265      } elsif ($table->[$mid]->[0] > $code) {
 266      _search($table, $lo, $mid - 1, $code);
 267      } else {
 268      return $table->[$mid]->[2];
 269      }
 270  }
 271  
 272  sub charinrange {
 273      my ($range, $arg) = @_;
 274      my $code = _getcode($arg);
 275      croak __PACKAGE__, "::charinrange: unknown code '$arg'"
 276      unless defined $code;
 277      _search($range, 0, $#$range, $code);
 278  }
 279  
 280  =head2 charblock
 281  
 282      use Unicode::UCD 'charblock';
 283  
 284      my $charblock = charblock(0x41);
 285      my $charblock = charblock(1234);
 286      my $charblock = charblock("0x263a");
 287      my $charblock = charblock("U+263a");
 288  
 289      my $range     = charblock('Armenian');
 290  
 291  With a B<code point argument> charblock() returns the I<block> the character
 292  belongs to, e.g.  C<Basic Latin>.  Note that not all the character
 293  positions within all blocks are defined.
 294  
 295  See also L</Blocks versus Scripts>.
 296  
 297  If supplied with an argument that can't be a code point, charblock() tries
 298  to do the opposite and interpret the argument as a character block. The
 299  return value is a I<range>: an anonymous list of lists that contain
 300  I<start-of-range>, I<end-of-range> code point pairs. You can test whether
 301  a code point is in a range using the L</charinrange> function. If the
 302  argument is not a known character block, C<undef> is returned.
 303  
 304  =cut
 305  
 306  my @BLOCKS;
 307  my %BLOCKS;
 308  
 309  sub _charblocks {
 310      unless (@BLOCKS) {
 311      if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
 312          local $_;
 313          while (<$BLOCKSFH>) {
 314          if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
 315              my ($lo, $hi) = (hex($1), hex($2));
 316              my $subrange = [ $lo, $hi, $3 ];
 317              push @BLOCKS, $subrange;
 318              push @{$BLOCKS{$3}}, $subrange;
 319          }
 320          }
 321          close($BLOCKSFH);
 322      }
 323      }
 324  }
 325  
 326  sub charblock {
 327      my $arg = shift;
 328  
 329      _charblocks() unless @BLOCKS;
 330  
 331      my $code = _getcode($arg);
 332  
 333      if (defined $code) {
 334      _search(\@BLOCKS, 0, $#BLOCKS, $code);
 335      } else {
 336      if (exists $BLOCKS{$arg}) {
 337          return dclone $BLOCKS{$arg};
 338      } else {
 339          return;
 340      }
 341      }
 342  }
 343  
 344  =head2 charscript
 345  
 346      use Unicode::UCD 'charscript';
 347  
 348      my $charscript = charscript(0x41);
 349      my $charscript = charscript(1234);
 350      my $charscript = charscript("U+263a");
 351  
 352      my $range      = charscript('Thai');
 353  
 354  With a B<code point argument> charscript() returns the I<script> the
 355  character belongs to, e.g.  C<Latin>, C<Greek>, C<Han>.
 356  
 357  See also L</Blocks versus Scripts>.
 358  
 359  If supplied with an argument that can't be a code point, charscript() tries
 360  to do the opposite and interpret the argument as a character script. The
 361  return value is a I<range>: an anonymous list of lists that contain
 362  I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
 363  code point is in a range using the L</charinrange> function. If the
 364  argument is not a known character script, C<undef> is returned.
 365  
 366  =cut
 367  
 368  my @SCRIPTS;
 369  my %SCRIPTS;
 370  
 371  sub _charscripts {
 372      unless (@SCRIPTS) {
 373      if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
 374          local $_;
 375          while (<$SCRIPTSFH>) {
 376          if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
 377              my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
 378              my $script = lc($3);
 379              $script =~ s/\b(\w)/uc($1)/ge;
 380              my $subrange = [ $lo, $hi, $script ];
 381              push @SCRIPTS, $subrange;
 382              push @{$SCRIPTS{$script}}, $subrange;
 383          }
 384          }
 385          close($SCRIPTSFH);
 386          @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
 387      }
 388      }
 389  }
 390  
 391  sub charscript {
 392      my $arg = shift;
 393  
 394      _charscripts() unless @SCRIPTS;
 395  
 396      my $code = _getcode($arg);
 397  
 398      if (defined $code) {
 399      _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
 400      } else {
 401      if (exists $SCRIPTS{$arg}) {
 402          return dclone $SCRIPTS{$arg};
 403      } else {
 404          return;
 405      }
 406      }
 407  }
 408  
 409  =head2 charblocks
 410  
 411      use Unicode::UCD 'charblocks';
 412  
 413      my $charblocks = charblocks();
 414  
 415  charblocks() returns a reference to a hash with the known block names
 416  as the keys, and the code point ranges (see L</charblock>) as the values.
 417  
 418  See also L</Blocks versus Scripts>.
 419  
 420  =cut
 421  
 422  sub charblocks {
 423      _charblocks() unless %BLOCKS;
 424      return dclone \%BLOCKS;
 425  }
 426  
 427  =head2 charscripts
 428  
 429      use Unicode::UCD 'charscripts';
 430  
 431      my $charscripts = charscripts();
 432  
 433  charscripts() returns a reference to a hash with the known script
 434  names as the keys, and the code point ranges (see L</charscript>) as
 435  the values.
 436  
 437  See also L</Blocks versus Scripts>.
 438  
 439  =cut
 440  
 441  sub charscripts {
 442      _charscripts() unless %SCRIPTS;
 443      return dclone \%SCRIPTS;
 444  }
 445  
 446  =head2 Blocks versus Scripts
 447  
 448  The difference between a block and a script is that scripts are closer
 449  to the linguistic notion of a set of characters required to present
 450  languages, while block is more of an artifact of the Unicode character
 451  numbering and separation into blocks of (mostly) 256 characters.
 452  
 453  For example the Latin B<script> is spread over several B<blocks>, such
 454  as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
 455  C<Latin Extended-B>.  On the other hand, the Latin script does not
 456  contain all the characters of the C<Basic Latin> block (also known as
 457  the ASCII): it includes only the letters, and not, for example, the digits
 458  or the punctuation.
 459  
 460  For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
 461  
 462  For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
 463  
 464  =head2 Matching Scripts and Blocks
 465  
 466  Scripts are matched with the regular-expression construct
 467  C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
 468  while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches
 469  any of the 256 code points in the Tibetan block).
 470  
 471  =head2 Code Point Arguments
 472  
 473  A I<code point argument> is either a decimal or a hexadecimal scalar
 474  designating a Unicode character, or C<U+> followed by hexadecimals
 475  designating a Unicode character.  In other words, if you want a code
 476  point to be interpreted as a hexadecimal number, you must prefix it
 477  with either C<0x> or C<U+>, because a string like e.g. C<123> will
 478  be interpreted as a decimal code point.  Also note that Unicode is
 479  B<not> limited to 16 bits (the number of Unicode characters is
 480  open-ended, in theory unlimited): you may have more than 4 hexdigits.
 481  
 482  =head2 charinrange
 483  
 484  In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
 485  can also test whether a code point is in the I<range> as returned by
 486  L</charblock> and L</charscript> or as the values of the hash returned
 487  by L</charblocks> and L</charscripts> by using charinrange():
 488  
 489      use Unicode::UCD qw(charscript charinrange);
 490  
 491      $range = charscript('Hiragana');
 492      print "looks like hiragana\n" if charinrange($range, $codepoint);
 493  
 494  =cut
 495  
 496  my %GENERAL_CATEGORIES =
 497   (
 498      'L'  =>         'Letter',
 499      'LC' =>         'CasedLetter',
 500      'Lu' =>         'UppercaseLetter',
 501      'Ll' =>         'LowercaseLetter',
 502      'Lt' =>         'TitlecaseLetter',
 503      'Lm' =>         'ModifierLetter',
 504      'Lo' =>         'OtherLetter',
 505      'M'  =>         'Mark',
 506      'Mn' =>         'NonspacingMark',
 507      'Mc' =>         'SpacingMark',
 508      'Me' =>         'EnclosingMark',
 509      'N'  =>         'Number',
 510      'Nd' =>         'DecimalNumber',
 511      'Nl' =>         'LetterNumber',
 512      'No' =>         'OtherNumber',
 513      'P'  =>         'Punctuation',
 514      'Pc' =>         'ConnectorPunctuation',
 515      'Pd' =>         'DashPunctuation',
 516      'Ps' =>         'OpenPunctuation',
 517      'Pe' =>         'ClosePunctuation',
 518      'Pi' =>         'InitialPunctuation',
 519      'Pf' =>         'FinalPunctuation',
 520      'Po' =>         'OtherPunctuation',
 521      'S'  =>         'Symbol',
 522      'Sm' =>         'MathSymbol',
 523      'Sc' =>         'CurrencySymbol',
 524      'Sk' =>         'ModifierSymbol',
 525      'So' =>         'OtherSymbol',
 526      'Z'  =>         'Separator',
 527      'Zs' =>         'SpaceSeparator',
 528      'Zl' =>         'LineSeparator',
 529      'Zp' =>         'ParagraphSeparator',
 530      'C'  =>         'Other',
 531      'Cc' =>         'Control',
 532      'Cf' =>         'Format',
 533      'Cs' =>         'Surrogate',
 534      'Co' =>         'PrivateUse',
 535      'Cn' =>         'Unassigned',
 536   );
 537  
 538  sub general_categories {
 539      return dclone \%GENERAL_CATEGORIES;
 540  }
 541  
 542  =head2 general_categories
 543  
 544      use Unicode::UCD 'general_categories';
 545  
 546      my $categories = general_categories();
 547  
 548  The general_categories() returns a reference to a hash which has short
 549  general category names (such as C<Lu>, C<Nd>, C<Zs>, C<S>) as keys and long
 550  names (such as C<UppercaseLetter>, C<DecimalNumber>, C<SpaceSeparator>,
 551  C<Symbol>) as values.  The hash is reversible in case you need to go
 552  from the long names to the short names.  The general category is the
 553  one returned from charinfo() under the C<category> key.
 554  
 555  =cut
 556  
 557  my %BIDI_TYPES =
 558   (
 559     'L'   => 'Left-to-Right',
 560     'LRE' => 'Left-to-Right Embedding',
 561     'LRO' => 'Left-to-Right Override',
 562     'R'   => 'Right-to-Left',
 563     'AL'  => 'Right-to-Left Arabic',
 564     'RLE' => 'Right-to-Left Embedding',
 565     'RLO' => 'Right-to-Left Override',
 566     'PDF' => 'Pop Directional Format',
 567     'EN'  => 'European Number',
 568     'ES'  => 'European Number Separator',
 569     'ET'  => 'European Number Terminator',
 570     'AN'  => 'Arabic Number',
 571     'CS'  => 'Common Number Separator',
 572     'NSM' => 'Non-Spacing Mark',
 573     'BN'  => 'Boundary Neutral',
 574     'B'   => 'Paragraph Separator',
 575     'S'   => 'Segment Separator',
 576     'WS'  => 'Whitespace',
 577     'ON'  => 'Other Neutrals',
 578   ); 
 579  
 580  sub bidi_types {
 581      return dclone \%BIDI_TYPES;
 582  }
 583  
 584  =head2 bidi_types
 585  
 586      use Unicode::UCD 'bidi_types';
 587  
 588      my $categories = bidi_types();
 589  
 590  The bidi_types() returns a reference to a hash which has the short
 591  bidi (bidirectional) type names (such as C<L>, C<R>) as keys and long
 592  names (such as C<Left-to-Right>, C<Right-to-Left>) as values.  The
 593  hash is reversible in case you need to go from the long names to the
 594  short names.  The bidi type is the one returned from charinfo()
 595  under the C<bidi> key.  For the exact meaning of the various bidi classes
 596  the Unicode TR9 is recommended reading:
 597  http://www.unicode.org/reports/tr9/tr9-17.html
 598  (as of Unicode 5.0.0)
 599  
 600  =cut
 601  
 602  =head2 compexcl
 603  
 604      use Unicode::UCD 'compexcl';
 605  
 606      my $compexcl = compexcl("09dc");
 607  
 608  The compexcl() returns the composition exclusion (that is, if the
 609  character should not be produced during a precomposition) of the 
 610  character specified by a B<code point argument>.
 611  
 612  If there is a composition exclusion for the character, true is
 613  returned.  Otherwise, false is returned.
 614  
 615  =cut
 616  
 617  my %COMPEXCL;
 618  
 619  sub _compexcl {
 620      unless (%COMPEXCL) {
 621      if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) {
 622          local $_;
 623          while (<$COMPEXCLFH>) {
 624          if (/^([0-9A-F]+)\s+\#\s+/) {
 625              my $code = hex($1);
 626              $COMPEXCL{$code} = undef;
 627          }
 628          }
 629          close($COMPEXCLFH);
 630      }
 631      }
 632  }
 633  
 634  sub compexcl {
 635      my $arg  = shift;
 636      my $code = _getcode($arg);
 637      croak __PACKAGE__, "::compexcl: unknown code '$arg'"
 638      unless defined $code;
 639  
 640      _compexcl() unless %COMPEXCL;
 641  
 642      return exists $COMPEXCL{$code};
 643  }
 644  
 645  =head2 casefold
 646  
 647      use Unicode::UCD 'casefold';
 648  
 649      my $casefold = casefold("00DF");
 650  
 651  The casefold() returns the locale-independent case folding of the
 652  character specified by a B<code point argument>.
 653  
 654  If there is a case folding for that character, a reference to a hash
 655  with the following fields is returned:
 656  
 657      key
 658  
 659      code             code point with at least four hexdigits
 660      status           "C", "F", "S", or "I"
 661      mapping          one or more codes separated by spaces
 662  
 663  The meaning of the I<status> is as follows:
 664  
 665     C                 common case folding, common mappings shared
 666                       by both simple and full mappings
 667     F                 full case folding, mappings that cause strings
 668                       to grow in length. Multiple characters are separated
 669                       by spaces
 670     S                 simple case folding, mappings to single characters
 671                       where different from F
 672     I                 special case for dotted uppercase I and
 673                       dotless lowercase i
 674                       - If this mapping is included, the result is
 675                         case-insensitive, but dotless and dotted I's
 676                         are not distinguished
 677                       - If this mapping is excluded, the result is not
 678                         fully case-insensitive, but dotless and dotted
 679                         I's are distinguished
 680  
 681  If there is no case folding for that character, C<undef> is returned.
 682  
 683  For more information about case mappings see
 684  http://www.unicode.org/unicode/reports/tr21/
 685  
 686  =cut
 687  
 688  my %CASEFOLD;
 689  
 690  sub _casefold {
 691      unless (%CASEFOLD) {
 692      if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
 693          local $_;
 694          while (<$CASEFOLDFH>) {
 695          if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
 696              my $code = hex($1);
 697              $CASEFOLD{$code} = { code    => $1,
 698                       status  => $2,
 699                       mapping => $3 };
 700          }
 701          }
 702          close($CASEFOLDFH);
 703      }
 704      }
 705  }
 706  
 707  sub casefold {
 708      my $arg  = shift;
 709      my $code = _getcode($arg);
 710      croak __PACKAGE__, "::casefold: unknown code '$arg'"
 711      unless defined $code;
 712  
 713      _casefold() unless %CASEFOLD;
 714  
 715      return $CASEFOLD{$code};
 716  }
 717  
 718  =head2 casespec
 719  
 720      use Unicode::UCD 'casespec';
 721  
 722      my $casespec = casespec("FB00");
 723  
 724  The casespec() returns the potentially locale-dependent case mapping
 725  of the character specified by a B<code point argument>.  The mapping
 726  may change the length of the string (which the basic Unicode case
 727  mappings as returned by charinfo() never do).
 728  
 729  If there is a case folding for that character, a reference to a hash
 730  with the following fields is returned:
 731  
 732      key
 733  
 734      code             code point with at least four hexdigits
 735      lower            lowercase
 736      title            titlecase
 737      upper            uppercase
 738      condition        condition list (may be undef)
 739  
 740  The C<condition> is optional.  Where present, it consists of one or
 741  more I<locales> or I<contexts>, separated by spaces (other than as
 742  used to separate elements, spaces are to be ignored).  A condition
 743  list overrides the normal behavior if all of the listed conditions are
 744  true.  Case distinctions in the condition list are not significant.
 745  Conditions preceded by "NON_" represent the negation of the condition.
 746  
 747  Note that when there are multiple case folding definitions for a
 748  single code point because of different locales, the value returned by
 749  casespec() is a hash reference which has the locales as the keys and
 750  hash references as described above as the values.
 751  
 752  A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
 753  followed by a "_" and a 2-letter ISO language code (possibly followed
 754  by a "_" and a variant code).  You can find the lists of those codes,
 755  see L<Locale::Country> and L<Locale::Language>.
 756  
 757  A I<context> is one of the following choices:
 758  
 759      FINAL            The letter is not followed by a letter of
 760                       general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
 761      MODERN           The mapping is only used for modern text
 762      AFTER_i          The last base character was "i" (U+0069)
 763  
 764  For more information about case mappings see
 765  http://www.unicode.org/unicode/reports/tr21/
 766  
 767  =cut
 768  
 769  my %CASESPEC;
 770  
 771  sub _casespec {
 772      unless (%CASESPEC) {
 773      if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
 774          local $_;
 775          while (<$CASESPECFH>) {
 776          if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
 777              my ($hexcode, $lower, $title, $upper, $condition) =
 778              ($1, $2, $3, $4, $5);
 779              my $code = hex($hexcode);
 780              if (exists $CASESPEC{$code}) {
 781              if (exists $CASESPEC{$code}->{code}) {
 782                  my ($oldlower,
 783                  $oldtitle,
 784                  $oldupper,
 785                  $oldcondition) =
 786                      @{$CASESPEC{$code}}{qw(lower
 787                                 title
 788                                 upper
 789                                 condition)};
 790                  if (defined $oldcondition) {
 791                  my ($oldlocale) =
 792                  ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
 793                  delete $CASESPEC{$code};
 794                  $CASESPEC{$code}->{$oldlocale} =
 795                  { code      => $hexcode,
 796                    lower     => $oldlower,
 797                    title     => $oldtitle,
 798                    upper     => $oldupper,
 799                    condition => $oldcondition };
 800                  }
 801              }
 802              my ($locale) =
 803                  ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
 804              $CASESPEC{$code}->{$locale} =
 805              { code      => $hexcode,
 806                lower     => $lower,
 807                title     => $title,
 808                upper     => $upper,
 809                condition => $condition };
 810              } else {
 811              $CASESPEC{$code} =
 812              { code      => $hexcode,
 813                lower     => $lower,
 814                title     => $title,
 815                upper     => $upper,
 816                condition => $condition };
 817              }
 818          }
 819          }
 820          close($CASESPECFH);
 821      }
 822      }
 823  }
 824  
 825  sub casespec {
 826      my $arg  = shift;
 827      my $code = _getcode($arg);
 828      croak __PACKAGE__, "::casespec: unknown code '$arg'"
 829      unless defined $code;
 830  
 831      _casespec() unless %CASESPEC;
 832  
 833      return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code};
 834  }
 835  
 836  =head2 namedseq()
 837  
 838      use Unicode::UCD 'namedseq';
 839  
 840      my $namedseq = namedseq("KATAKANA LETTER AINU P");
 841      my @namedseq = namedseq("KATAKANA LETTER AINU P");
 842      my %namedseq = namedseq();
 843  
 844  If used with a single argument in a scalar context, returns the string
 845  consisting of the code points of the named sequence, or C<undef> if no
 846  named sequence by that name exists.  If used with a single argument in
 847  a list context, returns list of the code points.  If used with no
 848  arguments in a list context, returns a hash with the names of the
 849  named sequences as the keys and the named sequences as strings as
 850  the values.  Otherwise, returns C<undef> or empty list depending
 851  on the context.
 852  
 853  (New from Unicode 4.1.0)
 854  
 855  =cut
 856  
 857  my %NAMEDSEQ;
 858  
 859  sub _namedseq {
 860      unless (%NAMEDSEQ) {
 861      if (openunicode(\$NAMEDSEQFH, "NamedSequences.txt")) {
 862          local $_;
 863          while (<$NAMEDSEQFH>) {
 864          if (/^(.+)\s*;\s*([0-9A-F]+(?: [0-9A-F]+)*)$/) {
 865              my ($n, $s) = ($1, $2);
 866              my @s = map { chr(hex($_)) } split(' ', $s);
 867              $NAMEDSEQ{$n} = join("", @s);
 868          }
 869          }
 870          close($NAMEDSEQFH);
 871      }
 872      }
 873  }
 874  
 875  sub namedseq {
 876      _namedseq() unless %NAMEDSEQ;
 877      my $wantarray = wantarray();
 878      if (defined $wantarray) {
 879      if ($wantarray) {
 880          if (@_ == 0) {
 881          return %NAMEDSEQ;
 882          } elsif (@_ == 1) {
 883          my $s = $NAMEDSEQ{ $_[0] };
 884          return defined $s ? map { ord($_) } split('', $s) : ();
 885          }
 886      } elsif (@_ == 1) {
 887          return $NAMEDSEQ{ $_[0] };
 888      }
 889      }
 890      return;
 891  }
 892  
 893  =head2 Unicode::UCD::UnicodeVersion
 894  
 895  Unicode::UCD::UnicodeVersion() returns the version of the Unicode
 896  Character Database, in other words, the version of the Unicode
 897  standard the database implements.  The version is a string
 898  of numbers delimited by dots (C<'.'>).
 899  
 900  =cut
 901  
 902  my $UNICODEVERSION;
 903  
 904  sub UnicodeVersion {
 905      unless (defined $UNICODEVERSION) {
 906      openunicode(\$VERSIONFH, "version");
 907      chomp($UNICODEVERSION = <$VERSIONFH>);
 908      close($VERSIONFH);
 909      croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
 910          unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
 911      }
 912      return $UNICODEVERSION;
 913  }
 914  
 915  =head2 Implementation Note
 916  
 917  The first use of charinfo() opens a read-only filehandle to the Unicode
 918  Character Database (the database is included in the Perl distribution).
 919  The filehandle is then kept open for further queries.  In other words,
 920  if you are wondering where one of your filehandles went, that's where.
 921  
 922  =head1 BUGS
 923  
 924  Does not yet support EBCDIC platforms.
 925  
 926  =head1 AUTHOR
 927  
 928  Jarkko Hietaniemi
 929  
 930  =cut
 931  
 932  1;


Generated: Tue Mar 17 22:47:18 2015 Cross-referenced by PHPXref 0.7.1