[ Index ]

PHP Cross Reference of Unnamed Project

title

Body

[close]

/se3-unattended/var/se3/unattended/install/linuxaux/opt/perl/lib/5.10.0/i586-linux-thread-multi/ -> Encode.pm (source)

   1  #
   2  # $Id: Encode.pm,v 2.23 2007/05/29 18:15:32 dankogai Exp dankogai $
   3  #
   4  package Encode;
   5  use strict;
   6  use warnings;
   7  our $VERSION = sprintf "%d.%02d", q$Revision: 2.23 $ =~ /(\d+)/g;
   8  sub DEBUG () { 0 }
   9  use XSLoader ();
  10  XSLoader::load( __PACKAGE__, $VERSION );
  11  
  12  require Exporter;
  13  use base qw/Exporter/;
  14  
  15  # Public, encouraged API is exported by default
  16  
  17  our @EXPORT = qw(
  18    decode  decode_utf8  encode  encode_utf8 str2bytes bytes2str
  19    encodings  find_encoding clone_encoding
  20  );
  21  our @FB_FLAGS = qw(
  22    DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC
  23    PERLQQ HTMLCREF XMLCREF STOP_AT_PARTIAL
  24  );
  25  our @FB_CONSTS = qw(
  26    FB_DEFAULT FB_CROAK FB_QUIET FB_WARN
  27    FB_PERLQQ FB_HTMLCREF FB_XMLCREF
  28  );
  29  our @EXPORT_OK = (
  30      qw(
  31        _utf8_off _utf8_on define_encoding from_to is_16bit is_8bit
  32        is_utf8 perlio_ok resolve_alias utf8_downgrade utf8_upgrade
  33        ),
  34      @FB_FLAGS, @FB_CONSTS,
  35  );
  36  
  37  our %EXPORT_TAGS = (
  38      all          => [ @EXPORT,    @EXPORT_OK ],
  39      fallbacks    => [@FB_CONSTS],
  40      fallback_all => [ @FB_CONSTS, @FB_FLAGS ],
  41  );
  42  
  43  # Documentation moved after __END__ for speed - NI-S
  44  
  45  our $ON_EBCDIC = ( ord("A") == 193 );
  46  
  47  use Encode::Alias;
  48  
  49  # Make a %Encoding package variable to allow a certain amount of cheating
  50  our %Encoding;
  51  our %ExtModule;
  52  require Encode::Config;
  53  eval { require Encode::ConfigLocal };
  54  
  55  sub encodings {
  56      my $class = shift;
  57      my %enc;
  58      if ( @_ and $_[0] eq ":all" ) {
  59          %enc = ( %Encoding, %ExtModule );
  60      }
  61      else {
  62          %enc = %Encoding;
  63          for my $mod ( map { m/::/o ? $_ : "Encode::$_" } @_ ) {
  64              DEBUG and warn $mod;
  65              for my $enc ( keys %ExtModule ) {
  66                  $ExtModule{$enc} eq $mod and $enc{$enc} = $mod;
  67              }
  68          }
  69      }
  70      return sort { lc $a cmp lc $b }
  71        grep      { !/^(?:Internal|Unicode|Guess)$/o } keys %enc;
  72  }
  73  
  74  sub perlio_ok {
  75      my $obj = ref( $_[0] ) ? $_[0] : find_encoding( $_[0] );
  76      $obj->can("perlio_ok") and return $obj->perlio_ok();
  77      return 0;    # safety net
  78  }
  79  
  80  sub define_encoding {
  81      my $obj  = shift;
  82      my $name = shift;
  83      $Encoding{$name} = $obj;
  84      my $lc = lc($name);
  85      define_alias( $lc => $obj ) unless $lc eq $name;
  86      while (@_) {
  87          my $alias = shift;
  88          define_alias( $alias, $obj );
  89      }
  90      return $obj;
  91  }
  92  
  93  sub getEncoding {
  94      my ( $class, $name, $skip_external ) = @_;
  95  
  96      ref($name) && $name->can('renew') and return $name;
  97      exists $Encoding{$name} and return $Encoding{$name};
  98      my $lc = lc $name;
  99      exists $Encoding{$lc} and return $Encoding{$lc};
 100  
 101      my $oc = $class->find_alias($name);
 102      defined($oc) and return $oc;
 103      $lc ne $name and $oc = $class->find_alias($lc);
 104      defined($oc) and return $oc;
 105  
 106      unless ($skip_external) {
 107          if ( my $mod = $ExtModule{$name} || $ExtModule{$lc} ) {
 108              $mod =~ s,::,/,g;
 109              $mod .= '.pm';
 110              eval { require $mod; };
 111              exists $Encoding{$name} and return $Encoding{$name};
 112          }
 113      }
 114      return;
 115  }
 116  
 117  sub find_encoding($;$) {
 118      my ( $name, $skip_external ) = @_;
 119      return __PACKAGE__->getEncoding( $name, $skip_external );
 120  }
 121  
 122  sub resolve_alias($) {
 123      my $obj = find_encoding(shift);
 124      defined $obj and return $obj->name;
 125      return;
 126  }
 127  
 128  sub clone_encoding($) {
 129      my $obj = find_encoding(shift);
 130      ref $obj or return;
 131      eval { require Storable };
 132      $@ and return;
 133      return Storable::dclone($obj);
 134  }
 135  
 136  sub encode($$;$) {
 137      my ( $name, $string, $check ) = @_;
 138      return undef unless defined $string;
 139      $string .= '' if ref $string;    # stringify;
 140      $check ||= 0;
 141      my $enc = find_encoding($name);
 142      unless ( defined $enc ) {
 143          require Carp;
 144          Carp::croak("Unknown encoding '$name'");
 145      }
 146      my $octets = $enc->encode( $string, $check );
 147      $_[1] = $string if $check and !ref $check and !( $check & LEAVE_SRC() );
 148      return $octets;
 149  }
 150  *str2bytes = \&encode;
 151  
 152  sub decode($$;$) {
 153      my ( $name, $octets, $check ) = @_;
 154      return undef unless defined $octets;
 155      $octets .= '' if ref $octets;
 156      $check ||= 0;
 157      my $enc = find_encoding($name);
 158      unless ( defined $enc ) {
 159          require Carp;
 160          Carp::croak("Unknown encoding '$name'");
 161      }
 162      my $string = $enc->decode( $octets, $check );
 163      $_[1] = $octets if $check and !ref $check and !( $check & LEAVE_SRC() );
 164      return $string;
 165  }
 166  *bytes2str = \&decode;
 167  
 168  sub from_to($$$;$) {
 169      my ( $string, $from, $to, $check ) = @_;
 170      return undef unless defined $string;
 171      $check ||= 0;
 172      my $f = find_encoding($from);
 173      unless ( defined $f ) {
 174          require Carp;
 175          Carp::croak("Unknown encoding '$from'");
 176      }
 177      my $t = find_encoding($to);
 178      unless ( defined $t ) {
 179          require Carp;
 180          Carp::croak("Unknown encoding '$to'");
 181      }
 182      my $uni = $f->decode($string);
 183      $_[0] = $string = $t->encode( $uni, $check );
 184      return undef if ( $check && length($uni) );
 185      return defined( $_[0] ) ? length($string) : undef;
 186  }
 187  
 188  sub encode_utf8($) {
 189      my ($str) = @_;
 190      utf8::encode($str);
 191      return $str;
 192  }
 193  
 194  sub decode_utf8($;$) {
 195      my ( $str, $check ) = @_;
 196      return $str if is_utf8($str);
 197      if ($check) {
 198          return decode( "utf8", $str, $check );
 199      }
 200      else {
 201          return decode( "utf8", $str );
 202          return $str;
 203      }
 204  }
 205  
 206  predefine_encodings(1);
 207  
 208  #
 209  # This is to restore %Encoding if really needed;
 210  #
 211  
 212  sub predefine_encodings {
 213      require Encode::Encoding;
 214      no warnings 'redefine';
 215      my $use_xs = shift;
 216      if ($ON_EBCDIC) {
 217  
 218          # was in Encode::UTF_EBCDIC
 219          package Encode::UTF_EBCDIC;
 220          push @Encode::UTF_EBCDIC::ISA, 'Encode::Encoding';
 221          *decode = sub {
 222              my ( $obj, $str, $chk ) = @_;
 223              my $res = '';
 224              for ( my $i = 0 ; $i < length($str) ; $i++ ) {
 225                  $res .=
 226                    chr(
 227                      utf8::unicode_to_native( ord( substr( $str, $i, 1 ) ) )
 228                    );
 229              }
 230              $_[1] = '' if $chk;
 231              return $res;
 232          };
 233          *encode = sub {
 234              my ( $obj, $str, $chk ) = @_;
 235              my $res = '';
 236              for ( my $i = 0 ; $i < length($str) ; $i++ ) {
 237                  $res .=
 238                    chr(
 239                      utf8::native_to_unicode( ord( substr( $str, $i, 1 ) ) )
 240                    );
 241              }
 242              $_[1] = '' if $chk;
 243              return $res;
 244          };
 245          $Encode::Encoding{Unicode} =
 246            bless { Name => "UTF_EBCDIC" } => "Encode::UTF_EBCDIC";
 247      }
 248      else {
 249  
 250          package Encode::Internal;
 251          push @Encode::Internal::ISA, 'Encode::Encoding';
 252          *decode = sub {
 253              my ( $obj, $str, $chk ) = @_;
 254              utf8::upgrade($str);
 255              $_[1] = '' if $chk;
 256              return $str;
 257          };
 258          *encode = \&decode;
 259          $Encode::Encoding{Unicode} =
 260            bless { Name => "Internal" } => "Encode::Internal";
 261      }
 262  
 263      {
 264  
 265          # was in Encode::utf8
 266          package Encode::utf8;
 267          push @Encode::utf8::ISA, 'Encode::Encoding';
 268  
 269          #
 270          if ($use_xs) {
 271              Encode::DEBUG and warn __PACKAGE__, " XS on";
 272              *decode = \&decode_xs;
 273              *encode = \&encode_xs;
 274          }
 275          else {
 276              Encode::DEBUG and warn __PACKAGE__, " XS off";
 277              *decode = sub {
 278                  my ( $obj, $octets, $chk ) = @_;
 279                  my $str = Encode::decode_utf8($octets);
 280                  if ( defined $str ) {
 281                      $_[1] = '' if $chk;
 282                      return $str;
 283                  }
 284                  return undef;
 285              };
 286              *encode = sub {
 287                  my ( $obj, $string, $chk ) = @_;
 288                  my $octets = Encode::encode_utf8($string);
 289                  $_[1] = '' if $chk;
 290                  return $octets;
 291              };
 292          }
 293          *cat_decode = sub {    # ($obj, $dst, $src, $pos, $trm, $chk)
 294                                 # currently ignores $chk
 295              my ( $obj, undef, undef, $pos, $trm ) = @_;
 296              my ( $rdst, $rsrc, $rpos ) = \@_[ 1, 2, 3 ];
 297              use bytes;
 298              if ( ( my $npos = index( $$rsrc, $trm, $pos ) ) >= 0 ) {
 299                  $$rdst .=
 300                    substr( $$rsrc, $pos, $npos - $pos + length($trm) );
 301                  $$rpos = $npos + length($trm);
 302                  return 1;
 303              }
 304              $$rdst .= substr( $$rsrc, $pos );
 305              $$rpos = length($$rsrc);
 306              return '';
 307          };
 308          $Encode::Encoding{utf8} =
 309            bless { Name => "utf8" } => "Encode::utf8";
 310          $Encode::Encoding{"utf-8-strict"} =
 311            bless { Name => "utf-8-strict", strict_utf8 => 1 } =>
 312            "Encode::utf8";
 313      }
 314  }
 315  
 316  1;
 317  
 318  __END__
 319  
 320  =head1 NAME
 321  
 322  Encode - character encodings
 323  
 324  =head1 SYNOPSIS
 325  
 326      use Encode;
 327  
 328  =head2 Table of Contents
 329  
 330  Encode consists of a collection of modules whose details are too big
 331  to fit in one document.  This POD itself explains the top-level APIs
 332  and general topics at a glance.  For other topics and more details,
 333  see the PODs below:
 334  
 335    Name                    Description
 336    --------------------------------------------------------
 337    Encode::Alias         Alias definitions to encodings
 338    Encode::Encoding      Encode Implementation Base Class
 339    Encode::Supported     List of Supported Encodings
 340    Encode::CN            Simplified Chinese Encodings
 341    Encode::JP            Japanese Encodings
 342    Encode::KR            Korean Encodings
 343    Encode::TW            Traditional Chinese Encodings
 344    --------------------------------------------------------
 345  
 346  =head1 DESCRIPTION
 347  
 348  The C<Encode> module provides the interfaces between Perl's strings
 349  and the rest of the system.  Perl strings are sequences of
 350  B<characters>.
 351  
 352  The repertoire of characters that Perl can represent is at least that
 353  defined by the Unicode Consortium. On most platforms the ordinal
 354  values of the characters (as returned by C<ord(ch)>) is the "Unicode
 355  codepoint" for the character (the exceptions are those platforms where
 356  the legacy encoding is some variant of EBCDIC rather than a super-set
 357  of ASCII - see L<perlebcdic>).
 358  
 359  Traditionally, computer data has been moved around in 8-bit chunks
 360  often called "bytes". These chunks are also known as "octets" in
 361  networking standards. Perl is widely used to manipulate data of many
 362  types - not only strings of characters representing human or computer
 363  languages but also "binary" data being the machine's representation of
 364  numbers, pixels in an image - or just about anything.
 365  
 366  When Perl is processing "binary data", the programmer wants Perl to
 367  process "sequences of bytes". This is not a problem for Perl - as a
 368  byte has 256 possible values, it easily fits in Perl's much larger
 369  "logical character".
 370  
 371  =head2 TERMINOLOGY
 372  
 373  =over 2
 374  
 375  =item *
 376  
 377  I<character>: a character in the range 0..(2**32-1) (or more).
 378  (What Perl's strings are made of.)
 379  
 380  =item *
 381  
 382  I<byte>: a character in the range 0..255
 383  (A special case of a Perl character.)
 384  
 385  =item *
 386  
 387  I<octet>: 8 bits of data, with ordinal values 0..255
 388  (Term for bytes passed to or from a non-Perl context, e.g. a disk file.)
 389  
 390  =back
 391  
 392  =head1 PERL ENCODING API
 393  
 394  =over 2
 395  
 396  =item $octets  = encode(ENCODING, $string [, CHECK])
 397  
 398  Encodes a string from Perl's internal form into I<ENCODING> and returns
 399  a sequence of octets.  ENCODING can be either a canonical name or
 400  an alias.  For encoding names and aliases, see L</"Defining Aliases">.
 401  For CHECK, see L</"Handling Malformed Data">.
 402  
 403  For example, to convert a string from Perl's internal format to
 404  iso-8859-1 (also known as Latin1),
 405  
 406    $octets = encode("iso-8859-1", $string);
 407  
 408  B<CAVEAT>: When you run C<$octets = encode("utf8", $string)>, then
 409  $octets B<may not be equal to> $string.  Though they both contain the
 410  same data, the UTF8 flag for $octets is B<always> off.  When you
 411  encode anything, UTF8 flag of the result is always off, even when it
 412  contains completely valid utf8 string. See L</"The UTF8 flag"> below.
 413  
 414  If the $string is C<undef> then C<undef> is returned.
 415  
 416  =item $string = decode(ENCODING, $octets [, CHECK])
 417  
 418  Decodes a sequence of octets assumed to be in I<ENCODING> into Perl's
 419  internal form and returns the resulting string.  As in encode(),
 420  ENCODING can be either a canonical name or an alias. For encoding names
 421  and aliases, see L</"Defining Aliases">.  For CHECK, see
 422  L</"Handling Malformed Data">.
 423  
 424  For example, to convert ISO-8859-1 data to a string in Perl's internal format:
 425  
 426    $string = decode("iso-8859-1", $octets);
 427  
 428  B<CAVEAT>: When you run C<$string = decode("utf8", $octets)>, then $string
 429  B<may not be equal to> $octets.  Though they both contain the same data,
 430  the UTF8 flag for $string is on unless $octets entirely consists of
 431  ASCII data (or EBCDIC on EBCDIC machines).  See L</"The UTF8 flag">
 432  below.
 433  
 434  If the $string is C<undef> then C<undef> is returned.
 435  
 436  =item [$obj =] find_encoding(ENCODING)
 437  
 438  Returns the I<encoding object> corresponding to ENCODING.  Returns
 439  undef if no matching ENCODING is find.
 440  
 441  This object is what actually does the actual (en|de)coding.
 442  
 443    $utf8 = decode($name, $bytes);
 444  
 445  is in fact
 446  
 447    $utf8 = do{
 448      $obj = find_encoding($name);
 449      croak qq(encoding "$name" not found) unless ref $obj;
 450      $obj->decode($bytes)
 451    };
 452  
 453  with more error checking.
 454  
 455  Therefore you can save time by reusing this object as follows;
 456  
 457    my $enc = find_encoding("iso-8859-1");
 458    while(<>){
 459       my $utf8 = $enc->decode($_);
 460       # and do someting with $utf8;
 461    }
 462  
 463  Besides C<< ->decode >> and C<< ->encode >>, other methods are
 464  available as well.  For instance, C<< -> name >> returns the canonical
 465  name of the encoding object.
 466  
 467    find_encoding("latin1")->name; # iso-8859-1
 468  
 469  See L<Encode::Encoding> for details.
 470  
 471  =item [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK])
 472  
 473  Converts B<in-place> data between two encodings. The data in $octets
 474  must be encoded as octets and not as characters in Perl's internal
 475  format. For example, to convert ISO-8859-1 data to Microsoft's CP1250
 476  encoding:
 477  
 478    from_to($octets, "iso-8859-1", "cp1250");
 479  
 480  and to convert it back:
 481  
 482    from_to($octets, "cp1250", "iso-8859-1");
 483  
 484  Note that because the conversion happens in place, the data to be
 485  converted cannot be a string constant; it must be a scalar variable.
 486  
 487  from_to() returns the length of the converted string in octets on
 488  success, I<undef> on error.
 489  
 490  B<CAVEAT>: The following operations look the same but are not quite so;
 491  
 492    from_to($data, "iso-8859-1", "utf8"); #1
 493    $data = decode("iso-8859-1", $data);  #2
 494  
 495  Both #1 and #2 make $data consist of a completely valid UTF-8 string
 496  but only #2 turns UTF8 flag on.  #1 is equivalent to
 497  
 498    $data = encode("utf8", decode("iso-8859-1", $data));
 499  
 500  See L</"The UTF8 flag"> below.
 501  
 502  Also note that
 503  
 504    from_to($octets, $from, $to, $check);
 505  
 506  is equivalent to
 507  
 508    $octets = encode($to, decode($from, $octets), $check);
 509  
 510  Yes, it does not respect the $check during decoding.  It is
 511  deliberately done that way.  If you need minute control, C<decode>
 512  then C<encode> as follows;
 513  
 514    $octets = encode($to, decode($from, $octets, $check_from), $check_to);
 515  
 516  =item $octets = encode_utf8($string);
 517  
 518  Equivalent to C<$octets = encode("utf8", $string);> The characters
 519  that comprise $string are encoded in Perl's internal format and the
 520  result is returned as a sequence of octets. All possible
 521  characters have a UTF-8 representation so this function cannot fail.
 522  
 523  
 524  =item $string = decode_utf8($octets [, CHECK]);
 525  
 526  equivalent to C<$string = decode("utf8", $octets [, CHECK])>.
 527  The sequence of octets represented by
 528  $octets is decoded from UTF-8 into a sequence of logical
 529  characters. Not all sequences of octets form valid UTF-8 encodings, so
 530  it is possible for this call to fail.  For CHECK, see
 531  L</"Handling Malformed Data">.
 532  
 533  =back
 534  
 535  =head2 Listing available encodings
 536  
 537    use Encode;
 538    @list = Encode->encodings();
 539  
 540  Returns a list of the canonical names of the available encodings that
 541  are loaded.  To get a list of all available encodings including the
 542  ones that are not loaded yet, say
 543  
 544    @all_encodings = Encode->encodings(":all");
 545  
 546  Or you can give the name of a specific module.
 547  
 548    @with_jp = Encode->encodings("Encode::JP");
 549  
 550  When "::" is not in the name, "Encode::" is assumed.
 551  
 552    @ebcdic = Encode->encodings("EBCDIC");
 553  
 554  To find out in detail which encodings are supported by this package,
 555  see L<Encode::Supported>.
 556  
 557  =head2 Defining Aliases
 558  
 559  To add a new alias to a given encoding, use:
 560  
 561    use Encode;
 562    use Encode::Alias;
 563    define_alias(newName => ENCODING);
 564  
 565  After that, newName can be used as an alias for ENCODING.
 566  ENCODING may be either the name of an encoding or an
 567  I<encoding object>
 568  
 569  But before you do so, make sure the alias is nonexistent with
 570  C<resolve_alias()>, which returns the canonical name thereof.
 571  i.e.
 572  
 573    Encode::resolve_alias("latin1") eq "iso-8859-1" # true
 574    Encode::resolve_alias("iso-8859-12")   # false; nonexistent
 575    Encode::resolve_alias($name) eq $name  # true if $name is canonical
 576  
 577  resolve_alias() does not need C<use Encode::Alias>; it can be
 578  exported via C<use Encode qw(resolve_alias)>.
 579  
 580  See L<Encode::Alias> for details.
 581  
 582  =head2 Finding IANA Character Set Registry names
 583  
 584  The canonical name of a given encoding does not necessarily agree with
 585  IANA IANA Character Set Registry, commonly seen as C<< Content-Type:
 586  text/plain; charset=I<whatever> >>.  For most cases canonical names
 587  work but sometimes it does not (notably 'utf-8-strict').
 588  
 589  Therefore as of Encode version 2.21, a new method C<mime_name()> is added.
 590  
 591    use Encode;
 592    my $enc = find_encoding('UTF-8');
 593    warn $enc->name;      # utf-8-strict
 594    warn $enc->mime_name; # UTF-8
 595  
 596  See also:  L<Encode::Encoding>
 597  
 598  =head1 Encoding via PerlIO
 599  
 600  If your perl supports I<PerlIO> (which is the default), you can use a
 601  PerlIO layer to decode and encode directly via a filehandle.  The
 602  following two examples are totally identical in their functionality.
 603  
 604    # via PerlIO
 605    open my $in,  "<:encoding(shiftjis)", $infile  or die;
 606    open my $out, ">:encoding(euc-jp)",   $outfile or die;
 607    while(<$in>){ print $out $_; }
 608  
 609    # via from_to
 610    open my $in,  "<", $infile  or die;
 611    open my $out, ">", $outfile or die;
 612    while(<$in>){
 613      from_to($_, "shiftjis", "euc-jp", 1);
 614      print $out $_;
 615    }
 616  
 617  Unfortunately, it may be that encodings are PerlIO-savvy.  You can check
 618  if your encoding is supported by PerlIO by calling the C<perlio_ok>
 619  method.
 620  
 621    Encode::perlio_ok("hz");             # False
 622    find_encoding("euc-cn")->perlio_ok;  # True where PerlIO is available
 623  
 624    use Encode qw(perlio_ok);            # exported upon request
 625    perlio_ok("euc-jp")
 626  
 627  Fortunately, all encodings that come with Encode core are PerlIO-savvy
 628  except for hz and ISO-2022-kr.  For gory details, see
 629  L<Encode::Encoding> and L<Encode::PerlIO>.
 630  
 631  =head1 Handling Malformed Data
 632  
 633  The optional I<CHECK> argument tells Encode what to do when it
 634  encounters malformed data.  Without CHECK, Encode::FB_DEFAULT ( == 0 )
 635  is assumed.
 636  
 637  As of version 2.12 Encode supports coderef values for CHECK.  See below.
 638  
 639  =over 2
 640  
 641  =item B<NOTE:> Not all encoding support this feature
 642  
 643  Some encodings ignore I<CHECK> argument.  For example,
 644  L<Encode::Unicode> ignores I<CHECK> and it always croaks on error.
 645  
 646  =back
 647  
 648  Now here is the list of I<CHECK> values available
 649  
 650  =over 2
 651  
 652  =item I<CHECK> = Encode::FB_DEFAULT ( == 0)
 653  
 654  If I<CHECK> is 0, (en|de)code will put a I<substitution character> in
 655  place of a malformed character.  When you encode, E<lt>subcharE<gt>
 656  will be used.  When you decode the code point C<0xFFFD> is used.  If
 657  the data is supposed to be UTF-8, an optional lexical warning
 658  (category utf8) is given.
 659  
 660  =item I<CHECK> = Encode::FB_CROAK ( == 1)
 661  
 662  If I<CHECK> is 1, methods will die on error immediately with an error
 663  message.  Therefore, when I<CHECK> is set to 1,  you should trap the
 664  error with eval{} unless you really want to let it die.
 665  
 666  =item I<CHECK> = Encode::FB_QUIET
 667  
 668  If I<CHECK> is set to Encode::FB_QUIET, (en|de)code will immediately
 669  return the portion of the data that has been processed so far when an
 670  error occurs. The data argument will be overwritten with everything
 671  after that point (that is, the unprocessed part of data).  This is
 672  handy when you have to call decode repeatedly in the case where your
 673  source data may contain partial multi-byte character sequences,
 674  (i.e. you are reading with a fixed-width buffer). Here is a sample
 675  code that does exactly this:
 676  
 677    my $buffer = ''; my $string = '';
 678    while(read $fh, $buffer, 256, length($buffer)){
 679      $string .= decode($encoding, $buffer, Encode::FB_QUIET);
 680      # $buffer now contains the unprocessed partial character
 681    }
 682  
 683  =item I<CHECK> = Encode::FB_WARN
 684  
 685  This is the same as above, except that it warns on error.  Handy when
 686  you are debugging the mode above.
 687  
 688  =item perlqq mode (I<CHECK> = Encode::FB_PERLQQ)
 689  
 690  =item HTML charref mode (I<CHECK> = Encode::FB_HTMLCREF)
 691  
 692  =item XML charref mode (I<CHECK> = Encode::FB_XMLCREF)
 693  
 694  For encodings that are implemented by Encode::XS, CHECK ==
 695  Encode::FB_PERLQQ turns (en|de)code into C<perlqq> fallback mode.
 696  
 697  When you decode, C<\xI<HH>> will be inserted for a malformed character,
 698  where I<HH> is the hex representation of the octet  that could not be
 699  decoded to utf8.  And when you encode, C<\x{I<HHHH>}> will be inserted,
 700  where I<HHHH> is the Unicode ID of the character that cannot be found
 701  in the character repertoire of the encoding.
 702  
 703  HTML/XML character reference modes are about the same, in place of
 704  C<\x{I<HHHH>}>, HTML uses C<&#I<NNN>;> where I<NNN> is a decimal number and
 705  XML uses C<&#xI<HHHH>;> where I<HHHH> is the hexadecimal number.
 706  
 707  In Encode 2.10 or later, C<LEAVE_SRC> is also implied.
 708  
 709  =item The bitmask
 710  
 711  These modes are actually set via a bitmask.  Here is how the FB_XX
 712  constants are laid out.  You can import the FB_XX constants via
 713  C<use Encode qw(:fallbacks)>; you can import the generic bitmask
 714  constants via C<use Encode qw(:fallback_all)>.
 715  
 716                       FB_DEFAULT FB_CROAK FB_QUIET FB_WARN  FB_PERLQQ
 717   DIE_ON_ERR    0x0001             X
 718   WARN_ON_ERR   0x0002                               X
 719   RETURN_ON_ERR 0x0004                      X        X
 720   LEAVE_SRC     0x0008                                        X
 721   PERLQQ        0x0100                                        X
 722   HTMLCREF      0x0200
 723   XMLCREF       0x0400
 724  
 725  =back
 726  
 727  =over 2
 728  
 729  =item Encode::LEAVE_SRC
 730  
 731  If the C<Encode::LEAVE_SRC> bit is not set, but I<CHECK> is, then the second
 732  argument to C<encode()> or C<decode()> may be assigned to by the functions. If
 733  you're not interested in this, then bitwise-or the bitmask with it.
 734  
 735  =back
 736  
 737  =Head2 coderef for CHECK
 738  
 739  As of Encode 2.12 CHECK can also be a code reference which takes the
 740  ord value of unmapped caharacter as an argument and returns a string
 741  that represents the fallback character.  For instance,
 742  
 743    $ascii = encode("ascii", $utf8, sub{ sprintf "<U+%04X>", shift });
 744  
 745  Acts like FB_PERLQQ but E<lt>U+I<XXXX>E<gt> is used instead of
 746  \x{I<XXXX>}.
 747  
 748  =head1 Defining Encodings
 749  
 750  To define a new encoding, use:
 751  
 752      use Encode qw(define_encoding);
 753      define_encoding($object, 'canonicalName' [, alias...]);
 754  
 755  I<canonicalName> will be associated with I<$object>.  The object
 756  should provide the interface described in L<Encode::Encoding>.
 757  If more than two arguments are provided then additional
 758  arguments are taken as aliases for I<$object>.
 759  
 760  See L<Encode::Encoding> for more details.
 761  
 762  =head1 The UTF8 flag
 763  
 764  Before the introduction of Unicode support in perl, The C<eq> operator
 765  just compared the strings represented by two scalars. Beginning with
 766  perl 5.8, C<eq> compares two strings with simultaneous consideration of
 767  I<the UTF8 flag>. To explain why we made it so, I will quote page 402 of
 768  C<Programming Perl, 3rd ed.>
 769  
 770  =over 2
 771  
 772  =item Goal #1:
 773  
 774  Old byte-oriented programs should not spontaneously break on the old
 775  byte-oriented data they used to work on.
 776  
 777  =item Goal #2:
 778  
 779  Old byte-oriented programs should magically start working on the new
 780  character-oriented data when appropriate.
 781  
 782  =item Goal #3:
 783  
 784  Programs should run just as fast in the new character-oriented mode
 785  as in the old byte-oriented mode.
 786  
 787  =item Goal #4:
 788  
 789  Perl should remain one language, rather than forking into a
 790  byte-oriented Perl and a character-oriented Perl.
 791  
 792  =back
 793  
 794  Back when C<Programming Perl, 3rd ed.> was written, not even Perl 5.6.0
 795  was born and many features documented in the book remained
 796  unimplemented for a long time.  Perl 5.8 corrected this and the introduction
 797  of the UTF8 flag is one of them.  You can think of this perl notion as of a
 798  byte-oriented mode (UTF8 flag off) and a character-oriented mode (UTF8
 799  flag on).
 800  
 801  Here is how Encode takes care of the UTF8 flag.
 802  
 803  =over 2
 804  
 805  =item *
 806  
 807  When you encode, the resulting UTF8 flag is always off.
 808  
 809  =item *
 810  
 811  When you decode, the resulting UTF8 flag is on unless you can
 812  unambiguously represent data.  Here is the definition of
 813  dis-ambiguity.
 814  
 815  After C<$utf8 = decode('foo', $octet);>,
 816  
 817    When $octet is...   The UTF8 flag in $utf8 is
 818    ---------------------------------------------
 819    In ASCII only (or EBCDIC only)            OFF
 820    In ISO-8859-1                              ON
 821    In any other Encoding                      ON
 822    ---------------------------------------------
 823  
 824  As you see, there is one exception, In ASCII.  That way you can assume
 825  Goal #1.  And with Encode Goal #2 is assumed but you still have to be
 826  careful in such cases mentioned in B<CAVEAT> paragraphs.
 827  
 828  This UTF8 flag is not visible in perl scripts, exactly for the same
 829  reason you cannot (or you I<don't have to>) see if a scalar contains a
 830  string, integer, or floating point number.   But you can still peek
 831  and poke these if you will.  See the section below.
 832  
 833  =back
 834  
 835  =head2 Messing with Perl's Internals
 836  
 837  The following API uses parts of Perl's internals in the current
 838  implementation.  As such, they are efficient but may change.
 839  
 840  =over 2
 841  
 842  =item is_utf8(STRING [, CHECK])
 843  
 844  [INTERNAL] Tests whether the UTF8 flag is turned on in the STRING.
 845  If CHECK is true, also checks the data in STRING for being well-formed
 846  UTF-8.  Returns true if successful, false otherwise.
 847  
 848  As of perl 5.8.1, L<utf8> also has utf8::is_utf8().
 849  
 850  =item _utf8_on(STRING)
 851  
 852  [INTERNAL] Turns on the UTF8 flag in STRING.  The data in STRING is
 853  B<not> checked for being well-formed UTF-8.  Do not use unless you
 854  B<know> that the STRING is well-formed UTF-8.  Returns the previous
 855  state of the UTF8 flag (so please don't treat the return value as
 856  indicating success or failure), or C<undef> if STRING is not a string.
 857  
 858  =item _utf8_off(STRING)
 859  
 860  [INTERNAL] Turns off the UTF8 flag in STRING.  Do not use frivolously.
 861  Returns the previous state of the UTF8 flag (so please don't treat the
 862  return value as indicating success or failure), or C<undef> if STRING is
 863  not a string.
 864  
 865  =back
 866  
 867  =head1 UTF-8 vs. utf8 vs. UTF8
 868  
 869    ....We now view strings not as sequences of bytes, but as sequences
 870    of numbers in the range 0 .. 2**32-1 (or in the case of 64-bit
 871    computers, 0 .. 2**64-1) -- Programming Perl, 3rd ed.
 872  
 873  That has been the perl's notion of UTF-8 but official UTF-8 is more
 874  strict; Its ranges is much narrower (0 .. 10FFFF), some sequences are
 875  not allowed (i.e. Those used in the surrogate pair, 0xFFFE, et al).
 876  
 877  Now that is overruled by Larry Wall himself.
 878  
 879    From: Larry Wall <larry@wall.org>
 880    Date: December 04, 2004 11:51:58 JST
 881    To: perl-unicode@perl.org
 882    Subject: Re: Make Encode.pm support the real UTF-8
 883    Message-Id: <20041204025158.GA28754@wall.org>
 884    
 885    On Fri, Dec 03, 2004 at 10:12:12PM +0000, Tim Bunce wrote:
 886    : I've no problem with 'utf8' being perl's unrestricted uft8 encoding,
 887    : but "UTF-8" is the name of the standard and should give the
 888    : corresponding behaviour.
 889    
 890    For what it's worth, that's how I've always kept them straight in my
 891    head.
 892    
 893    Also for what it's worth, Perl 6 will mostly default to strict but
 894    make it easy to switch back to lax.
 895    
 896    Larry
 897  
 898  Do you copy?  As of Perl 5.8.7, B<UTF-8> means strict, official UTF-8
 899  while B<utf8> means liberal, lax, version thereof.  And Encode version
 900  2.10 or later thus groks the difference between C<UTF-8> and C"utf8".
 901  
 902    encode("utf8",  "\x{FFFF_FFFF}", 1); # okay
 903    encode("UTF-8", "\x{FFFF_FFFF}", 1); # croaks
 904  
 905  C<UTF-8> in Encode is actually a canonical name for C<utf-8-strict>.
 906  Yes, the hyphen between "UTF" and "8" is important.  Without it Encode
 907  goes "liberal"
 908  
 909    find_encoding("UTF-8")->name # is 'utf-8-strict'
 910    find_encoding("utf-8")->name # ditto. names are case insensitive
 911    find_encoding("utf_8")->name  # ditto. "_" are treated as "-"
 912    find_encoding("UTF8")->name  # is 'utf8'.
 913  
 914  The UTF8 flag is internally called UTF8, without a hyphen. It indicates
 915  whether a string is internally encoded as utf8, also without a hypen.
 916  
 917  =head1 SEE ALSO
 918  
 919  L<Encode::Encoding>,
 920  L<Encode::Supported>,
 921  L<Encode::PerlIO>,
 922  L<encoding>,
 923  L<perlebcdic>,
 924  L<perlfunc/open>,
 925  L<perlunicode>, L<perluniintro>, L<perlunifaq>, L<perlunitut>
 926  L<utf8>,
 927  the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
 928  
 929  =head1 MAINTAINER
 930  
 931  This project was originated by Nick Ing-Simmons and later maintained
 932  by Dan Kogai E<lt>dankogai@dan.co.jpE<gt>.  See AUTHORS for a full
 933  list of people involved.  For any questions, use
 934  E<lt>perl-unicode@perl.orgE<gt> so we can all share.
 935  
 936  While Dan Kogai retains the copyright as a maintainer, the credit
 937  should go to all those involoved.  See AUTHORS for those submitted
 938  codes.
 939  
 940  =head1 COPYRIGHT
 941  
 942  Copyright 2002-2006 Dan Kogai E<lt>dankogai@dan.co.jpE<gt>
 943  
 944  This library is free software; you can redistribute it and/or modify
 945  it under the same terms as Perl itself.
 946  
 947  =cut


Generated: Tue Mar 17 22:47:18 2015 Cross-referenced by PHPXref 0.7.1