[ Index ]

PHP Cross Reference of Unnamed Project

title

Body

[close]

/se3-unattended/var/se3/unattended/install/linuxaux/opt/perl/lib/5.10.0/i586-linux-thread-multi/Encode/ -> Guess.pm (source)

   1  package Encode::Guess;
   2  use strict;
   3  use warnings;
   4  use Encode qw(:fallbacks find_encoding);
   5  our $VERSION = do { my @r = ( q$Revision: 2.2 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
   6  
   7  my $Canon = 'Guess';
   8  sub DEBUG () { 0 }
   9  our %DEF_SUSPECTS = map { $_ => find_encoding($_) } qw(ascii utf8);
  10  $Encode::Encoding{$Canon} = bless {
  11      Name     => $Canon,
  12      Suspects => {%DEF_SUSPECTS},
  13  } => __PACKAGE__;
  14  
  15  use base qw(Encode::Encoding);
  16  sub needs_lines { 1 }
  17  sub perlio_ok   { 0 }
  18  
  19  our @EXPORT         = qw(guess_encoding);
  20  our $NoUTFAutoGuess = 0;
  21  our $UTF8_BOM       = pack( "C3", 0xef, 0xbb, 0xbf );
  22  
  23  sub import {    # Exporter not used so we do it on our own
  24      my $callpkg = caller;
  25      for my $item (@EXPORT) {
  26          no strict 'refs';
  27          *{"$callpkg\::$item"} = \&{"$item"};
  28      }
  29      set_suspects(@_);
  30  }
  31  
  32  sub set_suspects {
  33      my $class = shift;
  34      my $self = ref($class) ? $class : $Encode::Encoding{$Canon};
  35      $self->{Suspects} = {%DEF_SUSPECTS};
  36      $self->add_suspects(@_);
  37  }
  38  
  39  sub add_suspects {
  40      my $class = shift;
  41      my $self = ref($class) ? $class : $Encode::Encoding{$Canon};
  42      for my $c (@_) {
  43          my $e = find_encoding($c) or die "Unknown encoding: $c";
  44          $self->{Suspects}{ $e->name } = $e;
  45          DEBUG and warn "Added: ", $e->name;
  46      }
  47  }
  48  
  49  sub decode($$;$) {
  50      my ( $obj, $octet, $chk ) = @_;
  51      my $guessed = guess( $obj, $octet );
  52      unless ( ref($guessed) ) {
  53          require Carp;
  54          Carp::croak($guessed);
  55      }
  56      my $utf8 = $guessed->decode( $octet, $chk );
  57      $_[1] = $octet if $chk;
  58      return $utf8;
  59  }
  60  
  61  sub guess_encoding {
  62      guess( $Encode::Encoding{$Canon}, @_ );
  63  }
  64  
  65  sub guess {
  66      my $class = shift;
  67      my $obj   = ref($class) ? $class : $Encode::Encoding{$Canon};
  68      my $octet = shift;
  69  
  70      # sanity check
  71      return unless defined $octet and length $octet;
  72  
  73      # cheat 0: utf8 flag;
  74      if ( Encode::is_utf8($octet) ) {
  75          return find_encoding('utf8') unless $NoUTFAutoGuess;
  76          Encode::_utf8_off($octet);
  77      }
  78  
  79      # cheat 1: BOM
  80      use Encode::Unicode;
  81      unless ($NoUTFAutoGuess) {
  82          my $BOM = pack( 'C3', unpack( "C3", $octet ) );
  83          return find_encoding('utf8')
  84            if ( defined $BOM and $BOM eq $UTF8_BOM );
  85          $BOM = unpack( 'N', $octet );
  86          return find_encoding('UTF-32')
  87            if ( defined $BOM and ( $BOM == 0xFeFF or $BOM == 0xFFFe0000 ) );
  88          $BOM = unpack( 'n', $octet );
  89          return find_encoding('UTF-16')
  90            if ( defined $BOM and ( $BOM == 0xFeFF or $BOM == 0xFFFe ) );
  91          if ( $octet =~ /\x00/o )
  92          {    # if \x00 found, we assume UTF-(16|32)(BE|LE)
  93              my $utf;
  94              my ( $be, $le ) = ( 0, 0 );
  95              if ( $octet =~ /\x00\x00/o ) {    # UTF-32(BE|LE) assumed
  96                  $utf = "UTF-32";
  97                  for my $char ( unpack( 'N*', $octet ) ) {
  98                      $char & 0x0000ffff and $be++;
  99                      $char & 0xffff0000 and $le++;
 100                  }
 101              }
 102              else {                            # UTF-16(BE|LE) assumed
 103                  $utf = "UTF-16";
 104                  for my $char ( unpack( 'n*', $octet ) ) {
 105                      $char & 0x00ff and $be++;
 106                      $char & 0xff00 and $le++;
 107                  }
 108              }
 109              DEBUG and warn "$utf, be == $be, le == $le";
 110              $be == $le
 111                and return
 112                "Encodings ambiguous between $utf BE and LE ($be, $le)";
 113              $utf .= ( $be > $le ) ? 'BE' : 'LE';
 114              return find_encoding($utf);
 115          }
 116      }
 117      my %try = %{ $obj->{Suspects} };
 118      for my $c (@_) {
 119          my $e = find_encoding($c) or die "Unknown encoding: $c";
 120          $try{ $e->name } = $e;
 121          DEBUG and warn "Added: ", $e->name;
 122      }
 123      my $nline = 1;
 124      for my $line ( split /\r\n?|\n/, $octet ) {
 125  
 126          # cheat 2 -- \e in the string
 127          if ( $line =~ /\e/o ) {
 128              my @keys = keys %try;
 129              delete @try{qw/utf8 ascii/};
 130              for my $k (@keys) {
 131                  ref( $try{$k} ) eq 'Encode::XS' and delete $try{$k};
 132              }
 133          }
 134          my %ok = %try;
 135  
 136          # warn join(",", keys %try);
 137          for my $k ( keys %try ) {
 138              my $scratch = $line;
 139              $try{$k}->decode( $scratch, FB_QUIET );
 140              if ( $scratch eq '' ) {
 141                  DEBUG and warn sprintf( "%4d:%-24s ok\n", $nline, $k );
 142              }
 143              else {
 144                  use bytes ();
 145                  DEBUG
 146                    and warn sprintf( "%4d:%-24s not ok; %d bytes left\n",
 147                      $nline, $k, bytes::length($scratch) );
 148                  delete $ok{$k};
 149              }
 150          }
 151          %ok or return "No appropriate encodings found!";
 152          if ( scalar( keys(%ok) ) == 1 ) {
 153              my ($retval) = values(%ok);
 154              return $retval;
 155          }
 156          %try = %ok;
 157          $nline++;
 158      }
 159      $try{ascii}
 160        or return "Encodings too ambiguous: ", join( " or ", keys %try );
 161      return $try{ascii};
 162  }
 163  
 164  1;
 165  __END__
 166  
 167  =head1 NAME
 168  
 169  Encode::Guess -- Guesses encoding from data
 170  
 171  =head1 SYNOPSIS
 172  
 173    # if you are sure $data won't contain anything bogus
 174  
 175    use Encode;
 176    use Encode::Guess qw/euc-jp shiftjis 7bit-jis/;
 177    my $utf8 = decode("Guess", $data);
 178    my $data = encode("Guess", $utf8);   # this doesn't work!
 179  
 180    # more elaborate way
 181    use Encode::Guess;
 182    my $enc = guess_encoding($data, qw/euc-jp shiftjis 7bit-jis/);
 183    ref($enc) or die "Can't guess: $enc"; # trap error this way
 184    $utf8 = $enc->decode($data);
 185    # or
 186    $utf8 = decode($enc->name, $data)
 187  
 188  =head1 ABSTRACT
 189  
 190  Encode::Guess enables you to guess in what encoding a given data is
 191  encoded, or at least tries to.  
 192  
 193  =head1 DESCRIPTION
 194  
 195  By default, it checks only ascii, utf8 and UTF-16/32 with BOM.
 196  
 197    use Encode::Guess; # ascii/utf8/BOMed UTF
 198  
 199  To use it more practically, you have to give the names of encodings to
 200  check (I<suspects> as follows).  The name of suspects can either be
 201  canonical names or aliases.
 202  
 203  CAVEAT: Unlike UTF-(16|32), BOM in utf8 is NOT AUTOMATICALLY STRIPPED.
 204  
 205   # tries all major Japanese Encodings as well
 206    use Encode::Guess qw/euc-jp shiftjis 7bit-jis/;
 207  
 208  If the C<$Encode::Guess::NoUTFAutoGuess> variable is set to a true
 209  value, no heuristics will be applied to UTF8/16/32, and the result
 210  will be limited to the suspects and C<ascii>.
 211  
 212  =over 4
 213  
 214  =item Encode::Guess->set_suspects
 215  
 216  You can also change the internal suspects list via C<set_suspects>
 217  method. 
 218  
 219    use Encode::Guess;
 220    Encode::Guess->set_suspects(qw/euc-jp shiftjis 7bit-jis/);
 221  
 222  =item Encode::Guess->add_suspects
 223  
 224  Or you can use C<add_suspects> method.  The difference is that
 225  C<set_suspects> flushes the current suspects list while
 226  C<add_suspects> adds.
 227  
 228    use Encode::Guess;
 229    Encode::Guess->add_suspects(qw/euc-jp shiftjis 7bit-jis/);
 230    # now the suspects are euc-jp,shiftjis,7bit-jis, AND
 231    # euc-kr,euc-cn, and big5-eten
 232    Encode::Guess->add_suspects(qw/euc-kr euc-cn big5-eten/);
 233  
 234  =item Encode::decode("Guess" ...)
 235  
 236  When you are content with suspects list, you can now
 237  
 238    my $utf8 = Encode::decode("Guess", $data);
 239  
 240  =item Encode::Guess->guess($data)
 241  
 242  But it will croak if:
 243  
 244  =over
 245  
 246  =item *
 247  
 248  Two or more suspects remain
 249  
 250  =item *
 251  
 252  No suspects left
 253  
 254  =back
 255  
 256  So you should instead try this;
 257  
 258    my $decoder = Encode::Guess->guess($data);
 259  
 260  On success, $decoder is an object that is documented in
 261  L<Encode::Encoding>.  So you can now do this;
 262  
 263    my $utf8 = $decoder->decode($data);
 264  
 265  On failure, $decoder now contains an error message so the whole thing
 266  would be as follows;
 267  
 268    my $decoder = Encode::Guess->guess($data);
 269    die $decoder unless ref($decoder);
 270    my $utf8 = $decoder->decode($data);
 271  
 272  =item guess_encoding($data, [, I<list of suspects>])
 273  
 274  You can also try C<guess_encoding> function which is exported by
 275  default.  It takes $data to check and it also takes the list of
 276  suspects by option.  The optional suspect list is I<not reflected> to
 277  the internal suspects list.
 278  
 279    my $decoder = guess_encoding($data, qw/euc-jp euc-kr euc-cn/);
 280    die $decoder unless ref($decoder);
 281    my $utf8 = $decoder->decode($data);
 282    # check only ascii and utf8
 283    my $decoder = guess_encoding($data);
 284  
 285  =back
 286  
 287  =head1 CAVEATS
 288  
 289  =over 4
 290  
 291  =item *
 292  
 293  Because of the algorithm used, ISO-8859 series and other single-byte
 294  encodings do not work well unless either one of ISO-8859 is the only
 295  one suspect (besides ascii and utf8).
 296  
 297    use Encode::Guess;
 298    # perhaps ok
 299    my $decoder = guess_encoding($data, 'latin1');
 300    # definitely NOT ok
 301    my $decoder = guess_encoding($data, qw/latin1 greek/);
 302  
 303  The reason is that Encode::Guess guesses encoding by trial and error.
 304  It first splits $data into lines and tries to decode the line for each
 305  suspect.  It keeps it going until all but one encoding is eliminated
 306  out of suspects list.  ISO-8859 series is just too successful for most
 307  cases (because it fills almost all code points in \x00-\xff).
 308  
 309  =item *
 310  
 311  Do not mix national standard encodings and the corresponding vendor
 312  encodings.
 313  
 314    # a very bad idea
 315    my $decoder
 316       = guess_encoding($data, qw/shiftjis MacJapanese cp932/);
 317  
 318  The reason is that vendor encoding is usually a superset of national
 319  standard so it becomes too ambiguous for most cases.
 320  
 321  =item *
 322  
 323  On the other hand, mixing various national standard encodings
 324  automagically works unless $data is too short to allow for guessing.
 325  
 326   # This is ok if $data is long enough
 327   my $decoder =  
 328    guess_encoding($data, qw/euc-cn
 329                             euc-jp shiftjis 7bit-jis
 330                             euc-kr
 331                             big5-eten/);
 332  
 333  =item *
 334  
 335  DO NOT PUT TOO MANY SUSPECTS!  Don't you try something like this!
 336  
 337    my $decoder = guess_encoding($data, 
 338                                 Encode->encodings(":all"));
 339  
 340  =back
 341  
 342  It is, after all, just a guess.  You should alway be explicit when it
 343  comes to encodings.  But there are some, especially Japanese,
 344  environment that guess-coding is a must.  Use this module with care. 
 345  
 346  =head1 TO DO
 347  
 348  Encode::Guess does not work on EBCDIC platforms.
 349  
 350  =head1 SEE ALSO
 351  
 352  L<Encode>, L<Encode::Encoding>
 353  
 354  =cut
 355  


Generated: Tue Mar 17 22:47:18 2015 Cross-referenced by PHPXref 0.7.1