[ Index ] |
PHP Cross Reference of Unnamed Project |
[Summary view] [Print] [Text view]
1 package Encode::CN::HZ; 2 3 use strict; 4 use warnings; 5 6 use vars qw($VERSION); 7 $VERSION = do { my @r = ( q$Revision: 2.4 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; 8 9 use Encode qw(:fallbacks); 10 11 use base qw(Encode::Encoding); 12 __PACKAGE__->Define('hz'); 13 14 # HZ is a combination of ASCII and escaped GB, so we implement it 15 # with the GB2312(raw) encoding here. Cf. RFCs 1842 & 1843. 16 17 # not ported for EBCDIC. Which should be used, "~" or "\x7E"? 18 19 sub needs_lines { 1 } 20 21 sub decode ($$;$) { 22 my ( $obj, $str, $chk ) = @_; 23 24 my $GB = Encode::find_encoding('gb2312-raw'); 25 my $ret = ''; 26 my $in_ascii = 1; # default mode is ASCII. 27 28 while ( length $str ) { 29 if ($in_ascii) { # ASCII mode 30 if ( $str =~ s/^([\x00-\x7D\x7F]+)// ) { # no '~' => ASCII 31 $ret .= $1; 32 33 # EBCDIC should need ascii2native, but not ported. 34 } 35 elsif ( $str =~ s/^\x7E\x7E// ) { # escaped tilde 36 $ret .= '~'; 37 } 38 elsif ( $str =~ s/^\x7E\cJ// ) { # '\cJ' == LF in ASCII 39 1; # no-op 40 } 41 elsif ( $str =~ s/^\x7E\x7B// ) { # '~{' 42 $in_ascii = 0; # to GB 43 } 44 else { # encounters an invalid escape, \x80 or greater 45 last; 46 } 47 } 48 else { # GB mode; the byte ranges are as in RFC 1843. 49 no warnings 'uninitialized'; 50 if ( $str =~ s/^((?:[\x21-\x77][\x21-\x7E])+)// ) { 51 $ret .= $GB->decode( $1, $chk ); 52 } 53 elsif ( $str =~ s/^\x7E\x7D// ) { # '~}' 54 $in_ascii = 1; 55 } 56 else { # invalid 57 last; 58 } 59 } 60 } 61 $_[1] = '' if $chk; # needs_lines guarantees no partial character 62 return $ret; 63 } 64 65 sub cat_decode { 66 my ( $obj, undef, $src, $pos, $trm, $chk ) = @_; 67 my ( $rdst, $rsrc, $rpos ) = \@_[ 1 .. 3 ]; 68 69 my $GB = Encode::find_encoding('gb2312-raw'); 70 my $ret = ''; 71 my $in_ascii = 1; # default mode is ASCII. 72 73 my $ini_pos = pos($$rsrc); 74 75 substr( $src, 0, $pos ) = ''; 76 77 my $ini_len = bytes::length($src); 78 79 # $trm is the first of the pair '~~', then 2nd tilde is to be removed. 80 # XXX: Is better C<$src =~ s/^\x7E// or die if ...>? 81 $src =~ s/^\x7E// if $trm eq "\x7E"; 82 83 while ( length $src ) { 84 my $now; 85 if ($in_ascii) { # ASCII mode 86 if ( $src =~ s/^([\x00-\x7D\x7F])// ) { # no '~' => ASCII 87 $now = $1; 88 } 89 elsif ( $src =~ s/^\x7E\x7E// ) { # escaped tilde 90 $now = '~'; 91 } 92 elsif ( $src =~ s/^\x7E\cJ// ) { # '\cJ' == LF in ASCII 93 next; 94 } 95 elsif ( $src =~ s/^\x7E\x7B// ) { # '~{' 96 $in_ascii = 0; # to GB 97 next; 98 } 99 else { # encounters an invalid escape, \x80 or greater 100 last; 101 } 102 } 103 else { # GB mode; the byte ranges are as in RFC 1843. 104 if ( $src =~ s/^((?:[\x21-\x77][\x21-\x7F])+)// ) { 105 $now = $GB->decode( $1, $chk ); 106 } 107 elsif ( $src =~ s/^\x7E\x7D// ) { # '~}' 108 $in_ascii = 1; 109 next; 110 } 111 else { # invalid 112 last; 113 } 114 } 115 116 next if !defined $now; 117 118 $ret .= $now; 119 120 if ( $now eq $trm ) { 121 $$rdst .= $ret; 122 $$rpos = $ini_pos + $pos + $ini_len - bytes::length($src); 123 pos($$rsrc) = $ini_pos; 124 return 1; 125 } 126 } 127 128 $$rdst .= $ret; 129 $$rpos = $ini_pos + $pos + $ini_len - bytes::length($src); 130 pos($$rsrc) = $ini_pos; 131 return ''; # terminator not found 132 } 133 134 sub encode($$;$) { 135 my ( $obj, $str, $chk ) = @_; 136 137 my $GB = Encode::find_encoding('gb2312-raw'); 138 my $ret = ''; 139 my $in_ascii = 1; # default mode is ASCII. 140 141 no warnings 'utf8'; # $str may be malformed UTF8 at the end of a chunk. 142 143 while ( length $str ) { 144 if ( $str =~ s/^([[:ascii:]]+)// ) { 145 my $tmp = $1; 146 $tmp =~ s/~/~~/g; # escapes tildes 147 if ( !$in_ascii ) { 148 $ret .= "\x7E\x7D"; # '~}' 149 $in_ascii = 1; 150 } 151 $ret .= pack 'a*', $tmp; # remove UTF8 flag. 152 } 153 elsif ( $str =~ s/(.)// ) { 154 my $s = $1; 155 my $tmp = $GB->encode( $s, $chk ); 156 last if !defined $tmp; 157 if ( length $tmp == 2 ) { # maybe a valid GB char (XXX) 158 if ($in_ascii) { 159 $ret .= "\x7E\x7B"; # '~{' 160 $in_ascii = 0; 161 } 162 $ret .= $tmp; 163 } 164 elsif ( length $tmp ) { # maybe FALLBACK in ASCII (XXX) 165 if ( !$in_ascii ) { 166 $ret .= "\x7E\x7D"; # '~}' 167 $in_ascii = 1; 168 } 169 $ret .= $tmp; 170 } 171 } 172 else { # if $str is malformed UTF8 *and* if length $str != 0. 173 last; 174 } 175 } 176 $_[1] = $str if $chk; 177 178 # The state at the end of the chunk is discarded, even if in GB mode. 179 # That results in the combination of GB-OUT and GB-IN, i.e. "~}~{". 180 # Parhaps it is harmless, but further investigations may be required... 181 182 if ( !$in_ascii ) { 183 $ret .= "\x7E\x7D"; # '~}' 184 $in_ascii = 1; 185 } 186 return $ret; 187 } 188 189 1; 190 __END__ 191 192 =head1 NAME 193 194 Encode::CN::HZ -- internally used by Encode::CN 195 196 =cut
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
Generated: Tue Mar 17 22:47:18 2015 | Cross-referenced by PHPXref 0.7.1 |