标签:
最常见的词语二分法:
$str = ‘这是我的网站www.7di.net!‘ ; //$str = iconv(‘GB2312‘,‘UTF-8‘,$str); $result = spStr( $str ); print_r( $result ); /** * UTF-8版 中文二元分词 */ function spStr( $str ) { $cstr = array (); $search = array ( "," , "/" , "\\" , "." , ";" , ":" , "\"" , "!" , "~" , "`" , "^" , "(" , ")" , "?" , "-" , "\t" , "\n" , "‘" , "<" , ">" , "\r" , "\r\n" , "{1}quot;" , "&" , "%" , "#" , "@" , "+" , "=" , "{" , "}" , "[" , "]" , ":" , ")" , "(" , "." , "。" , "," , "!" , ";" , "“" , "”" , "‘" , "’" , "[" , "]" , "、" , "—" , " " , "《" , "》" , "-" , "…" , "【" , "】" ,); $str = str_replace ( $search , " " , $str ); preg_match_all( "/[a-zA-Z]+/" , $str , $estr ); preg_match_all( "/[0-9]+/" , $str , $nstr ); $str = preg_replace( "/[0-9a-zA-Z]+/" , " " , $str ); $str = preg_replace( "/\s{2,}/" , " " , $str ); $str = explode ( " " , trim( $str )); foreach ( $str as $s ) { $l = strlen ( $s ); $bf = null; for ( $i = 0; $i < $l ; $i = $i +3) { $ns1 = $s { $i }. $s { $i +1}. $s { $i +2}; if (isset( $s { $i +3})) { $ns2 = $s { $i +3}. $s { $i +4}. $s { $i +5}; if (preg_match( "/[\x80-\xff]{3}/" , $ns2 )) $cstr [] = $ns1 . $ns2 ; } else if ( $i == 0) { $cstr [] = $ns1 ; } } } $estr = isset( $estr [0])? $estr [0]: array (); $nstr = isset( $nstr [0])? $nstr [0]: array (); return array_merge ( $nstr , $estr , $cstr ); } |
執行結果是:
Array ( [0] => 7 [1] => www [2] => di [3] => net [4] => 这是 [5] => 是我 [6] => 我的 [7] => 的网 [8] => 网站 ) |
接下来,将以上结果转换为区位码,PHP代码是:
foreach ( $result as $s ) { $s = iconv( ‘UTF-8‘ , ‘GB2312‘ , $s ); $code [] = gbCode( $s ); } $code = implode( " " , $code ); echo $code ; function gbCode( $str ) { $return = null; if (!preg_match( "/^[\x80-\xff]{2,}$/" , $str )) return $str ; $len = strlen ( $str ); for ( $i = 0; $i < $len ; $i = $i +2) { $return .= sprintf( "%02d%02d" ,ord( $str { $i })-160,ord( $str { $i +1})-160); } return $return ; } |
标签:
原文地址:http://www.cnblogs.com/shouce/p/5460467.html