标签:bin tmp code use close hand class highlight 时间
思想是设置子文本最大长度,然后分割成多个子文本, 最后合并.
词频则是当前位置字和前一位置的字的组合 进入hash.
代码如下
use Encode; ##编码解码
system("time /t"); ##开始时间
$g_MaxBiNum=1000000; ##最大文本长度
BiCount("train.txt");
MergeBi(\@BiTmp,"bi.txt");
foreach (@BiTmp){
unlink($_);
}
system("time /t"); ##结束时间
TrainWordToNum; ##统计字频
sub BiCount ##统计词语
{
my($File)=@_;
$BiFile="tmp";
open(In,"$File");
$ZiNum=0;
$ID=0;
@BiTmp=();
while(<In>){
chomp;
s/\s+//g;
$Line=$_;
while( $Line ne "" ){
$Len=1;
if ( ord($Line) & 0x80 ){
$Len=2;
}
$H2=substr($Line,0,$Len);
if ( $H1 ne "" ){
$Bi=$H1."_".$H2;
$hashBi{$Bi}++;
}
$H1=$H2;
$ZiNum++;
if ( $ZiNum > $g_MaxBiNum ){
$BiFileTmp=$BiFile."_".$ID;
push(@BiTmp,$BiFileTmp);
open(Out,">$BiFileTmp");
print "$BiFileTmp done!\n";
foreach (sort keys %hashBi ){
print Out "$_\t$hashBi{$_}\n";
}
%hashBi=();
$ZiNum=0;
close(Out);
$ID++;
}
$Line=substr($Line,$Len,length($Line)-$Len);
}
}
close(In);
}
sub MergeBi
{
my($RefBiFileList,$Merged)=@_;
open(Out,">$Merged");
foreach (@{$RefBiFileList}){
my $H="F".$_;
open($H,"$_");
if ( <$H>=~/(\S+)\t(\d+)/ ){
${$hash{$1}}{$H}=$2;
}
}
@BiStr=sort keys %hash;
while( @BiStr > 0 ){
$Num=0;
@Fhandle=();
foreach $Handle(keys %{$hash{$BiStr[0]}} ){
$Num+=${$hash{$BiStr[0]}}{$Handle};
push(@Fhandle,$Handle);
}
print Out "$BiStr[0]\t$Num\n";
delete $hash{$BiStr[0]};
foreach $Handle(@Fhandle){
if ( <$Handle>=~/(\S+)\t(\d+)/ ){
${$hash{$1}}{$Handle}=$2;
}
}
@BiStr=sort keys %hash;
}
foreach (@{$RefBiFileList}){
my $H="F".$_;
close($H);
}
}
sub TrainWordToNum{
open(in,"train.txt");
while(<in>)
{
chomp;
$line=decode("GBK",$_);
@AllW=$line=~/./g;
foreach $_(@AllW)
{
$_=encode("GBK",$_);
$Word2Num{$_}++;
}
}
close(in);
}
标签:bin tmp code use close hand class highlight 时间
原文地址:http://www.cnblogs.com/cagercoding/p/6910829.html