码迷,mamicode.com
首页 > 其他好文 > 详细

scws自定义分词库

时间:2016-08-18 16:04:34      阅读:379      评论:0      收藏:0      [点我收藏+]

标签:

CleverCode发现scws分词的效率挺高,研究了一下自定义分词库。

1  安装scws

安装详解:http://blog.csdn.net/clevercode/article/details/52204124。


2 没有添加自定义分词库

2.1 php代码

# vim parseWord.php

<?php

function parse($str)
{
	$cws = scws_new();
	$dictPath = ini_get('scws.default.fpath').'/dict.utf8.xdb';
	$cws->set_dict($dictPath);

    //自定义分词库
	$myDictPath = ini_get('scws.default.fpath').'/mydict.xdb';
	if(file_exists($myDictPath))
	{
	    //$cws->add_dict($myDictPath);
	}
	$cws->set_ignore(true);

	$utf8Str = iconv("GBK","UTF-8//IGNORE",$str);
	$cws->send_text($utf8Str);
	$resArr = array();
	while($tmp = $cws->get_result())
	{
		$resArr[] = $tmp;
	}
	$cws->close();
	
	return $resArr;
}

function start()
{
	$key = '里约奥运洪荒之力';
	
	$words_array = parse($key); 
    $str = print_r($words_array,true);
	echo '<p>输入:'.$key.'</p>'."\r\n";
	echo '<p>分词:'.iconv("UTF-8","GBK//IGNORE",$str);
	
}

start();

?>


2.2 分词里【里约奥运洪荒之力】结果

<p>输入:里约奥运洪荒之力</p>
<p>分词:Array
(
    [0] => Array
        (
            [0] => Array
                (
                    [word] => 里约
                    [off] => 0
                    [len] => 6
                    [idf] => 15.119999885559
                    [attr] => ns
                )

            [1] => Array
                (
                    [word] => 奥运
                    [off] => 6
                    [len] => 6
                    [idf] => 4.8800001144409
                    [attr] => n
                )

            [2] => Array
                (
                    [word] => 洪荒
                    [off] => 12
                    [len] => 6
                    [idf] => 8.0500001907349
                    [attr] => n
                )

            [3] => Array
                (
                    [word] => 之
                    [off] => 18
                    [len] => 3
                    [idf] => 0
                    [attr] => r
                )

            [4] => Array
                (
                    [word] => 力
                    [off] => 21
                    [len] => 3
                    [idf] => 0
                    [attr] => n
                )

        )

)

3 添加自定义词库

3.1 生成词库

# vim genMyDict.php

<?php
/**
 * 生成我的字典
 */

Class MyDict
{/*{{{*/

    //是否输出日志 
    private $isLogStdOut = true;

    //我的字典txt文件
    private $myNewDictTxt;

    //我的字典xdb文件
    private $myNewDictXdb;

    private $myDictXdb;

    function run()
    {/*{{{*/

        $this->init();

        $this->deleteOldFile();

        $words = $this->getMyWordData();

        $this->write2File($words, $this->myNewDictTxt);

        $this->genMyDict();
    }/*}}}*/

    private function init()
    {/*{{{*/
        $path = ini_get('scws.default.fpath');
        $this->myNewDictTxt = $path.'/myNewDict.txt';
        $this->myNewDictXdb = $path.'/myNewDict.xdb';
        $this->myDictXdb = $path.'/mydict.xdb';
    }/*}}}*/

    //获取我的单词数据
    function getMyWordData()
    {/*{{{*/
        $words = array('里约奥运','洪荒之力');    
        return $words;
    }/*}}}*/

    function deleteOldFile()
    {/*{{{*/
        $this->msgLog('INFO',"清除老文件");
        exec("rm -f $this->myNewDictTxt");
        exec("rm -f $this->myNewDictXdb");
    }/*}}}*/

    //写入数据
    private function write2File(array $words, $path)
    {/*{{{*/

        foreach($words as $word)
        {
            $utf8Word = mb_convert_encoding($word, 'utf-8', 'gbk');
            if(trim($utf8Word) != '')
            {
                $line = sprintf("%s\t%.2f\t%.2f\t%.2s\n", trim($utf8Word), 10.00, 10.00, "n");
                $this->msgLog("INFO",mb_convert_encoding($line, 'gbk', 'utf-8'));
                file_put_contents($path, $line, FILE_APPEND);
            }
        }
    }/*}}}*/                                                                                    

    //组合字典
    private function genMyDict()
    {/*{{{*/
        $path = ini_get('scws.default.fpath');

        $this->msgLog('INFO',"生成myNewDict.xdb");
        exec("$path/../bin/scws-gen-dict -c utf8 -i $this->myNewDictTxt -o $this->myNewDictXdb");

        $this->msgLog('INFO',"替换词典");
        exec("mv $this->myNewDictXdb $this->myDictXdb");

        $this->msgLog('INFO',"清除临时文件");
        exec("rm -f $this->myNewDictTxt");
        exec("rm -f $this->myNewDictXdb");
    }/*}}}*/


    /**
     * 打印输出 
     *                                                     
     * @param string $level 级别 INFO/WARNING/ERROR        
     * @param string $logStr 日志信息 
     * @static
     * @access public                                      
     * @return void
     */
    public function msgLog($level,$logStr)
    {/*{{{*/                                               
        if($this->isLogStdOut)                             
        {
            $t = time();
            $logHdr = $t.", [".$level."]: ";               
            $logStr = $logHdr.$logStr."\r\n";              
            echo $logStr;
        }
    }/*}}}*/

}/*}}}*/

function start()
{
   $myDict = new MyDict();
   $myDict->run();
}

start();
?>


生成词典后的结果

技术分享


3.2 添加自定义词典

去掉parseWord.php,13行注释。$cws->add_dict($myDictPath); 再次执行 php parseWord.php。如下,里约奥运和洪荒之力都被当成了完成的词。

<p>输入:里约奥运洪荒之力</p>
<p>分词:Array
(
    [0] => Array
        (
            [0] => Array
                (
                    [word] => 里约奥运
                    [off] => 0
                    [len] => 12
                    [idf] => 10
                    [attr] => n
                )

            [1] => Array
                (
                    [word] => 洪荒之力
                    [off] => 12
                    [len] => 12
                    [idf] => 10
                    [attr] => n
                )

        )

)



scws自定义分词库

标签:

原文地址:http://blog.csdn.net/clevercode/article/details/52241159

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!