码迷,mamicode.com
首页 > 编程语言 > 详细

***PHP多线程pthreads 实现QQ号码爬虫

时间:2015-07-23 19:33:11      阅读:129      评论:0      收藏:0      [点我收藏+]

标签:

通过空间历史浏览,爬出查看你空间的人(一般限制20人,除非开通黄钻),然后在爬出这20人的浏览记录,依次向下爬,你可以控制爬行深度。
这里仅仅给出怕中代码片段,你可以进一步优化,将QQ分类存储。通过QQ相互浏览关系,可以通过绘图工具绘制好友网络。等等

代码涉及pthreads 如果不清楚请阅读:《PHP 高级编程之多线程》
http://netkiller.github.io/journal/thread.php.html

 

<?php
/*
Homepage: http://netkiller.github.io
Author: Neo <netkiller@msn.com>
*/
if(!extension_loaded(‘pthreads‘)) die (‘Please install pthreads‘);
 
include_once(‘Snoopy.class.php‘);
 
class CrawlerWorker extends Worker {
 
    protected  static $dbh;
    public function __construct() {
 
    }
    public function run(){
    /*
        $dbhost = ‘db.example.com‘;         // 数据库服务器
        $dbuser = ‘example.com‘;            // 数据库用户名
        $dbpw = ‘password‘;                 // 数据库密码
        $dbname = ‘example‘;                // 数据库名
 
        self::$dbh  = new PDO("mysql:host=$dbhost;port=3306;dbname=$dbname", $dbuser, $dbpw, array(
            PDO::MYSQL_ATTR_INIT_COMMAND => ‘SET NAMES \‘UTF8\‘‘,
            PDO::MYSQL_ATTR_COMPRESS => true,
            PDO::ATTR_PERSISTENT => true
            )
        );
    */
    }
    protected function getInstance(){
        return self::$dbh;
    }
 
}
 
/* the collectable class implements machinery for Pool::collect */
class Crawler extends Stackable {
    public $depth = 3;
    private static $level = 0;
    public function __construct($qq) {
        $this->qq = $qq;
    }
    public function run() {
 
        try {
            $dbh  = $this->worker->getInstance();
            $this->recursion(array($this->qq));
        }
        catch(PDOException $e) {
            $error = sprintf("%s,%s\n", $mobile, $id );
            file_put_contents("mobile_error.log", $error, FILE_APPEND);
        }
        //printf("runtime: %s, %s\n", date(‘Y-m-d H:i:s‘), $this->worker->getThreadId());
        //$lst = $this->qzone($this->qq);
        //print_r($lst);
    }
    public function recursion($qqs){
         
        if( self::$level <= $this->depth){
            self::$level++;
        }else if(self::$level > 0){
            self::$level--;
        }
        printf("Level: %s\n", self::$level);
        //sleep(1);
        usleep(mt_rand(10000,1000000));
        if(self::$level >= $this->depth){
            return;
        }
         
        foreach($qqs as $uin) {
            $lst = $this->qzone($uin);
            print_r($lst);
            $this->recursion($lst);
        }
    }
 
    public function qzone($qq){
        $url = ‘http://m.qzone.com/mqz_get_visitor?g_tk=1191852101&res_mode=0&res_uin=‘.$qq.‘&offset=0&count=100&page=1&format=json&t=1401762986882&sid=dODKVcYv6azjN87cxXQ5mao1xgakYjHg18c8aa5e0201%3D%3D‘;
        $snoopy = new Snoopy;
          
        // need an proxy?
        //$snoopy->proxy_host = "my.proxy.host";
        //$snoopy->proxy_port = "8080";
          
        // set browser and referer:
        $snoopy->agent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)";
        $snoopy->referer = "http://m.qzone.com/";
          
        // set some cookies:
        //$snoopy->cookies["SessionID"] = ‘238472834723489‘;
        //$snoopy->cookies["favoriteColor"] = "blue";
          
        // set an raw-header:
        $snoopy->rawheaders["Pragma"] = "no-cache";
          
        // set some internal variables:
        $snoopy->maxredirs = 2;
        $snoopy->offsiteok = false;
        $snoopy->expandlinks = false;
          
        // set username and password (optional)
        //$snoopy->user = "joe";
        //$snoopy->pass = "bloe";
          
        // fetch the text of the website www.google.com:
        if($snoopy->fetchtext($url)){ 
            // other methods: fetch, fetchform, fetchlinks, submittext and submitlinks
 
            // response code:
            //print "response code: ".$snoopy->response_code."<br/>\n";
          
            // print the headers:
            //print "<b>Headers:</b><br/>";
            //while(list($key,$val) = each($snoopy->headers)){
            //  print $key.": ".$val."<br/>\n";
            //}
 
            // print the texts of the website:
            //print_r( json_decode($snoopy->results) );
             
            $results = array();
            $tmp = json_decode($snoopy->results);
             
            if($tmp){
                if(property_exists($tmp, ‘data‘)){
                    foreach( $tmp->data->list as $lst ){
                        $results[] = $lst->uin;
                    }
                }
            }
            return ($results);
             
        }
        else {
            print "Snoopy: error while fetching document: ".$snoopy->error."\n";
        }       
    }
}
 
$pool = new Pool(100, \CrawlerWorker::class, []);
 
#foreach (range(1000, 100000) as $number) {
#   $pool->submit(new Crawler($number));
#}
 
$pool->submit(new Crawler(‘13721218‘));
$pool->submit(new Crawler(‘291379‘));
//$pool->submit(new Crawler(‘xxx‘));
//$pool->submit(new Crawler(‘xxx‘));
//$pool->submit(new Crawler(‘xxx‘));
// 以此类推
//$pool->submit(new Crawler(‘nnn‘));
 
$pool->shutdown();
?>

 

***PHP多线程pthreads 实现QQ号码爬虫

标签:

原文地址:http://www.cnblogs.com/kenshinobiy/p/4671244.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!