码迷,mamicode.com
首页 > Web开发 > 详细

php 正则抓去页面函数整理

时间:2014-08-30 20:17:39      阅读:244      评论:0      收藏:0      [点我收藏+]

标签:blog   os   使用   io   strong   ar   for   数据   div   

整理了下抓取页面的一些函数 方便以后使用 

 

 

//抓取页面
function getcontents($url) {
    $ch = curl_init(); 
    $timeout = 5; 
    curl_setopt($ch, CURLOPT_URL, $url); 
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); 
    curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout); 
    $contents = curl_exec($ch); 
    return $contents;
}

//抓取table
function get_td_array($table) {
    // 去掉 HTML 标记属性
    $table = preg_replace("‘<table[^>]*?>‘si", "", $table);
    $table = preg_replace("‘<tr[^>]*?>‘si", "", $table);
    $table = preg_replace("‘<td[^>]*?>‘si", "", $table);
    $table = str_replace("</tr>", "{tr}", $table);
    $table = str_replace("</td>", "{td}", $table);
    // 去掉 HTML 标记
    
    $table = preg_replace("‘<[\/\!]*?[^<>]*?>‘si", "", $table);
    
    // 去掉空白字符
    $table = preg_replace("‘([\r\n])[\s]+‘", "", $table);
    $table = str_replace(" ", "", $table);
    $table = str_replace(" ", "", $table);
    
    $table = explode(‘{tr}‘, $table);
    array_pop($table);
    foreach ($table as $key => $tr) {
        $td = explode(‘{td}‘, $tr);
        array_pop($td);
        $td_array[] = $td;
    } 
    return $td_array;
}


//post提交数据 模拟采集页面
function curlPost($url,$postData=array()) {
    if(empty($url)) return false;
    $o="";
    foreach ($postData as $k=>$v){
        $o.= "$k=".urlencode($v)."&";
    }
    $postData=substr($o,0,-1);
    $ch = curl_init();

    $timeout = 5; 
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); 
    curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout); 
    
    curl_setopt($ch, CURLOPT_POST, 1);
    curl_setopt($ch, CURLOPT_HEADER, 0);
    curl_setopt($ch, CURLOPT_URL, $url); 
    curl_setopt($ch, CURLOPT_POSTFIELDS, $postData);
    $contents = curl_exec($ch); 
    return $contents;
}

 

// 提交的数据
$postData = array(
    ‘region_fullname‘=>iconv(‘GBK‘,‘UTF-8‘,‘黄山‘),
    ‘$total‘ => $totalPage,
    ‘$pgsz‘=> $prepage,
    ‘$pg‘ => $page,
);
$contents = curlPost($url,$postData);

  

 

php 正则抓去页面函数整理

标签:blog   os   使用   io   strong   ar   for   数据   div   

原文地址:http://www.cnblogs.com/threemore/p/3946737.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!