标签:
0x01
spider抓取URL 采用的是file_get_contents()/fopen()函数,利用正则匹配的方式(貌似最简单方式了,记录下后续添加)
0x02
usage:php spider.php www.baidu.com 结果保存在www.baidu.com文件中
Code:
-----------------
<?php //2015-4-16 @developd //获取网页中的URL 保存URL.txt中 //用法 php spider.php url if($argc !== 2){ echo "No Target...\r\n"; echo "Usage:php spider.php url \r\n"; echo "eg:php spider.php www.baidu.com \r\n"; exit(); }else{ $url = $argv[‘1‘]; } if(empty($url)){ echo "URL Error \r\n"; exit(); } $filename ="$url"; $url = "http://".$url; if(!file_exists($filename)){ $file=fopen($filename,"a+"); fclose($file); } $site=substr($url,0,strpos($url,‘/‘,8)); $base=substr($url,0,strpos($url,‘/‘)+1); $fp=fopen($url,‘r‘); while(!feof($fp)){ $contents.= fread($fp,1024); //var_dump($content); } $pattern = "|href=[‘\"]?([^‘\"]+)[‘\"]|U"; preg_match_all($pattern,$contents,$regArr,PREG_SET_ORDER); for($i=0;$i<count($regArr);$i++){ if(substr($regArr[$i][1],0,1)=="/") $data ="URL".($i+1).":".$site.$regArr[$i][1].PHP_EOL; else $data = "URL".($i+1).":".$regArr[$i][1].PHP_EOL; $res = file_put_contents($filename,$data,FILE_APPEND); } if(empty($res)){ echo "No URLS...\r\n"; }else{ echo "Get URLS success...\r\n"; } fclose($fp); ?>
-----------------
标签:
原文地址:http://www.cnblogs.com/developd/p/4441191.html