标签:地址 amp ase www oca not stat https ...
实验楼的php比赛题,网页数据提取。
题目的地址:https://www.shiyanlou.com/contests/lou5/challenges
以下代码是题目的答案
<?php header("Content-Type:text/html;charset=utf-8"); class Crawler{ private $content; private $data; static private $mysql; public function __construct(){ echo "开始爬取内容...."; } public function loadFile($file_path){ echo "正在加载文件"; $this->content = file_get_contents($file_path); } public function parseCourseBody(){ $regex = "/<body[^>]*?>(.*\s*?)<\/body>/is"; if(preg_match_all($regex, $this->content, $matches)){ $this->content = $matches[0]; } } public function parseContent(){ echo "开始解析内容...<br/>"; $this->parseCourseBody(); $this->parseTitle(); $this->parseDesc(); $this->parseType(); $this->titleIsLong(); $this->saveData(); echo "解析内容结束!<br/>"; } public function saveData(){ echo "存入数据库...<br/>"; self::$mysql = mysql_connect("localhost","root","root"); mysql_query("set names utf8"); mysql_select_db("databases",self::$mysql); $cnames = $this->data[‘cnames‘]; $cdescs = $this->data[‘cdescs‘]; $ctypes = $this->data[‘ctypes‘]; $nlongs = $this->data[‘nlongs‘]; foreach ($cnames as $key => $value) { $sql = "insert into `course_data`(`cname`,`cdesc`,`ctype`,`nlong`) values(‘".$cnames[$key]."‘,‘".$cdescs[$key]."‘,‘".$ctypes[$key]."‘,‘".$nlongs[$key]."‘)"; mysql_query($sql); } mysql_close(); } public function parseTitle(){ echo "解析课程标题...<br/>"; $regex= "/<div class=\"course-name\".*?>.*?<\/div>/ism"; if(preg_match_all($regex, $this->content, $matches)){ $cnames = $matches[0]; } foreach ($cnames as &$value) { $value = str_replace("</div>","",str_replace("<div class=\"course-name\">", "", $value)); } $this->data[‘cnames‘] = $cnames; } public function parseDesc(){ echo "解析课程简介...<br/>"; $regex4= "/<div class=\"course-desc\".*?>.*?<\/div>/ism"; if(preg_match_all($regex, $this->content, $matches)){ $cdescs = $matches[0]; } foreach ($cdescs as &$value) { $value = str_replace("</div>","",str_replace("<div class=\"course-desc\">", "", $value)); } $this->data[‘cdescs‘] = $cdescs; } public function parseType(){ echo "解析课程类型...<br/>"; $regex= "/<div class=\"course-footer\".*?>.*?<\/div>/ism"; if(preg_match_all($regex, $this->content, $matches)){ $ctypes = $matches[0]; } foreach ($ctypes as &$value) { $str = str_replace("</div>","",str_replace("<div class=\"course-footer\">", "", $value)); if(preg_match_all("/([\x{4e00}-\x{9fa5}])/u", $str, $match)){ $value = join("",$match[0]); }else{ $value = "免费"; } $this->data[‘ctypes‘] = $ctypes; } public function titleIsLong(){ echo "判断课程名是否超长...<br/>"; $cnames = $this->data[‘cnames‘]; foreach ($cnames as $value) { $nlongs[] = mb_strlen($value) > 16 : "true" : "false"; } $this->data[‘nlongs‘] = $nlongs; } } $Crawler = new Crawler(); $Crawler->loadFile("test.html"); $Crawler->parseContent(); /** 表结构 cname(varchar):完整的课程名 cdesc(varchar):课程描述 ctype(varchar):课程类型,值为 免费,会员,训练营。 nlong(enum(‘true‘,‘false‘)):课程名是否过长,课程名称超过16字符的时候为 true,否则为 false create table `course_data`( `id` int(11) not null auto_increment, `cname` varchar(255) default null, `cdesc` varchar(255) default null, `ctype` varchar(255) default null, `nlong` enum(‘true‘,‘false‘) default null, primary key (`id`) )engine=InnoDB default charset=utf8; */
标签:地址 amp ase www oca not stat https ...
原文地址:http://www.cnblogs.com/yxhblogs/p/6878366.html