码迷,mamicode.com
首页 > Web开发 > 详细

实验楼的php比赛题,网页数据提取。

时间:2017-05-19 15:12:18      阅读:235      评论:0      收藏:0      [点我收藏+]

标签:地址   amp   ase   www   oca   not   stat   https   ...   

实验楼的php比赛题,网页数据提取。

题目的地址:https://www.shiyanlou.com/contests/lou5/challenges

以下代码是题目的答案

<?php
header("Content-Type:text/html;charset=utf-8");
class Crawler{
	 private $content;
	 private $data;
	 static private $mysql;

	 public function __construct(){
	 	echo "开始爬取内容....";
	 }

	 public function loadFile($file_path){
	 	echo "正在加载文件";
	 	$this->content = file_get_contents($file_path);
	 }

	 public function parseCourseBody(){
	 	$regex = "/<body[^>]*?>(.*\s*?)<\/body>/is";
	 	if(preg_match_all($regex, $this->content, $matches)){
	 		$this->content = $matches[0];
	 	}
	 }

	 public function parseContent(){
	 	echo "开始解析内容...<br/>";
	 	$this->parseCourseBody();
	 	$this->parseTitle();
	 	$this->parseDesc();
	 	$this->parseType();
	 	$this->titleIsLong();
	 	$this->saveData();
	 	echo "解析内容结束!<br/>";
	 }

	 public function saveData(){
	 	echo "存入数据库...<br/>";
	 	self::$mysql = mysql_connect("localhost","root","root");
	 	mysql_query("set names utf8");
	 	mysql_select_db("databases",self::$mysql);
	 	$cnames = $this->data[‘cnames‘];
	 	$cdescs = $this->data[‘cdescs‘];
	 	$ctypes = $this->data[‘ctypes‘];
	 	$nlongs = $this->data[‘nlongs‘];
	 	foreach ($cnames as $key => $value) {
	 		$sql = "insert into `course_data`(`cname`,`cdesc`,`ctype`,`nlong`) values(‘".$cnames[$key]."‘,‘".$cdescs[$key]."‘,‘".$ctypes[$key]."‘,‘".$nlongs[$key]."‘)";
	 		mysql_query($sql);
	 	}
	 	mysql_close();
	 }

	 public function parseTitle(){
	 	echo "解析课程标题...<br/>";
	 	$regex= "/<div class=\"course-name\".*?>.*?<\/div>/ism";   
		if(preg_match_all($regex, $this->content, $matches)){
			$cnames = $matches[0];
		}
		foreach ($cnames as &$value) {
			$value = str_replace("</div>","",str_replace("<div class=\"course-name\">", "", $value));
		}
		$this->data[‘cnames‘] = $cnames;
	 }

	 public function parseDesc(){
	 	echo "解析课程简介...<br/>";
	 	$regex4= "/<div class=\"course-desc\".*?>.*?<\/div>/ism";   
		if(preg_match_all($regex, $this->content, $matches)){
			$cdescs = $matches[0];
		}
		foreach ($cdescs as &$value) {
			$value = str_replace("</div>","",str_replace("<div class=\"course-desc\">", "", $value));
		}
		$this->data[‘cdescs‘] = $cdescs;
	 }

	 public function parseType(){
	 	echo "解析课程类型...<br/>";
	 	$regex= "/<div class=\"course-footer\".*?>.*?<\/div>/ism";   
		if(preg_match_all($regex, $this->content, $matches)){
			$ctypes = $matches[0];
		}
		foreach ($ctypes as &$value) {
			$str = str_replace("</div>","",str_replace("<div class=\"course-footer\">", "", $value));
			if(preg_match_all("/([\x{4e00}-\x{9fa5}])/u", $str, $match)){
				$value = join("",$match[0]);
			}else{
				$value = "免费";
		}
		$this->data[‘ctypes‘] = $ctypes;
	 }

	 public function titleIsLong(){
	 	echo "判断课程名是否超长...<br/>";
	 	$cnames = $this->data[‘cnames‘];
	 	foreach ($cnames as $value) {
	 		$nlongs[] = mb_strlen($value) > 16 : "true" : "false";
	 	}
	 	$this->data[‘nlongs‘] = $nlongs;
	 }
}
$Crawler = new Crawler();
$Crawler->loadFile("test.html");
$Crawler->parseContent();

/**
 表结构
cname(varchar):完整的课程名
cdesc(varchar):课程描述
ctype(varchar):课程类型,值为 免费,会员,训练营。
nlong(enum(‘true‘,‘false‘)):课程名是否过长,课程名称超过16字符的时候为 true,否则为 false

create table `course_data`(
	`id` int(11) not null auto_increment,
	`cname` varchar(255) default null,
	`cdesc` varchar(255) default null,
	`ctype` varchar(255) default null,
	`nlong` enum(‘true‘,‘false‘) default null,
	primary key (`id`)
)engine=InnoDB default charset=utf8;
*/

  

实验楼的php比赛题,网页数据提取。

标签:地址   amp   ase   www   oca   not   stat   https   ...   

原文地址:http://www.cnblogs.com/yxhblogs/p/6878366.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!