标签:src xpath 技术 open body 闭包 sub cep dom
1 <?php 2 3 /* 参考文章 https://www.iamle.com/archives/2202.html */ 4 5 require_once __DIR__.‘/vendor/autoload.php‘; 6 use GuzzleHttp\Client; 7 use Symfony\Component\DomCrawler\Crawler; 8 9 // $url = ‘https://movie.douban.com/subject/25812712/?from=showing‘; 10 // //下载网页内容 11 // $client = new Client([ 12 // ‘timeout‘ => 10, 13 // ‘headers‘ => [‘User-Agent‘ => ‘Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)‘, 14 // ], 15 // ]); 16 // $response = $client->request(‘GET‘, $url)->getBody()->getContents(); 17 // print_r($response); exit; 18 19 print_r(json_encode(Spider(), JSON_UNESCAPED_UNICODE)); 20 //print_r(Spider()); 21 22 function Spider() 23 { 24 //需要爬取的页面 25 $url = ‘https://movie.douban.com/subject/25812712/?from=showing‘; 26 27 //下载网页内容 28 $client = new Client([ 29 ‘timeout‘ => 10, 30 ‘headers‘ => [‘User-Agent‘ => ‘Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)‘, 31 ], 32 ]); 33 /* 原格式输出 */ 34 echo ‘<pre>‘; 35 $response = $client->request(‘GET‘, $url)->getBody()->getContents(); 36 37 //进行XPath页面数据抽取 38 $data = []; //结构化数据存本数组 39 $crawler = new Crawler(); 40 $crawler->addHtmlContent($response); 41 42 try { 43 //电影名称 44 //网页结构中用css选择器用id的比较容易写xpath表达式 45 $data[‘name‘] = $crawler->filterXPath(‘//*[@id="content"]/h1/span[1]‘)->text(); 46 //电影海报 47 $data[‘cover‘] = $crawler->filterXPath(‘//*[@id="mainpic"]/a/img/@src‘)->text(); 48 //导演 49 $data[‘director‘] = $crawler->filterXPath(‘//*[@id="info"]/span[1]/span[2]‘)->text(); 50 //多个导演处理成数组 51 $data[‘director‘] = explode(‘/‘, $data[‘director‘]); 52 //过滤前后空格 53 $data[‘director‘] = array_map(‘trim‘, $data[‘director‘]); 54 55 //编剧 56 $data[‘cover‘] = $crawler->filterXPath(‘//*[@id="info"]/span[2]/span[2]/a‘)->text(); 57 //主演 58 $data[‘mactor‘] = $crawler->filterXPath(‘//*[@id="info"]/span[contains(@class,"actor")]/span[contains(@class,"attrs")]‘)->text(); 59 //多个主演处理成数组 60 $data[‘mactor‘] = explode(‘/‘, $data[‘mactor‘]); 61 //过滤前后空格 62 $data[‘mactor‘] = array_map(‘trim‘, $data[‘mactor‘]); 63 64 //上映日期 65 $data[‘rdate‘] = $crawler->filterXPath(‘//*[@id="info"]‘)->text(); 66 //使用正则进行抽取 67 preg_match_all("/(\d{4})-(\d{2})-(\d{2})\(.*?\)/", $data[‘rdate‘], $rdate); //2017-07-07(中国大陆) / 2017-06-14(安锡动画电影节) / 2017-06-30(美国) 68 $data[‘rdate‘] = $rdate[0]; 69 //简介 70 //演示使用class选择器的方式 71 $data[‘introduction‘] = trim($crawler->filterXPath(‘//div[contains(@class,"indent")]/span‘)->text()); 72 73 //演员 74 //本xpath表达式会得到多个对象结果,用each方法进行遍历 75 //each是传入的参数是一个闭包,在闭包中使用外部的变量使用use方法,并使用变量指针 76 $crawler->filterXPath(‘//ul[contains(@class,"celebrities-list from-subject")]/li‘)->each(function (Crawler $node, $i) use (&$data) { 77 $actor[‘name‘] = $node->filterXPath(‘//div[contains(@class,"info")]/span[contains(@class,"name")]/a‘)->text(); //名字 78 $actor[‘role‘] = $node->filterXPath(‘//div[contains(@class,"info")]/span[contains(@class,"role")]‘)->text(); //角色 79 $actor[‘avatar‘] = $node->filterXPath(‘//a/div[contains(@class,"avatar")]/@style‘)->text(); //头像 80 //background-image: url(https://img3.doubanio.com/img/celebrity/medium/5253.jpg) 正则抽取头像图片 81 preg_match_all("/((https|http|ftp|rtsp|mms)?:\/\/)[^\s]+\.(jpg|jpeg|gif|png)/", $actor[‘avatar‘], $avatar); 82 $actor[‘avatar‘] = $avatar[0][0]; 83 //print_r($actor); 84 $data[‘actor‘][] = $actor; 85 }); 86 87 } catch (\Exception $e) { 88 89 } 90 91 return $data; 92 93 }
标签:src xpath 技术 open body 闭包 sub cep dom
原文地址:https://www.cnblogs.com/bneglect/p/11684694.html