标签:一个 use top 显示乱码 int sel href 中文 webkit
""" 抓取猫眼电影TOP100 """ import re import time import requests from bs4 import BeautifulSoup class SpiderMaoyan(object): def __init__(self): # 通过分析URL可以发现, 猫眼电影TOP100页面是通过 offset + 10 来分页的 self.url = "http://maoyan.com/board/4?offset={0}" # 设置一下UA, 否则有可能提示你访问被禁止了 self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/66.0.3359.139 Safari/537.36" } # 定义需要提取的内容字段 self.fields = ("id", "name", "movieUrl", "imgUrl", "star", "releaseTime", "score") def handler(self, offset=0): while offset < 100: response = requests.get(self.url.format(offset), headers=self.headers) if response.status_code == 200: print("INFO -> Current URL: <%s>" % response.url) # 编码处理一下, 不然有可能中文显示乱码 r_html = response.text.encode(response.encoding).decode("utf-8") # 构建一个 BeautifulSoup 对象, 用于后续的标签、内容提取 soup = BeautifulSoup(r_html, "html5lib") # 继续分析网页源代码, 发现每部影片都存在 <dd></dd> 标签中 tags = soup.select("dd") # 提取内容 for tag in tags: # id、name、movieUrl obj = tag.find("p", class_="name").select_one("a") _id = re.search(r"movieId:(\d+)", obj.get("data-val")).group(1) _name = obj.string _movieUrl = "http://maoyan.com" + obj.get("href") # img # Tips: 可以将图片地址后面的分辨率去掉, 保存高清大图地址 .split("@")[0] _imgUrl = tag.find("img", class_="board-img").get("data-src") # star # Tips: 中文标点 _star = tag.find("p", class_="star").string.strip().split(":")[-1] # releaseTime # Tips: 中文标点 _releaseTime = tag.find("p", class_="releasetime").string.split(":")[-1] # score _score = tag.find("p", class_="score").get_text() # 接下来就可以将数据写入存储了 # Tips: 这种 SQL 生成方式有必要验证 key/val 是否成对出现 print( "INSERT INTO TABLE_NAME (%s) VALUE %s;" % ( ", ".join(self.fields), tuple([_id, _name, _movieUrl, _imgUrl, _star, _releaseTime, _score]) ) ) # 偏移量自增 offset += 10 # 有必要停顿一下 time.sleep(.9) else: print(response.reason) exit(999) if __name__ == "__main__": spider = SpiderMaoyan() spider.handler()
执行效果如下:
INFO -> Current URL: <http://maoyan.com/board/4?offset=0> INSERT INTO TABLE_NAME (id, name, movieUrl, imgUrl, star, releaseTime, score) VALUE (‘1203‘, ‘霸王别姬‘, ‘http://maoyan.com/films/1203‘, ‘http://p1.meituan.net/movie/20803f59291c47e1e116c11963ce019e68711.jpg@160w_220h_1e_1c‘, ‘张国荣,张丰毅,巩俐‘, ‘1993-01-01(中国香港)‘, ‘9.6‘); INSERT INTO TABLE_NAME (id, name, movieUrl, imgUrl, star, releaseTime, score) VALUE (‘1297‘, ‘肖申克的救赎‘, ‘http://maoyan.com/films/1297‘, ‘http://p0.meituan.net/movie/__40191813__4767047.jpg@160w_220h_1e_1c‘, ‘蒂姆·罗宾斯,摩根·弗里曼,鲍勃·冈顿‘, ‘1994-10-14(美国)‘, ‘9.5‘); INSERT INTO TABLE_NAME (id, name, movieUrl, imgUrl, star, releaseTime, score) VALUE (‘2641‘, ‘罗马假日‘, ‘http://maoyan.com/films/2641‘, ‘http://p0.meituan.net/movie/23/6009725.jpg@160w_220h_1e_1c‘, ‘格利高利·派克,奥黛丽·赫本,埃迪·艾伯特‘, ‘1953-09-02(美国)‘, ‘9.1‘); INSERT INTO TABLE_NAME (id, name, movieUrl, imgUrl, star, releaseTime, score) VALUE (‘4055‘, ‘这个杀手不太冷‘, ‘http://maoyan.com/films/4055‘, ‘http://p0.meituan.net/movie/fc9d78dd2ce84d20e53b6d1ae2eea4fb1515304.jpg@160w_220h_1e_1c‘, ‘让·雷诺,加里·奥德曼,娜塔莉·波特曼‘, ‘1994-09-14(法国)‘, ‘9.5‘); INSERT INTO TABLE_NAME (id, name, movieUrl, imgUrl, star, releaseTime, score) VALUE (‘1247‘, ‘教父‘, ‘http://maoyan.com/films/1247‘, ‘http://p0.meituan.net/movie/92/8212889.jpg@160w_220h_1e_1c‘, ‘马龙·白兰度,阿尔·帕西诺,詹姆斯·凯恩‘, ‘1972-03-24(美国)‘, ‘9.3‘); INSERT INTO TABLE_NAME (id, name, movieUrl, imgUrl, star, releaseTime, score) VALUE (‘267‘, ‘泰坦尼克号‘, ‘http://maoyan.com/films/267‘, ‘http://p0.meituan.net/movie/11/324629.jpg@160w_220h_1e_1c‘, ‘莱昂纳多·迪卡普里奥,凯特·温丝莱特,比利·赞恩‘, ‘1998-04-03‘, ‘9.5‘); INSERT INTO TABLE_NAME (id, name, movieUrl, imgUrl, star, releaseTime, score) VALUE (‘123‘, ‘龙猫‘, ‘http://maoyan.com/films/123‘, ‘http://p0.meituan.net/movie/c8f224ca9939cd9dd58f709c9c4deb0924422.jpg@160w_220h_1e_1c‘, ‘日高法子,坂本千夏,糸井重里‘, ‘1988-04-16(日本)‘, ‘9.2‘); INSERT INTO TABLE_NAME (id, name, movieUrl, imgUrl, star, releaseTime, score) VALUE (‘837‘, ‘唐伯虎点秋香‘, ‘http://maoyan.com/films/837‘, ‘http://p0.meituan.net/movie/62/109878.jpg@160w_220h_1e_1c‘, ‘周星驰,巩俐,郑佩佩‘, ‘1993-07-01(中国香港)‘, ‘9.2‘); INSERT INTO TABLE_NAME (id, name, movieUrl, imgUrl, star, releaseTime, score) VALUE (‘2760‘, ‘魂断蓝桥‘, ‘http://maoyan.com/films/2760‘, ‘http://p1.meituan.net/movie/94c3a84626fd7650d6891088c4b88e5c27012.jpg@160w_220h_1e_1c‘, ‘费雯·丽,罗伯特·泰勒,露塞尔·沃特森‘, ‘1940-05-17(美国)‘, ‘9.2‘); INSERT INTO TABLE_NAME (id, name, movieUrl, imgUrl, star, releaseTime, score) VALUE (‘1212‘, ‘千与千寻‘, ‘http://maoyan.com/films/1212‘, ‘http://p0.meituan.net/movie/9bf7d7b81001a9cf8adbac5a7cf7d766132425.jpg@160w_220h_1e_1c‘, ‘柊瑠美,入野自由,夏木真理‘, ‘2001-07-20(日本)‘, ‘9.3‘); INFO -> Current URL: <http://maoyan.com/board/4?offset=10> INSERT INTO TABLE_NAME (id, name, movieUrl, imgUrl, star, releaseTime, score) VALUE (‘7431‘, ‘乱世佳人‘, ‘http://maoyan.com/films/7431‘, ‘http://p0.meituan.net/movie/13/6960141.jpg@160w_220h_1e_1c‘, ‘费雯·丽,克拉克·盖博,奥利维娅·德哈维兰‘, ‘1939-12-15(美国)‘, ‘9.1‘); INSERT INTO TABLE_NAME (id, name, movieUrl, imgUrl, star, releaseTime, score) VALUE (‘9025‘, ‘喜剧之王‘, ‘http://maoyan.com/films/9025‘, ‘http://p0.meituan.net/movie/59/2366463.jpg@160w_220h_1e_1c‘, ‘周星驰,莫文蔚,张柏芝‘, ‘1999-02-13(中国香港)‘, ‘9.2‘); INSERT INTO TABLE_NAME (id, name, movieUrl, imgUrl, star, releaseTime, score) VALUE (‘1228‘, ‘天空之城‘, ‘http://maoyan.com/films/1228‘, ‘http://p0.meituan.net/movie/24/4495986.jpg@160w_220h_1e_1c‘, ‘寺田农,鹫尾真知子,龟山助清‘, ‘1992‘, ‘9.1‘); INSERT INTO TABLE_NAME (id, name, movieUrl, imgUrl, star, releaseTime, score) VALUE (‘14556‘, ‘大闹天宫‘, ‘http://maoyan.com/films/14556‘, ‘http://p0.meituan.net/movie/d1de085b6899fd2e661fc17da8b72a1b17287.jpg@160w_220h_1e_1c‘, ‘邱岳峰,毕克,富润生‘, ‘1965-12-31‘, ‘9.0‘);
标签:一个 use top 显示乱码 int sel href 中文 webkit
原文地址:https://www.cnblogs.com/wangxiaoqiangs/p/8998298.html