码迷,mamicode.com
首页 > 编程语言 > 详细

python多线程爬取-今日头条的街拍数据(附源码加思路注释)

时间:2018-09-02 02:14:26      阅读:220      评论:0      收藏:0      [点我收藏+]

标签:auto   str   入口   mongo   线程   请求方式   try   url   format   

  这里用的是json+re+requests+beautifulsoup+多线程


1
import json 2 import re 3 from multiprocessing.pool import Pool 4 5 import requests 6 from bs4 import BeautifulSoup 7 from config import * 8 from requests import RequestException 9 10 11 def get_page_index(offset, keyword): 12 ‘‘‘得到一个页面的索引‘‘‘ 13 data = { 14 offset: offset, 15 format: json, 16 keyword: keyword, 17 autoload: true, 18 count: 20, 19 cur_tab: 1, 20 from: search_tab 21 } 22 # 请求方式一 23 # url = ‘https://www.toutiao.com/search_content/?‘+urlencode(data) 24 # response = requests.get(url) 25 26 # 请求方式二 27 url = https://www.toutiao.com/search_content/ 28 try: 29 response = requests.get(url, params=data) 30 if response.status_code == 200: 31 return response.text 32 return None 33 except RequestException: 34 return None 35 36 37 def parse_page_index(html): 38 ‘‘‘解析json数据‘‘‘ 39 data = json.loads(html) 40 if data and data in data.keys(): 41 for item in data.get(data): 42 yield item.get(article_url) 43 44 45 def get_page_detail(url): 46 ‘‘‘得到详情页的数据‘‘‘ 47 # 添加的请求头 48 headers = { 49 user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36, 50 } 51 try: 52 response = requests.get(url, headers=headers) 53 if response.status_code == 200: 54 return response.text 55 return None 56 except RequestException: 57 return None 58 59 60 def parse_page_detail(html, url): 61 ‘‘‘解析详情页数据‘‘‘ 62 soup = BeautifulSoup(html, lxml) 63 t = soup.select(title) 64 for i in t: 65 title = i.get_text() 66 67 pattern = re.compile(gallery: JSON.parse\("(.*?)"\),, re.S) 68 result = re.search(pattern, html) 69 if result: 70 71 # print(result.group(1)) 72 d = re.sub(\\\\, ‘‘, result.group(1)) 73 # print(d) 74 data = json.loads(d) 75 if data: 76 images = [item.get(url) for item in data.get(sub_images)] 77 for image in images: 78 download_image(image, title) 79 return { 80 title: title, 81 url: url, 82 images: images 83 } 84 else: 85 None 86 87 88 def download_image(url, title): 89 ‘‘‘ 90 图片下载 91 :param url: 下载的连接 92 :return: 93 ‘‘‘ 94 print(正在下载, url) 95 try: 96 response = requests.get(url) 97 if response.status_code == 200: 98 content = response.content 99 save_to_image(content, title) 100 return None 101 except RequestException: 102 return None 103 104 105 count = 0 106 107 108 def save_to_image(content, title): 109 global count 110 ‘‘‘ 111 保存图片文件 112 :param content: 图片文件的内容 113 :return: 114 ‘‘‘ 115 name = title + str(count) 116 file_path = ./头条/{}.{}.format(name, jpg) 117 with open(file_path, wb) as f: 118 count += 1 119 f.write(content) 120 121 122 def main(offset): 123 ‘‘‘主程序入口‘‘‘ 124 html = get_page_index(offset, 街拍) 125 126 # print(html) 127 for url in parse_page_index(html): 128 129 if url: 130 # print(url) 131 html = get_page_detail(url) 132 if html: 133 # print(parse_page_detail(html, url)) 134 result = parse_page_detail(html, url) 135 if result: 136 print(result) 137 # save_to_mongo(result) 138 139 140 GROUP_START = 1 141 GROUP_END = 20 142 if __name__ == __main__: 143 groups = [i * 20 for i in range(GROUP_START, GROUP_END)] 144 pool = Pool() 145 pool.map(main, groups)

 

python多线程爬取-今日头条的街拍数据(附源码加思路注释)

标签:auto   str   入口   mongo   线程   请求方式   try   url   format   

原文地址:https://www.cnblogs.com/yunlongaimeng/p/9572148.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!