# coding=utf-8 import requests from lxml import etree # # # class DonewsSpider(object): # """ # 抓取www.donows.com专栏新闻 # """ # def __init__(self): # self.start_url = ‘http://www.donews.com/idonews/‘ # self.headers = { # ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36‘, # } # self.proxies = { # "https": "115.230.51.12:46939", # } # self.total_list = [] # # def get_page(self, url): # response = requests.get(url, headers=self.headers, proxies=self.proxies) # # response.encoding = response.apparent_encoding # return response.text # # def parse(self, page_data): # detail_url_list = [] # html = etree.HTML(page_data) # el_objs = html.xpath(‘/html/body/div[5]/div[2]/div/dl‘) # for el in el_objs: # detail_url = el.xpath(‘./dd/h3/a/@href‘)[0] # print(detail_url) # if ‘http‘ in detail_url: # detail_url_list.append(detail_url) # print(detail_url_list) # return detail_url_list # # def parse_detail(self, detail_data): # data_dict = {} # content_ = ‘‘ # html = etree.HTML(detail_data) # data_dict[‘title_‘] = html.xpath(‘//*[@id="main"]/div[1]/h2/text()‘)[0] # content_objs = html.xpath(‘//*[@id="main"]/div[1]/div[2]‘) # for p_ in content_objs: # content_ += p_.xpath(‘./p/text()‘)[0] # data_dict[‘content_‘] = content_ # data_dict[‘writer_‘] = html.xpath(‘//*[@id="main"]/div[1]/div[1]/p/span[1]/text()‘)[0] # return data_dict # # def run(self): # page_data = self.get_page(self.start_url) # detail_url_list = self.parse(page_data) # for url in detail_url_list: # detail_data = self.get_page(url) # data_dict = self.parse_detail(detail_data) # print(data_dict) # self.total_list.append(data_dict) # print(self.total_list) # # 点击获取下拉数据 # # ... 解析js时,发现数据已经通过ajax存放在一个url中了; # # if __name__ == ‘__main__‘: # dn_spider = DonewsSpider() # dn_spider.run() # import re # 在js中解析出所有detail_url # 以下是逻辑代码,就不封装了,懒... headers = { ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36‘, } proxies = { "https": "115.230.51.12:46939", } i = 1 detail_url_list = [] data_list = [] while True: i += 1 try: url = "http://www.donews.com/column/more_article_ajax?page=%s" % i page_data = requests.get(url, headers=headers, proxies=proxies).text.encode(‘utf-8‘) # 将unicode数据转化成python数据类型 url_list = re.findall(r‘http:\\/\\/www.donews.com\\/article\\/detail\\/\d+\\/\d+.html‘, page_data) url_ = ‘‘ for url in url_list: url_ = ‘‘ el_list = url.split(‘\\‘) for el in el_list: url_ += el detail_url_list.append(url_) print(detail_url_list) except: print(‘链接获取完毕,提取页面数据‘) for detail_url in detail_url_list: detail_page_data = requests.get(detail_url, headers=headers, proxies=proxies).text data_dict = {} content_ = ‘‘ html = etree.HTML(detail_page_data) data_dict[‘title_‘] = html.xpath(‘//*[@id="main"]/div[1]/h2/text()‘)[0] content_objs = html.xpath(‘//*[@id="main"]/div[1]/div[2]‘) for p_ in content_objs: content_ += p_.xpath(‘./p/text()‘)[0] data_dict[‘content_‘] = content_ data_dict[‘writer_‘] = html.xpath(‘//*[@id="main"]/div[1]/div[1]/p/span[1]/text()‘)[0] data_list.append(data_dict) print(data_list)