标签:mon use end zip new 源代码 lxml append ==
这个可以作为xpath的练手项目,爬取股吧2016年6月份到2016年12月份的文章标题和发帖时间
代码如下:
import requests
from requests.exceptions import RequestException
from lxml import etree
import csv
def get_one_index_page(url):
"""
获取请求页的源码
:param url:
:return:
"""
try:
headers = {
‘User-Agent‘: ‘Mozilla / 5.0(X11;Linuxx86_64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / ‘
‘76.0.3809.100Safari / 537.36‘,
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
def parse_page(text):
"""
解析网页源代码
:param text:
:return:
"""
html = etree.HTML(text)
title_name = html.xpath("//span[@class=‘l3 a3‘]/a/text()")
time = html.xpath("//span[@class=‘l5 a5‘]/text()")
list_new_time = []
for i in time[1::]:
list_new_time.append(‘2016-‘ + i)
return zip(title_name, list_new_time)
def change_page(number):
"""
翻页
:param number:
:return:
"""
base_url = ‘http://guba.eastmoney.com/‘
url = base_url + ‘list,zssh000016,f_%d.html‘ % number
return url
def save_to_csv(result, filename):
"""
保存
:param result:
:param filename:
:return:
"""
with open(‘%s‘ % filename, ‘a‘) as csvfile:
writer = csv.writer(csvfile)
writer.writerow(result)
def main():
# list01 = []
for i in range(265, 272):
number = change_page(i)
text = get_one_index_page(number)
result = parse_page(text)
for j in result:
save_to_csv(j, ‘data_new.csv‘)
if __name__ == ‘__main__‘:
main()
标签:mon use end zip new 源代码 lxml append ==
原文地址:https://www.cnblogs.com/lattesea/p/11746486.html