标签:auth div nbsp urllib col url int code ber
#-*-coding:utf-8-*- import urllib import urllib2 import re def get_duanzi(url): store=[] user_agent=‘Mozilla/5.0 (Windows NT 10.0; WOW64)‘ headers={‘User-Agent‘:user_agent} request=urllib2.Request(url,headers=headers) response=urllib2.urlopen(request) html=response.read().decode(‘utf-8‘) pattern=re.compile(‘<div class="author clearfix">.*?<h2>(.*?)</h2>.*?<div.*?span>(.*?)</span>(.*?)<div class="stats">.*?"number">(.*?)</i>‘,re.S) results=re.findall(pattern,html) for result in results: haveImg=re.search("img",result[2]) if not haveImg: store.append([result[0],result[1],result[3]]) for st in store: print st[0] print st[1] print st[2]
#爬取糗事百科前13页的段子 for page in range(1,14): url=‘http://www.qiushibaike.com/hot/page/‘+str(page) get_duanzi(url)
标签:auth div nbsp urllib col url int code ber
原文地址:https://www.cnblogs.com/bashaowei/p/8830968.html