标签:
import re
page=20
f=open(‘test‘,‘r‘)
html=f.read()
f.close()
#取得标题
title=re.search(‘<title>(.*?)</title>‘,html,re.S).group(1)
print title
#取得链接地址
like=re.findall("href=‘(.*?)‘",html,re.S)
for i in like:print i
#抓取需要的信息(先大后小)
text_fied = re.findall("<ul>(.*?)</ul>",html,re.S)[0]
the_text = re.findall("‘>(.*?)</a>",text_fied,re.S)
for every_text in the_text:
print every_text
#替换页面
url=‘http://www.jikexueyuan.com/course/android/?pageNum=2‘
for i in range(2,page+1):
new_list=re.sub(‘pageNum=\d+‘,‘pageNum=%d‘%i,url,re.S)
print new_list
#实例(下载图片,并保存)
f=open(‘test1‘,‘r‘)
html=f.read()
print html
f.close()
#匹配图片地址
url=re.findall(‘img src="(.*?)" class="lessonimg"‘,html,re.S)
i=0
import requests,time
for s in url:
time.sleep(1)
print u‘开始下载:%s‘%(s)
print u‘正在下载第{0}张图片‘.format(i)
pic=requests.get(s)
with open(r‘pic\\‘+str(i)+‘.jpg‘,‘wb‘)as f:
f.write(pic.content)
i+=1
else:
print u‘下载完成‘
print u‘下载完成时间是:{0}‘.format(time.strftime(‘%Y-%m-%d %H:%M:%S‘))
import requests
url1="http://jp.tingroom.com/yuedu/yd300p/"
headers={‘User-Agent‘:"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36"}
html1=requests.get(url1,headers=headers)
html1.encoding=‘utf-8‘
import re
title=re.findall(‘color: #039;">(.*?)</a>‘,html1.text,re.S)
for i in title:
print i
import requests,re
url="https://www.crowdfunder.com/deals"
html=requests.get(url)
data={‘entities_onlu‘:‘true‘,‘page‘:‘1‘}
post=requests.post(url,data=data)
title=re.findall(‘"card-title">(.*?)</div>‘,post.text,re.S)
for i in title:print i
import re,requests,sys
reload(sys)
sys.setdefaultencoding(‘utf8‘)
#对win下面的gbk进行强制转换为utf8
class Spider(object):
def __init__(self):
print u‘开始下载此页面.......‘
#获取源代码
def getsource(self,url):
html=requests.get(url).text
return html
def changepage(self,url,page):#获取不同的链接
now_page = int(re.search(‘pageNum=(\d+)‘,url,re.S).group(1))
page_group=[]
for i in range(now_page,page+1):
link=re.sub(‘pageNum=\d+‘,‘pageNum=%s‘%(i),url,re.S)
page_group.append(link)
return page_group
def geteveryclass(self,source):#获取课程信息
source=re.findall(‘<li deg="" .*?</li>‘,source,re.S)
return source
def getinfo(self,sinfo):#对每个课程中需要的信息进行提取
info={}
info[‘title‘]=re.search(‘target="_blank">(.*?)</a>‘,sinfo,re.S).group(1)
info[‘content‘]=re.search(‘<h2><p>(.*?)</p>‘,sinfo,re.S).group(1)
timeandeven=re.findall(‘<em>(.*?)</em>‘,sinfo,re.S)
info[‘classtime‘]=timeandeven[0]
info[‘classlevel‘]=timeandeven[1]
info[‘learnnum‘]=re.search(‘"learn-number">(.*?)</em>‘,sinfo,re.S).group(1)
return info
def saveinfo(self,info):#保存到txt下面
f = open(‘info.txt‘,‘w‘)
for each in info:
f.write(each)
f.writelines(‘title:‘ + each[‘title‘] + ‘\n‘)
f.writelines(‘content:‘ + each[‘content‘] + ‘\n‘)
f.writelines(‘classtime:‘ + each[‘classtime‘] + ‘\n‘)
f.writelines(‘classlevel:‘ + each[‘classlevel‘] + ‘\n‘)
f.writelines(‘learnnum:‘ + each[‘learnnum‘] +‘\n\n‘)
f.close()
if __name__==‘__main__‘:
classinfo = []
url = ‘http://www.jikexueyuan.com/course/?pageNum=1‘
jikespider = Spider()
all_links = jikespider.changepage(url,1)
for link in all_links:
print u‘正在处理页面:‘ + link
html = jikespider.getsource(link)
everyclass = jikespider.geteveryclass(html)
for each in everyclass:
info = jikespider.getinfo(each)
classinfo.append(info)
jikespider.saveinfo(classinfo)
# -*- coding: utf-8 -*-
__author__ = ‘Administrator‘
from lxml import etree
#xpath学习
"""
//定位根节点
/往下寻找
/text()提取文本内容
/@xxxx提取属性内容
"""
html = ‘‘‘
<!DOCTYPE html>
<html>
<head lang="en">
<meta charset="UTF-8">
<title>测试-常规用法</title>
</head>
<body>
<div id="content">
<ul id="useful">
<li>这是第一条信息</li>
<li>这是第二条信息</li>
<li>这是第三条信息</li>
</ul>
<ul id="useless">
<li>不需要的信息1</li>
<li>不需要的信息2</li>
<li>不需要的信息3</li>
</ul>
<div id="url">
<a href="http://jikexueyuan.com">极客学院</a>
<a href="http://jikexueyuan.com/course/" title="极客学院课程库">点我打开课程库</a>
</div>
</div>
</body>
</html>
‘‘‘
selector=etree.HTML(html)
#提取文本
conter=selector.xpath(‘//ul[@id="useless"]/li/text()‘)
for i in conter:print i
#提取属性
link=selector.xpath(‘//div[@id="url"]/a/@href‘)
for i in link:print i
#提取唯一一个文本信息
title=selector.xpath(‘//div[@id="url"]/a/@title‘)
for i in title:print i
#神器 XPath 的特殊用法
##以相同字符开头
#starts-with(@属性名称,属性字符相同部分)
#如下
"""
<div id=‘a‘>1</div>
<div id=‘b‘>2</div>
<div id=‘c‘>3</div>
"""
html=‘‘‘
<html>
<head lang="en">
<meta charset="UTF-8">
<title>测试-特殊用法</title>
</head>
<body>
<div id="test-1">需要的内容1</div>
<div id="test-2">需要的内容2</div>
<div id="testfault">需要的内容3</div>
</body>
</html>
‘‘‘
selector=etree.HTML(html)
conter=selector.xpath(‘//div[starts-with(@id,"test")]/text()‘)
for i in conter:print i
##以标签套标签方式
#string(.)
"""
<div id=‘anc‘>
<font color=‘red‘>abc</font>
</div>
"""
html=‘‘‘
<!DOCTYPE html>
<html>
<head lang="en">
<meta charset="UTF-8">
<title></title>
</head>
<body>
<div id="test3">
我左青龙,
<span id="tiger">
右白虎,
<ul>上朱雀,
<li>下玄武。</li>
</ul>老牛在当中,
</span>龙头在胸口。
</div>
</body>
</html>
‘‘‘
selector=etree.HTML(html)
data=selector.xpath(‘//div[@id="test3"]‘)[0]
info=data.xpath(‘string(.)‘)
conter=info.replace(‘\n‘,‘‘).replace(‘ ‘,‘‘)
print conter
#Python 并行化介绍与演示
from multiprocessing.dummy import Pool
import time,requests
def getHtml(url):
html=requests.get(url)
return html.text
urls=[]
for i in range(1,21):
page=‘http://tieba.baidu.com/p/3522395719?pn=‘+str(i)
urls.append(page)
times=time.time()
for i in urls:
print i
getHtml(i)
time2=time.time()
print u‘单:‘,str(time2-times)
pool=Pool(4)
time3=time.time()
results=pool.map(getHtml,urls)
pool.close()
pool.join()
time4=time.time()
print u‘共:‘,str(time4-time3)
#-*-coding:utf8-*-
from lxml import etree
from multiprocessing.dummy import Pool as ThreadPool
import requests
import json
import sys
reload(sys)
sys.setdefaultencoding(‘utf-8‘)
‘‘‘重新运行之前请删除content.txt,因为文件操作使用追加方式,会导致内容太多。‘‘‘
def towrite(contentdict):
f.writelines(u‘回帖时间:‘ + str(contentdict[‘topic_reply_time‘]) + ‘\n‘)
f.writelines(u‘回帖内容:‘ + unicode(contentdict[‘topic_reply_content‘]) + ‘\n‘)
f.writelines(u‘回帖人:‘ + contentdict[‘user_name‘] + ‘\n\n‘)
def spider(url):
html = requests.get(url)
selector = etree.HTML(html.text)
content_field = selector.xpath(‘//div[@class="l_post l_post_bright "]‘)
item = {}
for each in content_field:
reply_info = json.loads(each.xpath(‘@data-field‘)[0].replace(‘"‘,‘‘))
author = reply_info[‘author‘][‘user_name‘]
content = each.xpath(‘div[@class="d_post_content_main"]/div/cc/div[@class="d_post_content j_d_post_content "]/text()‘)[0]
reply_time = reply_info[‘content‘][‘date‘]
print content
print reply_time
print author
item[‘user_name‘] = author
item[‘topic_reply_content‘] = content
item[‘topic_reply_time‘] = reply_time
towrite(item)
if __name__ == ‘__main__‘:
pool = ThreadPool(4)
f = open(‘content.txt‘,‘a‘)
page = []
for i in range(1,21):
newpage = ‘http://tieba.baidu.com/p/3522395718?pn=‘ + str(i)
page.append(newpage)
results = pool.map(spider, page)
pool.close()
pool.join()
f.close()
标签:
原文地址:http://www.cnblogs.com/mhxy13867806343/p/4580609.html