python正则

时间：2015-07-16 18:41:08 阅读：186 评论：0 收藏：0 [点我收藏+]

标签：

import re
page=20
f=open(‘test‘,‘r‘)
html=f.read()
f.close()
#取得标题
title=re.search(‘<title>(.*?)</title>‘,html,re.S).group(1)
print title
#取得链接地址

like=re.findall("href=‘(.*?)‘",html,re.S)
for i in like:print i
#抓取需要的信息(先大后小)
text_fied = re.findall("<ul>(.*?)</ul>",html,re.S)[0]
the_text = re.findall("‘>(.*?)</a>",text_fied,re.S)
for every_text in the_text:
    print every_text
#替换页面
url=‘http://www.jikexueyuan.com/course/android/?pageNum=2‘
for i in range(2,page+1):
    new_list=re.sub(‘pageNum=\d+‘,‘pageNum=%d‘%i,url,re.S)
    print new_list

#实例(下载图片，并保存)
f=open(‘test1‘,‘r‘)
html=f.read()
print html
f.close()
#匹配图片地址
url=re.findall(‘img src="(.*?)" class="lessonimg"‘,html,re.S)
i=0
import requests,time
for s in url:
    time.sleep(1)
    print u‘开始下载:%s‘%(s)
    print u‘正在下载第{0}张图片‘.format(i)
    pic=requests.get(s)
    with open(r‘pic\\‘+str(i)+‘.jpg‘,‘wb‘)as f:
        f.write(pic.content)
    i+=1
else:
    print u‘下载完成‘
    print u‘下载完成时间是:{0}‘.format(time.strftime(‘%Y-%m-%d %H:%M:%S‘))

import requests


url1="http://jp.tingroom.com/yuedu/yd300p/"
headers={‘User-Agent‘:"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36"}
html1=requests.get(url1,headers=headers)
html1.encoding=‘utf-8‘
import re
title=re.findall(‘color: #039;">(.*?)</a>‘,html1.text,re.S)
for i in title:
    print i

import  requests,re
url="https://www.crowdfunder.com/deals"
html=requests.get(url)
data={‘entities_onlu‘:‘true‘,‘page‘:‘1‘}
post=requests.post(url,data=data)
title=re.findall(‘"card-title">(.*?)</div>‘,post.text,re.S)
for i in title:print i

import re,requests,sys
reload(sys)
sys.setdefaultencoding(‘utf8‘)
#对win下面的gbk进行强制转换为utf8
class Spider(object):
    def __init__(self):
        print u‘开始下载此页面.......‘
    #获取源代码
    def getsource(self,url):
        html=requests.get(url).text
        return  html
    def changepage(self,url,page):#获取不同的链接
        now_page = int(re.search(‘pageNum=(\d+)‘,url,re.S).group(1))
        page_group=[]
        for i in range(now_page,page+1):
            link=re.sub(‘pageNum=\d+‘,‘pageNum=%s‘%(i),url,re.S)
            page_group.append(link)
        return  page_group
    def geteveryclass(self,source):#获取课程信息
        source=re.findall(‘<li deg="" .*?</li>‘,source,re.S)
        return  source
    def getinfo(self,sinfo):#对每个课程中需要的信息进行提取
        info={}
        info[‘title‘]=re.search(‘target="_blank">(.*?)</a>‘,sinfo,re.S).group(1)
        info[‘content‘]=re.search(‘<h2><p>(.*?)</p>‘,sinfo,re.S).group(1)
        timeandeven=re.findall(‘<em>(.*?)</em>‘,sinfo,re.S)
        info[‘classtime‘]=timeandeven[0]
        info[‘classlevel‘]=timeandeven[1]
        info[‘learnnum‘]=re.search(‘"learn-number">(.*?)</em>‘,sinfo,re.S).group(1)
        return  info
    def saveinfo(self,info):#保存到txt下面
        f = open(‘info.txt‘,‘w‘)
        for each in info:
            f.write(each)
            f.writelines(‘title:‘ + each[‘title‘] + ‘\n‘)
            f.writelines(‘content:‘ + each[‘content‘] + ‘\n‘)
            f.writelines(‘classtime:‘ + each[‘classtime‘] + ‘\n‘)
            f.writelines(‘classlevel:‘ + each[‘classlevel‘] + ‘\n‘)
            f.writelines(‘learnnum:‘ + each[‘learnnum‘] +‘\n\n‘)
        f.close()





if __name__==‘__main__‘:
    classinfo = []
    url = ‘http://www.jikexueyuan.com/course/?pageNum=1‘
    jikespider = Spider()
    all_links = jikespider.changepage(url,1)
    for link in all_links:
        print u‘正在处理页面：‘ + link
        html = jikespider.getsource(link)
        everyclass = jikespider.geteveryclass(html)
        for each in everyclass:
            info = jikespider.getinfo(each)
            classinfo.append(info)
    jikespider.saveinfo(classinfo)

# -*- coding: utf-8 -*-
__author__ = ‘Administrator‘
from lxml import  etree
#xpath学习
"""
//定位根节点
/往下寻找
/text()提取文本内容
/@xxxx提取属性内容
"""
html = ‘‘‘
<!DOCTYPE html>
<html>
<head lang="en">
    <meta charset="UTF-8">
    <title>测试-常规用法</title>
</head>
<body>
<div id="content">
    <ul id="useful">
        <li>这是第一条信息</li>
        <li>这是第二条信息</li>
        <li>这是第三条信息</li>
    </ul>
    <ul id="useless">
        <li>不需要的信息1</li>
        <li>不需要的信息2</li>
        <li>不需要的信息3</li>
    </ul>

    <div id="url">
        <a href="http://jikexueyuan.com">极客学院</a>
        <a href="http://jikexueyuan.com/course/" title="极客学院课程库">点我打开课程库</a>
    </div>
</div>

</body>
</html>
‘‘‘
selector=etree.HTML(html)
#提取文本
conter=selector.xpath(‘//ul[@id="useless"]/li/text()‘)
for i in conter:print i
#提取属性
link=selector.xpath(‘//div[@id="url"]/a/@href‘)
for i in link:print i
#提取唯一一个文本信息
title=selector.xpath(‘//div[@id="url"]/a/@title‘)
for i in title:print i
#神器 XPath 的特殊用法
##以相同字符开头
#starts-with(@属性名称，属性字符相同部分)
#如下
"""
<div id=‘a‘>1</div>
<div id=‘b‘>2</div>
<div id=‘c‘>3</div>
"""
html=‘‘‘
<html>
<head lang="en">
    <meta charset="UTF-8">
    <title>测试-特殊用法</title>
</head>
<body>
    <div id="test-1">需要的内容1</div>
    <div id="test-2">需要的内容2</div>
    <div id="testfault">需要的内容3</div>
</body>
</html>
‘‘‘
selector=etree.HTML(html)
conter=selector.xpath(‘//div[starts-with(@id,"test")]/text()‘)
for i in conter:print i
##以标签套标签方式
#string(.)
"""
<div id=‘anc‘>
    <font color=‘red‘>abc</font>
</div>
"""

html=‘‘‘
<!DOCTYPE html>
<html>
<head lang="en">
    <meta charset="UTF-8">
    <title></title>
</head>
<body>
    <div id="test3">
        我左青龙，
        <span id="tiger">
            右白虎，
            <ul>上朱雀，
                <li>下玄武。</li>
            </ul>老牛在当中，
        </span>龙头在胸口。
    </div>
</body>
</html>
‘‘‘
selector=etree.HTML(html)
data=selector.xpath(‘//div[@id="test3"]‘)[0]
info=data.xpath(‘string(.)‘)
conter=info.replace(‘\n‘,‘‘).replace(‘ ‘,‘‘)
print conter

#Python 并行化介绍与演示
from multiprocessing.dummy import Pool
import time,requests

def getHtml(url):
    html=requests.get(url)
    return  html.text
urls=[]
for i in range(1,21):
    page=‘http://tieba.baidu.com/p/3522395719?pn=‘+str(i)
    urls.append(page)
times=time.time()
for i in urls:
    print i
    getHtml(i)
time2=time.time()
print u‘单:‘,str(time2-times)
pool=Pool(4)
time3=time.time()
results=pool.map(getHtml,urls)
pool.close()
pool.join()
time4=time.time()
print u‘共:‘,str(time4-time3)

#-*-coding:utf8-*-
from lxml import etree
from multiprocessing.dummy import Pool as ThreadPool
import requests
import json
import sys

reload(sys)

sys.setdefaultencoding(‘utf-8‘)

‘‘‘重新运行之前请删除content.txt，因为文件操作使用追加方式，会导致内容太多。‘‘‘

def towrite(contentdict):
f.writelines(u‘回帖时间:‘ + str(contentdict[‘topic_reply_time‘]) + ‘\n‘)
f.writelines(u‘回帖内容:‘ + unicode(contentdict[‘topic_reply_content‘]) + ‘\n‘)
f.writelines(u‘回帖人:‘ + contentdict[‘user_name‘] + ‘\n\n‘)

def spider(url):
html = requests.get(url)
selector = etree.HTML(html.text)
content_field = selector.xpath(‘//div[@class="l_post l_post_bright "]‘)
item = {}
for each in content_field:
reply_info = json.loads(each.xpath(‘@data-field‘)[0].replace(‘&quot‘,‘‘))
author = reply_info[‘author‘][‘user_name‘]
content = each.xpath(‘div[@class="d_post_content_main"]/div/cc/div[@class="d_post_content j_d_post_content "]/text()‘)[0]
reply_time = reply_info[‘content‘][‘date‘]
print content
print reply_time
print author
item[‘user_name‘] = author
item[‘topic_reply_content‘] = content
item[‘topic_reply_time‘] = reply_time
towrite(item)

if __name__ == ‘__main__‘:
pool = ThreadPool(4)
f = open(‘content.txt‘,‘a‘)
page = []
for i in range(1,21):
newpage = ‘http://tieba.baidu.com/p/3522395718?pn=‘ + str(i)
page.append(newpage)

results = pool.map(spider, page)
pool.close()
pool.join()
f.close()

python正则

标签：

原文地址：http://www.cnblogs.com/mhxy13867806343/p/4580609.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行