Python-18：多线程扒取百度贴吧帖子内容源码

时间：2017-06-21 14:00:11 阅读：134 评论：0 收藏：0 [点我收藏+]

标签：null port 文件的 result cto .com 功能 get page

源码中附注释，直接放源码哈。

#-*-coding:utf8-*-
from lxml import etree
from multiprocessing.dummy import Pool as ThreadPool
import requests
import json

#这三行是用来解决编码问题的
import sys

reload(sys)

sys.setdefaultencoding(‘utf-8‘)

‘‘‘重新运行之前请删除content.txt，因为文件操作使用追加方式，会导致内容太多。‘‘‘

#该方法是向文件中写入以下格式的内容
def towrite(contentdict):
    f.writelines(u‘回帖时间:‘ + str(contentdict[‘topic_reply_time‘]) + ‘\n‘)
    f.writelines(u‘回帖内容:‘ + unicode(contentdict[‘topic_reply_content‘]) + ‘\n‘)
    f.writelines(u‘回帖人:‘ + contentdict[‘user_name‘] + ‘\n\n‘)
_header={‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36‘}
#根据给定的URL扒取内容的方法
def spider(url):
    html = requests.get(url,headers=_header)
    print url
    selector = etree.HTML(html.text)
    #获取这一楼的所有内容
    content_field = selector.xpath(‘//div[@class="l_post j_l_post l_post_bright  "]‘)
    item = {}
    #遍历这一楼
    for each in content_field:
        ‘‘‘
        data-field="
        {"
        author":{
            "user_id":830583117,
            "user_name":"huluxiao855",
            "name_u":"huluxiao855&ie=utf-8",
            "user_sex":0,
            "portrait":"4db168756c757869616f3835358131",
            "is_like":1,
            "level_id":4,
            "level_name":"\u719f\u6089\u82f9\u679c",
            "cur_score":31,
            "bawu":0,
            "props":null},
        "content":{
            "post_id":62881461599,
            "is_anonym":false,
            "open_id":"tbclient",
            "open_type":"apple",
            "date":"2015-01-11 22:09",
            "vote_crypt":"",
            "post_no":203,
            "type":"0",
            "comment_num":1,
            "ptype":"0",
            "is_saveface":false,
            "props":null,
            "post_index":0,
            "pb_tpoint":null
            }
        }"
        
        ‘‘‘
        reply_info = json.loads(each.xpath(‘@data-field‘)[0].replace(‘&quot‘,‘‘))
        #reply_info是一个字典，根据上面注释所述的结构关系，这样来获取做着
        author = reply_info[‘author‘][‘user_name‘]
        content = each.xpath(‘div[@class="d_post_content_main"]/div/cc/div[@class="d_post_content j_d_post_content "]/text()‘)[0]
        reply_time = reply_info[‘content‘][‘date‘]
        print content
        print reply_time
        print author
        item[‘user_name‘] = author
        item[‘topic_reply_content‘] = content
        item[‘topic_reply_time‘] = reply_time
        towrite(item)
    ‘‘‘
    如果我们是直接执行某个.py文件的时候，该文件中那么”__name__ == ‘__main__‘“是True,
    但是我们如果从另外一个.py文件通过import导入该文件的时候，这时__name__的值就是我们这个py文件的名字而不是__main__。
    这个功能还有一个用处：
    调试代码的时候，在”if __name__ == ‘__main__‘“中加入一些我们的调试代码，我们可以让外部模块调用的时候不执行我们的调试代码，
    但是如果我们想排查问题的时候，直接执行该模块文件，调试代码能够正常运行！
    ‘‘‘
if __name__ == ‘__main__‘:
    #创建一个4核的应用程序池
    pool = ThreadPool(4)
    #第二个参数a代表的意思是向文件中追加
    f = open(‘content.txt‘,‘a‘)
    #定义一个保存网址url的数组
    page = []
    #通过循环将网址追加到数组中
    for i in range(1,21):
        newpage = ‘http://tieba.baidu.com/p/3522395718?pn=‘ + str(i)
        page.append(newpage)
    #多线程 爬虫方法
    results = pool.map(spider, page)
    pool.close()
    pool.join()
    f.close()

标签：null port 文件的 result cto .com 功能 get page

原文地址：http://www.cnblogs.com/jiyongxin/p/7058715.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行