码迷,mamicode.com
首页 > 其他好文 > 详细

天涯帖子备份

时间:2017-09-14 18:50:31      阅读:295      评论:0      收藏:0      [点我收藏+]

标签:int   set   encoding   head   star   main   add   nav   str   

from bs4 import BeautifulSoup
import urllib.request as request
import os
import time
import threading

url_s = http://bbs.tianya.cn/m/post-develop-
url_e = -1.shtml

headers=(User-Agent,Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11)


def del_extract(dd,name=None):
    if len(dd)>0:
        for ds in dd:
            ds.extract()

def cut_str(s):
    if len(s)>0:
        try:
            href = s[0][href]
            if len(href)>0:
                s[0][href]=href[3:]
        except:
            pass
    
def get_total_page(url_site):
    opener = request.build_opener()
    opener.addheaders=[headers]
    content = opener.open(url_site).read()
    soup = BeautifulSoup(content,"html.parser")
    d = soup.find_all(class_=post,id="j-post-content")[0]

    data=d.find_all(class_=u-btn last-btn)
    total = 0
    if len(data)>0:
        total = data[0][href]
        total = int(str(total).split(".")[0].split("-")[-1])
    else:
        total = 0
    return total
    
    
    
def dealwith_page( url_site):

    opener = request.build_opener()
    opener.addheaders=[headers]
    content = opener.open(url_site).read()
    
    soup = BeautifulSoup(content,"html.parser")

    del_extract(soup.find_all("script"))
    del_extract(soup.find_all(class_="ty-m-nav"))
    del_extract(soup.find_all(class_="meta f-cf"))
    del_extract(soup.find_all(class_="ft"))
    d = soup.find_all(class_=post,id="j-post-content")[0]
    
    del_extract(d.find_all(class_="u-like"))
    del_extract(d.find_all(class_="post-func-close"))
    del_extract(d.find_all(class_="u-like hot-list"))

    cut_str(d.find_all(class_=u-btn off first-btn))
    cut_str(d.find_all(class_=u-btn pre-btn))
    cut_str(d.find_all(class_=page-txt))
    cut_str(d.find_all(class_=u-btn last-btn))
    cut_str(d.find_all(class_=u-btn next-btn))

    name = str(url_site).split(/)[-1]
    content = soup.prettify()
    if not os.path.exists("m/"):
        os.makedirs("m")
    with open(r"m/"+name,w,encoding="utf-8") as fw:
        fw.write(content)
    


def main_fun():
    print("please input the id of tianyaer (eg.2165689):")
    url_t = input("> ")
    url_page = url_s + url_t + url_e
    total = get_total_page(url_page)
    for n in range(total):
        url_page = url_s+str(url_t)+str("-")+str(n+1)+".shtml"
        print(url_page)
        while(threading.active_count()>10):
            time.sleep(1)
        
        t1=threading.Thread(target=dealwith_page,args=(url_page,))
        t1.setDaemon(True)
        t1.start() 
        
main_fun()   

 

天涯帖子备份

标签:int   set   encoding   head   star   main   add   nav   str   

原文地址:http://www.cnblogs.com/waefk/p/7521607.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!