标签:速度 stack fail image ping 线程 数据库连接 ace cursor
1 #! -*- coding:utf-8 -*- 2 import requests as rq 3 import re 4 import time 5 import datetime 6 import pymysql 7 from multiprocessing.dummy import Pool,Queue #dummy子库是多线程库 8 import html 9 from urllib.request import urlopen 10 from bs4 import BeautifulSoup 11 unescape = html.unescape #用来实现对HTML字符的转移 12 13 tasks = Queue() #链接队列 14 tasks_pass = set() #已队列过的链接 15 results = {} #结果变量 16 count = 0 #爬取页面总数 17 tasks.put(‘/index.html‘) #把主页加入到链接队列 18 tasks_pass.add(‘/index.html‘) #把主页加入到已队列链接 19 20 def main(tasks): 21 global results,count,tasks_pass #多线程可以很轻松地共享变量 22 while True: 23 url = tasks.get() #取出一个链接 24 url = ‘http://wap.xigushi.com‘+url 25 html = urlopen(url) 26 bsObj = BeautifulSoup(html.read(), "lxml") 27 if (bsObj.meta.attrs[‘charset‘]==‘gb2312‘): 28 web = rq.get(url).content.decode(‘gbk‘) # 这里的编码要看实际情形而定 29 else: 30 web = rq.get(url).content.decode(‘utf8‘) # 这里的编码要看实际情形而定 31 32 urls = re.findall(‘href="(/.*?)"‘, web) #查找所有站内链接 33 for u in urls: 34 if (u not in tasks_pass): #把还没有队列过的链接加入队列 35 if ((re.search(‘images‘, url)) is None): 36 tasks.put(u) 37 tasks_pass.add(u) 38 else: 39 print(u, ‘---------------------------skipping--------------------------------------------‘) 40 else: 41 pass 42 43 text = bsObj.title.get_text() 44 print(datetime.datetime.now(), ‘ ‘, url, ‘ ‘, text) 45 db = pymysql.connect("localhost", "testuser", "test123", "TESTDB", charset=‘gbk‘) 46 dbc = db.cursor() 47 sql = "insert ignore into data1(url,title) values(%s,%s);" 48 data = (url, text) 49 dbc.execute(sql, data) 50 dbc.close() 51 db.commit() 52 db.close() 53 count += 1 54 if count % 100 == 0: 55 print(u‘%s done.‘%count) 56 57 pool = Pool(10, main, (tasks,)) #多线程爬取,4是线程数 58 total = 0 59 while True: #这部分代码的意思是如果20秒内没有动静,那就结束脚本 60 time.sleep(60) 61 if len(tasks_pass) > total: 62 total = len(tasks_pass) 63 else: 64 break 65 66 pool.terminate() 67 print("terminated normally")
encoding error : input conversion failed due to input error, bytes 0xB1 0x80 0xB5 0xC4
为什么改了那么多次还有...显然是gbk转utf8问题,可是我判断了啊,还是有些网页就是比较乱...
UnicodeEncodeError: ‘gbk‘ codec can‘t encode character ‘\u30fb‘ in position 86: illegal multibyte sequence
urllib.error.HTTPError: HTTP Error 503: Forwarding failure
标签:速度 stack fail image ping 线程 数据库连接 ace cursor
原文地址:http://www.cnblogs.com/vorphan/p/7468727.html