标签:
# -*- coding: utf-8 -*- from HttpRequestModule import * import os import json import traceback import codecs from lxml import etree import StringIO, gzip import sys reload(sys) sys.setdefaultencoding(‘utf-8‘) def write_file(file_name,file_data,encoding): if len(file_data) == 0 : print "file_data is zero" return file_dir = r"D:\fs\test_data\qqzone" file_path=os.path.join(file_dir,file_name) print file_path # fp=open(file_path,"w") # fp.write(file_data) # fp.flush() # fp.close() with codecs.open(file_path,"w",encoding) as f: f.write(file_data) def decodeJson(json_string): decode_json=None try: decode_json=json.loads(json_string) return decode_json except (TypeError, ValueError) as err: print( ‘TypeError or ValueError:{0}‘.format(err) ) except Exception,e: print( traceback.format_exc() ) pass return decode_json def getUserBlogList(): blog_list=[] diray_url=‘‘‘ http://b1.qzone.qq.com/cgi-bin/blognew/get_abs?hostUin=859226880&blogType=0&cateName=&cateHex=&statYear=2015&reqInfo=7&pos=0&num=15&sortType=0&absType=0&source=0&rand=0.6346770680975169&ref=qzone&g_tk=1611717761&verbose=1 ‘‘‘ data=doGet(diray_url) data_len = len(data) if data_len == 0 : print "data len is 0" return blog_list data_json = data[10:data_len-2] #write_file(‘bloglist.txt‘,data_json,‘utf-8‘) decode_json=decodeJson(data_json.decode("gbk")) if decode_json == None : print "decode_json is None" return [] if decode_json[‘code‘] != 0: print "server response code is "+decode_json[‘code‘] return [] data =decode_json[‘data‘] if data[‘totalNum‘] <=0 : print "server response totalnum is "+data[‘totalNum‘] return [] blog_list=data[‘list‘] return blog_list def getUserBlog(uin,blogid): url=‘‘‘ http://b1.qzone.qq.com/cgi-bin/blognew/blog_output_data?uin=%(uin)s&blogid=%(blogid)s&styledm=ctc.qzonestyle.gtimg.cn&imgdm=ctc.qzs.qq.com&bdm=b.qzone.qq.com&mode=2&numperpage=15×tamp=1437033537&dprefix=&inCharset=gb2312&outCharset=gb2312&ref=qzone ‘‘‘%{‘uin‘:uin,‘blogid‘:blogid} my_headers={ "Accept-Encoding":"gzip,deflate,sdch", "Accept-Language": "zh-CN,zh;q=0.8,en;q=0.6" , "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.104 Safari/537.36" , "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8" , "Referer": "http://ctc.qzs.qq.com/qzone/newblog/blogcanvas.html" } request = urllib2.Request(url,headers=my_headers) try: response = urllib2.urlopen(request) except URLError,e: if hasattr(e, ‘code‘): print(‘The server couldn\‘t fulfill the request. errorcode:{0}‘.format(e.code )) elif hasattr(e, ‘reason‘): print(‘We failed to reach a server. reason:{0}‘.format(e.reason )) else: page = response.read() return page return "" def getText(elem): rc = [] for node in elem.itertext(): rc.append(node.strip()) return ‘‘.join(rc) def gzdecode(data) : compressedstream = StringIO.StringIO(data) gziper = gzip.GzipFile(fileobj=compressedstream) data2 = gziper.read() # 读取解压缩后数据 return data2 def test(blogid): print blogid blog_data=getUserBlog(‘859226880‘,blogid) blog_data=gzdecode(blog_data) #write_file( blogid+‘.html‘,blog_data ) #return try: content=blog_data.decode(‘utf-8‘) tree=etree.HTML(content) node=tree.xpath("//div[@id=‘blogDetailDiv‘]")[0] tgt_data=getText(node) print "*"*30 print tgt_data write_file( blogid+‘.txt‘,tgt_data, ‘gbk‘) return except Exception,ex : print "111",Exception,":",ex try: content=blog_data.decode(‘gbk‘) tree=etree.HTML(content) node=tree.xpath("//div[@id=‘blogDetailDiv‘]")[0] tgt_data=getText(node) print "_"*30 print tgt_data write_file( blogid+‘.txt‘,tgt_data ,‘utf-8‘) except Exception,ex : print "222",Exception,":",ex def main(): print "main" test("1288281044") #return blog_list=getUserBlogList() for blog_item in blog_list: blogId=blog_item[‘blogId‘] print blogId test( str(blogId) ) pass main()
标签:
原文地址:http://my.oschina.net/u/557955/blog/479522