标签:
#!/usr/bin/env python #coding:utf-8 import urllib2,re,sys,os,types #from bs4 import BeautifulSoup reload(sys); sys.setdefaultencoding(‘gbk‘); province="上海" city="上海" fileHeader=‘\xEF\xBB\xBF‘ colums=‘省直辖市^城市^行政区^商圈^名称^地址^联系人^联系电话^URL^公司介绍^‘ def getCompany(): for page in range(1,5+1): url1="http://book.jd.com/booktop-4-6929-%s.html"%(page) print "\n##################:",url1 httpCrawler(url1,page) def httpCrawler(url,page): content = httpRequest(url) #<tr logr=‘j_2_27359935228167_20019655228034_3‘> List=re.findall(r‘<dt class=\‘p-name\‘>(.*?)<a href=\‘(.*?)\‘ title="(.*?)" target=\‘_blank\‘‘,content,re.S) no=len(List) print no for i in range(0,no):#0 ~ no-1 url=List[i][1] name=List[i][2] print "\ndownload one page:",List[i][1],"\n",List[i][2] if not os.path.exists(‘./jd‘): os.mkdir(r‘./jd‘) content = httpRequest(url) # if (page-1)*20+i+1 != 82: open(u‘jd/%s.%s‘%((page-1)*20+i+1,List[i][2].replace("/",""))+‘.html‘,‘w+‘).write(content) print "ok" def httpRequest(url): #try: html = None req_header = { ‘User-Agent‘:‘Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0‘, ‘Accept‘:‘text/html;q=0.9,*/*;q=0.8‘, #‘Accept-Language‘:‘en-US,en;q=0.5‘, #‘Accept-Encoding‘:‘gzip‘, #‘Host‘:‘j3.s2.dpfile.com‘, #‘Connection‘:‘keep-alive‘ #‘Referer‘:‘http://www.baidu.com‘ } req_timeout = 15 req = urllib2.Request(url,None,req_header) resp = urllib2.urlopen(req,None,req_timeout) html = resp.read()#.decode(‘gbk‘).encode(‘gbk‘) print "resp:",resp #print html #finally: # if resp: # resp.close() return html def writeHeader(fileheader,colums): if not os.path.exists(‘./58‘): os.mkdir(r‘./58‘) f = open(‘./58/daikuan.csv‘, ‘w‘) f.write(fileheader) f.write(colums) #f.write(‘\r\n‘) f.close() if __name__ == ‘__main__‘: #writeHeader(fileHeader,colums) getCompany()
标签:
原文地址:http://www.cnblogs.com/timdes/p/4765312.html