标签:
#-*- coding: UTF-8 -*- import urllib import re from bs4 import BeautifulSoup url=‘http://zj.zol.com.cn/index.php?c=Ajax_ParamResponse&a=GetGoods&subcateId=28&type=0&priceId=noPrice&page=2&manuId=¶mStr=&keyword=&locationId=1&queryType=0‘ html = urllib.urlopen(url).read() soup=BeautifulSoup(html,"html.parser") listModal=[] listSpecs=[] tags = soup.find_all("a",attrs={"target":"\\\"_blank\\\""}) cnt=0 for tag in tags: cnt+=1 modalSubstr=tag.contents[0] #print ‘modalSubstr:‘+modalSubstr manufacturer=re.findall(‘(.+?) ‘,modalSubstr)[0]#非贪心匹配 遇到空格即中止,返回第一个匹配项 #print ‘manufacturer:‘+manufacturer detailSubstr=re.findall(‘ ([0-9a-zA-Z- ]+)‘,modalSubstr) #print detailSubstr detailSubstr0=detailSubstr[0] #针对i3、i5、i7的处理 if "i3" in modalSubstr: modalDetail="i3 "+detailSubstr0 elif "i5" in modalSubstr: modalDetail="i5 "+detailSubstr0 elif "i7" in modalSubstr: modalDetail="i7 "+detailSubstr0 else: modalDetail=detailSubstr0 #针对APU的处理 if modalDetail=="APU": modalDetail+=" "+detailSubstr[1] modal=manufacturer+" "+modalDetail print "modal:"+modal
#-*- coding: UTF-8 -*- import urllib import re from bs4 import BeautifulSoup url=‘http://zj.zol.com.cn/index.php?c=Ajax_ParamResponse&a=GetGoods&subcateId=28&type=0&priceId=noPrice&page=2&manuId=¶mStr=&keyword=&locationId=1&queryType=0‘ html = urllib.urlopen(url).read() soup=BeautifulSoup(html,"html.parser") listModal=[] listSpecs=[] tags = soup.find_all("a",attrs={"target":"\\\"_blank\\\""}) cnt=0 for tag in tags: cnt+=1 print cnt substr=str(tag)[100:500] #以title=‘\"开头+任意小数+ GHz结尾 specsDictionary=re.findall(r‘title=\‘\\\"([0-9.]+GHz)‘,substr) try: specs=specsDictionary[0] except IndexError: specs="Data Missed" print specs
urlLeft=‘http://zj.zol.com.cn/index.php?c=Ajax_ParamResponse&a=GetGoods&subcateId=28&type=0&priceId=noPrice&page=‘ urlRight=‘&manuId=¶mStr=&keyword=&locationId=1&queryType=0‘ urlPageIndex=1 while (1): url=urlLeft+str(urlPageIndex)+urlRight html = urllib.urlopen(url).read() soup=BeautifulSoup(html,"html.parser") soupSub=str(soup)[0:50] pageIndex=int(re.findall(‘page\":([0-9]+)‘,soupSub)[0]) if urlPageIndex==pageIndex: tags = soup.find_all("a",attrs={"target":"\\\"_blank\\\""}) cnt=0 for tag in tags: ......省略 print "yes"+str(urlPageIndex) urlPageIndex+=1 else: print "no"+str(urlPageIndex) break
import csv with open(‘excel_2010_ms-dos.csv‘, ‘rb‘) as csvfile: spamreader = csv.reader(csvfile, dialect=‘excel‘) for row in spamreader: print ‘, ‘.join(row)
#-*- coding: UTF-8 -*- import urllib import re import csv from bs4 import BeautifulSoup listModal=[] listSpecs=[] urlLeft=‘http://zj.zol.com.cn/index.php?c=Ajax_ParamResponse&a=GetGoods&subcateId=28&type=0&priceId=noPrice&page=‘ urlRight=‘&manuId=¶mStr=&keyword=&locationId=1&queryType=0‘ urlPageIndex=1 while (1): url=urlLeft+str(urlPageIndex)+urlRight html = urllib.urlopen(url).read() soup=BeautifulSoup(html,"html.parser") soupSub=str(soup)[0:50] pageIndex=int(re.findall(‘page\":([0-9]+)‘,soupSub)[0]) if urlPageIndex==pageIndex: tags = soup.find_all("a",attrs={"target":"\\\"_blank\\\""}) cnt=0 for tag in tags: cnt+=1 modalSubstr=tag.contents[0] manufacturer=re.findall(‘(.+?) ‘,modalSubstr)[0]#非贪心匹配 遇到空格即中止,返回第一个匹配项 detailSubstr=re.findall(‘ ([0-9a-zA-Z- ]+)‘,modalSubstr) detailSubstr0=detailSubstr[0] #针对i3、i5、i7的处理 if "i3" in modalSubstr: modalDetail="i3 "+detailSubstr0 elif "i5" in modalSubstr: modalDetail="i5 "+detailSubstr0 elif "i7" in modalSubstr: modalDetail="i7 "+detailSubstr0 else: modalDetail=detailSubstr0 #针对APU的处理 if modalDetail=="APU": modalDetail+=" "+detailSubstr[1] modal=manufacturer+" "+modalDetail listModal.append(modal) substr=str(tag)[100:500] #以title=‘\"开头+任意小数+ GHz结尾 specsDictionary=re.findall(r‘title=\‘\\\"([0-9.]+GHz)‘,substr) try: specs=specsDictionary[0] except IndexError: specs="Data Missed" listSpecs.append(specs) print "yes"+str(urlPageIndex) urlPageIndex+=1 else: print "no"+str(urlPageIndex) break with open(‘Config.csv‘, ‘wb‘) as csvfile: spamwriter = csv.writer(csvfile, dialect=‘excel‘) #write 标题行 spamwriter.writerow([‘Config_Type‘,‘Config_Modal‘,‘Config_Specs‘,‘Config_MinorSpecs‘]) i=0 for elementModal in listModal: spamwriter.writerow([‘CPU‘,listModal[i], listSpecs[i]]) i+=1
标签:
原文地址:http://www.cnblogs.com/moonache/p/5333300.html