标签:init sqrt 中文 rand except level item color blog
改动:
新增功能 :可选择只看天猫或淘宝
代码模块化封装,参数配置或输入单独在一个py文件管理,主函数功能只留出参数传入在setting配置的py文件里。
main.py代码:
1 # -*- coding: utf-8 -*- 2 import urllib 3 import urllib2 4 import re 5 import time 6 import random 7 import os 8 from math import log 9 from math import log10 10 from math import sqrt 11 import sys 12 13 import setting 14 15 ‘‘‘在Python自己IDE上要注释掉一下两行‘‘‘ 16 reload(sys) 17 sys.setdefaultencoding(‘utf8‘) # python2.x的的defaultencoding是ascii 18 19 class counter(object): 20 #计数器 21 def __init__(self): 22 self.count = 0 23 self.try_time = 0 24 self.fail_time = 0 25 self.url_list = [] 26 self.new_flag = True 27 self.results=[] 28 self.priSu=0 29 self.descSu=0 30 self.tm_tb = ‘‘ 31 32 def print_counter(self): 33 print ‘try_time:‘, self.try_time, " get_count:" , self.count, " fail_time:",self.fail_time 34 35 counter1 = counter() 36 37 38 def post_request(url): 39 ‘‘‘ 40 #使用代理 41 proxy = {‘http‘:‘27.24.158.155:84‘} 42 proxy_support = urllib2.ProxyHandler(proxy) 43 # opener = urllib2.build_opener(proxy_support,urllib2.HTTPHandler(debuglevel=1)) 44 opener = urllib2.build_opener(proxy_support) 45 urllib2.install_opener(opener) 46 ‘‘‘ 47 48 #构造随机头部文件访问请求 49 User_Agents=["Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0", 50 "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36", 51 "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", # 52 "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11", 53 "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11O" 54 ] 55 random_User_Agent = random.choice(User_Agents) 56 #print random_User_Agent 57 58 req =urllib2.Request(url) #!! 59 60 req.add_header("User-Agent",random_User_Agent) 61 req.add_header("GET",url) 62 req.add_header("Referer",url) 63 return req 64 65 66 def recommend_rate(price,description,delivery,service,comments): 67 #描述为绝对值 68 av_p=counter1.priSu/counter1.count 69 av_d=counter1.descSu/counter1.count 70 rate=(description/av_d)**20*(description+delivery+service)*(av_p/(price))**0.1+log((comments+5),1000) 71 #print ‘all count=‘,counter1.count 72 #print "avrage price=",av_p,‘;‘,av_p/(price),‘;price‘,price,‘;comments=‘,comments,‘;descrip=‘,description 73 #print ‘rate=‘,rate,‘(price)yinzi‘,(av_p/(price))**0.1,‘descrip_yinzi‘,(description/av_d)**20,‘comments_factor=‘,log((comments+50),100) 74 return rate 75 76 77 def product_rank(list): 78 for x in list: 79 #0开始为 x0商品名 、x1图片链接、x2商品链接、x3价格、x4评论数、 x5店铺名、 x6快递情况、x7描述相符情况3个、x8服务情况 80 rate=recommend_rate(x[3],x[7],x[6],x[8],x[4]) 81 x.append(rate) 82 83 84 def get_user_rate(item_url): 85 #暂时未使用该功能 86 ‘‘‘获取卖家信用情况;未登录情况不能访问,或者需要在头部文件中加入cookie。。。;‘‘‘ 87 html=urllib2.urlopen(item_url) 88 #"//rate.taobao.com/user-rate-282f910f3b70f2128abd0ee9170e6428.htm" 89 regrex_rate=‘"(//.*?user\-rate.*?)"‘ 90 codes= re.findall(regrex_rate,html.read()) 91 html.close() 92 93 user_rate_url= ‘http:‘+codes[0] 94 print ‘uu‘, user_rate_url 95 96 user_rate_html = urllib2.urlopen(user_rate_url) 97 print user_rate_html.read() 98 #title="4.78589分" 99 desc_regex=u‘title="(4.[0-9]{5}).*?‘ 100 de_pat=re.compile(desc_regex) 101 102 descs = re.findall(de_pat,user_rate_html.read()) 103 print len(descs) 104 item_url=‘https://item.taobao.com/item.htm?id=530635294653&ns=1&abbucket=0#detail‘ 105 #get_user_rate(item_url) 106 ‘‘‘获取卖家信用情况;未登录情况不能访问。。。暂时 无用‘‘‘ 107 108 109 def makeNewdir(savePath): 110 while os.path.exists(savePath): 111 savePath = savePath+‘%s‘%random.randrange(1,10) 112 #print "the path exist,we‘ll make a new one" 113 try: 114 os.makedirs(savePath) 115 print ‘ok,file_path we reserve results: %s‘%savePath 116 print ‘保存的路径为:‘.decode(‘utf-8‘) 117 except: 118 print "failed to make file path\nplease restart program" 119 print ‘创建文件夹失败,请重新启动程序‘.decode(‘utf-8‘) 120 121 122 def get_praised_good(url, file_open, keyword, counts, descripHrequ, servHrequ, descripNrequ): 123 #从给定的淘宝链接中 获取符合条件的商品list 124 html = urllib2.urlopen(post_request(url)) 125 code = html.read() 126 html.close() 127 128 regrex2=ur‘raw_title":"(.*?)","pic_url":"(.*?)","detail_url":"(.*?)","view_price":"(.*?)".*?"comment_count":"(.*?)".*?"nick":"(.*?)".*?"delivery":\[(.*?),(.*?),(.*?)\],"description":\[(.*?),(.*?),(.*?)\],"service":\[(.*?),(.*?),(.*?)\]‘ 129 #每一个匹配项 返回 15个 字符串 130 #x[0]开始为 x0商品名 、x1图片链接、x2商品链接、x3价格、x4评论数、 x5店铺名、 x6快递情况3个、x9描述相符情况3个、x12服务情况3个 131 pat = re.compile(regrex2) 132 meet_code = re.findall(regrex2,code)# 133 134 for x in meet_code: 135 if counter1.count>=counts : 136 print "have get enough pruducts" 137 break 138 description_higher=int(x[10])*float(x[11])/100 139 service_higher=int(x[13])*float(x[14])/100 140 try: 141 x4=int(x[4]) #description_count 142 except: 143 x4=0 144 145 #如果 只要淘宝 非天猫 146 if counter1.tm_tb == ‘taobao‘: 147 if counter1.tm_tb not in x[2].split(‘.‘): 148 break 149 150 if (description_higher>=descripHrequ) and (service_higher>=servHrequ) and x4>=descripNrequ: 151 if re.findall(keyword,x[0]) : # 中文keyword在结果中匹配问题暂时没有解决,,直接加在搜索词里吧 152 x0=x[0].replace(‘ ‘,‘‘).replace(‘/‘,‘‘) 153 detail_url=‘http:‘+x[2].decode(‘unicode-escape‘).encode(‘utf-8‘) 154 x1=‘http:‘+x[1].decode(‘unicode-escape‘).encode(‘utf-8‘) 155 #print type(x) 156 if detail_url in counter1.url_list: 157 counter1.new_flag=False 158 print ‘no more new met products‘ 159 print counter1.url_list 160 print detail_url 161 break 162 counter1.url_list.append(detail_url) 163 counter1.try_time+=1 164 counter1.count+=1 165 166 x11=float(x[11])/100 167 x9=float(x[9])/100 168 x12=float(x[12])/100 169 x6=float(x[6])/100 170 x3=float(x[3]) 171 counter1.priSu += x3 172 counter1.descSu += x9 173 x5=unicode(x[5],‘utf-8‘) 174 175 result_list=[] 176 result_list.append(x0) 177 result_list.append(x1) 178 result_list.append(detail_url) 179 result_list.append(x3) 180 result_list.append(x4) 181 result_list.append(x5) 182 result_list.append(x6) 183 result_list.append(x9) 184 result_list.append(x12) 185 #0开始为 x0商品名 、x1图片链接、x2商品链接、x3价格、x4评论数、 x5店铺名、 x6快递情况、x7描述相符情况、x8服务情况 186 counter1.results.append(result_list) 187 188 189 def save_downpic(lis,file_open,savePath): 190 ‘‘‘从商品list下载图片到reserve_file_path,并写入信息至fileopen‘‘‘ 191 #0开始为 x0商品名 、x1图片链接、x2商品链接、x3价格、x4评论数、 x5店铺名、 x6快递情况、x7描述相符情况、x8服务情况、x9:rate 192 len_list=len(lis) 193 print len_list 194 cc=0 195 for x in lis: 196 try : 197 urllib.urlretrieve(x[1],savePath+‘\\%s___‘%cc +unicode(x[0],‘utf-8‘)+‘.jpg‘) 198 199 txt_name = savePath+‘\\‘+ ‘%s__‘%cc+ ‘custome_description_%s __‘%x[7] +‘__comments_%s_‘%x[4]+ ‘___price_%srmb___‘%x[3] +x[5] +‘.txt‘ 200 201 file_o = open(txt_name,‘a‘) 202 file_o.write(x[2]) 203 file_o.close() 204 205 print ‘\nget_one_possible_fine_goods:\n‘,‘good_name:‘,x[0].decode(‘utf-8‘) 206 print ‘rate=‘,x[9] 207 print ‘price:‘,x[3],x[5].decode(‘utf-8‘) 208 print ‘custome_description:‘,x[7],‘--‘,‘described_number:‘,x[4],‘ service:‘,x[8] 209 print x[2].decode(‘utf-8‘),‘\ngood_pic_url:‘,x[1].decode(‘utf-8‘) 210 211 print txt_name 212 print cc+1,"th" 213 214 file_open.write(u‘%s__‘%cc +u‘%s‘%x[0]+‘\nprice:‘+str(x[3])+‘¥,‘+‘\n‘+str(x[2])+‘ \n‘+str(x[5])+‘\ncustomer_description:‘+str(x[7])+‘described_number:‘+str(x[4])+‘\n\n\n‘) 215 216 217 218 print ‘get one -^-‘ 219 except : 220 print "failed to down picture or creat txt" 221 counter1.fail_time += 1 222 cc+=1 223 time.sleep(0.5) 224 225 226 def get_all_praised_goods(serchProd,counts,savePath ,keyword, price_min=0,price_max=0,descripHrequ =0,servHrequ=0 ,descripNrequ=0): 227 #边里搜索结果每一页 228 #initial url and page number 229 initial_url=‘https://s.taobao.com/search?q=‘+serchProd 230 231 if counter1.tm_tb == ‘tmall‘: 232 initial_url = initial_url + ‘&filter_tianmao=tmall‘ 233 234 if price_min : 235 if price_min < price_max : 236 initial_url = initial_url+‘&filter=reserve_price%5B‘+‘%s‘%price_min+‘%2C‘ +‘%s‘%price_max 237 initial_url = initial_url +‘%5D&s=‘ 238 239 #tian_mall = ‘https://list.tmall.com/search_product.htm?q=‘ 240 241 print "initial_url",initial_url 242 page_n=0 243 reserve_file=savePath+r‘\found_goods.txt‘ 244 file_open=open(reserve_file,‘a‘) 245 246 file_open.write(‘****************************\n‘) 247 file_open.write(time.ctime()) 248 file_open.write(‘\n****************************\n‘) 249 250 while counter1.new_flag and counter1.count<counts : 251 252 url_1=initial_url+‘%s‘%(44*page_n) 253 #print initial_url 254 print ‘url_1:‘, url_1 255 #print ‘ss‘,initial_url+‘%s‘%(44*page_n) 256 page_n += 1 257 258 get_praised_good(url_1,file_open,keyword,counts,descripHrequ,servHrequ ,descripNrequ) 259 print "let web network rest for 2s lest make traffic jams " 260 time.sleep(2) 261 # except: 262 print "%s"%page_n,"pages have been searched" 263 if page_n >=11 : 264 print "check keyword,maybe too restrict" 265 break 266 print url_1 267 product_rank(counter1.results) 268 269 counter1.results.sort(key=lambda x :x[9],reverse=True) 270 271 save_downpic(counter1.results,file_open,savePath) 272 273 # 274 for a in counter1.results: 275 for b in a : 276 file_open.write(unicode(str(b),‘utf-8‘)) 277 file_open.write(‘\t‘) 278 file_open.write(‘\n\n‘) 279 280 file_open.close() 281 counter1.print_counter() 282 283 284 def main(): 285 print ‘说明:\n本程序用于在淘宝上搜索商品时主动通过 价格范围、商品描述、服务态度、评论数来筛选商品;\n筛选出来的商品图片下载保存到磁盘(默认桌面新建find_worty_goods文件夹)并建立同序号开头的txt文件,图片显示商品,其旁的txt文件名显示价格等关键信息,txt里保存商品的淘宝链接‘.decode(‘utf-8‘) 286 if setting.userDefine: #自己输入 配置参数-筛选要求 287 setting.inputPara() 288 #否则 使用setting中的配置参数 289 290 serchProd = setting.serchProd #淘宝搜索词 291 keyword = setting.keyword #raw_input().decode("gbk").encode("utf-8") #个人限定词,商品名字必须包含,防止淘宝推荐了其他相关词 (正则表达式). 为任意表示不作限制 292 price_min = setting.price_min #价格区间 293 price_max = setting.price_max 294 descripHrequ = setting.descripHrequ # % 默认高于average, 输出结果大于此值 295 servHrequ = setting.servHrequ # % 默认高于average, 输出结果大于此值 296 descripNrequ = setting.descripNrequ 297 counts = setting.counts #要求选出多少个商品 298 counter1.tm_tb = setting.tm_tb #不区分天猫淘宝则,字符串为空,,只要天猫 则 =‘tmall‘ ,只要淘宝 = ‘taobao‘ 299 300 #savePath=r"C:\Users\Administrator\Desktop\Python scrapy\find_worthy_goods\results"#结果保存路径 301 savePath=u"results%s"%serchProd #结果保存路径 302 makeNewdir(savePath) 303 304 get_all_praised_goods(serchProd, counts, savePath, keyword, price_min, price_max ,descripHrequ ,servHrequ ,descripNrequ) 305 306 307 if __name__=="__main__" : 308 main() 309 310
setting.py
# -*- coding: utf-8 -*- userDefine = False #筛选要求设置 serchProd=‘背包‘ #淘宝搜索词 keyword=‘‘ #raw_input().decode("gbk").encode("utf-8") #个人限定词,商品名字必须包含,防止淘宝推荐了其他相关词 (正则表达式). 为任意表示不作限制 price_min=22 #价格区间 price_max=100 descripHrequ=0 # % 默认高于average, 输出结果大于此值 servHrequ=0 # % 默认高于average, 输出结果大于此值 descripNrequ=6 counts=25 #要求选出多少个商品 tm_tb =‘tmall‘ #不区分天猫淘宝则,字符串为空,,只要天猫 则 =‘tmall‘ ,只要淘宝 = ‘taobao‘ def inputPara(): ‘‘‘ 用户选择是否自定义要求,根据要求进行获取商品,并按推荐排序输出‘‘‘ print "please input reserch _goods_name" global serchProd , keyword , price_min, price_max, descripHrequ , servHrequ, descripNrequ ,counts ,tm_tb serchProd=raw_input().replace(‘ ‘,‘‘) #淘宝搜索词 ,并去除中间意外输入的空格 if serchProd: print "if customise price_range ,decriptiom require .etc.\ninput Y/N \n default by : no price limit avarage than descriptiom,get 50 products \n 默认要求为:无价格限制,商品描述、快递、服务高于均值,获取50个商品。自定义要求请输入 ‘Y’ (区分大小写)".decode(‘utf-8‘) if raw_input() == ‘Y‘: print "\nplease input _minimal price and _maximal price; \ndefault by 0,10000\nnext by ‘enter‘key input nothing means by default,the same below " print ‘请输入价格范围 ;默认0-10000 ;两项用半角逗号","分隔 按回车键确认;什么也不输入代表使用默认值 ‘.decode(‘utf-8‘) try: price_min, price_max=input() except: print ‘not input or wrong number,use default range‘ price_min, price_max = 0 ,10000 # print ‘是否要求 只看天猫/正品保障 还是只看淘宝 \n 只看天猫输入 tmall ,只看淘宝输入taobao,都看则回车略过‘ try: tm_tb=raw_input().decode("gbk").encode("utf-8") #个人限定词,商品名字必须包含,防止淘宝推荐了其他相关词 (正则表达式). 为任意表示不作限制 except: tm_tb=‘‘ # # # print "please input _keyword that goods name must include:\n(more than one keyword must use Regular Expression); default by no kewords" try: keyword=raw_input().decode("gbk").encode("utf-8") #个人限定词,商品名字必须包含,防止淘宝推荐了其他相关词 (正则表达式). 为任意表示不作限制 except: keyword=‘‘ # print "\nplease input _description_higher_percent_require and _service_higher__percent_require\n range:(-100,100) ; \ndefault by 0,0 I.e better than average" print ‘请输入商品描述、服务高于平均值的百分比-100 ~100‘.decode(‘utf-8‘) # % 默认高于average, 输出结果大于此值 try: descripHrequ,servHrequ=input() except: print ‘not input or wrong number,use default range‘ descripHrequ = 0 # % 默认高于average, 输出结果大于此值 servHrequ = 0 # print "\nplease input description count limit, default more than 5\n" ,‘输入最低商品评价数,默认大于5‘.decode(‘utf-8‘) try: descripNrequ=input() except : print ‘not input or wrong number,use default range‘ descripNrequ=5 # # print "\nIF customise file reserve path, Y or N \ndefault/sample as: C:\\Users\\Administrator\\Desktop\\find_worthy_goods\\results " # print ‘是否自定义保存文件目录 Y or N‘.decode(‘utf-8‘) # if raw_input()==‘Y‘: # print "please input path that you want to reserve; \n " # savePath = raw_input() # else: # #savePath=r"C:\Users\Administrator\Desktop\find_worthy_goods\results"#结果保存路径 # print "\nplease input how many results you want, default by 50\n" ,‘您要获取的商品数目,默认50‘.decode(‘utf-8‘) try: counts=input() except : counts=50 else : counts =50 keyword = ‘‘ tm_tb = ‘‘ price_min ,price_max ,descripHrequ ,servHrequ ,descripNrequ = 0,0,0,0,0 else: print "no search goods,please restart" print ‘没有输入商品名称,请重新启动程序‘.decode(‘utf-8‘)
标签:init sqrt 中文 rand except level item color blog
原文地址:http://www.cnblogs.com/willowj/p/6266507.html