python推荐淘宝物美价廉商品

时间：2016-12-30 23:26:39 阅读：715 评论：0 收藏：0 [点我收藏+]
标签：代码 ade 获取 sample while gecko urlopen 回车 exist
完成的目标：
　　输入搜索的商品以及淘宝的已评价数目、店铺的商品描述（包括如实描述、服务态度、快递的5.0打分）；
　　按要求，晒选出要求数量的结果，并按“物美价廉算法”排序后输出　　　　
思路：
1，利用淘宝搜索‘https://s.taobao.com/search?‘的价格filter 先进行价格筛选，得到结果的网站
2，用urllib打开结果网站，构造正则表达式匹配出各个商品结果的价格、已评价数量、店铺的如实描述等信息；
　　并把结果保存至二维数组里。
3，利用商品及店铺信息，用“物美价廉算法”给各个商品打分
4，按打分排序，各个信息总结果按排序输出到新建txt文件里；
　　并将各个商品图片下载到文件及建立相同排序开头的txt(其名字包好简要的商品信息)，这样图片和商品信息同时能在一个文件夹里用大图排列看到。
5.，可以把输入的参数（价格范围等要求）以函数输入，，用pyinstaller 把整个py程序打包为EXE 就可以发布了。
源代码如下
  1 # -*- coding: utf-8 -*-
  2 import urllib
  3 import urllib2
  4 import re
  5 import time 
  6 import random
  7 import os
  8 from math import log
  9 from math import log10
 10 from math import sqrt
 11 import sys
 12 
 13 reload(sys)  
 14 sys.setdefaultencoding(‘utf8‘) 
 15 
 16 
 17 class counter(object):
 18     #计数器
 19     def __init__(self):
 20         self.count  = 0
 21         self.try_time = 0
 22         self.fail_time = 0
 23         self.url_list = []
 24         self.new_flag = True
 25         self.results=[]
 26         self.p=0
 27         self.d=0
 28     def print_counter(self):
 29         print ‘try_time:‘, self.try_time,   "  get_count:" , self.count,   "  fail_time:",self.fail_time
 30 counter1=counter()
 31 
 32 def post_request(url):
 33     #构造随机头部文件访问请求
 34     User_Agents=["Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0",#
 35     "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36",
 36     "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", #
 37     "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
 38     "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11O"
 39     ]
 40     random_User_Agent = random.choice(User_Agents)
 41     #print random_User_Agent
 42 
 43     req =urllib2.Request(url) #！！
 44 
 45     req.add_header("User-Agent",random_User_Agent)
 46     req.add_header("GET",url)
 47     req.add_header("Referer",url)
 48     return req
 49 
 50 def recommend_rate(price,description,delivery,service,comments):
 51     #描述为绝对值
 52     av_p=counter1.p/counter1.count
 53     av_d=counter1.d/counter1.count
 54     rate=(description/av_d)**20*(description+delivery+service)*(av_p/(price))**0.1+log((comments+5),1000)
 55     print ‘all count=‘,counter1.count
 56     print "avrage price=",av_p,‘;‘,av_p/(price),‘;price‘,price,‘;comments=‘,comments,‘;descrip=‘,description
 57     print ‘rate=‘,rate,‘(price)yinzi‘,(av_p/(price))**0.1,‘descrip_yinzi‘,(description/av_d)**20,‘comments_factor=‘,log((comments+50),100)
 58     return rate
 59 
 60 def product_rank(list):
 61     for x in list:
 62         #0开始为  x0商品名 、x1图片链接、x2商品链接、x3价格、x4评论数、 x5店铺名、 x6快递情况、x7描述相符情况3个、x8服务情况 
 63         rate=recommend_rate(x[3],x[7],x[6],x[8],x[4])
 64         x.append(rate)
 65 
 66 def get_user_rate(item_url):
 67     ‘‘‘获取卖家信用情况；未登录情况不能访问。。。‘‘‘
 68     html=urllib2.urlopen(item_url)
 69     #"//rate.taobao.com/user-rate-282f910f3b70f2128abd0ee9170e6428.htm"
 70     regrex_rate=‘"(//.*?user\-rate.*?)"‘
 71     codes= re.findall(regrex_rate,html.read())
 72     html.close()
 73 
 74     user_rate_url= ‘http:‘+codes[0]
 75     print ‘uu‘, user_rate_url
 76 
 77     user_rate_html = urllib2.urlopen(user_rate_url)
 78     print user_rate_html.read()
 79     #title="4.78589分"
 80     desc_regex=u‘title="(4.[0-9]{5}).*?‘
 81     de_pat=re.compile(desc_regex)
 82     
 83     descs = re.findall(de_pat,user_rate_html.read())
 84     print len(descs)
 85     item_url=‘https://item.taobao.com/item.htm?id=530635294653&ns=1&abbucket=0#detail‘
 86 #get_user_rate(item_url)
 87 ‘‘‘获取卖家信用情况；未登录情况不能访问。。。暂时 无用‘‘‘
 88 
 89 def get_praised_good(url,file_open,keyword,counts):
 90     #从给定的淘宝链接中 获取符合条件的商品list
 91     html=urllib2.urlopen(post_request(url))
 92     code=html.read()
 93     html.close()
 94 
 95     regrex2=ur‘raw_title":"(.*?)","pic_url":"(.*?)","detail_url":"(.*?)","view_price":"(.*?)".*?"comment_count":"(.*?)".*?"nick":"(.*?)".*?"delivery":\[(.*?),(.*?),(.*?)\],"description":\[(.*?),(.*?),(.*?)\],"service":\[(.*?),(.*?),(.*?)\]‘ 
 96     #每一个匹配项 返回  15个 字符串 
 97     #x[0]开始为  x0商品名 、x1图片链接、x2商品链接、x3价格、x4评论数、 x5店铺名、 x6快递情况3个、x9描述相符情况3个、x12服务情况3个
 98     #x3,x6,x9,x12,x4
 99     pat=re.compile(regrex2)
100     meet_code=re.findall(regrex2,code)#
101 
102     for x in meet_code:
103         if counter1.count>=counts :
104             print "have get enough pruducts"
105             break
106         description_higher=int(x[10])*float(x[11])/100
107         service_higher=int(x[13])*float(x[14])/100
108         try:
109             x4=int(x[4])#description_count
110         except:
111             x4=0
112         if  (description_higher>=description_higher_require) and (service_higher>=service_higher_require) and x4>=description_count_require:
113             if re.findall(keyword,x[0]) :#():
114             #try:
115                 detail_url=‘http:‘+x[2].decode(‘unicode-escape‘).encode(‘utf-8‘)
116                 x1=‘http:‘+x[1].decode(‘unicode-escape‘).encode(‘utf-8‘)
117                 #print type(x)
118                 if detail_url  in counter1.url_list:
119                     counter1.new_flag=False
120                     print ‘no more new met products‘
121                     print counter1.url_list
122                     print detail_url
123                     break
124                 counter1.url_list.append(detail_url)
125                 counter1.try_time+=1
126                 counter1.count+=1
127 
130                 x11=float(x[11])/100
131                 x9=float(x[9])/100
132                 x12=float(x[12])/100
133                 x6=float(x[6])/100
134                 x3=float(x[3])
135                 counter1.p+=x3
136                 counter1.d+=x9
137                 x5=unicode(x[5],‘utf-8‘)
138                 
139                 
140                 result_list=[]
141                 result_list.append(x[0])
142                 result_list.append(x1)
143                 result_list.append(detail_url)
144                 result_list.append(x3)
145                 result_list.append(x4)
146                 result_list.append(x5)
147                 result_list.append(x6)
148                 #result_list.append(x7)
149                 result_list.append(x9)
150                 result_list.append(x12)
151                 #result_list.append(rate)
152                 #0开始为  x0商品名 、x1图片链接、x2商品链接、x3价格、x4评论数、 x5店铺名、 x6快递情况、x7描述相符情况、x8服务情况 
153                 counter1.results.append(result_list)
154 ‘‘‘
155                 txt_name=reserve_file_path+‘\\‘+ ‘%s__‘%counter1.count+ ‘custome_description_%s __‘%x9 +‘__comments_%s_‘%x[4]+ ‘___price_%srmb___‘%x[3] +x5 +‘.txt‘
156                 
157                 file_o=open(txt_name,‘a‘)
158                 file_o.write(detail_url)
159                 file_o.close()
160             
161                 print ‘\nget_one_possible_fine_goods:\n‘,‘good_name:‘,x[0].decode(‘utf-8‘)
162                 print ‘price:‘,x[3].decode(‘utf-8‘),x[5].decode(‘utf-8‘)
163                 print ‘custome_description:‘,x9,‘--‘,x11,‘%higher than average;‘,‘described_nomber:‘,x[4],‘  service:‘,float(x[12])/100
164                 print detail_url.decode(‘utf-8‘),‘\ngood_pic_url:‘,x1.decode(‘utf-8‘)
165 
166                 print txt_name
167                 
168                 file_open.write(‘%s__‘%counter1.count +x[0]+‘\nprice:‘+x[3]+‘￥,‘+‘\n‘+detail_url+‘  \n‘+x[5]+‘\ncustomer_description:‘+str(x9)+‘; ‘+str(x11)+‘%higher than average   described_number:‘+x[4]+‘\n\n\n‘)
169                 
170                 print ‘get one -^-‘
171                 ‘‘‘
172             #except:
173                 #print ‘lose one picture‘
174                 #counter1.fail_time+=1
175                 
176 
177 def save_downpic(lis,file_open,reserve_file_path):
178     ‘‘‘从商品list下载图片到reserve_file_path，并写入信息至fileopen‘‘‘
179     #0开始为  x0商品名 、x1图片链接、x2商品链接、x3价格、x4评论数、 x5店铺名、 x6快递情况、x7描述相符情况、x8服务情况、x9:rate
180     len_list=len(lis)
181     print len_list
182     cc=0        
183     for x in lis:
184         
185         try:
186             urllib.urlretrieve(x[1],reserve_file_path+‘\\%s___‘%cc +unicode(x[0],‘utf-8‘)+‘.jpg‘)
187 
188             txt_name=reserve_file_path+‘\\‘+ ‘%s__‘%cc+ ‘custome_description_%s __‘%x[7] +‘__comments_%s_‘%x[4]+ ‘___price_%srmb___‘%x[3] +x[5] +‘.txt‘
189                     
190             file_o=open(txt_name,‘a‘)
191             file_o.write(x[2])
192             file_o.close()
193             
194             print ‘\nget_one_possible_fine_goods:\n‘,‘good_name:‘,x[0].decode(‘utf-8‘)
195             print ‘rate=‘,x[9]
196             print ‘price:‘,x[3],x[5]#.decode(‘utf-8‘)
197             print ‘custome_description:‘,x[7],‘--‘,‘described_number:‘,x[4],‘  service:‘,x[8]
198             print x[2].decode(‘utf-8‘),‘\ngood_pic_url:‘,x[1].decode(‘utf-8‘)
199 
200             print txt_name
201             print cc+1,"th"
202 
203             file_open.write(u‘%s__‘%cc +u‘%s‘%x[0]+‘\nprice:‘+str(x[3])+‘￥,‘+‘\n‘+str(x[2])+‘  \n‘+str(x[5])+‘\ncustomer_description:‘+str(x[7])+‘described_number:‘+str(x[4])+‘\n\n\n‘)
204             
205             print ‘get one -^-‘
206         except :
207             pass
208         cc+=1
209         time.sleep(0.5)
210 
211 def get_all_praised_goods(reserch_goods,counts,reserve_file_path,price_min,price_max):
212     #边里搜索结果每一页
213     #initial url and page number
214     initial_url=‘https://s.taobao.com/search?q=‘+reserch_goods+‘&filter=reserve_price%5B‘+‘%s‘%price_min+‘%2C‘+‘%s‘%price_max+‘%5D&s=‘
215     print "initial_url",initial_url
216     page_n=0
217     reserve_file=reserve_file_path+r‘\found_goods.txt‘
218     file_open=open(reserve_file,‘a‘)
219 
220     file_open.write(‘****************************\n‘)
221     file_open.write(time.ctime())
222     file_open.write(‘\n****************************\n‘)
223 
224     while counter1.new_flag and counter1.count<counts :
225         
226         url_1=initial_url+‘%s‘%(44*page_n)
227         #print initial_url
228         print ‘url_1:‘, url_1
229         #print ‘ss‘,initial_url+‘%s‘%(44*page_n)
230         page_n+=1
231 
232         get_praised_good(url_1,file_open,keyword,counts)
233         time.sleep(2)
234         # except:
235         print "%s"%page_n,"pages have been searched"            
236         if page_n>=11 :
237             print "check keyword,maybe too restrict"
238             break
239     print url_1        
240     product_rank(counter1.results)
241 
242     counter1.results.sort(key=lambda x :x[9],reverse=True)        
243 
244     save_downpic(counter1.results,file_open,reserve_file_path)
245 
246     file_open.close()
247     counter1.print_counter()
248 
249 if __name__=="__main__":
250 
251     reserch_goods=‘英伦男外套‘     #淘宝搜索词
252     keyword=‘‘#raw_input().decode("gbk").encode("utf-8")  #个人限定词，商品名字必须包含，防止淘宝推荐了其他相关词 (正则表达式). 为任意表示不作限制
253     price_min=22            #价格区间
254     price_max=300
255     description_higher_require=0   # %   默认高于average, 输出结果大于此值
256     service_higher_require=0      # %  默认高于average, 输出结果大于此值
257     description_count_require=6
258 
259     counts=50
260 
261 
262     reserve_file_path=r"C:\Users\Administrator\Desktop\Python scrapy\find_worthy_goods\results"#结果保存路径
263     while os.path.exists(reserve_file_path):
264         reserve_file_path =reserve_file_path+‘%s‘%random.randrange(0,100)
265     if not os.path.exists(reserve_file_path):
266         os.makedirs(reserve_file_path)
267     
268     
269     get_all_praised_goods(reserch_goods,counts,reserve_file_path,price_min,price_max)
270     
271     #print counter1.results
272     #
273     #print counter1.results
274 
275     ‘‘‘ 以下用输入函数输入要求的刷选条件
276     print ‘说明：\n本程序用于在淘宝上搜索商品时主动通过 价格范围、商品描述、服务态度、评论数来筛选商品;\n筛选出来的商品图片下载保存到磁盘（默认桌面新建find_worty_goods文件夹）并建立同序号开头的txt文件，图片显示商品，其旁的txt文件名显示价格等关键信息，txt里保存商品的淘宝链接‘.decode(‘utf-8‘)
277     print "please input reserch _goods_name"
278     print "请输入搜索商品名称；注意不能有空格，下同".decode(‘utf-8‘)
279     reserch_goods=raw_input()     #淘宝搜索词
280     if reserch_goods:
281         # #
282         # print "please input _keyword that goods name must include:\n(more than one keyword must use Regular Expression); default by no kewords"
283         # try:
284         #     keyword=raw_input().decode("gbk").encode("utf-8")      #个人限定词，商品名字必须包含，防止淘宝推荐了其他相关词 (正则表达式). 为任意表示不作限制
285         # except:
286         #     keyword=‘青‘
287         # #    
288         keyword=‘.‘
289         print "\nplease input  _minimal price and _maximal price;   \ndefault by 0,10000\nnext by ‘enter‘key input nothing means by default,the same below "
290         print ‘请输入价格范围 ；默认0-10000 ；两项用半角逗号","分隔 按回车键确认；什么也不输入代表使用默认值 ‘.decode(‘utf-8‘)
291         try:
292             price_min, price_max=input()
293         except:
294             print ‘not input or wrong number,use default range‘
295             price_min, price_max = 0 ,10000
296         #    
297         print "\nplease input  _description_higher_percent_require and _service_higher__percent_require\n range:(-100,100) ;   \ndefault by 0,0  I.e better than average"
298         print ‘请输入商品描述、服务高于平均值的百分比-100 ~100‘.decode(‘utf-8‘)
299              # %   默认高于average, 输出结果大于此值
300         try:
301             description_higher_require,service_higher_require=input()              
302         except:
303             print ‘not input or wrong number,use default range‘
304             description_higher_require = 0  # %  默认高于average, 输出结果大于此值
305             service_higher_require = 0
306         #    
307         print "\nplease input description count limit,  default more than 1"
308         print ‘输入最低商品评价数，默认大于1‘.decode(‘utf-8‘)
309         try:
310             description_count_require=input()
311         except :
312             print ‘not input or wrong number,use default range‘
313             description_count_require=1
314         #
315             
316         print "\nIF customise file reserve path, Y or N  \ndefault/sample as:  C:\\Users\\Administrator\\Desktop\\find_worthy_goods\\results "
317         print ‘是否自定义保存文件目录 Y or N‘.decode(‘utf-8‘)
318         if raw_input()==‘Y‘:
319             print "please input path that you want to reserve;  \n "    
320             reserve_file_path = raw_input()
321         else:
322             reserve_file_path=r"C:\Users\Administrator\Desktop\find_worthy_goods\results"#结果保存路径
323         
324         while os.path.exists(reserve_file_path):
325             reserve_file_path = reserve_file_path+‘%s‘%random.randrange(1,10)
326             #print "the path exist,we‘ll make a new one"
327         try:
328             os.makedirs(reserve_file_path)
329             print ‘ok,file_path we reserve results:  %s‘%reserve_file_path
330             print ‘保存的路径为：‘.decode(‘utf-8‘)
331         except:
332             print "failed to make file path\nplease restart program"
333             print ‘创建文件夹失败，请重新启动程序‘.decode(‘utf-8‘)
334             
335         #
336         print "\nplease input how many results you want,  default by 50"
337         print ‘您要获取的商品数目，默认50‘.decode(‘utf-8‘)
338         try:
339             counts=input()
340         except :
341             counts=50
342         #
343         
344         get_all_praised_goods(reserch_goods,counts,reserve_file_path,price_min,price_max)
345         print ‘\n‘
346         counter1.print_counter()
347         print "finished,please look up in %s"%reserve_file_path
348         print ‘下载完成‘.decode(‘utf-8‘)
349 
350         print counter1.results
351         input()
352     else:
353         print "no search goods"
354         print ‘没有输入商品名称‘.decode(‘utf-8‘)
355     ‘‘‘
356 #下一步保存图片，以文件名为商品图片名字，并以序号开头
357     #同时，输出 价格、商家名，商品描述、服务等 到 txt文本
358     #在商品图片看中后，便可按序号查找 
359     #按描述、服务评价高于平均，购物体验应该可以的
python推荐淘宝物美价廉商品
标签：代码 ade 获取 sample while gecko urlopen 回车 exist
原文地址：http://www.cnblogs.com/willowj/p/6238406.html
踩
(0)
评论一句话评论（0）
分享档案
更多>
2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)
周排行