标签:doc set time content offset get lis tle encoding
clear_data.py
#!/usr/bin/env python # -*- coding: utf-8 -*- import requests from docx import Document from docx.shared import Inches,Pt from docx.oxml.ns import qn key_word =[‘小孩‘,‘儿童‘,‘儿子‘,‘女儿‘,‘小孩子‘,‘娃‘,‘嬉戏‘,‘亲子‘,‘玩具‘,‘宝宝‘,‘宝贝‘] childer =[] with open(‘code_dict.txt‘,‘r‘,encoding=‘utf-8‘)as f: content = f.readlines() for one_line in content: dict = eval(one_line) for w in key_word: if w in dict[‘comment‘]: childer.append(dict) break print(len(childer)) print((childer[1:10])) doc =Document() for i in childer: print(i) name = i.get(‘name‘) comment = i.get(‘comment‘) time = i.get(‘time‘) star = i.get(‘star‘) pic = i.get(‘pic‘) pen = doc.add_paragraph() ph = pen.paragraph_format ph.line_spacing = Pt(22) pensize1 = pen.add_run(‘用户:‘+name+‘\n‘) pensize = pen.add_run(‘评分:‘+str(star)+‘\n‘) pensize2 = pen.add_run(‘时间:‘+time+‘\n‘) pensize3 = pen.add_run(‘评论:‘+comment+‘\n‘) pensize.font.name = ‘宋体‘ pensize._element.rPr.rFonts.set(qn(‘w:eastAsia‘), ‘宋体‘) pensize.font.size = Pt(15) pensize.bold=True pensize1.font.name = ‘宋体‘ pensize1._element.rPr.rFonts.set(qn(‘w:eastAsia‘), ‘宋体‘) pensize1.font.size = Pt(15) pensize2.font.name = ‘宋体‘ pensize2._element.rPr.rFonts.set(qn(‘w:eastAsia‘), ‘宋体‘) pensize2.font.size = Pt(15) pensize3.font.name = ‘宋体‘ pensize3._element.rPr.rFonts.set(qn(‘w:eastAsia‘), ‘宋体‘) pensize3.font.size = Pt(15) if pic: for p in pic: req = requests.get(p) with open(‘capth.png‘,‘wb‘)as f: f.write(req.content) doc.add_picture(‘capth.png‘, width=Inches(2.5)) doc.save(‘dianping.docx‘)
dazhong.py
#!/usr/bin/env python # -*- coding: utf-8 -*- import datetime import random import time import re # from selenium.webdriver.chrome.options import Options # from selenium import webdriver from lxml import etree import requests class DianpingComment: font_size = 14 start_y = 23 def __init__(self, shop_id, cookies, delay=7, handle_ban=False): self.shop_id = shop_id self._delay = delay self._cookies = self._format_cookies(cookies) self._css_headers = { ‘User-Agent‘: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36‘, } self._default_headers = { ‘Connection‘: ‘keep-alive‘, ‘Host‘: ‘www.dianping.com‘, ‘User-Agent‘: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36‘, } self._cur_request_url = ‘http://www.dianping.com/shop/{}/review_all/p1‘.format(shop_id) if handle_ban: print(‘不想写跳过验证了‘) # self._browser = self._init_browser() # self._handle_ban() def run(self): self._css_link = self._get_css_link(self._cur_request_url) self._font_dict = self._get_font_dict(self._css_link) self._get_conment_page() def _delay_func(self): delay_time = random.randint((self._delay - 2) * 10, (self._delay + 2) * 10) * 0.1 print(‘睡一会‘,delay_time) time.sleep(delay_time) # def _init_browser(self): # """ # 初始化游览器 # """ # chrome_options = Options() # chrome_options.add_argument(‘--headless‘) # chrome_options.add_argument(‘--disable-gpu‘) # browser = webdriver.Chrome(chrome_options=chrome_options) # browser.get(self._cur_request_url) # for name, value in self._cookies.items(): # browser.add_cookie({‘name‘: name, ‘value‘: value}) # browser.refresh() # return browser # def _handle_ban(self): # """ # 爬取速度过快,出现异常时处理验证 # """ # try: # self._browser.refresh() # time.sleep(1) # button = self._browser.find_element_by_id(‘yodaBox‘) # move_x_offset = self._browser.find_element_by_id(‘yodaBoxWrapper‘).size[‘width‘] # webdriver.ActionChains(self._browser).drag_and_drop_by_offset( # button, move_x_offset, 0).perform() # except: # pass def _format_cookies(self, cookies): cookies = {cookie.split(‘=‘)[0]: cookie.split(‘=‘)[1] for cookie in cookies.replace(‘ ‘, ‘‘).split(‘;‘)} return cookies def _get_conment_page(self): # 获得评论内容 """ 请求评论页,并将<span></span>样式替换成文字 """ while self._cur_request_url: self._delay_func() print(‘[{now_time}] {msg}‘.format(now_time=datetime.datetime.now(), msg=self._cur_request_url)) res = requests.get(self._cur_request_url, headers=self._default_headers, cookies=self._cookies) html = res.text class_set = set() for span in re.findall(r‘<span class="([a-zA-Z0-9]{5,6})"></span>‘, html): class_set.add(span) for class_name in class_set: html = re.sub(‘<span class="%s"></span>‘ % class_name, self._font_dict[class_name], html) doc = etree.HTML(html) self._parse_comment_page(doc) try: self._default_headers[‘Referer‘] = self._cur_request_url next_page_url = ‘http://www.dianping.com‘ + doc.xpath(‘.//a[@class="NextPage"]/@href‘)[0] except IndexError: next_page_url = None self._cur_request_url = next_page_url def _data_pipeline(self, data): """ 处理数据 """ print(data) def _parse_comment_page(self, doc): """ 解析评论页并提取数据 """ for li in doc.xpath(‘//*[@class="reviews-items"]/ul/li‘): name = li.xpath(‘.//a[@class="name"]/text()‘)[0].strip(‘\n\r \t‘) try: star = li.xpath(‘.//span[contains(./@class, "sml-str")]/@class‘)[0] star = re.findall(r‘sml-rank-stars sml-str(.*?) star‘, star)[0] except IndexError: star = 0 time = li.xpath(‘.//span[@class="time"]/text()‘)[0].strip(‘\n\r \t‘) pics =[] if li.xpath(‘.//*[@class="review-pictures"]/ul/li‘): for pic in li.xpath(‘.//*[@class="review-pictures"]/ul/li‘): print(pic.xpath(‘.//a/@href‘)) pics.append(pic.xpath(‘.//a/img/@data-big‘)[0]) comment = ‘‘.join(li.xpath(‘.//div[@class="review-words Hide"]/text()‘)).strip(‘\n\r \t‘) if not comment: comment = ‘‘.join(li.xpath(‘.//div[@class="review-words"]/text()‘)).strip(‘\n\r \t‘) data = { ‘name‘: name, ‘comment‘: comment, ‘star‘: star, ‘pic‘:pics, ‘time‘: time, } self._data_pipeline(data) def _get_css_link(self, url): """ 请求评论首页,获取css样式文件 """ res = requests.get(url, headers=self._default_headers, cookies=self._cookies) html = res.text # print(html) # css_link = re.search(r‘<link re.*?css.*?href="(.*?svgtextcss.*?)">‘, html) css_link = re.findall(r‘<link rel="stylesheet" type="text/css" href="//s3plus.meituan.net/v1/(.*?)">‘, html) assert css_link css_link = ‘http://s3plus.meituan.net/v1/‘ + css_link[0] return css_link def _get_font_dict(self, url): """ 获取css样式对应文字的字典 """ res = requests.get(url, headers=self._css_headers) html = res.text background_image_link = re.findall(r‘background-image: url\((.*?)\);‘, html) print(‘带有svg的链接‘,background_image_link) assert background_image_link background_image_link = ‘http:‘ + background_image_link[1] html = re.sub(r‘span.*?\}‘, ‘‘, html) group_offset_list = re.findall(r‘\.([a-zA-Z0-9]{5,6}).*?round:(.*?)px (.*?)px;‘, html) # css中的类 print(‘css中class对应坐标‘,group_offset_list) font_dict_by_offset = self._get_font_dict_by_offset(background_image_link) # svg得到这里面图片对应成字典 print(‘解析svg成字典‘,font_dict_by_offset) font_dict = {} for class_name, x_offset, y_offset in group_offset_list: y_offset = y_offset.replace(‘.0‘, ‘‘) x_offset = x_offset.replace(‘.0‘, ‘‘) # print(y_offset,x_offset) if font_dict_by_offset.get(int(y_offset)): font_dict[class_name] = font_dict_by_offset[int(y_offset)][int(x_offset)] return font_dict def _get_font_dict_by_offset(self, url): """ 获取坐标偏移的文字字典, 会有最少两种形式的svg文件(目前只遇到两种) """ res = requests.get(url, headers=self._css_headers) html = res.text font_dict = {} # print(html) y_list = re.findall(r‘d="M0 (\d+?) ‘, html) if y_list: font_list = re.findall(r‘<textPath .*?>(.*?)<‘, html) for i, string in enumerate(font_list): y_offset = self.start_y - int(y_list[i]) sub_font_dict = {} for j, font in enumerate(string): x_offset = -j * self.font_size sub_font_dict[x_offset] = font font_dict[y_offset] = sub_font_dict else: font_list = re.findall(r‘<text.*?y="(.*?)">(.*?)<‘, html) for y, string in font_list: y_offset = self.start_y - int(y) sub_font_dict = {} for j, font in enumerate(string): x_offset = -j * self.font_size sub_font_dict[x_offset] = font font_dict[y_offset] = sub_font_dict return font_dict if __name__ == "__main__": pass
demo.py
#!/usr/bin/env python # -*- coding: utf-8 -*- from dazhong import DianpingComment COOKIES = ‘_lxsdk_cuid=1699b152d90c8-04b0ee8b481697-541f3415-1fa400-1699b152d91c8; _lxsdk=1699b152d90c8-04b0ee8b481697-541f3415-1fa400-1699b152d91c8; _hc.v=992d8c67-a9b0-ee61-c6cf-ed9b42cfe11f.1553085051; _thirdu.c=136cbfec8b174105c45f6628ce431df6; ctu=cc29f77c02b4556c6a1db1c67c5c10e084f7f63d00208c59788c11a4845348aa; cy=160; cye=zhengzhou; thirdtoken=e0dfd5bf-3cc9-482c-a559-ecb5a5408581; dper=13f0e16d38f4829e80270687b88c4ce8d36d333a6f525bc6be3dec9bbc60b1d7f44f8b47a413dc1c18f3ef5fed921594f3c5161e72d50fed52f3006625babe559507c56bb8b77d1f9dd95d104ffb3cdba1c49805e34df17c99e3ba781183b850; ll=7fd06e815b796be3df069dec7836c3df; ua=aJay13; ctu=a5f067d1428ce75e417e53634b352a7767a63503c85b2d59c0c70ae996add3e701d656899061b0eddfa568430b723553; _lxsdk_s=1699df6ef73-4f6-781-d9c%7C%7C719‘ class Customer(DianpingComment): def _data_pipeline(self, data): print(data) with open(‘code_dict.txt‘,‘a+‘,encoding=‘utf-8‘)as f: f.write(str(data)+‘\n‘) if __name__ == "__main__": dianping = Customer(‘1726435‘, cookies=COOKIES) dianping.run()
标签:doc set time content offset get lis tle encoding
原文地址:https://www.cnblogs.com/-stewart/p/12739173.html