码迷,mamicode.com
首页 > 其他好文 > 详细

2019/6/27号————检查

时间:2019-06-27 19:27:00      阅读:140      评论:0      收藏:0      [点我收藏+]

标签:node   sch   one   urllib   code   打印   star   html   strftime   

-----------------------------------------------------spider_Un.py---------------------------------------------------------------------------------------------------
import requests
import time
from lxml import etree

def get_html(url): # 请求页面
try:
headers = {
‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36‘}
res = requests.get(url, headers = headers)
res.encoding = res.apparent_encoding
if res.status_code == 200:
html = res.text
return html
else:
time.sleep(0.1)
return get_html(url)
except Exception as e: # except BaseException 这个也可以 e是打印出错误的原因
print("问题是", e)
pass

def parse(html):
print(html)
r = etree.HTML(html)
#print(r)
node_list = r.xpath("//div[@class=‘container‘]//script/text()")[1]
print(node_list)
print(len(node_list))

def url_join():

url_start = ‘https://gaokao.chsi.com.cn/sch/search--ss-on,searchType-1,option-qg,start-‘
url_end = ‘.dhtml‘
url_list = []
for i in range(1,139):
url_num = 20 * i - 20
url = url_start + str(url_num) + url_end
url_list.append(url)

return url_list
if __name__ == ‘__main__‘:
# url_list = url_join()
# print(url_list)
#
# for url in url_list:
#
# #访问
# html = get_html(url)

url = ‘https://gaokao.chsi.com.cn/sch/search--ss-on,searchType-1,option-qg,start-0.dhtml‘
html = get_html(url)
parse(html)
--------------------------------------------------------------------------------------------------------------------------------weixin.py---------------------------------------
#Weichat

import scrapy
import urllib.parse
from news_project.middlewares import Deal_Content
from news_project.items import NewsProjectItem
from lxml import etree
import js2py
import time
import requests
import re
import bs4

class Weichat(scrapy.Spider):
name = ‘Weichat‘
base = ‘https://mp.weixin.qq.com‘
allowed_domains = [‘weixin.sogou.com‘] #允许的页面最好不要定义 http:// 这样的
start_urls = [‘http://weixin.sogou.com‘]

#微信
def parse(self,response):

url_1 = ‘https://weixin.sogou.com/weixin?type=1&query={}&ie=utf8&s_from=input&_sug_=y&_sug_type_=‘.format(‘宝鸡招商局‘)
yield scrapy.Request(url=url_1, callback=self.detail_parse,dont_filter=True)

def detail_parse(self, response):

baoji_url = response.xpath("//a[@uigs=‘account_name_0‘]/@href").extract_first()
print(‘baoji_url‘,baoji_url)
print(‘baoji_url‘, type(baoji_url))
yield scrapy.Request(url=baoji_url, callback=self.baoji_parse, dont_filter=True)

def baoji_parse(self,response):

selector = etree.HTML(response.text)
print("------------------宝鸡招商局---------------------",response.text)
script = selector.xpath(‘.//script[not(@nonce) and @type="text/javascript"]/text()‘)
script = script[1]
script = script.replace(‘seajs.use("sougou/profile.js");‘, ‘‘)
script = ‘function getList(){‘ + script + ‘return msgList \n}‘
script = script.replace(‘amp;‘, ‘‘)
getList = js2py.eval_js(script)
js = getList()
js = str(js)
js = eval(js)
lis = js.get(‘list‘)
firstLinks = []
otherStyleTimes = []
for li in lis:
# 获取文章发布时间,转换为时间戳,格式化为数据库可以保存的格式
datimes = li[‘comm_msg_info‘][‘datetime‘]
timeArray = time.localtime(datimes)
otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)

# 找到文章url,后期构建真实url时用到
try:
content_url = li.get(‘app_msg_ext_info‘).get(‘content_url‘)
print(content_url)
firstLink = self.base + content_url
except IndexError:
firstLink = None
print(‘CAPTCHA!‘)
firstLinks.append(firstLink)
otherStyleTimes.append(otherStyleTime)
print(‘firstLinks, otherStyleTimes***********************‘,firstLinks, otherStyleTimes)
yield scrapy.Request(url=firstLinks[0], callback=self.baoji_detail_parse, meta={‘time‘:otherStyleTimes[0]},dont_filter=True)

def baoji_detail_parse(self,response):

item = NewsProjectItem()
content = ‘‘
meta = response.meta
print("response.url",response.url)
#res.raise_for_status()
detailPage = bs4.BeautifulSoup(response.text, "html.parser")
# 获取文章标题
title = detailPage.title.text.replace(‘\n‘, ‘‘).replace(‘\r‘, ‘‘).replace(‘ ‘, ‘‘).replace(‘!‘, ‘‘).replace(‘|‘, ‘‘)
print(‘title‘,title)

sections = detailPage.findAll(‘section‘, class_=‘_editor‘)
# 获取文章内容
for section in sections[:-3]:
content = content + section.text.replace(‘\n‘, ‘‘).replace(‘\r‘, ‘‘).replace(‘ ‘, ‘‘)

content_1 = response.xpath("//div[@id=‘js_content‘]")
print("meta[‘time‘]**************",meta[‘time‘])

print("content---------------",content)

item[‘title_url‘] = response.url

# 详细页面的内容
etree = response.xpath(‘//div[@id="js_content"]‘)
tagContet = etree.extract()
tagContet = ‘‘.join(tagContet)

content = etree.xpath(‘.//text()‘).extract()
content = ‘‘.join(content)
img_urls = etree.xpath(‘.//img/@src‘).extract()

img_urls_dict = {}
for url in img_urls:
if "http://网站" not in url:
url1 = urllib.parse.urljoin(response.url, url) # 拼接url的网址
img_urls_dict[url] = url1

print("*******img_urls_dict****", img_urls_dict)
item[‘content‘], item[‘tags‘] = Deal_Content.handleText(content, tagContet, img_urls_dict, title)
print("************item[‘tags‘]********************", item[‘tags‘])


item[‘content‘] = item[‘content‘].replace(re.findall("font-size: 18px;(.*)",item[‘content‘])[0],‘‘)

item[‘title‘] = title

item[‘time‘] = meta[‘time‘]

#item[‘content‘] = content

id, pid = Deal_Content.sql_read(response.url)

item[‘id‘] =id

item[‘pid‘] = pid

item[‘type_cn‘] = "省市级"

# #news 新闻来源、是那一个网站, 主页
item[‘news‘] = ‘宝鸡招商局‘

# type_no 就是 id
item[‘type_no‘] = 18

yield item
----------------------------------------------------------js.py-----------------------------------------------------------------------------------------


2019/6/27号————检查

标签:node   sch   one   urllib   code   打印   star   html   strftime   

原文地址:https://www.cnblogs.com/yuanjia8888/p/11099010.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!