标签:encoding 类型 news this png soup pass 获取 mod
import requests from bs4 import BeautifulSoup import string import time import datetime import re #获取文章详情 def getNewDetail(newsrrl): #点击次数 def getClickCount(newUrl): #获取新闻编号 r2=re.findall(‘\_\d+\/(.*?)\.‘,d,re.S) #print(r2) r1=‘http://oa.gzcc.cn/api.php?op=count&id=‘ r3=‘&modelid=80‘ r22="".join(r2) #生成点击次数的URL r_all=r1+r22+r3 #print(r_all) rlink2=requests.get(r_all,headers=head) #获取点击次数 hist=rlink2.text.split(‘.html‘)[-1].lstrip("(‘)").rstrip("‘);") return hist soup=BeautifulSoup(r.text,‘html.parser‘) for i in soup.select(‘li‘): if len(i.select(".news-list-title"))>0: a=i.select(".news-list-title")[0].text b=i.select(".news-list-info")[0].contents[0].text c=i.select(".news-list-info")[0].contents[1].text d=i.select("a")[0].attrs[‘href‘] hist=getClickCount(d) print("标题:"+a+‘\n‘+"时间:"+b+‘\n‘+"来源:"+c+‘\n‘+"链接:"+d+‘\n‘+"点击:"+hist+‘\n\n‘) print() rlink=requests.get(d,headers=head) rlink.encoding=‘utf-8‘ #print(rlink.text) soup=BeautifulSoup(rlink.text,‘html.parser‘) e=soup.select(".show-info")[0].text f=e.split() for i in range(len(f)-1): print(f[i],end=‘ ‘) print("点击:"+hist+"次") print() print() #时间类型转换 dt=e.lstrip(‘发布时间:‘)[:19] dt = datetime.datetime.strptime(dt,‘%Y-%m-%d %H:%M:%S‘) print("datetime类型时间:",end=‘ ‘) print(dt) print() #作者 i=e.find(‘作者:‘) if i>0: s=e[e.find(‘作者:‘):].split()[0].lstrip(‘作者:‘) print("作者:",end=‘ ‘) print(s) print() #审核 i=e.find(‘审核:‘) if i>0: s=e[e.find(‘审核:‘):].split()[0].lstrip(‘审核:‘) print("审核:",end=‘ ‘) print(s) print() #来源 i=e.find(‘来源:‘) if i>0: s=e[e.find(‘来源:‘):].split()[0].lstrip(‘来源:‘) print("来源:",end=‘ ‘) print(s) print() #摄影 i=e.find(‘摄影:‘) if i>0: s=e[e.find(‘摄影:‘):].split()[0].lstrip(‘摄影:‘) print("摄影:",end=‘ ‘) print(s) print() #点击次数 i=e.find(‘点击:‘) if i>0: print("点击:",end=‘ ‘) print(hist) for pn in range(5): print() print() #打印文章主体 print(soup.select("#content")[0].text) print() print() print() #爬虫伪装 head = {} head[‘user-agent‘]=‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36‘ r=requests.get("http://news.gzcc.cn/html/xiaoyuanxinwen/",headers=head) r.encoding=‘utf-8‘ soup=BeautifulSoup(r.text,‘html.parser‘) getNewDetail(r) #电话 telephone=re.findall(‘(\d{3,4})\-(\d{6,8})‘,soup.text,re.S) print(telephone) print() #邮箱 email=‘308800902@qq.com‘ eroll=‘([0-9a-zA-Z_]{0,19}@[0-9a-zA-Z_]{0,19}(?:\.\w{2,3}){0,2})‘ efinadll=re.findall(eroll,email) print(efinadll) print() #英文分词 estr=‘‘‘Personal information such as names, birthdays, nicknames, pet‘s names, social security numbers, and the like should never, ever, ever be used because these are way too obvious and too easy to crack. The more you avoid using things like this as your passwords, the more secure your login areas will be.‘‘‘ print(re.split("[\s,.?!]+",estr))
标签:encoding 类型 news this png soup pass 获取 mod
原文地址:https://www.cnblogs.com/wban48/p/8782518.html