码迷,mamicode.com
首页 > 其他好文 > 详细

爬虫的简单操作

时间:2020-02-09 14:57:56      阅读:108      评论:0      收藏:0      [点我收藏+]

标签:spl   ring   download   lse   ESS   inf   创建文件夹   技术   request   

今天学习了python爬虫的简单操作。

1.学会创建文件夹和创建文件:

技术图片
 1 import os
 2 
 3 def mkdir(path):
 4     if os.path.exists(path):###判断是文件夹否存在,否则有可能会报错
 5         print("The path has already existed .")
 6     else:
 7         os.makedirs(path)###新建文件夹
 8         print("Done .")
 9 
10 def write(path,str):
11     with open(path,"w+") as file:###写文件
12         file.write(str)
13 
14 def main():
15     mkdir("test")
16     write("test/test.txt","hello world")
17 
18 if __name__=="__main__":
19     main()
View Code

2.得到一个网站的源码(如果能够访问):

技术图片
 1 from bs4 import BeautifulSoup
 2 import requests
 3 
 4 def main():
 5     html=requests.get("https://www.baidu.com")###去找这个网址
 6     html.encoding="utf-8"###中文网址,换个字符集
 7     soup=BeautifulSoup(html.text,"lxml")###美味的汤,就是正则表达式
 8     print(soup.prettify())###将源码格式化(不是删数据)
 9 
10 if __name__=="__main__":
11     main()
View Code

3.得到一个网站的源码中相应标签的元素(如果能够访问):

技术图片
 1 import requests
 2 from bs4 import BeautifulSoup
 3 
 4 def write_to_file(content):
 5     with open("save.txt","a",encoding="utf-8") as f:
 6         f.write(content)
 7 
 8 def get_blog_info(url):
 9     html=requests.get(url)
10     soup=BeautifulSoup(html.text,"lxml")
11     print(soup.title)###各种各样的元素
12     print("="*100)
13     print(type(soup.title))
14     print("="*100)
15     print(type(soup.title.string))
16     print("="*100)
17     print(soup.title.string)
18     print("="*100)
19     print(soup.head)
20     print("="*100)
21     print(soup.p)
22 
23 def main():
24     blog_url="https://www.cnblogs.com/sgh1023"
25     get_blog_info(blog_url)
26 
27 if __name__=="__main__":
28     main()
View Code

4.下载一个图片(如果能够访问):

技术图片
 1 import requests
 2 from bs4 import BeautifulSoup
 3 import os
 4 
 5 tot=0
 6 path="save"
 7 
 8 def mkdir(path):
 9     if os.path.exists(path):
10         return
11     else:
12         os.makedirs(path)
13 
14 def save(content):
15     global tot,path
16     mkdir(path)
17     with open(path+"/"+str(tot)+".png","wb+") as file:
18         file.write(content)
19         file.close()
20         tot=tot+1
21 
22 def download_image(url):###下图片,不保证一定成功
23     print("Now downloading...",tot)
24     response=requests.get(url)
25     save(response.content)
26     print("Done !")
27 
28 def main():
29     download_image("https://www.baidu.com/img/pc_1c6e30772d5e4103103bd460913332f9.png")
30 
31 if __name__=="__main__":
32     main()
View Code

5.下载一个网页的图片:

技术图片
 1 import requests
 2 import urllib
 3 import os
 4 from bs4 import BeautifulSoup
 5 
 6 tot=0
 7 path="save"
 8 
 9 def mkdir(path):
10     if os.path.exists(path):
11         return
12     else:
13         os.makedirs(path)
14 
15 def save(content):
16     global tot,path
17     mkdir(path)
18     with open(path+"/"+str(tot)+".png","wb+") as file:
19         file.write(content)
20         file.close()
21         tot=tot+1
22 ######################################################################
23 def get_html_content(url):###获得网址的源码
24     req=urllib.request.Request(url)###添加头部,伪装Goole浏览器,这是抄的代码。
25     req.add_header(user-agent,Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36)
26     response=urllib.request.urlopen(req)
27     html=response.read()
28     return html
29 
30 def url_exist(url):###判断网址是否存在
31     try:
32         response=requests.get(url)
33         return True
34     except:
35         return False
36 
37 def download_image(url):###下图片
38     print("Now downloading...",tot,url)
39     if(url_exist(url)):###判断网址是否存在
40         response=requests.get(url)
41         save(response.content)
42         print("Done !")
43     else:
44         print("Unavailable !")
45 ######################################################################
46 def process(str):###简单地处理网址
47     if(str[0]==h):
48         return str;
49     elif(str[0]==/ and str[1]!=/):
50         return "https:/"+str
51     return "https:"+str;
52 
53 def get_image(url):
54     soup=BeautifulSoup(get_html_content(url),"lxml")
55     items=soup.find_all("img",{"src":True})
56     for i in items:
57         download_image(process(i["src"]))
58 
59 def main():
60     url="https://www.bilibili.com"
61     get_image(url)
62 
63 if __name__=="__main__":
64     main()
View Code

 当然,find_all的参数视具体情况而定。

爬虫的简单操作

标签:spl   ring   download   lse   ESS   inf   创建文件夹   技术   request   

原文地址:https://www.cnblogs.com/GreenDuck/p/12287016.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!