码迷,mamicode.com
首页 > 其他好文 > 详细

go--单任务版爬虫

时间:2018-07-07 17:57:35      阅读:144      评论:0      收藏:0      [点我收藏+]

标签:text   nil   net   back   print   prot   lock   package   xxxx   

1.获取初识页面内容

 

package main

import (
	"net/http"
	"fmt"
	"io/ioutil"
)

func main() {
	//打开链接,会有两个返回值,一个响应,一个error
	res, err := http.Get("https://tieba.baidu.com/p/5524106374?red_tag=0000236673")
	if err != nil {
		panic(err)
	}
	defer res.Body.Close()
	//如果响应状态码不等于200,证明出问题了
	if res.StatusCode != 200 {
		fmt.Println("err statuscode:", res.StatusCode)
		return
	}
	//将内容读到all里面
	all, _ := ioutil.ReadAll(res.Body)
	fmt.Println(string(all))

	//还可以使用httputil下的一个方法
	//会自动的将res的响应体读取到all中
	//此外该方法还会打印一些头部信息,如下
	//all, _ = httputil.DumpResponse(res, true)
	//fmt.Println(string(all))
//	HTTP / 1.1
//	200
//	OK
//	Transfer - Encoding: chunked
//Connection:
//	keep - alive
//	Content - Type: text / html;
//	charset = UTF - 8
//Date:
//	Sat, 07
//	Jul
//	2018
//	05:20:48
//	GMT
//P3p:
//	CP = " OTI DSP COR IVA OUR IND COM "
//Server:
//	Apache
//	Set - Cookie: TIEBA_USERTYPE = d8c56c898382fa778148475e;
//	expires = Thu, 31-Dec-2020
//	15:59:59
//	GMT;
//	path =/;
//	domain = tieba.baidu.com
//	Set - Cookie: wise_device = 0;
//	path =/
//	Set - Cookie: BAIDUID = 3826
//C6F501EC8C114AC77215BBE0DA64:
//	FG = 1;
//	expires = Sun, 07-Jul-19
//	05:20:48
//	GMT;
//	max - age = 31536000;
//	path =/;
//	domain =.baidu.com;
//	version = 1
//Tracecode:
//	12484498910460795914070713
//Tracecode:
//	12484498910470965258070713
//Vary:
//	Accept - Encoding
//	X - Xss - Protection: 1;
//	mode = block
}

 

  

2.使用正则表达式解析,并提取url

 

package main

import (
	"net/http"
	"fmt"
	"io/ioutil"
	"regexp"
)

func main() {
	res, err := http.Get("https://tieba.baidu.com/p/5524106374?red_tag=0000236673")
	if err != nil {
		panic(err)
	}
	defer res.Body.Close()
	if res.StatusCode != 200 {
		fmt.Println("err statuscode:", res.StatusCode)
		return
	}
	//将内容读到all里面
	all, _ := ioutil.ReadAll(res.Body)

	re:=regexp.MustCompile(`src="(http[^"]+?(?:jpg|png))"`)
	//找到所有图片链接 src="thhp://xxxxxx.jpg"
	match := re.FindAllStringSubmatch(string(all),-1)
	//go中的正则没办法单独匹配括号里面的内容,也许我们不想要整体的内容,但是go还是会匹配出来。
	//而且只能使用FindAllStringSubmatch,如果FindAllString,匹配的仍然是整体
	//加上SubMatch,会将整体和()里面的内容都加到切片当中
	//解决的办法是for循环
	urls:=make([]interface{},0)
	for _,url:=range match{
		//但我们的目的是找我们想要的图片,可是有一些图片是我们不想要的,但它们也符合我们的正则表达式
		//因此我们可以进行一个判断
		//若url的长度和我们期待的图片的长度不相等,那么就进行下一轮循环
		if len(url[1]) != len("https://imgsa.baidu.com/forum/w%3D580/sign=605377bf04b30f24359aec0bf897d192/eb55981bb051f8199dc2df94d1b44aed2c73e7d5.jpg"){
			continue
		}
		urls = append(urls,url[1])
	}
	fmt.Println(urls)
}

//[https://imgsa.baidu.com/forum/w%3D580/sign=51dfed7aafc27d1ea5263bcc2bd7adaf/29aa8064034f78f072d6c52d72310a55b1191cd2.jpg 
// https://imgsa.baidu.com/forum/w%3D580/sign=b76619654c10b912bfc1f6f6f3fcfcb5/0b74e395d143ad4bab64181e89025aafa50f0669.jpg 
// https://imgsa.baidu.com/forum/w%3D580/sign=dc2cfe017dc6a7efb926a82ecdfbafe9/6e799a3533fa828b9d23c63df61f4134960a5ab7.jpg 
// https://imgsa.baidu.com/forum/w%3D580/sign=605377bf04b30f24359aec0bf897d192/eb55981bb051f8199dc2df94d1b44aed2c73e7d5.jpg 
// https://imgsa.baidu.com/forum/w%3D580/sign=2f3f86cc444a20a4311e3ccfa0539847/644711f33a87e95059032db21b385343faf2b4a4.jpg 
// https://imgsa.baidu.com/forum/w%3D580/sign=e30f414125dda3cc0be4b82831e83905/4b6297315c6034a8f6e6605fc0134954082376b4.jpg 
// https://imgsa.baidu.com/forum/w%3D580/sign=50b8de14898ba61edfeec827713597cc/af59f91b0ef41bd54ce3302a5ada81cb38db3d00.jpg 
// https://imgsa.baidu.com/forum/w%3D580/sign=3f9442bff8deb48ffb69a1d6c01e3aef/2f586a34970a304ee7717824dac8a786c8175c61.jpg 
// https://imgsa.baidu.com/forum/w%3D580/sign=17f88df316950a7b75354ecc3ad3625c/a98c2146f21fbe09309315d160600c338544adea.jpg 
// https://imgsa.baidu.com/forum/w%3D580/sign=4bae33e30224ab18e016e13f05f8e69a/1dbb35178a82b901535815f6788da9773b12efc2.jpg]

 

  

package main

import (
	"net/http"
	"fmt"
	"io/ioutil"
	"regexp"
)
//可以将上面的进行一个封装
//link:访问的url地址
//rule:正则表达式要匹配的规则
//target:我们想获取的链接
func get_pic_url(link,rule,target string) []interface{}{
	res, err := http.Get(link)
	if err != nil {
		panic(err)
	}
	defer res.Body.Close()
	if res.StatusCode != 200 {
		fmt.Println("err statuscode:", res.StatusCode)
		panic("出错了")
	}
	all, _ := ioutil.ReadAll(res.Body)
	re:=regexp.MustCompile(rule)
	match := re.FindAllStringSubmatch(string(all),-1)
	urls:=make([]interface{},0)
	for _,url:=range match{
		if len(url[1]) != len(target){
			continue
		}
		urls = append(urls,url[1])
	}
	return urls
}


func main(){
	link := "https://tieba.baidu.com/p/4244799788?red_tag=2313275030"
	rule := `src="(.+?\.jpg)"`
	target := "https://imgsa.baidu.com/forum/w%3D580/sign=a80a7ab75eee3d6d22c687c373166d41/862df7246b600c337e73b7d81d4c510fd9f9a163.jpg"
	urls := get_pic_url(link,rule,target)
	for _,url := range urls{
		fmt.Println(url)
	}
}

//https://imgsa.baidu.com/forum/w%3D580/sign=4e328e1f8094a4c20a23e7233ef51bac/0b837a899e510fb3979e1887de33c895d0430ced.jpg
//https://imgsa.baidu.com/forum/w%3D580/sign=4b14d9aba11ea8d38a22740ca70b30cf/57efcc11728b4710c1fc93dcc4cec3fdfd032399.jpg
//https://imgsa.baidu.com/forum/w%3D580/sign=dc1c05ce3a6d55fbc5c6762e5d234f40/1b6a6a600c3387441173466c560fd9f9d72aa03f.jpg
//https://imgsa.baidu.com/forum/w%3D580/sign=72361f898526cffc692abfba89004a7d/5e528601a18b87d6e20cc20f000828381f30fd27.jpg
//https://imgsa.baidu.com/forum/w%3D580/sign=9627ab23968fa0ec7fc764051696594a/3a847acb0a46f21f4afe3743f1246b600d33aea3.jpg
//https://imgsa.baidu.com/forum/w%3D580/sign=9f46baed7c899e51788e3a1c72a6d990/1a78aec379310a5519a5b7e4b04543a9832610ef.jpg
//https://imgsa.baidu.com/forum/w%3D580/sign=0dceca0bd30735fa91f04eb1ae500f9f/462a024f78f0f736091524110d55b319eac41381.jpg
//https://imgsa.baidu.com/forum/w%3D580/sign=4da8783f92cad1c8d0bbfc2f4f3f67c4/2eb3fd039245d688a9d9dab4a3c27d1ed31b2491.jpg
//https://imgsa.baidu.com/forum/w%3D580/sign=6459fdcc0ffa513d51aa6cd60d6c554c/e708f31fbe096b6303f236400b338744eaf8ac82.jpg
//https://imgsa.baidu.com/forum/w%3D580/sign=5cd8795f992f07085f052a08d924b865/675d622762d0f7034a81fecc0ffa513d2697c5ba.jpg
//https://imgsa.baidu.com/forum/w%3D580/sign=77e1f18f29dda3cc0be4b82831e83905/7da177c6a7efce1ba1c1a0daa851f3deb58f6582.jpg
//https://imgsa.baidu.com/forum/w%3D580/sign=bddeeefe9b82d158bb8259b9b00b19d5/4ac8c8177f3e6709a5ea2def3cc79f3df9dc5582.jpg
//https://imgsa.baidu.com/forum/w%3D580/sign=ab1df271f4deb48ffb69a1d6c01e3aef/7f16d009b3de9c8287bf919f6b81800a18d84392.jpg
//https://imgsa.baidu.com/forum/w%3D580/sign=c527aa16a8c379317d688621dbc5b784/15578718367adab4f73a73538cd4b31c8601e483.jpg
//https://imgsa.baidu.com/forum/w%3D580/sign=0bce68f3d239b6004dce0fbfd9503526/407a5882b2b7d0a212f46f8dccef76094b369aab.jpg
//https://imgsa.baidu.com/forum/w%3D580/sign=64b851138044ebf86d716437e9f8d736/d7c9e850352ac65c1fe00e63fcf2b21192138a83.jpg
//https://imgsa.baidu.com/forum/w%3D580/sign=2e1d5d1b5cb5c9ea62f303ebe538b622/e199902397dda144b7b207a2b5b7d0a20cf48631.jpg
//https://imgsa.baidu.com/forum/w%3D580/sign=727db5e4b04543a9f51bfac42e178a7b/c33d8ad4b31c870160efd9f6207f9e2f0708ff10.jpg
//https://imgsa.baidu.com/forum/w%3D580/sign=77dd52d3d0ca7bcb7d7bc7278e086b3f/d9d5023b5bb5c9ea101d68f3d239b6003bf3b386.jpg
//https://imgsa.baidu.com/forum/w%3D580/sign=76dc96b5d643ad4ba62e46c8b2035a89/aee78326cffc1e17256de7214d90f603728de99b.jpg
//https://imgsa.baidu.com/forum/w%3D580/sign=09f988ea7ccb0a4685228b315b63f63e/ee755ab5c9ea15ce33d98a19b1003af33a87b2bd.jpg
//https://imgsa.baidu.com/forum/w%3D580/sign=d41bcbcef71f3a295ac8d5c6a925bce3/3c16cdbf6c81800a4197e31eb63533fa828b477a.jpg
//https://imgsa.baidu.com/forum/w%3D580/sign=00cd89a8d33f8794d3ff4826e21a0ead/d8b4e7cd7b899e519d3b500445a7d933c8950d33.jpg
//https://imgsa.baidu.com/forum/w%3D580/sign=0ebf913fb9096b6381195e583c328733/f88037d3d539b6003e0a59a7ee50352ac75cb7ac.jpg
//https://imgsa.baidu.com/forum/w%3D580/sign=b8db0843f1246b607b0eb27cdbf91a35/3877b7003af33a87ef82560ac15c10385243b585.jpg
//https://imgsa.baidu.com/forum/w%3D580/sign=e4baea7493eef01f4d1418cdd0fe99e0/71ccd058ccbf6c81d03f7ff8bb3eb13533fa4060.jpg
//https://imgsa.baidu.com/forum/w%3D580/sign=6ff190046c600c33f079dec02a4d5134/5b4e3bf33a87e95031b8a07c17385343faf2b4c5.jpg
//https://imgsa.baidu.com/forum/w%3D580/sign=099bb22732fae6cd0cb4ab693fb20f9e/ee0179f0f736afc3e7816c75b419ebc4b64512db.jpg
//https://imgsa.baidu.com/forum/w%3D580/sign=4b966f04ac18972ba33a00c2d6cc7b9d/a6b5faedab64034fe3709116a8c379310a551d2b.jpg
//https://imgsa.baidu.com/forum/w%3D580/sign=44dc433f92cad1c8d0bbfc2f4f3f67c4/2eb3fd039245d688a0ade1b4a3c27d1ed31b24c5.jpg
//https://imgsa.baidu.com/forum/w%3D580/sign=4bff61282f381f309e198da199004c67/8f9ef603918fa0ecdd2a371d219759ee3c6ddb84.jpg
//https://imgsa.baidu.com/forum/w%3D580/sign=0447538859df8db1bc2e7c6c3923dddb/6fc5a71ea8d3fd1f1913f02a374e251f95ca5f52.jpg
//https://imgsa.baidu.com/forum/w%3D580/sign=1d2eae1d77cf3bc7e800cde4e100babd/9050d31b0ef41bd5d749bae456da81cb39db3d52.jpg
//https://imgsa.baidu.com/forum/w%3D580/sign=1d1af6afa7ec08fa260013af69ee3d4d/96850b46f21fbe09817a91046c600c338744ad4d.jpg
//https://imgsa.baidu.com/forum/w%3D580/sign=57bfc6ab80d6277fe912323018391f63/fc91a8ec8a136327b26e9023968fa0ec09fac7ea.jpg
//https://imgsa.baidu.com/forum/w%3D580/sign=a745d5db44166d223877159c76220945/6a0aeaf81a4c510f2b47b00a6759252dd52aa5ea.jpg
//https://imgsa.baidu.com/forum/w%3D580/sign=db4dd5319413b07ebdbd50003cd69113/4e827dd98d1001e919be461fbf0e7bec55e797ea.jpg

  

 

go--单任务版爬虫

标签:text   nil   net   back   print   prot   lock   package   xxxx   

原文地址:https://www.cnblogs.com/traditional/p/9277459.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!