码迷,mamicode.com
首页 > 其他好文 > 详细

go-百度贴吧-纵向爬取

时间:2019-12-07 14:50:47      阅读:93      评论:0      收藏:0      [点我收藏+]

标签:reg   scan   ring   代码   turn   compile   pack   test   author   

百度贴吧纵向爬取

上一个是横向爬取的,这个纵向爬取,具体怎么做的看代码

package main

import (
    "fmt"
    "io"
    "net/http"
    "os"
    "regexp"
    "strconv"
)

func HttpGetDB(url string) (result string, err error) {
    resp, err1 := http.Get(url)
    if err1 != nil {
        err = err1
        return
    }
    defer resp.Body.Close()

    buf := make([]byte, 4096)

    for {
        n, err2 := resp.Body.Read(buf)
        if n == 0 {
            break
        }
        if err2 != nil && err2 != io.EOF {
            err = err2
            return
        }
        result += string(buf[:n])
    }

    return
}

func Save2file(idx int, fileName [][]string) {
    path := "第" + strconv.Itoa(idx) + "页" + ".txt"
    f, err := os.Create(path)
    if err != nil {
        fmt.Println("os.Create err", err)
        return
    }
    defer f.Close()

    n := len(fileName)

    f.WriteString("名称\n")
    for i := 0; i < n; i++ {
        f.WriteString(fileName[i][1] + "\n")
    }
}

func SpiderPageDB(idx int, page chan int) {
    //  url := "https://movie.douban.com/review/best/?start=" + strconv.Itoa((idx-1)*20)
    //  url := "https://movie.douban.com/annual/2018?source=navigation#" + strconv.Itoa(idx-1)
    //  https://movie.douban.com/review/best/?start=20

    url := "https://tieba.baidu.com/f?kw=vue&ie=utf-8&pn=" + strconv.Itoa((idx-1)*50)

    result, err := HttpGetDB(url)
    if err != nil {
        fmt.Println("HttpGet2 err", err)
        return
    }
    //  fmt.Println("result=", result)
    ret := regexp.MustCompile(`<span class="tb_icon_author_rely j_replyer" title="最后回复人:(?s:(.*?))"`)
    fileName := ret.FindAllStringSubmatch(result, -1)
    //  for _, name := range fileName {
    //      fmt.Println("name", name[1])
    //  }
    Save2file(idx, fileName)

    page <- idx
}

func toWork(start, end int) {
    fmt.Printf("正在爬取%d到%d页。。。\n", start, end)

    page := make(chan int)
    for i := start; i <= end; i++ {
        go SpiderPageDB(i, page)
    }

    for i := start; i <= end; i++ {
        fmt.Print("第%d页爬取完成\n", <-page)
    }
}

func main() {
    var start, end int
    fmt.Print("请输入起始页(>=1):")
    fmt.Scan(&start)
    fmt.Print("请输入终止页(>=start):")
    fmt.Scan(&end)

    toWork(start, end)
}

go-百度贴吧-纵向爬取

标签:reg   scan   ring   代码   turn   compile   pack   test   author   

原文地址:https://www.cnblogs.com/ygjzs/p/12001364.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!