码迷,mamicode.com
首页 > 其他好文 > 详细

go-爬段子

时间:2019-12-07 14:22:15      阅读:92      评论:0      收藏:0      [点我收藏+]

标签:read   let   tps   content   UNC   --   art   tco   package   

爬取搞笑的段子,横向爬取+纵向爬取

横向爬取爬页数,纵向爬取,爬每页的内容

package main

import (
    "fmt"
    "io"
    "net/http"
    "os"
    "regexp"
    "strconv"
    "strings"
)

func HttpGet(url string) (result string, err error) {
    resp, err1 := http.Get(url)
    if err1 != nil {
        err = err1
        return
    }
    defer resp.Body.Close()
    buf := make([]byte, 4096)
    for {
        n, err2 := resp.Body.Read(buf)
        if n == 0 {
            break
        }
        if err2 != nil && err2 != io.EOF {
            err = err2
            return
        }
        result += string(buf[:n])
    }
    return
}

func SaveJoke2File(idx int, fileTitle, fileContent []string) {
    path := "第" + strconv.Itoa(idx) + "页.txt"
    f, err := os.Create(path)
    if err != nil {
        fmt.Println("err:", err)
        return
    }
    defer f.Close()
    n := len(fileTitle)
    for i := 0; i < n; i++ {
        f.WriteString(fileTitle[i] + "\n" + fileContent[i] + "\n")
        f.WriteString("-----------------------------\n")
    }
}

//抓取一个网页
func Spiderpage(idx int, page chan int) {
    url := "https://m.pengfue.com/xiaohua_" + strconv.Itoa(idx) + ".html"

    result, err := HttpGet(url)
    if err != nil {
        fmt.Println("httpget err", err)
        return
    }

    ret := regexp.MustCompile(`<h1 class="f18"><a href="(?s:(.*?))"`)

    alls := ret.FindAllStringSubmatch(result, -1)

    fileTitle := make([]string, 0)
    fileContent := make([]string, 0)

    for _, jokeURL := range alls {
        //      fmt.Println("jokeURL", jokeURL[1])
        title, content, err := SpiderJokePage(jokeURL[1])

        if err != nil {
            fmt.Println("err:", err)
            continue
        }

        //      fmt.Println("title:", title)
        //      fmt.Println("content:", content)
        fileTitle = append(fileTitle, title)
        fileContent = append(fileContent, content)
    }
    SaveJoke2File(idx, fileTitle, fileContent)

    page <- idx
}

func toWork(start, end int) {
    fmt.Printf("正在爬取%d到%d页。。。\n", start, end)

    page := make(chan int)

    for i := start; i <= end; i++ {
        //      title, content, err := Spiderpage(i)
        go Spiderpage(i, page)
        //      if err != nil {
        //          fmt.Println("err:", err)
        //          continue
        //      }
        //      fmt.Println("title:", title)
        //      fmt.Println("content:", content)
    }
    for i := start; i <= end; i++ {
        fmt.Printf("第%d个页面爬取完成\n", <-page)
    }
}

func SpiderJokePage(url string) (title, content string, err error) {
    result, err1 := HttpGet(url)
    if err1 != nil {
        //      fmt.Println("httpget err", err)
        err = err1
        return
    }
    ret1 := regexp.MustCompile(`<title>(?s:(.*?))</title>`)

    alls := ret1.FindAllStringSubmatch(result, 1) //两处,取一个
    for _, timTitle := range alls {
        title = timTitle[1]
        title = strings.Replace(title, " ", "", -1)
        title = strings.Replace(title, "\n", "", -1)
        break
    }

    ret2 := regexp.MustCompile(`<div class="con-txt">(?s:(.*?))</div>`)

    alls2 := ret2.FindAllStringSubmatch(result, 1) //两处,取一个
    for _, timTitle := range alls2 {
        content = timTitle[1]
        content = strings.Replace(content, " ", "", -1)
        content = strings.Replace(content, "\n", "", -1)
        content = strings.Replace(content, "    ", "", -1)
        content = strings.Replace(content, "&nbsp;", "", -1)
        break
    }
    return
}

func main() {
    var start, end int
    fmt.Print("请输入起始页。。。")
    fmt.Scan(&start)
    fmt.Print("请输入终止页。。。")
    fmt.Scan(&end)

    toWork(start, end)
}

go-爬段子

标签:read   let   tps   content   UNC   --   art   tco   package   

原文地址:https://www.cnblogs.com/ygjzs/p/12001375.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!