Golang 爬取Discord Invite界面

Discord Invite页面是一个延迟加载的页面,我们需要等待页面加载完毕

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
package scraper

import (
"context"
"errors"
"github.com/PuerkitoBio/goquery"
"github.com/chromedp/chromedp"
"log"
"strconv"
"strings"
"time"
)

type Counts struct {
DiscordUrl string
ApproximatePresenceCount int //在线人数
ApproximateMemberCount int //总人数
}

// GetHttpHtmlContent 获取网站上爬取的数据
// htmlContent 是上面的 html 页面信息,selector 是我们第一步获取的 selector
func GetHttpHtmlContent(url string, selector string, sel interface{}) (string, error) {
options := []chromedp.ExecAllocatorOption{
chromedp.Flag("headless", true), // debug使用
chromedp.Flag("blink-settings", "imagesEnabled=false"),
chromedp.UserAgent(`Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36`),
}
//初始化参数,先传一个空的数据
options = append(chromedp.DefaultExecAllocatorOptions[:], options...)

c, _ := chromedp.NewExecAllocator(context.Background(), options...)

// create context
chromeCtx, cancel := chromedp.NewContext(c, chromedp.WithLogf(log.Printf))
defer cancel()
// 执行一个空task, 用提前创建Chrome实例
_ = chromedp.Run(chromeCtx, make([]chromedp.Action, 0, 1)...)
//创建一个上下文,超时时间为40s 此时间可做更改 调整等待页面加载时间
timeoutCtx, cancel := context.WithTimeout(chromeCtx, 10*time.Second)
defer cancel()
var htmlContent string
err := chromedp.Run(timeoutCtx,
chromedp.Navigate(url),
chromedp.WaitVisible(selector),
chromedp.OuterHTML(sel, &htmlContent, chromedp.ByJSPath),
)
if err != nil {
return "", err
}
return htmlContent, nil
}

// GetSpecialData 得到具体的数据
func GetSpecialData(htmlContent string, selector string) (int,int, error) {
dom, err := goquery.NewDocumentFromReader(strings.NewReader(htmlContent))
if err != nil {
return 0,0,err
}
var str string
selection := dom.Find(selector)
//dom.Find(selector).Each(func(i int, selection *goquery.Selection) {
// str = selection.Text()
//})
str = selection.Text()
str = strings.Replace(str,",","",-1)
str = strings.Replace(str,"人在线","",-1)
str = strings.Replace(str,"位成员","",-1)
index := strings.Index(str," ")
if index != -1 && len(str)-1 > index + 1{
onlineMembers,err := strconv.Atoi(str[:index])
if err != nil{
return 0,0,err
}
totalMembers,err := strconv.Atoi(str[index+1:len(str)-1])
if err != nil{
return 0,0,err
}
return onlineMembers,totalMembers, nil
}
err = errors.New("index = -1")
return 0,0,err
}

func GetMembers(url string) (*Counts,error){
//记住是未登录状态的页面
selector := "body > div > div.app-3xd6d0 >div > div > div > div > div> form > div > div"
param := `document.querySelector("body > div > div.app-3xd6d0 > div > div > div > div > div > form > div > div")`
html, err := GetHttpHtmlContent(url, selector, param)
if err != nil{
return nil,err
}
onlineMembers,totalMembers,err := GetSpecialData(html,"span")
discordMsg := &Counts{
DiscordUrl: url,
ApproximatePresenceCount: onlineMembers,
ApproximateMemberCount: totalMembers,
}
return discordMsg,nil
}