转载于:

goquery 是什么

goquery 是用 Go 实现的一个类似于 jQuery 的库,它封装了 Go 标准库 net/html 和 CSS 库 cascadia,提供了与 jQuery 相近的接口。

Go 著名的爬虫框架 colly 就是基于 goquery 实现的。

goquery 能用来干什么

goquery 提供了与 jQuery 相近的接口,可以对爬取到的 HTML 进行过滤以获得自己想要的数据。

goquery quick start

Document 是 goquery 包的核心类之一,创建一个 Document 是使用 goquery 的第一步:

type Document struct {
    *Selection
    Url      *url.URL
    rootNode *html.Node
}

func NewDocumentFromNode(root *html.Node) *Document 
func NewDocument(url string) (*Document, error)
func NewDocumentFromReader(r io.Reader) (*Document, error)
func NewDocumentFromResponse(res *http.Response) (*Document, error)
rootNodeUrlNewDocumentNewDocumentFromResponse
Document类Selection类Find函数
func TestFind(t *testing.T) {
    html := `<body>
                <div>DIV1</div>
                <div>DIV2</div>
                <span>SPAN</span>
            </body>
            `
    
    dom, err := goquery.NewDocumentFromReader(strings.NewReader(html))
    if err != nil {
        log.Fatalln(err)
    }

    dom.Find("div").Each(func(i int, selection *goquery.Selection) {
        fmt.Println(selection.Text())
    })
}
------------运行结果--------------
=== RUN   TestFind
DIV1
DIV2

玩转goquery.Find()

Find函数Find函数

查找多个标签

,逗号
func TestMultiFind(t *testing.T) {
    html := `<body>
                <div>DIV1</div>
                <div>DIV2</div>
                <span>SPAN</span>
            </body>
            `

    dom, err := goquery.NewDocumentFromReader(strings.NewReader(html))
    if err != nil {
        log.Fatalln(err)
    }

    dom.Find("div,span").Each(func(i int, selection *goquery.Selection) {
        fmt.Println(selection.Text())
    })
}
------------运行结果--------------
=== RUN   TestMultiFind
DIV1
DIV2
SPAN

Id 选择器

#
func TestFind_IdSelector(t *testing.T) {
    html := `<body>
                <div id="div1">DIV1</div>
                <div>DIV2</div>
                <span>SPAN</span>
            </body>
            `

    dom, err := goquery.NewDocumentFromReader(strings.NewReader(html))
    if err != nil {
        log.Fatalln(err)
    }

    dom.Find("#div1").Each(func(i int, selection *goquery.Selection) {
        fmt.Println(selection.Text())
    })
}
------------运行结果--------------
=== RUN   TestFind_IdSelector
DIV1

Class 选择器

.
func TestFind_ClassSelector(t *testing.T) {
    html := `<body>
                <div>DIV1</div>
                <div class="name">DIV2</div>
                <span>SPAN</span>
            </body>
            `

    dom, err := goquery.NewDocumentFromReader(strings.NewReader(html))
    if err != nil {
        log.Fatalln(err)
    }

    dom.Find(".name").Each(func(i int, selection *goquery.Selection) {
        fmt.Println(selection.Text())
    })
}
------------运行结果--------------
=== RUN   TestFind_ClassSelector
DIV2

属性选择器

[]
func TestFind_AttributeSelector(t *testing.T) {
    html := `<body>
                <div>DIV1</div>
                <div lang="zh">DIV2</div>
                <span>SPAN</span>
            </body>
            `

    dom, err := goquery.NewDocumentFromReader(strings.NewReader(html))
    if err != nil {
        log.Fatalln(err)
    }

    dom.Find("div[lang]").Each(func(i int, selection *goquery.Selection) {
        fmt.Println(selection.Text())
    })
}
------------运行结果--------------
=== RUN   TestFind_AttributeSelector
DIV2

属性选择器也支持表达式过滤,比如:

func TestFind_AttributeSelector_2(t *testing.T) {
    html := `<body>
                <div>DIV1</div>
                <div lang="zh">DIV2</div>
                <div lang="en">DIV3</div>
                <span>SPAN</span>
            </body>
            `

    dom, err := goquery.NewDocumentFromReader(strings.NewReader(html))
    if err != nil {
        log.Fatalln(err)
    }

    dom.Find("div[lang=zh]").Each(func(i int, selection *goquery.Selection) {
        fmt.Println(selection.Text())
    })
}
------------运行结果--------------
=== RUN   TestFind_AttributeSelector_2
DIV2
选择器说明
Find(“div[lang]”)筛选含有lang属性的div元素
Find(“div[lang=zh]”)筛选lang属性为zh的div元素
Find(“div[lang!=zh]”)筛选lang属性不等于zh的div元素
Find(“div[lang¦=zh]”)筛选lang属性为zh或者zh-开头的div元素
Find(“div[lang*=zh]”)筛选lang属性包含zh这个字符串的div元素
Find(“div[lang~=zh]”)筛选lang属性包含zh这个单词的div元素,单词以空格分开的
Find(“div[lang$=zh]”)筛选lang属性以zh结尾的div元素,区分大小写
Find(“div[lang^=zh]”)筛选lang属性以zh开头的div元素,区分大小写
Find("div[id][lang=zh]")

子节点选择器

>
func TestFind_ChildrenSelector(t *testing.T) {
    html := `<body>
                <div>DIV1</div>
                <div>DIV2</div>
                <span>SPAN</span>
            </body>
            `

    dom, err := goquery.NewDocumentFromReader(strings.NewReader(html))
    if err != nil {
        log.Fatalln(err)
    }

    dom.Find("body>span").Each(func(i int, selection *goquery.Selection) {
        fmt.Println(selection.Text())
    })
}
------------运行结果--------------
=== RUN   TestFind_ChildrenSelector
SPAN

此外+表示相邻,~表示共有(父节点相同即为true)

内容过滤器

过滤文本

:contains($text)
func TestFind_ContentFilter_Contains(t *testing.T) {
    html := `<body>
                <div>DIV1</div>
                <div>DIV2</div>
                <span>SPAN</span>
            </body>
            `

    dom, err := goquery.NewDocumentFromReader(strings.NewReader(html))
    if err != nil {
        log.Fatalln(err)
    }

    dom.Find("div:contains(V2)").Each(func(i int, selection *goquery.Selection) {
        fmt.Println(selection.Text())
    })
}
------------运行结果--------------
=== RUN   TestFind_ContentFilter_Contains
DIV2

过滤节点

func TestFind_ContentFilter_Has(t *testing.T) {
    html := `<body>
                <span>SPAN1</span>
                <span>
                    SPAN2
                    <div>DIV</div>
                </span>
            </body>
            `

    dom, err := goquery.NewDocumentFromReader(strings.NewReader(html))
    if err != nil {
        log.Fatalln(err)
    }

    dom.Find("span:has(div)").Each(func(i int, selection *goquery.Selection) {
        fmt.Println(selection.Text())
    })
}
------------运行结果--------------
=== RUN   TestFind_ContentFilter_Has
SPAN2
DIV
:first-child:first-of-type
:last-child:last-of-type:nth-child(n):nth-of-type(n)

goquery 源码分析

Find函数
func (s *Selection) Find(selector string) *Selection {
    return pushStack(s, findWithMatcher(s.Nodes, compileMatcher(selector)))
}
Find函数pushStack函数实现
func pushStack(fromSel *Selection, nodes []*html.Node) *Selection {
    result := &Selection{nodes, fromSel.document, fromSel}
    return result
}
nodes参数
pushStack
type Selection struct {
    Nodes    []*html.Node
    document *Document
    prevSel  *Selection // 上一个节点的地址
}
pushStack函数的nodes参数nodes参数Find函数findWithMatcher函数
func findWithMatcher(nodes []*html.Node, m Matcher) []*html.Node {
    return mapNodes(nodes, func(i int, n *html.Node) (result []*html.Node) {
        for c := n.FirstChild; c != nil; c = c.NextSibling {
            if c.Type == html.ElementNode {
                result = append(result, m.MatchAll(c)...)
            }
        }
        return
    })
}
findWithMatcher函数mapNodes函数
func mapNodes(nodes []*html.Node, f func(int, *html.Node) []*html.Node) (result []*html.Node) {
    set := make(map[*html.Node]bool)
    for i, n := range nodes {
        if vals := f(i, n); len(vals) > 0 {
            result = appendWithoutDuplicates(result, vals, set)
        }
    }
    return result
}
mapNodes函数参数f[]*html.Nodef func(int, *html.Node) []*html.Node
func(i int, n *html.Node) (result []*html.Node) {
    for c := n.FirstChild; c != nil; c = c.NextSibling {
        if c.Type == html.ElementNode {
            result = append(result, m.MatchAll(c)...)
        }
    }
    return
}![img.png](img.png)
html.Node节点MatchAll函数
type Matcher interface {
    Match(*html.Node) bool
    MatchAll(*html.Node) []*html.Node
    Filter([]*html.Node) []*html.Node
}

func compileMatcher(s string) Matcher {
    cs, err := cascadia.Compile(s)
    if err != nil {
        return invalidMatcher{}
    }
    return cs
}
MatchAll函数Matcher接口compileMatcher(s string)cascadia库Matcher实现类参数sdom.Find("div")

图解源码

Find函数



总结

Find函数Find函数