Since posting my original question I have attempted to use the following packages that have yielded no resolution. (Neither of which seem to return subsequent children or nested tags from inside the body. For example:

<!DOCTYPE html>
<html>
    <head>
        <title>
            Title of the document
        </title>
    </head>
    <body>
        body content 
        <p>more content</p>
    </body>
</html> 
  • pkg/encoding/xml/ (standard library xml package)
  • golang.org/x/net/html

The over all goal would be to obtain a string or content that would look like:

<body>
    body content 
    <p>more content</p>
</body>

This can be solved by recursively finding the body node, using the html package, and subsequently render the html, starting from that node.

package main

import (
    "bytes"
    "errors"
    "fmt"
    "golang.org/x/net/html"
    "io"
    "strings"
)

func getBody(doc *html.Node) (*html.Node, error) {
    var b *html.Node
    var f func(*html.Node)
    f = func(n *html.Node) {
        if n.Type == html.ElementNode && n.Data == "body" {
            b = n
        }
        for c := n.FirstChild; c != nil; c = c.NextSibling {
            f(c)
        }
    }
    f(doc)
    if b != nil {
        return b, nil
    }
    return nil, errors.New("Missing <body> in the node tree")
}

func renderNode(n *html.Node) string {
    var buf bytes.Buffer
    w := io.Writer(&buf)
    html.Render(w, n)
    return buf.String()
}

func main() {
    doc, _ := html.Parse(strings.NewReader(htm))
    bn, err := getBody(doc)
    if err != nil {
        return
    }
    body := renderNode(bn)
    fmt.Println(body)
}

const htm = `<!DOCTYPE html>
<html>
<head>
    <title></title>
</head>
<body>
    body content
    <p>more content</p>
</body>
</html>`

这篇关于Golang解析HTML,使用&lt; body&gt;提取所有内容&LT; /体&GT;标签的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!