熟悉了《Golang 网络爬虫框架gocolly/colly 一》和《Golang 网络爬虫框架gocolly/colly 二》之后就可以在网络上爬取大部分数据了。本文接下来将爬取中证指数有限公司提供的行业市盈率。(http://www.csindex.com.cn/zh-CN/downloads/industry-price-earnings-ratio)
在这里插入图片描述
定义数据结构体:

type ZhjhHyShyl struct {
	Hydm string        `json:"行业代码"`
	Hymc string        `json:"行业名称"`
	Zxsj *float64      `json:"最新数据"`
	Gpjs int           `json:"股票家数"`
	Ksjs int           `json:"亏损家数"`
	Jygy *float64      `json:"近一个月"`
	Jsgy *float64      `json:"近三个月"`
	Jlgy *float64      `json:"近六个月"`
	Jyn  *float64      `json:"近一年"`
	Zhy  []*ZhjhHyShyl `json:"细分行业"`
}

接下来为gocolly调用准备,将用户代理设置为Chrome浏览器,该值可以通过Fiddler工具查看:

c.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299"

在这里插入图片描述
还可以利用Fiddler设置更多的Request Header,将爬虫工具伪装成浏览器。

接下来F12调用浏览器调试器查看目标数据的元素,拷贝jQuery选择器,然后改成相对路径。
在这里插入图片描述
完成所有的数据抓取代码:

package main

import (
	"encoding/json"
	"fmt"
	"log"
	"strconv"
	"strings"

	"github.com/PuerkitoBio/goquery"
	"github.com/gocolly/colly"
)

//证监会行业市盈率
type ZhjhHyShyl struct {
	Hydm string        `json:"行业代码"`
	Hymc string        `json:"行业名称"`
	Zxsj *float64      `json:"最新数据"`
	Gpjs int           `json:"股票家数"`
	Ksjs int           `json:"亏损家数"`
	Jygy *float64      `json:"近一个月"`
	Jsgy *float64      `json:"近三个月"`
	Jlgy *float64      `json:"近六个月"`
	Jyn  *float64      `json:"近一年"`
	Zhy  []*ZhjhHyShyl `json:"细分行业"`
}

func main() {
	var err error
	c := colly.NewCollector()
	c.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299"
	zjhHyShyl := make([]*ZhjhHyShyl, 0)
	c.OnRequest(func(r *colly.Request) {
		fmt.Printf("%+v\r\n%+v\r\n", *r, *(r.Headers))
	})
	c.OnHTML("td>table.list-div-table>tbody>tr", func(e *colly.HTMLElement) {
		hyShy := ZhjhHyShyl{
			Hydm: e.ChildText("td:first-child"),
			Hymc: e.ChildText("td:nth-child(2)"),
		}
		zxsj, err := strconv.ParseFloat(e.ChildText("td:nth-child(3)"), 64)
		if err == nil {
			hyShy.Zxsj = &zxsj
		}
		gpjs, err := strconv.ParseInt(e.ChildText("td:nth-child(4)"), 10, 32)
		if err == nil {
			hyShy.Gpjs = int(gpjs)
		}
		ksjs, err := strconv.ParseInt(e.ChildText("td:nth-child(5)"), 10, 32)
		if err == nil {
			hyShy.Ksjs = int(ksjs)
		}
		jygy, err := strconv.ParseFloat(e.ChildText("td:nth-child(6)"), 64)
		if err == nil {
			hyShy.Jygy = &jygy
		}
		jsgy, err := strconv.ParseFloat(e.ChildText("td:nth-child(7)"), 64)
		if err == nil {
			hyShy.Jsgy = &jsgy
		}
		jlgy, err := strconv.ParseFloat(e.ChildText("td:nth-child(8)"), 64)
		if err == nil {
			hyShy.Jlgy = &jlgy
		}
		jyn, err := strconv.ParseFloat(e.ChildText("td:nth-child(9)"), 64)
		if err == nil {
			hyShy.Jyn = &jyn
		}
		zjhHyShyl = append(zjhHyShyl, &hyShy)
		hyShy.Zhy = make([]*ZhjhHyShyl, 0)
		e.DOM.Parent().Parent().Next().Find("table.list-div-table>tbody>tr").Each(func(_ int, s *goquery.Selection) {
			zhy := ZhjhHyShyl{
				Hydm: strings.Trim(s.Find("td:nth-child(1)").Text(), "\r\n\t "),
				Hymc: strings.Trim(s.Find("td:nth-child(2)").Text(), "\r\n\t "),
			}
			zxsj, err := strconv.ParseFloat(strings.Trim(s.Find("td:nth-child(3)").Text(), "\r\n\t "), 64)
			if err == nil {
				zhy.Zxsj = &zxsj
			}
			gpjs, err := strconv.ParseInt(strings.Trim(s.Find("td:nth-child(4)").Text(), "\r\n\t "), 10, 32)
			if err == nil {
				zhy.Gpjs = int(gpjs)
			}
			ksjs, err := strconv.ParseInt(strings.Trim(s.Find("td:nth-child(5)").Text(), "\r\n\t "), 10, 32)
			if err == nil {
				zhy.Ksjs = int(ksjs)
			}
			jygy, err := strconv.ParseFloat(strings.Trim(s.Find("td:nth-child(6)").Text(), "\r\n\t "), 64)
			if err == nil {
				zhy.Jygy = &jygy
			}
			jsgy, err := strconv.ParseFloat(strings.Trim(s.Find("td:nth-child(7)").Text(), "\r\n\t "), 64)
			if err == nil {
				zhy.Jsgy = &jsgy
			}
			jlgy, err := strconv.ParseFloat(strings.Trim(s.Find("td:nth-child(8)").Text(), "\r\n\t "), 64)
			if err == nil {
				zhy.Jlgy = &jlgy
			}
			jyn, err := strconv.ParseFloat(strings.Trim(s.Find("td:nth-child(9)").Text(), "\r\n\t "), 64)
			if err == nil {
				zhy.Jyn = &jyn
			}
			hyShy.Zhy = append(hyShy.Zhy, &zhy)
		})
	})
	c.OnScraped(func(_ *colly.Response) {
		bData, _ := json.MarshalIndent(zjhHyShyl, "", "\t")
		fmt.Println(string(bData))
	})
	err = c.Visit("http://www.csindex.com.cn/zh-CN/downloads/industry-price-earnings-ratio?date=2017-12-27&type=zjh1")
	if err != nil {
		log.Fatal(err)
	}
}

运行后的部分结果:

{
	"行业代码": "D",
	"行业名称": "电力、热力、燃气及水的生产和供应业",
	"最新数据": 20.12,
	"股票家数": 107,
	"亏损家数": 5,
	"近一个月": 19.51,
	"近三个月": 19.7,
	"近六个月": 19.87,
	"近一年": 18.9,
	"细分行业": [{
		"行业代码": "44",
		"行业名称": "电力、热力生产和供应业",
		"最新数据": 18.75,
		"股票家数": 70,
		"亏损家数": 3,
		"近一个月": 18.28,
		"近三个月": 18.43,
		"近六个月": 18.55,
		"近一年": 17.44,
		"细分行业": null
	}, {
		"行业代码": "45",
		"行业名称": "燃气生产和供应业",
		"最新数据": 28.4,
		"股票家数": 22,
		"亏损家数": 2,
		"近一个月": 25.71,
		"近三个月": 25.33,
		"近六个月": 25.38,
		"近一年": 27.24,
		"细分行业": null
	}, {
		"行业代码": "46",
		"行业名称": "水的生产和供应业",
		"最新数据": 27.78,
		"股票家数": 15,
		"亏损家数": 0,
		"近一个月": 27.88,
		"近三个月": 29.33,
		"近六个月": 30.56,
		"近一年": 29.64,
		"细分行业": null
	}]
}