Golang单任务版爬虫

你的善良,必须带点锋芒,否则等于零。

项目结构

源码分析

main.go

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
package main

import (
"test/engine"
"test/parser"
)

func main() {

url := "http://www.zhenai.com/zhenghun"

engine.Run(engine.Request{
Url:url,
ParserFunc:parser.ParseCityList,
})
}

types.go

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
package engine

//解析后返回的结果
type ParseResult struct {
Requests []Request
Items []interface{}
}


type Request struct {
Url string //解析出来的URL
ParserFunc func([]byte) ParseResult //处理这个URL所需要的函数
}


func NilParser([] byte) ParseResult {
return ParseResult{}
}

engine.go

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
package engine

import (
"log"
"test/fetcher"
)


func Run(seeds ...Request) {

var requests []Request
for _, r := range seeds {
requests = append(requests, r)
}
for len(requests) > 0 {
r := requests[0]
requests = requests[1:]
log.Printf("Fetching %s",r.Url)
body, err := fetcher.Fetch(r.Url)
if err != nil {
log.Printf("Fetcher: error fetching url %s %v", r.Url, err)
continue
}
parseResult := r.ParserFunc(body)
requests = append(requests, parseResult.Requests...)
for _,item:=range parseResult.Items{
log.Printf("Got item %v",item)
}
}
}

fetcher.go

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
package fetcher

import (
"bufio"
"fmt"
"golang.org/x/net/html/charset"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/unicode"
"golang.org/x/text/transform"
"io/ioutil"
"log"
"net/http"
"strings"
)


/**
fetcher:根据url获取对应的数据
*/
func Fetch(url string) ([] byte, error) {
/*
resp, err := http.Get(url)
if err != nil {
return nil, err
}
*/

newUrl := strings.Replace(url, "http://", "https://", 1)

request, _:=http.NewRequest(http.MethodGet,newUrl,nil)
request.Header.Add("User-Agent","Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1")


// 1 分钟有效cookie


cookie := "ec=efexs2yx-1635059763651-4354f6812c7ee-1160473905; FSSBBIl1UgzbN7NO=50VBKV5hc0utxjqurAbThL2oOHItIEhFghdO1gdWyFhazAg7AKjmWBYnVMsPSVDRiSAKC1Xi7Q2JXDaDrAzn2ZA; sid=WoTVURTyXnfFuyrLjyvP; _exid=SOFFj%2B9FuKQsSSQgAHQq1wFmbHHxiumUgMg4agh6CvCmn1kBX5i%2FP3FzOUtJH99IXis2NS1cc7PdW1UAoVTmSA%3D%3D; _efmdata=7d%2FA9aWpDcKzs29MsTO%2BMHTqjG75uCjz%2BbNOD2sXTBvXMYfHCfSN3NSqzEYUGO5tguKli8YS2L9O%2BK0VpRv%2BFuWqAGZg0j5PpOcDcQTauuM%3D; FSSBBIl1UgzbN7NP=53UVQdDmToH3qqqmZ7Uve0qwxpoCJMOOrBMPxhuW8HW8qPf4rKCR6C4GFdQb8y3yKH_bypWGA3JltlGZ0vedtR6ADPk5sn5aPIY7hB6efB0Wb097cXT1oSxo9LEwwzKsCBODguMmpY6uu3ArHiNpf9jssSGUNPokhuA9tR2Yz7SzJj4GdbBbGGjXP0Ud_TqzsxWtedn9P7LbXq_bZhEMMoR3i74qQ8_iTxWxJ0mTxcp1b3_0PrIoM1wqhWiwJ9256Nlqw4AU5x7UvW4f_TW0cBtBBCkP9L3ud_3O1cKMFZEOaAJ0_a2UKSM70P0s87RA8G"
request.Header.Add("cookie", cookie)

resp,_:=http.DefaultClient.Do(request)
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
fmt.Println("resp:",resp)
return nil, fmt.Errorf("error:status code:%d", resp.StatusCode)
}
//如果页面传来的不是utf8,我们需要转为utf8格式
bodyReader := bufio.NewReader(resp.Body)
e := determineEncoding(bodyReader)
utf8Reader := transform.NewReader(bodyReader, e.NewDecoder())
return ioutil.ReadAll(utf8Reader)
}



func determineEncoding(r *bufio.Reader) encoding.Encoding {
bytes, err := r.Peek(1024)
if err != nil {
log.Printf("Ftcher error:%v", err)
return unicode.UTF8
}
e, _, _ := charset.DetermineEncoding(bytes, "")
return e
}

citylistparser.go

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
package parser
import (
"regexp"
"test/engine"
)
const cityListRe = `<a href="(http://www.zhenai.com/zhenghun/[0-9a-z]+)" [^>]*>([^<]+)</a>`


//解析城市信息
func ParseCityList(contents []byte) engine.ParseResult {
re := regexp.MustCompile(cityListRe)
all := re.FindAllSubmatch(contents, -1)
result := engine.ParseResult{}

i := 0

for _, c := range all {
result.Items = append(result.Items, string(c[2])) //城市名字
result.Requests = append(result.Requests, engine.Request{
Url: string(c[1]),
ParserFunc: ParseCity,
})

i++
if i == 2 {
break
}
}

return result
}

cityparser.go

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
package parser

import (
"fmt"
"regexp"
"test/engine"
)

const cityRe = `<a href="(http://album.zhenai.com/u/[0-9]+)"[^>]*>([^<]+)</a>`


//解析信息
func ParseCity(contents []byte) engine.ParseResult {
re := regexp.MustCompile(cityRe)
all := re.FindAllSubmatch(contents, -1)
result := engine.ParseResult{}
for _, c := range all {
fmt.Println("用户url:", string(c[1]))
result.Items = append(result.Items, "User:"+string(c[2])) //用户名字
name := string(c[2])
result.Requests = append(result.Requests, engine.Request{
Url: string(c[1]),
ParserFunc: func(c []byte) engine.ParseResult {
return ParseProfile(c, name)
},
})
}
return result
}

profileparser.go

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
package parser

import (
"fmt"
"github.com/bitly/go-simplejson"
"log"
"regexp"
"test/engine"
"test/model"
)

var re = regexp.MustCompile(`<script>window.__INITIAL_STATE__=(.+);\(function`)



func ParseProfile(contents []byte, name string) engine.ParseResult {
fmt.Println("-------------------")
match := re.FindSubmatch(contents)
result := engine.ParseResult{}
if len(match) >= 2 {
json := match[1]
//fmt.Printf("json : %s\n",json)
profile := parseJson(json)
profile.Name = name
//fmt.Println(profile)
result.Items = append(result.Items, profile)
fmt.Println(result)
}
return result
}



//解析json数据
func parseJson(json []byte) model.Profile {

res, err := simplejson.NewJson(json)
if err != nil {
log.Println("解析json失败。。")
}

infos, err := res.Get("objectInfo").Get("basicInfo").Array()
//infos是一个切片,里面的类型是interface{}
//fmt.Printf("infos:%v, %T\n", infos, infos) //infos:[离异 47岁 射手座(11.22-12.21) 157cm 55kg 工作地:阿坝汶川 月收入:3-5千 教育/科研 大学本科], []interface {}
var profile model.Profile
//所以我们遍历这个切片,里面使用断言来判断类型
for k, v := range infos {
//fmt.Printf("k:%v,%T\n", k, k)
//fmt.Printf("v:%v,%T\n", v, v)
/*
"basicInfo":[
"未婚",
"25岁",
"魔羯座(12.22-01.19)",
"152cm",
"42kg",
"工作地:阿坝茂县",
"月收入:3-5千",
"医生",
"大专"
],
*/
if e, ok := v.(string); ok {
switch k {
case 0:
profile.Marriage = e
case 1:
//年龄:47岁,我们可以设置int类型,所以可以通过另一个json字段来获取
profile.Age = e
case 2:
profile.Xingzuo = e
case 3:
profile.Height = e
case 4:
profile.Weight = e
case 6:
profile.Income = e
case 7:
profile.Occupation = e
case 8:
profile.Education = e
}
}
}
return profile
}

user.go

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
package model

//珍爱网用户对象模型
type Profile struct {
Name string //姓名
Marriage string //婚况
Age string //年龄
Gender string //性别
Height string //身高
Weight string //体重
Income string //收入
Education string //教育
Occupation string //职业
Hukou string //籍贯户口
Xingzuo string //星座
House string //房子
Car string //车
}