scrapy爬去西刺ip列表

零落成泥碾作尘,只有香如故。

创建工程

执行命令:

1
scrapy startproject xici

新建爬虫

进入到xici目录执行命令

1
scrapy genspider aixici www.xicidaili.com

编写items

1
2
3
4
5
6
7
8
9
10
11
import scrapy

class XiciItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
ip = scrapy.Field()
port = scrapy.Field()
possion = scrapy.Field()
type = scrapy.Field()
speed = scrapy.Field()
last_time = scrapy.Field()

编写爬虫

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# -*- coding: utf-8 -*-
import scrapy
from xici.items import XiciItem

class AixiciSpider(scrapy.Spider):
name = "aixici"
allowed_domains = ["www.xicidaili.com"]
start_urls = ['http://www.xicidaili.com/']

def start_requests(self):
reqs = []
for i in range(1,10):
req = scrapy.Request('http://www.xicidaili.com/nn/%s'%i)
reqs.append(req)

return reqs

def parse(self, response):
ip_list = response.xpath('//*[@id="ip_list"]')
trs = ip_list[0].xpath('tr')
items = []
for ip in trs[1:]:
pre_item = XiciItem()
pre_item['ip'] = ip.xpath('td[2]/text()')[0].extract()
pre_item['port'] = ip.xpath('td[3]/text()')[0].extract()
pre_item['possion'] = ip.xpath('td[4]')[0].extract().strip()
pre_item['type'] = ip.xpath('td[6]/text()')[0].extract()
pre_item['speed'] = ip.xpath('td[7]/div[@class="bar"]/@title').re('\d{0,2}\.\d{0,}')[0]
pre_item['last_time'] = ip.xpath('td[10]/text()')[0].extract()
items.append(pre_item)
return items

运行爬虫

1
scrapy crawl aixici -o item.json