【scrapy框架】王者荣耀英雄信息爬取 python爬虫-聊城职业技术学院网站建设工作室

首页> 学习资料

【scrapy框架】王者荣耀英雄信息爬取 python爬虫

作者：裴静博时间：2020-06-18 点击数：

【scrapy框架】王者荣耀英雄信息爬取 python爬虫

分析

入口页面地址

https://pvp.qq.com/web201605/herolist.shtml

·第一步获取所有英雄的列表

可以看到英雄列表是在源码中可以被找到的

·第二步获取英雄的各种信息

英雄的基本信息放在一个class = "cover"的div中我们主要采集英雄的名称和技能介绍

技能部分都在 class=" zk-con3 zk-con" 中中的 ul中

爬取英雄列表

·创建工程

scrapy startproject wzry

cd wzry

·创建爬虫

scrapy genspider wzry_spider pvp.qq.com

·修改配置

# 不遵循robots协议因为站点没有这个文件爬虫会直接略过

ROBOTSTXT_OBEY = False

# 添加请求头

DEFAULT_REQUEST_HEADERS = {

'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

'Accept-Language': 'en',

'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'

}

# 下载延迟

DOWNLOAD_DELAY = 1

·修改初始页面

start_urls = ['https://pvp.qq.com/web201605/herolist.shtml']

·创建爬取列表方法(测试)

def parse(self, response):

print("=" * 50)

print(response)

print("=" * 50)

·运行爬虫

scrapy crawl wzry_spider

2020-05-25 11:08:53 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023

2020-05-25 11:08:54 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://pvp.qq.com/web201605/herolist.shtml> (referer: None)

==================================================

<200 https://pvp.qq.com/web201605/herolist.shtml>

==================================================

2020-05-25 11:08:54 [scrapy.core.engine] INFO: Closing spider (finished)

2020-05-25 11:08:54 [scrapy.statscollectors] INFO: Dumping Scrapy stats:

{'downloader/request_bytes': 315,

·成功返回结果

·爬取所有英雄链接方法

class WzrySpiderSpider(scrapy.Spider):

name = 'wzry_spider'

allowed_domains = ['pvp.qq.com']

start_urls = ['https://pvp.qq.com/web201605/herolist.shtml'] # 起始url

base_url = "https://pvp.qq.com/web201605/" # url 前缀

def parse(self, response):

print("=" * 50)

# print(response.body)

hero_list = response.xpath("//ul[@class='herolist clearfix']//li")

for hero in hero_list:

url = self.base_url + hero.xpath("./a/@href").get()

print(url)

# yield scrapy.Request(url)

print("=" * 50)

https://pvp.qq.com/web201605/herodetail/114.shtml

https://pvp.qq.com/web201605/herodetail/113.shtml

https://pvp.qq.com/web201605/herodetail/112.shtml

https://pvp.qq.com/web201605/herodetail/111.shtml

https://pvp.qq.com/web201605/herodetail/110.shtml

https://pvp.qq.com/web201605/herodetail/109.shtml

https://pvp.qq.com/web201605/herodetail/108.shtml

https://pvp.qq.com/web201605/herodetail/107.shtml

https://pvp.qq.com/web201605/herodetail/106.shtml

https://pvp.qq.com/web201605/herodetail/105.shtml

==================================================

爬取英雄详情页面

·获得基本信息

# 基本信息块

hero_info = response.xpath("//div[@class='cover']")

hero_name = hero_info.xpath(".//h2[@class='cover-name']/text()").get()

print(hero_name)

# 分类

sort_num = hero_info.xpath(".//span[@class='herodetail-sort']/i/@class").get()[-1:]

print(sort_num)

# 生存能力

viability = hero_info.xpath(".//ul/li[1]/span/i/@style").get()[6:]

print("生存能力:" + viability)

# 伤害

aggressivity = hero_info.xpath(".//ul/li[2]/span/i/@style").get()[6:]

print("攻击能力:" + aggressivity)

effect = hero_info.xpath(".//ul/li[3]/span/i/@style").get()[6:]

print("技能影响" + effect)

difficulty = hero_info.xpath(".//ul/li[4]/span/i/@style").get()[6:]

print("上手难度:" + difficulty)

·获取技能信息

skill_list = response.xpath("//div[@class='skill-show']/div[@class='show-list']")

for skill in skill_list:

skill_name = skill.xpath("./p[@class='skill-name']/b/text()").get()

if not skill_name:

continue

# 冷却时间

cooling = skill.xpath("./p[@class='skill-name']/span[1]/text()").get().split("：")[1].strip().split('/')

# 消耗

consume = skill.xpath("./p[@class='skill-name']/span[1]/text()").get().split("：")[1].strip().split('/')

# "".strip()

# 如果这个技能是空的就 continue

# 技能介绍

skill_desc = skill.xpath("./p[@class='skill-desc']/text()").get()

new_skill = {

"name": skill_name,

"cooling": cooling,

"consume": consume,

"desc": skill_desc

}

new_hero = HeroInfo(name=hero_name,

sort_num=sort_num,

viability=viability,

aggressivity=aggressivity,

effect=effect,

difficulty=difficulty,

skills_list=new_skill)

yield new_hero

·英雄信息item类

class HeroInfo(scrapy.Item):

# 存字符串

name = scrapy.Field()

sort_num = scrapy.Field()

viability = scrapy.Field()

aggressivity = scrapy.Field()

effect = scrapy.Field()

difficulty = scrapy.Field()

# 存字典

skills_list = scrapy.Field()

·储存 pipelines.py文件

·需要修改 ITEM_PIPELINES 配置加入这个处理类

# -*- coding: utf-8 -*-

from scrapy.exporters import JsonItemExporter

class WzryPipeline:

def __init__(self):

# 打开文件并实例化JsonItemExporter

self.fp = open('result.json', 'wb')

self.save_json = JsonItemExporter(self.fp, encoding="utf-8", ensure_ascii=False, indent=4)

# 开始写入

self.save_json.start_exporting()

def open_spider(self, spider):

pass

def close_spider(self, spider):

# 结束写入

self.save_json.finish_exporting()

# 关闭文件

self.fp.close()

def process_item(self, item, spider):

# 写入item

self.save_json.export_item(item)

return item

·运行爬虫

scrapy crawl wzry_spider

·结果

全部代码

wzry_spider.py

# -*- coding: utf-8 -*-

import scrapy

from wzry.items import HeroInfo

class WzrySpiderSpider(scrapy.Spider):

name = 'wzry_spider'

allowed_domains = ['pvp.qq.com']

start_urls = ['https://pvp.qq.com/web201605/herolist.shtml'] # 起始url

base_url = "https://pvp.qq.com/web201605/" # url 前缀

def parse(self, response):

# print(response.body)

hero_list = response.xpath("//ul[@class='herolist clearfix']//li")

for hero in hero_list:

url = self.base_url + hero.xpath("./a/@href").get()

yield scrapy.Request(url, callback=self.get_hero_info)

def get_hero_info(self, response):

# 基本信息块

global new_skill

hero_info = response.xpath("//div[@class='cover']")

hero_name = hero_info.xpath(".//h2[@class='cover-name']/text()").get()

# 分类

sort_num = hero_info.xpath(".//span[@class='herodetail-sort']/i/@class").get()[-1:]

# 生存能力

viability = hero_info.xpath(".//ul/li[1]/span/i/@style").get()[6:]

# 伤害

aggressivity = hero_info.xpath(".//ul/li[2]/span/i/@style").get()[6:]

effect = hero_info.xpath(".//ul/li[3]/span/i/@style").get()[6:]

difficulty = hero_info.xpath(".//ul/li[4]/span/i/@style").get()[6:]

# 技能列表

skill_list = response.xpath("//div[@class='skill-show']/div[@class='show-list']")

for skill in skill_list:

skill_name = skill.xpath("./p[@class='skill-name']/b/text()").get()

if not skill_name:

continue

# 冷却时间

cooling = skill.xpath("./p[@class='skill-name']/span[1]/text()").get().split("："

"")[1].strip().split('/')

# 消耗

consume = skill.xpath("./p[@class='skill-name']/span[1]/text()").get().split("："

"")[1].strip().split('/')

# "".strip()

# 如果这个技能是空的就 continue

# 技能介绍

skill_desc = skill.xpath("./p[@class='skill-desc']/text()").get()

new_skill = {

"name": skill_name,

"cooling": cooling,

"consume": consume,

"desc": skill_desc

}

new_hero = HeroInfo(name=hero_name,

sort_num=sort_num,

viability=viability,

aggressivity=aggressivity,

effect=effect,

difficulty=difficulty,

skills_list=new_skill)

yield new_hero

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items

# See documentation in:

# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy

class WzryItem(scrapy.Item):

# define the fields for your item here like:

# name = scrapy.Field()

pass

class HeroInfo(scrapy.Item):

# 存字符串

name = scrapy.Field()

sort_num = scrapy.Field()

viability = scrapy.Field()

aggressivity = scrapy.Field()

effect = scrapy.Field()

difficulty = scrapy.Field()

# 存字典

skills_list = scrapy.Field()

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here

# Don't forget to add your pipeline to the ITEM_PIPELINES setting

# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html

from scrapy.exporters import JsonItemExporter

class WzryPipeline:

def __init__(self):

self.fp = open('result.json', 'wb')

self.save_json = JsonItemExporter(self.fp, encoding="utf-8", ensure_ascii=False, indent=4)

self.save_json.start_exporting()

def open_spider(self, spider):

pass

def close_spider(self, spider):

self.save_json.finish_exporting()

self.fp.close()

def process_item(self, item, spider):

self.save_json.export_item(item)

return item

上一篇：初识JAVA
下一篇：【Scrapy框架】爬取糗事段子获取python 爬虫