需求
- 爬取
Appannie
的排行榜页面
- 演示用,只爬取列表的名称和缩略图字段
思路
- 需要先登录,才能进入排行榜页面,我们使用
selenium
模拟登录,获取登录后的cookies
- 以后的请求都带上这个
cookies
- 排行版页面的数据是前端渲染的,这里使用
selenium
等渲染完成后再取数据
步骤
- 修改
appannie/appannie/settings.py
,做一些初始设置
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36'
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 1
COOKIES_ENABLED = True
- 修改
appannie/appannie/items.py
文件
import scrapy
class AppannieItem(scrapy.Item):
name = scrapy.Field()
thumb = scrapy.Field()
- 编写爬虫文件
appannie/appannie/spiders/annieSpider.py
import time
import scrapy
from selenium import webdriver
from appannie.items import AppannieItem
class annieSpider(scrapy.Spider):
name = 'annieSpider'
allowd_domains = ['appannie.com']
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',
}
def __init__(self):
options = webdriver.ChromeOptions()
options.add_argument('--headless')
self.browser = webdriver.Chrome(chrome_options=options)
self.browser.set_page_load_timeout(30)
self.cookies = {}
def closed(self, spider):
self.browser.close()
self.cookies.clear()
def start_requests(self):
self.browser.get('https://www.appannie.com/account/login')
time.sleep(2)
self.browser.find_element_by_name('username').send_keys('xxxxxx')
self.browser.find_element_by_xpath('//*[@id="__next"]/div[1]/div/div[1]/div/div[3]/form/div/div[2]/input').send_keys('xxxxxx')
self.browser.find_element_by_css_selector('.Button__ButtonBlank-sc-1wnez5l-2.Button__UCButton-sc-1wnez5l-9.kEtTqS').click()
time.sleep(1)
cookies = self.browser.get_cookies()
for cookie in cookies:
self.cookies[cookie['name']] = cookie['value']
urls = [
'https://www.appannie.com/apps/ios/top-chart/?country=US&category=6014&device=iphone&date=2020-02-23&feed=Paid&rank_sorting_type=rank&page_number=0&page_size=100&table_selections=&metrics=grossing_rank,price,category,all_avg,all_count,last_avg,last_count,first_release_date,last_updated_date,est_download,est_revenue,wau&order_type=desc&order_by=paid_rank'
]
for url in urls:
yield scrapy.Request(url=url, headers=self.headers, cookies=self.cookies, callback=self.parse, dont_filter=True)
def parse(self, response):
gameLists = response.xpath('//*[@id="sub-container"]//*[@class="scroll-table-container"]/table/tbody/tr')
for gameItem in gameLists:
item = AppannieItem()
item['name'] = gameItem.xpath('.//div[@class="main-info"]//div[@class="app-link-container"]/a/span/text()').extract_first()
item['thumb'] = gameItem.xpath('.//div[@class="icon-info"]/a/img/@src').extract_first()
yield item
- 修改
appannie/appannie/middlewares.py
文件,添加selenium
获取渲染后数据处理
from scrapy import signals
from scrapy.http import HtmlResponse
from selenium.common.exceptions import TimeoutException
import time
import sys
import importlib
import urllib.parse
class SeleniumMiddleware(object):
def process_request(self, request, spider):
if spider.name == 'annieSpider':
try:
cookie_dict = spider.cookies
cookies = []
for key in cookie_dict:
cookie = {}
cookie['name'] = key
cookie['value'] = cookie_dict[key]
cookies.append(cookie)
spider.browser.get(request.url)
for c in cookies:
spider.browser.add_cookie(c)
spider.browser.get(request.url)
spider.browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
except TimeoutException as e:
print('timeout')
spider.browser.execute_script('window.stop()')
time.sleep(5)
return HtmlResponse(url=spider.browser.current_url, body=spider.browser.page_source, encoding="utf-8", request=request)
- 修改
appannie/appannie/settings.py
,使用SeleniumMiddleware
DOWNLOADER_MIDDLEWARES = {
'appannie.middlewares.SeleniumMiddleware': 543,
}
- 执行爬虫
scrapy crawl annieSpider -o res.json
修改
- 每次执行都会登录账号,这样很容易被封。
- 正常应该登录后将
cookies
记录到Redius
中。
- 这里简单点,我们手动将
cookies
写在配置文件中
- 修改
appannie/appannie/settings.py
,将cookies
配置在这里
MY_COOKIES = 'csrftoken=xxxxx; aa_language=cn;.....'
- 修改爬虫文件
appannie/appannie/spiders/annieSpider.py
import time
import scrapy
from scrapy.utils.project import get_project_settings
from selenium import webdriver
from appannie.items import AppannieItem
class annieSpider(scrapy.Spider):
name = 'annieSpider'
allowd_domains = ['appannie.com']
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',
}
def __init__(self):
options = webdriver.ChromeOptions()
options.add_argument('--headless')
self.browser = webdriver.Chrome(chrome_options=options)
self.browser.set_page_load_timeout(30)
self.cookies = {}
def closed(self, spider):
self.browser.quit()
self.cookies.clear()
def start_requests(self):
settings = get_project_settings()
cookieStr = settings['MY_COOKIES']
for line in cookieStr.split(';'):
key, value = line.strip().split('=', 1)
self.cookies[key] = value
urls = [
'https://www.appannie.com/apps/ios/top-chart/?country=US&category=6014&device=iphone&date=2020-02-23&feed=Paid&rank_sorting_type=rank&page_number=0&page_size=100&table_selections=&metrics=grossing_rank,price,category,all_avg,all_count,last_avg,last_count,first_release_date,last_updated_date,est_download,est_revenue,wau&order_type=desc&order_by=paid_rank'
]
for url in urls:
yield scrapy.Request(url=url, headers=self.headers, cookies=self.cookies, callback=self.parse, dont_filter=True)
def parse(self, response):
gameLists = response.xpath('//*[@id="sub-container"]//*[@class="scroll-table-container"]/table/tbody/tr')
for gameItem in gameLists:
item = AppannieItem()
item['name'] = gameItem.xpath('.//div[@class="main-info"]//div[@class="app-link-container"]/a/span/text()').extract_first()
item['thumb'] = gameItem.xpath('.//div[@class="icon-info"]/a/img/@src').extract_first()
yield item
- 修改
appannie/appannie/middlewares.py
文件
from scrapy import signals
from scrapy.http import HtmlResponse
from scrapy.utils.project import get_project_settings
from selenium.common.exceptions import TimeoutException
import time
import sys
import importlib
import urllib.parse
class SeleniumMiddleware(object):
def process_request(self, request, spider):
if spider.name == 'annieSpider':
try:
settings = get_project_settings()
cookieStr = settings['MY_COOKIES']
cookies = []
for line in cookieStr.split(';'):
cookie = {}
key, value = line.strip().split('=', 1)
cookie['name'] = key
cookie['value'] = urllib.parse.unquote(value)
cookies.append(cookie)
spider.browser.get(request.url)
for c in cookies:
spider.browser.add_cookie(c)
spider.browser.get(request.url)
spider.browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
except TimeoutException as e:
print('timeout')
spider.browser.execute_script('window.stop()')
time.sleep(5)
return HtmlResponse(url=spider.browser.current_url, body=spider.browser.page_source, encoding="utf-8", request=request)
- 执行爬虫
scrapy crawl annieSpider -o res.json
发表评论