大家好,我是安果!
有时候,我们想知道生活在这座城市的人每天交流的事情,然后对数据进行一些分析,方便我们更好地了解城市的特征及居民的需求
以重庆为例,最火爆的论坛是购物狂,每天都有大量的帖子内容产生,基本上囊括了重庆人的衣食住行
本篇文章将介绍使用 Scrapy 爬取该论坛数据的完整教程
1. 爬虫
目标对象:
aHR0cHM6Ly9nby5jcW1tZ28uY29tL3RocmVhZC9sYXN0ZXN0P3BhZ2U9MQ==
我们需要爬取所有页面帖子中所有的回帖信息
1-1 安装依赖
# 安装依赖
pip3 install Scrapy
# 生成词云
pip3 install jieba
pip3 install stylecloud
1-2 创建项目及爬虫
# 创建项目
scrapy startproject cq_all
# 创建一个 CrawlSpider 爬虫
cd cq_all
scrapy genspider -t crawl talk "HOST"
1-3 定义 Item 实体对象
在 items.py 文件中,将需要爬取的数据定义为 Item
这里只需要爬取帖子的标题、URL 及所有回帖内容
# items.py
import scrapy
class CqAllItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field() # 帖子标题
url = scrapy.Field() # URL
content = scrapy.Field() # 内容
1-4 编写爬虫
在 spiders 文件夹下的爬虫文件中编写具体的爬虫逻辑
首先通过定义 rules,指定要爬取的地址,然后使用 Xpath 语法解析出所有文本内容
PS:通过设定 follow 为 True,可以爬取某一个帖子所有的回帖数据
# talk.py
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class TalkSpider(CrawlSpider):
name = 'talk_all'
allowed_domains = ['HOST']
start_urls = ['https://HOST/thread/lastest?page={}'.format(i + 1) for i in range(20)]
rules = (
Rule(LinkExtractor(allow=r'https://HOST/forum-\d+-thread-\S+.html', deny=r'safeRedirect'),
callback='parse_forum',
follow=True),
)
def parse_forum(self, response):
"""
解析帖子
:param response:
:return:
"""
try:
# 帖子标题
title = response.xpath('//*[@id="view-hd"]//span/text()').extract()[0]
# URL
url = response.url
# 获取内容,并获取其真实文字内容
content = get_content(
response.xpath('//*[@id="view-bd"]//table[@class="view-data"]//div//text()').extract())
# 导航标题栏
tab_titles = response.xpath('//*[@id="nav"]//a/text()').extract()
# 过滤特殊帖子,比如:公告
if not in_exclude_tabs(tab_titles):
if content:
# print("帖子URL:", url, ",标题:", title, ",内容:", content)
item = CqAllItem()
item['title'] = title
item['url'] = url
item['content'] = content
yield item
else:
# print("(过滤掉)帖子URL:", response.url, "内容为空")
pass
else:
# print("(过滤掉)过滤的帖子,Tab:", tab_titles, ",地址:", response.url)
pass
except Exception as e:
print("产生异常,异常信息:", str(e))
为了过滤无效信息,这里对回帖内容及 Tab 进行一次初步过滤
# 排除的Tab,过滤无效信息
exclude_tabs = ['公告','网友中心']
def get_content(contents):
"""
去掉空格换行符
:param contents:
:return:
"""
result = ''
for item in contents:
result += item.strip()
return result
def in_exclude_tabs(tab_titles):
result = False
for tab_title in tab_titles:
if tab_title in exclude_tabs:
result = True
break
return result
1-5 反反爬
为了应对网站的反爬,这里需要进行如下处理
# settings.py
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 1
# 禁用重定向
REDIRECTS_ENABLED = False
# talk.py
# 自定义请求头
custom_settings = {
"COOKIES_ENABLED": False,
"DOWNLOAD_DELAY": 3,
'DEFAULT_REQUEST_HEADERS': {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Content-type': 'application/x-www-form-urlencoded;',
'Host': 'go.cqmmgo.com',
'Origin': 'https://HOST',
'sec-ch-ua': '"Not?A_Brand";v="8", "Chromium";v="108", "Google Chrome";v="108"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': "Windows",
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
'Referer': 'https://HOST/',
'Cookie': '*',
}
}
1-6 自定义下载管道 Pipline
在 piplines.py 文件中,自定义 2 个下载管道,分别将数据写入到本地 CSV 文件和文本文件中
from scrapy.exceptions import DropItem
from scrapy.exporters import CsvItemExporter
from cq_all.items import CqAllItem
class DuplicatesPipeline(object):
"""
Pipline去重复的帖子
"""
def __init__(self):
self.talk_set = set()
def process_item(self, item, spider):
name = item['title']
if name in self.talk_set:
raise DropItem("Duplicate book found:%s" % item)
self.talk_set.add(name)
return item
class CqAllPipeline(object):
"""保存CSV"""
def __init__(self):
self.file = open("./cq_all.csv", 'wb')
self.exporter = CsvItemExporter(self.file, fields_to_export=[
'title', 'url', 'content'
])
self.exporter.start_exporting()
def process_item(self, item, spider):
if isinstance(item, CqAllItem):
self.exporter.export_item(item)
return item
# 关闭资源
def close_spider(self, spider):
self.exporter.finish_exporting()
self.file.close()
class CqAllTxtPipline(object):
"""
保存到txt文件中
"""
def __init__(self):
self.file = open('result.txt', 'w', encoding='utf-8')
def open_spider(self, spider):
# self.file = open('result.txt', 'w', encoding='utf-8')
pass
def close_spider(self, spider):
self.file.close()
def process_item(self, item, spider):
try:
if isinstance(item, CqAllItem):
self.file.write(item['content'] + '\n')
except:
pass
1-7 配置爬虫配置文件
打开 settings.py 文件,配置默认请求头及数据管道
# settings.py
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Content-type': 'application/x-www-form-urlencoded;',
'Host': 'HOST',
'Origin': 'https://HOST',
'sec-ch-ua'
: '"Not?A_Brand";v="8", "Chromium";v="108", "Google Chrome";v="108"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': "Windows",
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
'Referer': 'https://HOST/'
}
ITEM_PIPELINES = {
'cq_all.pipelines.DuplicatesPipeline': 1,
'cq_all.pipelines.CqAllPipeline': 2,
'cq_all.pipelines.CqAllTxtPipline': 3,
}
1-8 爬虫主入口
在爬虫项目根目录下创建一个文件,通过下面的方式运行单个爬虫
# main.py
from scrapy.cmdline import execute
import sys, os
def start_scrapy():
sys.path.append(os.path.dirname(__file__))
# 运行单个爬虫
execute(["scrapy", "crawl", "talk_all"])
def pre():
"""
删除临时文件
:return:
"""
if os.path.exists('cq.png'):
os.remove("cq.png")
if os.path.exists('result.txt'):
os.remove('result.txt')
if __name__ == '__main__':
pre()
start_scrapy()
2. 词云可视化
在爬虫类重写 close 方法,根据本地文件绘制成词云图片保存到本地
# talk.py
class TalkSpider(CrawlSpider):
def close(spider, reason):
# 绘制词图
gene_word_cloud()
return None
2-1 内容预处理
从爬取文本中读取数据后进行预处理,过滤一些无用的字符
def gene_word_cloud():
"""根据文件生成词云"""
with open('result.txt', 'r', encoding='utf-8') as f:
data = f.read()
# 文本预处理
new_data = re.findall('[\u4e00-\u9fa5]+', data, re.S)
new_data = "/".join(new_data)
2-2 分词后去除单词
接着使用 jieba 进行分词,然后过滤掉单词
import jieba, stylecloud
# 解析
word_list_pre = jieba.cut(new_data, cut_all=True)
# 去除单词
word_list = [item for item in word_list_pre if len(item) > 1]
result = " ".join(word_list) # 分词用空格隔开
2-3 停用词汇
为了保证生成词云的有效性,我们一般需要自定义停用词汇
这里推荐下面 2 个词汇,大家可以在此基础上进行二次编辑
https://github.com/goto456/stopwords/blob/master/cn_stopwords.txt
https://github.com/fwwdn/sensitive-stop-words/blob/master/stopword.dic
def get_cn_stop_words():
"""
从文件中读取停用中文词
:return:
"""
stop_words = set()
# 停用词汇
# https://github.com/fwwdn/sensitive-stop-words/blob/master/stopword.dic
# https://github.com/goto456/stopwords/blob/master/cn_stopwords.txt
with open('cn_stopwords.txt', encoding='utf-8') as f:
con = f.readlines()
for i in con:
i = i.replace("\n", "") # 去掉读取每一行数据的\n
if i:
stop_words.add(i)
return list(stop_words)
2-4 绘制词云
使用 stylecloud 绘制词云的时候,我们需要指定图片大小、字体、调色方案、方向、蒙版、停用词汇等参数
其中
调色方案参考:
https://jiffyclub.github.io/palettable/cartocolors/qualitative/
词云蒙版参考:
https://fa5.dashgame.com/#/
import jieba, stylecloud
image_path = 'cq.png'
# 中文禁用词
stop_words = get_cn_stop_words()
stylecloud.gen_stylecloud(
text=result, # 上面分词的结果作为文本传给text参数
size=1024,
font_path='msyh.ttc', # 字体设置
palette='cartocolors.qualitative.Pastel_10', # 调色方案选取,从palettable里选择
gradient='horizontal', # 渐变色方向选了垂直方向
icon_name='fas fa-dove'
, # 蒙版选取,从Font Awesome里选
stopwords=True, # 布尔值,用于筛除常见禁用词
max_words=200,
collocations=False,
custom_stopwords=stop_words,
output_name=image_path)
2-5 消息推送
我们读取本地图片,使用企业微信机器人发送出去
import re, base64, hashlib
import requests
def send_wx():
"""
将词云图片发送到微信
:return:
"""
with open(image_path, 'rb') as file: # 转换图片成base64格式
data = file.read()
image_data = str(base64.b64encode(data), 'utf-8')
with open(image_path, 'rb') as file: # 图片的MD5值
md = hashlib.md5()
md.update(file.read())
image_md5 = md.hexdigest()
url = 'https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=xx'
headers = {"Content-Type": "application/json"}
data = {
"msgtype": "image",
"image": {
"base64": image_data,
"md5": image_md5
}
}
result = requests.post(url, headers=headers, json=data)
return result
3. 最后
在本机运行成功之后,我们就可以将程序部署到服务器了
需要注意的是,服务器一般没有中文字体,我们需要额外安装字体
以 CentOS 为例,我们首先需要安装 fontconfig 和 ttmkfdir,然后下载中文字体放置到中文字体目录下,接着修改配置文件 fonts.conf,最后刷新字体缓存即可
关于文中出现的任何疑惑,欢迎大家在文末留言交流!