这里只贴下代码,简单介绍下
看看下方代码最底部的注释,这是比较重要的几个链接,当然最重要的是弹幕的链接,只需要访问某个视频详情页,打开浏览器调试,刷新页面就可以看到,但是只能看到1000+的弹幕。
抓取了华农兄弟的全部视频弹幕,共计26万条
import pymongo
import requests
from scrapy.selector import Selector
headers = {
"User-Agent": "Mozilla/5.0 (Linux; Android 8.0.0; Pixel 2 XL Build/OPD1.170816.004) AppleWebKit/537.36 (KHTML, like "
"Gecko) Chrome/78.0.3904.108 Mobile Safari/537.36",
}
page_list_url = "https://api.bilibili.com/x/player/pagelist?aid={0}&jsonp=jsonp"
def save_to_mongo():
client = pymongo.MongoClient(host='127.0.0.1', port=27017)
return client
def crawl_list_page(url):
response = requests.get(url, headers=headers)
vlist = response.json().get('data').get('list').get('vlist')
for dic in vlist:
aid = dic.get('aid')
di = {
"comment": dic.get('comment'), # 评论
"desc": dic.get('description'), # 描述
"title": dic.get('title'),
"play": dic.get('play'), # 播放量
"created": dic.get('created'), # 发稿时间,时间戳
"length": dic.get('length'),
"aid": aid,
"detail_url": "https://www.bilibili.com/video/av{0}".format(aid)
}
get_uid_url = page_list_url.format(aid)
get_bullet_comments(get_uid_url, di)
def get_bullet_comments(url, di):
res = requests.get(url, headers=headers)
cid = res.json().get('data')[0].get('cid')
dm_url = "https://api.bilibili.com/x/v1/dm/list.so?oid={0}".format(cid)
response = requests.get(dm_url, headers=headers)
response.encoding = 'utf-8'
selector = Selector(text=response.text)
danmu = selector.xpath('//d//text()').getall()
di['danmu'] = danmu
di['danmu_api'] = url
client = save_to_mongo()
client['blibli']['华农兄弟'].insert(di)
if __name__ == '__main__':
for i in range(1, 10):
url = "https://api.bilibili.com/x/space/arc/search?mid=250858633&ps=30&tid=0&pn={0}" \
"&keyword=&order=pubdate&jsonp=jsonp".format(i)
crawl_list_page(url)
# 弹幕:https://api.bilibili.com/x/v1/dm/list.so?oid=109798064
# detail_url: https://www.bilibili.com/video/av63221412
# title: https://api.bilibili.com/x/web-interface/archive/desc?callback=jqueryCallback_bili_8691248159053766&aid=63221412&page=&jsonp=jsonp&_=1577432452871
# 评论 : https://api.bilibili.com/x/v2/reply?callback=jQuery172022627270899916274_1577432450295&jsonp=jsonp&pn=1&type=1&oid=63221412&sort=2&_=1577432452892
# 视频详细信息:https://api.bilibili.com/x/web-interface/view?aid=63221412&cid=109798064
生成词云:
本来是打算找个竹鼠的图的,但是,生成词云的图片,好像必须要白底黑图,就随便网上找了张
import pymongo
from wordcloud import WordCloud
import PIL.Image as image
import numpy as np
import jieba
def client_mongo():
client = pymongo.MongoClient(host='127.0.0.1', port=27017)
return client
def trans_cn(text):
word_list = jieba.cut(text)
result = " ".join(word_list)
return result
def get_data():
client = client_mongo()
coll = client['blibli']['华农兄弟']
texts = []
for i in coll.find():
texts.extend(i.get('danmu'))
words = []
for text in texts:
# word = trans_cn(text)
words.append(text)
word_text = " ".join(words)
mask = np.array(image.open(r".\1.png")) # 词云形状
wordcloud = WordCloud(
mask=mask,
background_color="white",
# 生成中文字的字体
font_path=r"C:\Windows\Fonts\simhei.ttf"
).generate(word_text)
image_produce = wordcloud.to_image()
image_produce.show()
if __name__ == '__main__':
get_data()
效果:
评论列表
已有0条评论