python爬虫blibli-华农兄弟弹幕生成词云

网络爬虫 2019-12-28 1265

这里只贴下代码,简单介绍下
看看下方代码最底部的注释,这是比较重要的几个链接,当然最重要的是弹幕的链接,只需要访问某个视频详情页,打开浏览器调试,刷新页面就可以看到,但是只能看到1000+的弹幕。
抓取了华农兄弟的全部视频弹幕,共计26万条

import pymongo  
import requests  
from scrapy.selector import Selector

headers = {  
    "User-Agent": "Mozilla/5.0 (Linux; Android 8.0.0; Pixel 2 XL Build/OPD1.170816.004) AppleWebKit/537.36 (KHTML, like "  
                  "Gecko) Chrome/78.0.3904.108 Mobile Safari/537.36",  
}  

page_list_url = "https://api.bilibili.com/x/player/pagelist?aid={0}&jsonp=jsonp"  


def save_to_mongo():  
    client = pymongo.MongoClient(host='127.0.0.1', port=27017)  
    return client  


def crawl_list_page(url):  
    response = requests.get(url, headers=headers)  
    vlist = response.json().get('data').get('list').get('vlist')  
    for dic in vlist:  
        aid = dic.get('aid')  
        di = {  
            "comment": dic.get('comment'),  # 评论  
            "desc": dic.get('description'),  # 描述  
            "title": dic.get('title'),  
            "play": dic.get('play'),  # 播放量  
            "created": dic.get('created'),   # 发稿时间,时间戳  
            "length": dic.get('length'),  
            "aid": aid,  
            "detail_url": "https://www.bilibili.com/video/av{0}".format(aid)  
        }  
        get_uid_url = page_list_url.format(aid)  
        get_bullet_comments(get_uid_url, di)  


def get_bullet_comments(url, di):  
    res = requests.get(url, headers=headers)  
    cid = res.json().get('data')[0].get('cid')  
    dm_url = "https://api.bilibili.com/x/v1/dm/list.so?oid={0}".format(cid)  
    response = requests.get(dm_url, headers=headers)  
    response.encoding = 'utf-8'  
    selector = Selector(text=response.text)  
    danmu = selector.xpath('//d//text()').getall()  
    di['danmu'] = danmu  
    di['danmu_api'] = url  
    client = save_to_mongo()  
    client['blibli']['华农兄弟'].insert(di)  


if __name__ == '__main__':  
    for i in range(1, 10):  
        url = "https://api.bilibili.com/x/space/arc/search?mid=250858633&ps=30&tid=0&pn={0}" \  
              "&keyword=&order=pubdate&jsonp=jsonp".format(i)  
        crawl_list_page(url)  

# 弹幕:https://api.bilibili.com/x/v1/dm/list.so?oid=109798064  
# detail_url: https://www.bilibili.com/video/av63221412  
# title: https://api.bilibili.com/x/web-interface/archive/desc?callback=jqueryCallback_bili_8691248159053766&aid=63221412&page=&jsonp=jsonp&_=1577432452871  
# 评论 : https://api.bilibili.com/x/v2/reply?callback=jQuery172022627270899916274_1577432450295&jsonp=jsonp&pn=1&type=1&oid=63221412&sort=2&_=1577432452892  
# 视频详细信息:https://api.bilibili.com/x/web-interface/view?aid=63221412&cid=109798064  

生成词云:
本来是打算找个竹鼠的图的,但是,生成词云的图片,好像必须要白底黑图,就随便网上找了张

import pymongo  
from wordcloud import WordCloud  
import PIL.Image as image  
import numpy as np  
import jieba

def client_mongo():  
    client = pymongo.MongoClient(host='127.0.0.1', port=27017)  
    return client  


def trans_cn(text):  
    word_list = jieba.cut(text)  
    result = " ".join(word_list)  
    return result  


def get_data():  
    client = client_mongo()  
    coll = client['blibli']['华农兄弟']  
    texts = []  
    for i in coll.find():  
        texts.extend(i.get('danmu'))  
    words = []  
    for text in texts:  
        # word = trans_cn(text)  
        words.append(text)  
    word_text = " ".join(words)  
    mask = np.array(image.open(r".\1.png")) # 词云形状  
    wordcloud = WordCloud(  
        mask=mask,  
        background_color="white",  
        # 生成中文字的字体  
        font_path=r"‪C:\Windows\Fonts\simhei.ttf"  
    ).generate(word_text)  
    image_produce = wordcloud.to_image()  
    image_produce.show()  


if __name__ == '__main__':  
    get_data()  

效果:

 

标签:网络爬虫

文章评论

评论列表

已有0条评论