python 数据可视化matplotlib、pyecharts和词云常用操作

Python 2022-07-13 910

1.matplotlib的条形图、折线图、饼图

1.1条形图

# 绘制条形统计图
# 全国各城市男女人数
from utils.tools import readExcel
import matplotlib
import matplotlib.pyplot as plt
import numpy as np


def showBarCharts():
    [row_datas, col_datas] = readExcel('./../census/census.xls')
    x_data = col_datas[0][2:]
    data1 = col_datas[5][2:]  # 男
    data2 = col_datas[6][2:]  # 女
    data1 = [int(i) for i in data1]
    data2 = [int(i) for i in data2]
    matplotlib.rc("font", family='MicroSoft YaHei')
    list1 = np.array(data1)  # 柱状图第一组数据
    list2 = np.array(data2)  # 柱状图第二组数据
    x = np.arange(len(x_data))  # 横坐标范围

    width = 0.4  # 单个柱状图的宽度

    plt.title("2010年全国各省市男/女性总人口")  # 柱状图标题
    plt.ylabel("人数")  # 纵坐标label
    plt.ylabel('人口(人)')
    plt.ylim(500000, 80000000)
    # plt.xlabel('城市')

    plt.bar(x, list1, width=width, label="全国男性总人口")
    plt.bar(x + width, list2, width=width, label="全国女性总人口")
    # plt.xticks(np.arange(20) + bar_width / 2, values[:, 0], rotation=45)
    # 这里是将x轴的标签数据旋转
    plt.xticks(ticks=x + width / 2, labels=x_data, rotation=70, fontsize=8)
    plt.legend()  # 给出图例
    plt.show()

1.2 折线图

# 折线图
# 各省市人口对比
from utils.tools import readExcel
import matplotlib
import matplotlib.pyplot as plt


def showLineChart():
    [row_datas, col_datas] = readExcel('./../census/census.xls')
    x_data = col_datas[0][2:]
    y_data = col_datas[4][2:]
    y_data = [int(i) for i in y_data]
    matplotlib.rc("font", family='MicroSoft YaHei')
    for a, b in zip(x_data, y_data): # 在折线点上绘制该点的数据
        plt.text(a, b, str(b), ha='center', va='bottom', fontsize=8)  # ha='center', va='top'
    plt.plot(x_data, y_data, color="pink", marker=".", alpha=0.5, linewidth=1)
    plt.ylabel("人口/亿")
    plt.xticks(rotation=70, fontsize=8)
    plt.title("城市总人口")
    plt.show()

1.3 饼图

# 饼图
# 统计全国男女比例
from utils.tools import readExcel
import matplotlib
import matplotlib.pyplot as plt


def draw_pie_charts():
    [row_datas, col_datas] = readExcel('./../census/census.xls')
    datas = row_datas[1][5:7]
    print(datas)  # ['682329104', '650481765']
    labels = ["男", "女"]
    matplotlib.rc("font", family='MicroSoft YaHei')
    # matplotlib 会根据数据自动会你换算
    plt.pie(datas, labels=labels, autopct='%1.1f%%', counterclock=False, startangle=90, explode=[0.1, 0])

    plt.title('2010年全国男/女占比')
    plt.tight_layout() # 自动调整子图参数,使之填充整个图像区域
    plt.show()

2.pyecharts 绘制折线、条形、地图、箱型图

2.1 条形图绘制

"""
规格分析-比对不同参数电脑的销量情况
-条形图
"""
from collections import Counter

from pyecharts import options as opts
from pyecharts.charts import Bar, Grid

from utils.tools import read_csv

if __name__ == '__main__':
    items = read_csv('jd_all_comment.csv')

    dic = {}  # 获取商品id与型号的对应关系
    for i in items:
        product_size = i.get('商品型号')  # 因为存在为空的情况,所以会在商品为空时参考商品id来得到商品型号
        reference_id = str(i.get('商品ID')).strip()
        if product_size and dic.get(reference_id) is None:
            dic[reference_id] = product_size
    result = []
    for i in items:
        product_size = i.get('商品型号')
        reference_id = str(i.get('商品ID')).strip()
        if product_size == '':
            product_size = dic.get(reference_id)
        if product_size:
            result.append(product_size)

    counts = Counter(result)
    # 绘制图像的代码
    xaxis = list(counts.keys())  # x轴数据
    yaxis = list(counts.values())  # y轴数据

    bar = (
        Bar()
            .add_xaxis(xaxis)
            .add_yaxis("型号", yaxis)
            .set_global_opts(title_opts=opts.TitleOpts(title="华硕(ASUS)灵耀13s 大明宫版"),
                             xaxis_opts=opts.AxisOpts(name_rotate=60, axislabel_opts={"rotate": 45}))
    )
    grid = Grid(init_opts=opts.InitOpts(  # 使用grid组合图形,解决x轴文字被省略的问题
        width='900px',
        height='800px',
    ))
    grid.add(bar, grid_opts=opts.GridOpts(pos_bottom='30%', is_contain_label=True))
    grid.render("product_size.html")
    print('生成完成')

2. 箱型图

"""
    箱线图,通过箱型的大小就能直观展示出数据的分布
"""
from utils.tools import read_csv
from pyecharts import options as opts
from pyecharts.charts import Boxplot
import datetime
import math

if __name__ == '__main__':
    items = read_csv('jd_all_comment.csv')
    times = []
    for i in items:
        comment_time = i.get('评论时间')
        referenceTime = i.get('上架时间')
        comment = i.get('评论')
        startTime = datetime.datetime.strptime(referenceTime, "%Y-%m-%d %H:%M:%S")
        endTime = datetime.datetime.strptime(comment_time, "%Y-%m-%d %H:%M:%S")
        seconds = (endTime - startTime).total_seconds()
        hours = seconds/60/60  # 获取得到小时
        day = math.ceil(hours/24)
        times.append(day) # 将评论相距天数保存到times
    xaxis = ['2天内', '7天内', '大于12天']  # 箱型图的x轴的标签
    yaxis1 = list(filter(lambda x: x <= 2, times))  # 数据量
    yaxis2 = list(filter(lambda x: x <= 7, times))
    yaxis3 = list(filter(lambda x: x >= 12, times))
    c = Boxplot()
    c.add_xaxis(xaxis)
    c.add_yaxis("A", c.prepare_data([yaxis1, yaxis2, yaxis3]))
    c.set_global_opts(title_opts=opts.TitleOpts(title="评论时间"))
    c.render("comment_time_boxplot.html")

2.3 地图

from pyecharts import options as opts
from pyecharts.charts import Map
from utils.tools import readExcel
from pyecharts.render import make_snapshot  # 保存图片需要的
from snapshot_selenium import snapshot as driver  # 保存图片需要的
"""
 保存图片时需要 安装 chrome driver
 https://blog.csdn.net/weixin_46660141/article/details/116498316
 且需要 去掉 .render("geo.html")
"""

if __name__ == '__main__':
    [row_datas, col_datas] = readExcel('./../spider/2010年水资源情况.xls')
    row1 = row_datas[1:]
    row2 = []
    for i in row1:
        name = i[0].replace('省', '').replace('市', '') \
            .replace('自治区', '').replace('维吾尔', '') \
            .replace('回族', '').replace('壮族', '')
        row2.append([name, float(i[1])])
    c = (
        Map()
            .add("水资源总量", row2, maptype='china')
            .set_series_opts(label_opts=opts.LabelOpts(is_show=False))
            .set_global_opts(
            visualmap_opts=opts.VisualMapOpts(is_piecewise=True, pieces=[
                {"min": 0, "max": 500, "label": "1~500", "color": "cyan"},
                {"min": 500, "max": 1000, "label": "501~1000", "color": "yellow"},
                {"min": 1000, "max": 2000, "label": "1001~2000", "color": "orange"},
                {"min": 2000, "max": 3000, "label": "2001~3000", "color": "coral"},
                {"min": 3000, "max": 5000, "label": "3001~5000", "color": "red"},
            ]), title_opts=opts.TitleOpts(title="2010年水资源总量(亿立方米)分布")
        )
            .render("geo.html")
    )
    # make_snapshot(driver, c.render(), "2010年水资源总量分布图.png") # 保存图片

3.词云

在制作词云为了让分词更有需要提供停用词表,过滤一些无意义的词

"""
词云图
"""
import os

import jieba
from utils.tools import read_csv

from wordcloud import WordCloud


def get_abs_path():
    """
    获取绝对路径
    :return:
    """
    return os.path.abspath(os.path.dirname(os.path.abspath(os.path.dirname(__file__))))


def stop_words_list(filepath):
    """
    停用词
    :param filepath:
    :return:
    """
    stopwords = [
        line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()
    ]
    return stopwords


def parse_word(items, word_cloud_name):
    """
    解析评论
    :param items:
    :param word_cloud_name:
    :return:
    """
    words = []  # 评论
    for i in items:
        comment = i.get('评论').strip()  # 评论
        if comment.find('此用户未及时') != -1:
            comment = i.get('追评')
        if comment:
            words.append(comment)
    content = "".join(words)  # 先将数组内的词连成一段
    current_path = get_abs_path()
    stop_word_path = os.path.join(current_path, r'./comment_word_cloud/stopword.txt').replace('\\', '/')
    stopwords = stop_words_list(stop_word_path)
    new_list = []
    data_changed = jieba.cut(content, cut_all=True)  # 分词
    for word in data_changed:
        if word not in stopwords:  # 去除部分无意义的词汇
            new_list.append(word)
    tem = " "
    data_end = tem.join(new_list)
    mask = None
    # max_words 指定词云显示的最大单词数量 默认大小为200
    # font_path 使用的字体文件
    # WordCloud对象中也有停用词选项 stopwords
    # collocations,是否包括两个词的搭配 强烈建议开启 这将会去除那种多次出现的叠词 如哈哈哈 哈哈哈哈
    wd = WordCloud(width=600, height=400, max_words=200, font_path='./simfang.ttf', collocations=False, mask=mask)  # 配置对象参数
    wd.generate(data_end).to_file('{}.jpg'.format(word_cloud_name))  # 生成词云

标签:Python

文章评论

评论列表

已有0条评论