1.matplotlib的条形图、折线图、饼图
1.1条形图
# 绘制条形统计图
# 全国各城市男女人数
from utils.tools import readExcel
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
def showBarCharts():
[row_datas, col_datas] = readExcel('./../census/census.xls')
x_data = col_datas[0][2:]
data1 = col_datas[5][2:] # 男
data2 = col_datas[6][2:] # 女
data1 = [int(i) for i in data1]
data2 = [int(i) for i in data2]
matplotlib.rc("font", family='MicroSoft YaHei')
list1 = np.array(data1) # 柱状图第一组数据
list2 = np.array(data2) # 柱状图第二组数据
x = np.arange(len(x_data)) # 横坐标范围
width = 0.4 # 单个柱状图的宽度
plt.title("2010年全国各省市男/女性总人口") # 柱状图标题
plt.ylabel("人数") # 纵坐标label
plt.ylabel('人口(人)')
plt.ylim(500000, 80000000)
# plt.xlabel('城市')
plt.bar(x, list1, width=width, label="全国男性总人口")
plt.bar(x + width, list2, width=width, label="全国女性总人口")
# plt.xticks(np.arange(20) + bar_width / 2, values[:, 0], rotation=45)
# 这里是将x轴的标签数据旋转
plt.xticks(ticks=x + width / 2, labels=x_data, rotation=70, fontsize=8)
plt.legend() # 给出图例
plt.show()
1.2 折线图
# 折线图
# 各省市人口对比
from utils.tools import readExcel
import matplotlib
import matplotlib.pyplot as plt
def showLineChart():
[row_datas, col_datas] = readExcel('./../census/census.xls')
x_data = col_datas[0][2:]
y_data = col_datas[4][2:]
y_data = [int(i) for i in y_data]
matplotlib.rc("font", family='MicroSoft YaHei')
for a, b in zip(x_data, y_data): # 在折线点上绘制该点的数据
plt.text(a, b, str(b), ha='center', va='bottom', fontsize=8) # ha='center', va='top'
plt.plot(x_data, y_data, color="pink", marker=".", alpha=0.5, linewidth=1)
plt.ylabel("人口/亿")
plt.xticks(rotation=70, fontsize=8)
plt.title("城市总人口")
plt.show()
1.3 饼图
# 饼图
# 统计全国男女比例
from utils.tools import readExcel
import matplotlib
import matplotlib.pyplot as plt
def draw_pie_charts():
[row_datas, col_datas] = readExcel('./../census/census.xls')
datas = row_datas[1][5:7]
print(datas) # ['682329104', '650481765']
labels = ["男", "女"]
matplotlib.rc("font", family='MicroSoft YaHei')
# matplotlib 会根据数据自动会你换算
plt.pie(datas, labels=labels, autopct='%1.1f%%', counterclock=False, startangle=90, explode=[0.1, 0])
plt.title('2010年全国男/女占比')
plt.tight_layout() # 自动调整子图参数,使之填充整个图像区域
plt.show()
2.pyecharts 绘制折线、条形、地图、箱型图
2.1 条形图绘制
"""
规格分析-比对不同参数电脑的销量情况
-条形图
"""
from collections import Counter
from pyecharts import options as opts
from pyecharts.charts import Bar, Grid
from utils.tools import read_csv
if __name__ == '__main__':
items = read_csv('jd_all_comment.csv')
dic = {} # 获取商品id与型号的对应关系
for i in items:
product_size = i.get('商品型号') # 因为存在为空的情况,所以会在商品为空时参考商品id来得到商品型号
reference_id = str(i.get('商品ID')).strip()
if product_size and dic.get(reference_id) is None:
dic[reference_id] = product_size
result = []
for i in items:
product_size = i.get('商品型号')
reference_id = str(i.get('商品ID')).strip()
if product_size == '':
product_size = dic.get(reference_id)
if product_size:
result.append(product_size)
counts = Counter(result)
# 绘制图像的代码
xaxis = list(counts.keys()) # x轴数据
yaxis = list(counts.values()) # y轴数据
bar = (
Bar()
.add_xaxis(xaxis)
.add_yaxis("型号", yaxis)
.set_global_opts(title_opts=opts.TitleOpts(title="华硕(ASUS)灵耀13s 大明宫版"),
xaxis_opts=opts.AxisOpts(name_rotate=60, axislabel_opts={"rotate": 45}))
)
grid = Grid(init_opts=opts.InitOpts( # 使用grid组合图形,解决x轴文字被省略的问题
width='900px',
height='800px',
))
grid.add(bar, grid_opts=opts.GridOpts(pos_bottom='30%', is_contain_label=True))
grid.render("product_size.html")
print('生成完成')
2. 箱型图
"""
箱线图,通过箱型的大小就能直观展示出数据的分布
"""
from utils.tools import read_csv
from pyecharts import options as opts
from pyecharts.charts import Boxplot
import datetime
import math
if __name__ == '__main__':
items = read_csv('jd_all_comment.csv')
times = []
for i in items:
comment_time = i.get('评论时间')
referenceTime = i.get('上架时间')
comment = i.get('评论')
startTime = datetime.datetime.strptime(referenceTime, "%Y-%m-%d %H:%M:%S")
endTime = datetime.datetime.strptime(comment_time, "%Y-%m-%d %H:%M:%S")
seconds = (endTime - startTime).total_seconds()
hours = seconds/60/60 # 获取得到小时
day = math.ceil(hours/24)
times.append(day) # 将评论相距天数保存到times
xaxis = ['2天内', '7天内', '大于12天'] # 箱型图的x轴的标签
yaxis1 = list(filter(lambda x: x <= 2, times)) # 数据量
yaxis2 = list(filter(lambda x: x <= 7, times))
yaxis3 = list(filter(lambda x: x >= 12, times))
c = Boxplot()
c.add_xaxis(xaxis)
c.add_yaxis("A", c.prepare_data([yaxis1, yaxis2, yaxis3]))
c.set_global_opts(title_opts=opts.TitleOpts(title="评论时间"))
c.render("comment_time_boxplot.html")
2.3 地图
from pyecharts import options as opts
from pyecharts.charts import Map
from utils.tools import readExcel
from pyecharts.render import make_snapshot # 保存图片需要的
from snapshot_selenium import snapshot as driver # 保存图片需要的
"""
保存图片时需要 安装 chrome driver
https://blog.csdn.net/weixin_46660141/article/details/116498316
且需要 去掉 .render("geo.html")
"""
if __name__ == '__main__':
[row_datas, col_datas] = readExcel('./../spider/2010年水资源情况.xls')
row1 = row_datas[1:]
row2 = []
for i in row1:
name = i[0].replace('省', '').replace('市', '') \
.replace('自治区', '').replace('维吾尔', '') \
.replace('回族', '').replace('壮族', '')
row2.append([name, float(i[1])])
c = (
Map()
.add("水资源总量", row2, maptype='china')
.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
.set_global_opts(
visualmap_opts=opts.VisualMapOpts(is_piecewise=True, pieces=[
{"min": 0, "max": 500, "label": "1~500", "color": "cyan"},
{"min": 500, "max": 1000, "label": "501~1000", "color": "yellow"},
{"min": 1000, "max": 2000, "label": "1001~2000", "color": "orange"},
{"min": 2000, "max": 3000, "label": "2001~3000", "color": "coral"},
{"min": 3000, "max": 5000, "label": "3001~5000", "color": "red"},
]), title_opts=opts.TitleOpts(title="2010年水资源总量(亿立方米)分布")
)
.render("geo.html")
)
# make_snapshot(driver, c.render(), "2010年水资源总量分布图.png") # 保存图片
3.词云
在制作词云为了让分词更有需要提供停用词表,过滤一些无意义的词
"""
词云图
"""
import os
import jieba
from utils.tools import read_csv
from wordcloud import WordCloud
def get_abs_path():
"""
获取绝对路径
:return:
"""
return os.path.abspath(os.path.dirname(os.path.abspath(os.path.dirname(__file__))))
def stop_words_list(filepath):
"""
停用词
:param filepath:
:return:
"""
stopwords = [
line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()
]
return stopwords
def parse_word(items, word_cloud_name):
"""
解析评论
:param items:
:param word_cloud_name:
:return:
"""
words = [] # 评论
for i in items:
comment = i.get('评论').strip() # 评论
if comment.find('此用户未及时') != -1:
comment = i.get('追评')
if comment:
words.append(comment)
content = "".join(words) # 先将数组内的词连成一段
current_path = get_abs_path()
stop_word_path = os.path.join(current_path, r'./comment_word_cloud/stopword.txt').replace('\\', '/')
stopwords = stop_words_list(stop_word_path)
new_list = []
data_changed = jieba.cut(content, cut_all=True) # 分词
for word in data_changed:
if word not in stopwords: # 去除部分无意义的词汇
new_list.append(word)
tem = " "
data_end = tem.join(new_list)
mask = None
# max_words 指定词云显示的最大单词数量 默认大小为200
# font_path 使用的字体文件
# WordCloud对象中也有停用词选项 stopwords
# collocations,是否包括两个词的搭配 强烈建议开启 这将会去除那种多次出现的叠词 如哈哈哈 哈哈哈哈
wd = WordCloud(width=600, height=400, max_words=200, font_path='./simfang.ttf', collocations=False, mask=mask) # 配置对象参数
wd.generate(data_end).to_file('{}.jpg'.format(word_cloud_name)) # 生成词云
评论列表
已有0条评论