【基于机器学习的餐馆评论文本分类分析】( 五 )


对预处理操作完成后的数据进行统计 , 筛选出餐馆的评论量大于50的数据 , 过滤掉评论量较少的餐馆 , 集中分析有更多评论量的餐馆 , 按照餐馆的好评/差评数据占比总数据的比例降序排列 , 查看好评评论和差评评论中top10餐馆 。统计情况如下图 。
import pandas as pdimport matplotlib.pyplot as plt# 读取数据data = http://www.kingceram.com/post/batch_data.copy()# 设置中文字体plt.rcParams['font.sans-serif'] = ['SimHei']plt.rcParams['axes.unicode_minus'] = False# 筛选出评论数量大于50的餐馆数据filtered_data = data.groupby('restId').filter(lambda x: len(x)> 50)# 计算每个餐馆的总评数目、好评数目和差评数目grouped = filtered_data.groupby(['restId', 'evaluation']).size().reset_index(name='count')good_counts = grouped[grouped['evaluation'] == 1]bad_counts = grouped[grouped['evaluation'] == 0]total_counts = grouped.groupby('restId')['count'].sum().reset_index(name='total_count')good_ratios = good_counts.merge(total_counts, on='restId')good_ratios['good_ratio'] = good_ratios['count'] / good_ratios['total_count']bad_ratios = bad_counts.merge(total_counts, on='restId')bad_ratios['bad_ratio'] = bad_ratios['count'] / bad_ratios['total_count']ratios = good_ratios.merge(bad_ratios, on='restId').merge(data[['name','restId']].drop_duplicates(), on='restId')# 按照好评/总评比例或者差评/总评比例大小排序 , 并取出前10名餐馆top10_good = good_ratios.sort_values(by='good_ratio', ascending=False).head(10)top10_bad = bad_ratios.sort_values(by='bad_ratio', ascending=False).head(10)# 横向柱状图可视化好评/差评比例plt.subplots(figsize=(10, 7))plt.suptitle('Restaurants Rating Ratio - Top 10 Good Rating vs. Top 10 Bad Rating', fontsize=16, y=1)plt.subplot(1, 2, 1)plt.barh(y=ratios.nlargest(10,'good_ratio')['name'], width=ratios.nlargest(10,'good_ratio')['good_ratio'], color='green')plt.xticks(rotation=0)plt.xlabel('Good Rating Ratio')plt.ylabel('Restaurant Name')plt.title('Top 10 Restaurants by Good Rating Ratio')plt.subplot(1, 2, 2)plt.barh(y=ratios.nlargest(10,'bad_ratio')['name'], width=ratios.nlargest(10,'bad_ratio')['bad_ratio'], color='red')plt.xticks(rotation=0)plt.xlabel('Bad Rating Ratio')plt.ylabel('Restaurant Name')plt.title('Top 10 Restaurants by Bad Rating Ratio')plt.subplots_adjust(wspace=1)plt.show()
2.3 Top10 餐馆随时间变化的好评/差评[按照总体评分]线图
随着时间推移 , 餐馆的服务水平、菜品质量、环境卫生等方面可能发生变化 , 对应的好评/差评率也会有所反应 。整个数据中时间跨度为2009-07-04 00:16:00到2012-02-08 14:04:00 , 总共有32个月 , 137个周 , 950个日 , 数据量比较大 。绘制按照周的时间粒度的随时间变化的餐馆的好评/差评率线图如下 。
def plot_evaluation_by_week(top10, filtered_data, ratios, is_good_evaluation=True):# 绘制每个餐馆的评分随时间变化的线图for restId in top10['restId']:rest_data = http://www.kingceram.com/post/filtered_data[filtered_data['restId'] == restId]rest_data['date'] = pd.to_datetime(rest_data['timestamp'])rest_data = rest_data.set_index('date').resample('W').mean()rest_name = ratios.loc[ratios['restId'] == restId, 'name'].iloc[0]fig, ax = plt.subplots()ax.plot(rest_data.index, rest_data['rating_2'], label=rest_name, color='blue')ax.set_xlabel('时间')ax.set_ylabel('评分')title = '好评排名top10的餐厅' if not is_good_evaluation else '差评排名top10的餐厅'plt.title(title)plt.xticks(rotation=45)ax.legend(loc='best')plt.show()