微信聊天记录词云

前端之家收集整理的这篇文章主要介绍了微信聊天记录词云前端之家小编觉得挺不错的,现在分享给大家,也给大家做个参考。

<table class="python"><tr class="li1">
<td class="ln"><pre class="de1">1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
@H_403_1@</td>
<td class="de1"><pre class="de1"><span class="co1"># -- coding: utf-8 --
<span class="st0">"""
<span class="st0">Created on Tue Jan 17 17:00:15 2017
<span class="st0"> https://zhuanlan.zhihu.com/p/25188725
<span class="st0">@author: hhl
<span class="st0">"""
<span class="co1">#首先加载所需的各种库
<span class="kw1">import <span class="kw3">re
<span class="kw1">import requests
<span class="kw1">import <span class="kw3">time
<span class="kw1">import numpy <span class="kw1">as np
<span class="kw1">import <span class="kw3">codecs
<span class="kw1">import pandas
<span class="kw1">from lxml <span class="kw1">import etree
<span class="co1">#import seaborn as sns
<span class="kw1">import jieba
<span class="kw1">import <span class="kw3">os
<span class="kw1">from wordcloud <span class="kw1">import WordCloud
<span class="kw1">import matplotlib.<span class="me1">pyplot <span class="kw1">as plt
%matplotlib inline
 
<span class="co1">#遍历文件中的数据
<span class="kw2">file<span class="sy0">=<span class="kw3">codecs.<span class="kw2">open<span class="br0">(u<span class="st0">"Msg_new666_20170211.html"<span class="sy0">,<span class="st0">'r'<span class="br0">)
html<span class="sy0">=<span class="kw2">file.<span class="me1">read<span class="br0">(<span class="br0">)
<span class="kw2">file.<span class="me1">close<span class="br0">(<span class="br0">)
 
<span class="co1">#通过相应规则整理数据
item_pattern <span class="sy0">= <span class="kw3">re.<span class="kw2">compile<span class="br0">(
    r<span class="st0">'<SPAN class="MsgHistory">(.*?)'<span class="sy0">,
    <span class="kw3">re.<span class="me1">S<span class="br0">)
 
 
<span class="kw1">def parse_askitem<span class="br0">(page<span class="br0">):
    info <span class="sy0">= <span class="kw3">re.<span class="me1">findall<span class="br0">(item_pattern<span class="sy0">, page<span class="br0">)
    <span class="kw1">return info
 
items_list <span class="sy0">= parse_askitem<span class="br0">(html<span class="br0">)
 
对<span class="kw2">list进行处理,取出文字类型的数据并汇总到一个content中
content_list <span class="sy0">= <span class="br0">[<span class="br0">]
<span class="kw1">for item <span class="kw1">in items_list:
    <span class="co1">#print(item)
    <span class="kw1">if <span class="br0">(<span class="st0">'.' <span class="kw1">not <span class="kw1">in item<span class="br0">)&<span class="br0">(<span class="st0">';' <span class="kw1">not <span class="kw1">in item<span class="br0">):
        content_list.<span class="me1">append<span class="br0">(item<span class="br0">)
 
content <span class="sy0">=<span class="st0">""
<span class="kw1">for con <span class="kw1">in content_list:
    content <span class="sy0">= content + con
 
<span class="co1">#print(len(content))
 
<span class="co1">#分词  
segment <span class="sy0">= <span class="br0">[<span class="br0">]
segs <span class="sy0">= jieba.<span class="me1">cut<span class="br0">(content<span class="br0">)
<span class="kw1">for seg <span class="kw1">in segs:
    <span class="kw1">if <span class="kw2">len<span class="br0">(seg<span class="br0">) <span class="sy0">> <span class="nu0">1 <span class="kw1">and seg<span class="sy0">!=<span class="st0">'<span class="es0">\r<span class="es0">\n':
        segment.<span class="me1">append<span class="br0">(seg<span class="br0">)
 
<span class="co1">#去停用词
words_df<span class="sy0">=pandas.<span class="me1">DataFrame<span class="br0">(<span class="br0">{<span class="st0">'segment':segment<span class="br0">}<span class="br0">)
words_df.<span class="me1">head<span class="br0">(<span class="br0">)
<span class="co1">#stopwords=pandas.read_csv("stopwords.txt",index_col=False,quoting=3,sep="\t",names=['stopword'])#quoting=3全不引用
<span class="co1">#stopwords.head()
<span class="co1">#words_df=words_df[~words_df.segment.isin(stopwords.stopword)]
ancient_chinese_stopwords<span class="sy0">=pandas.<span class="me1">Series<span class="br0">(<span class="br0">[<span class="co1">#'的',
                                         <span class="co1">#'其','或','亦','方','于','即','皆',
                                         <span class="co1">#'因','仍','故','尚','呢','了','的','着',
                                         <span class="st0">'" "'<span class="br0">]<span class="br0">)
words_df<span class="sy0">=words_df<span class="br0">[<span class="sy0">~words_df.<span class="me1">segment.<span class="me1">isin<span class="br0">(ancient_chinese_stopwords<span class="br0">)<span class="br0">]
 
<span class="co1">#统计词频
words_stat<span class="sy0">=words_df.<span class="me1">groupby<span class="br0">(by<span class="sy0">=<span class="br0">[<span class="st0">'segment'<span class="br0">]<span class="br0">)<span class="br0">[<span class="st0">'segment'<span class="br0">].<span class="me1">agg<span class="br0">(<span class="br0">{<span class="st0">"number":np.<span class="me1">size<span class="br0">}<span class="br0">)
words_stat<span class="sy0">=words_stat.<span class="me1">reset_index<span class="br0">(<span class="br0">).<span class="me1">sort_values<span class="br0">(by<span class="sy0">=<span class="st0">"number"<span class="sy0">,ascending<span class="sy0">=<span class="kw2">False<span class="br0">)
 
<span class="co1">#照片做词云
<span class="kw1">from scipy.<span class="me1">misc <span class="kw1">import imread
<span class="kw1">import matplotlib.<span class="me1">pyplot <span class="kw1">as plt
<span class="kw1">from wordcloud <span class="kw1">import WordCloud<span class="sy0">,ImageColorGenerator
bimg<span class="sy0">=imread<span class="br0">(<span class="st0">'timefriends_lcz.jpg'<span class="br0">)
wordcloud<span class="sy0">=WordCloud<span class="br0">(background_color<span class="sy0">=<span class="st0">"white"<span class="sy0">,mask<span class="sy0">=bimg<span class="sy0">,font_path<span class="sy0">=<span class="st0">'msyh.ttf'<span class="br0">)
wordcloud<span class="sy0">=wordcloud.<span class="me1">fit_words<span class="br0">(words_stat.<span class="me1">head<span class="br0">(<span class="nu0">39769<span class="br0">).<span class="me1">itertuples<span class="br0">(index<span class="sy0">=<span class="kw2">False<span class="br0">)<span class="br0">)
bimgColors<span class="sy0">=ImageColorGenerator<span class="br0">(bimg<span class="br0">)
plt.<span class="me1">figure<span class="br0">(figsize<span class="sy0">=<span class="br0">(<span class="nu0">20<span class="sy0">,<span class="nu0">15<span class="br0">)<span class="br0">)
plt.<span class="me1">axis<span class="br0">(<span class="st0">"off"<span class="br0">)
plt.<span class="me1">imshow<span class="br0">(wordcloud.<span class="me1">recolor<span class="br0">(color_func<span class="sy0">=bimgColors<span class="br0">)<span class="br0">)
plt.<span class="me1">show<span class="br0">(<span class="br0">)
 
<span class="co1">#==========================中文显示乱码问题===========================================
 
<span class="kw1">import matplotlib
zhfont1 <span class="sy0">= matplotlib.<span class="me1">font_manager.<span class="me1">FontProperties<span class="br0">(fname<span class="sy0">=<span class="st0">'msyh.ttf'<span class="br0">)
<span class="co1"># 设置显示中文
matplotlib.<span class="me1">rcParams<span class="br0">[<span class="st0">'font.sans-serif'<span class="br0">] <span class="sy0">= <span class="br0">[<span class="st0">'msyh'<span class="br0">] <span class="co1">#指定默认字体
matplotlib.<span class="me1">rcParams<span class="br0">[<span class="st0">'axes.unicode_minus'<span class="br0">] <span class="sy0">= <span class="kw2">False <span class="co1">#解决保存图像是负号'-'显示为方块的问题
 
<span class="kw1">from matplotlib.<span class="me1">font_manager <span class="kw1">import FontProperties
font <span class="sy0">= FontProperties<span class="br0">(fname<span class="sy0">=r<span class="st0">"msyh.ttc"<span class="sy0">, size<span class="sy0">=<span class="nu0">14<span class="br0">)
 
words_stat<span class="br0">[:<span class="nu0">20<span class="br0">].<span class="me1">plot<span class="br0">(y<span class="sy0">=<span class="st0">'number'<span class="sy0">, kind<span class="sy0">=<span class="st0">'bar'<span class="br0">)<span class="co1">#x='segment',中文未能正常显示
 
words_stat<span class="br0">[:<span class="nu0">20<span class="br0">].<span class="me1">plot<span class="br0">(x<span class="sy0">=<span class="st0">'segment'<span class="sy0">, y<span class="sy0">=<span class="st0">'number'<span class="sy0">, kind<span class="sy0">=<span class="st0">'bar'<span class="br0">)<span class="co1">#中文未能正常显示@H_403_1@</td>
</tr></table>

猜你在找的微信小程序相关文章