-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlda.py
323 lines (299 loc) · 12.4 KB
/
lda.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
from collections import OrderedDict
import numpy as np
import random
import os
import configparser
import codecs
from stanfordcorenlp import StanfordCoreNLP
import log.logger as logger
import prettytable as pt
class Document:
'''
文檔,保存文檔單詞和個數
'''
def __init__(self):
self.words = []
self.length = 0
self.docuName = None
class DataPreProcessing:
'''
总保存文档,得到单词集和映射
'''
def __init__(self):
self.docs_count = 0
self.words_count = 0
self.docs = []
self.word2id = OrderedDict() #单词,id映射
self.id2word = None #id, 单词映射
class LDA:
def __init__(self, dpre, topic_num, alpha, beta, iters_time, top_words_num, log):
self.dpre = dpre #获取预处理参数
self.K = topic_num #聚类的个数
self.alpha = alpha #超参数α(alpha)
self.beta = beta #超参数 β(beta)
self.iter_times = iters_time #迭代次数
self.top_words_num = top_words_num #每个类特征词个数top_words_num
self.log = log
self.p = np.zeros(self.K) #概率向量,存储采样的临时变量
# 词在主题上的分布,单词-主题先验分布P(word,topic)
self.wordTopic = np.zeros((self.dpre.words_count, self.K), dtype=int)
# 每个topic上词的数量 主题条件概率分布P(word|topic)
self.TopicWordSum = np.zeros(self.K, dtype=int)
# 每个doc中各个topic词的个数 文档-主题先验分布P(doc, topic)
self.docsTopic = np.zeros((self.dpre.docs_count, self.K), dtype=int)
# 每个doc中词的总数 文档单词矩阵 单词条件概率分布P(topic|doc)
self.docsWordSum = np.zeros(self.dpre.docs_count, dtype=int)
# 文档中的词分布 文档-单词联合分布
self.TopicWord = np.array([
[0 for y in range(self.dpre.docs[x].length)]
for x in range(self.dpre.docs_count)
])
#随机分配类型,为每个文档中的各个单词分配主题
for x in range(self.TopicWord.shape[0]):
self.docsWordSum[x] = self.dpre.docs[x].length#每个doc中词个数
for y in range(self.dpre.docs[x].length):
topic = random.randint(0, self.K-1)#随机选取一个主题
self.TopicWord[x][y] = topic #文档中词-主题分布
self.wordTopic[self.dpre.docs[x].words[y]][topic] += 1
self.TopicWordSum[topic] += 1
self.docsTopic[x][topic] += 1
#主题分布
self.theta = np.array([
[0.0 for y in range(self.K)]
for _ in range(self.dpre.docs_count)
])
#词语分布
self.phi = np.array([
[0.0 for y in range(self.dpre.words_count)]
for _ in range(self.K)
])
def _sampling(self, i, j):
'''
Parameters
----------
i 文档下标
j 词语下标
Returns 新主题
-------
'''
#换主题
topic = self.TopicWord[i][j]
word = self.dpre.docs[i].words[j]#单词编号
self.wordTopic[word][topic] -= 1
self.TopicWordSum[topic] -= 1
self.docsWordSum[i] -= 1
self.docsTopic[i][topic] -= 1
Vbeta = self.dpre.words_count * self.beta
Kalpha = self.K * self.alpha
self.p = (self.wordTopic[word]+self.beta)/(self.TopicWordSum[topic]+Vbeta) * \
(self.docsTopic[i]+self.alpha)/(self.docsWordSum[i]+Kalpha)
#随机更新topic
p = np.squeeze(np.asarray(self.p / np.sum(self.p)))
topic = np.argmax(np.random.multinomial(1, p))
self.wordTopic[word][topic] += 1
self.TopicWordSum[topic] += 1
self.docsWordSum[i] += 1
self.docsTopic[i][topic] += 1
return topic
def _est(self):
'''
Returns 计算出主题-词语联合概率分布
-------
'''
#Gibbs Sampling
self.log.info("Gabbs Samlping ....")
for x in range(self.iter_times):
for i in range(self.dpre.docs_count):
for j in range(self.dpre.docs[i].length):
topic = self._sampling(i, j)
self.TopicWord[i][j] = topic
self.log.info("计算文档-主题分布")
self._theta()
self.log.info("计算词分布")
self._phi()
self.log.info("保存模型")
self._save()
def _theta(self):
#计算文档-主题分布
for i in range(self.dpre.docs_count):
self.theta[i] = (self.docsTopic[i]+self.alpha)/(self.docsWordSum[i]+self.K*self.alpha)
def _phi(self):
#计算词语-主题分布
for i in range(self.K):
self.phi[i] = (self.wordTopic.T[i]+self.beta)/(self.TopicWordSum[i]+self.dpre.words_count*self.beta)
def _save(self):
'''
Returns 保存模型参数
-------
'''
conf = configparser.ConfigParser()
conf.read("settings.conf")
phifile = conf.get('Setting', 'phifile')
thetafile = conf.get('Setting', 'thetafile')
paramfile = conf.get('Setting', 'paramfile')
topNfile = conf.get('Setting', 'topNfile')
tassginfile = conf.get('Setting', 'tassginfile')
self.log.info(u"文章-主题分布已保存到%s" % thetafile)
with codecs.open(thetafile, 'w') as f:
for x in range(self.dpre.docs_count):
for y in range(self.K):
f.write(str(self.theta[x][y]) + '\t')
f.write('\n')
# 保存phi词-主题分布
self.log.info(u"词-主题分布已保存到%s" % phifile)
with codecs.open(phifile, 'w') as f:
for x in range(self.K):
for y in range(self.dpre.words_count):
f.write(str(self.phi[x][y]) + '\t')
f.write('\n')
# 保存参数设置
self.log.info(u"参数设置已保存到%s" % paramfile)
with codecs.open(paramfile, 'w', 'utf-8') as f:
f.write('K=' + str(self.K) + '\n')
f.write('alpha=' + str(self.alpha) + '\n')
f.write('beta=' + str(self.beta) + '\n')
f.write(u'迭代次数 iter_times=' + str(self.iter_times) + '\n')
f.write(u'每个类的高频词显示个数 top_words_num=' + str(self.top_words_num) + '\n')
# 保存每个主题topic的词
self.log.info(u"主题topN词已保存到%s" % topNfile)
with codecs.open(topNfile, 'w', 'utf-8') as f:
self.top_words_num = min(self.top_words_num, self.dpre.words_count)
for x in range(self.K):
f.write(u'第' + str(x) + u'类:' + '\n')
twords = [(n, self.phi[x][n]) for n in range(self.dpre.words_count)]
twords.sort(key=lambda i: i[1], reverse=True)
for y in range(self.top_words_num):
word = OrderedDict({value: key for key, value in self.dpre.word2id.items()})[twords[y][0]]
f.write('\t' * 2 + word + '\t' + str(twords[y][1]) + '\n')
# 保存最后退出时,文章的词分派的主题的结果
self.log.info(u"文章-词-主题分派结果已保存到%s" % tassginfile)
with codecs.open(tassginfile, 'w') as f:
for x in range(self.dpre.docs_count):
for y in range(self.dpre.docs[x].length):
f.write(str(self.dpre.docs[x].words[y]) + ':' + str(self.TopicWord[x][y]) + '\t')
f.write('\n')
self.log.info(u"模型训练完成.")
def perplexity(self, docs=None):
'''
Parameters docs
----------
Returns 计算困惑度
-------
'''
if docs==None:
docs = self.dpre.docs
log_per = 0
for m in range(self.dpre.docs_count):
for word in self.dpre.docs[m].words:
log_per -= np.log(np.asarray(self.theta[m]*self.phi.T[word]).sum())
return log_per/self.dpre.docs_count
def _showTopicWord(self):
self.top_words_num = min(self.top_words_num, self.dpre.words_count)
table = pt.PrettyTable(["第"+str(i)+"类主题" for i in range(self.K)])
for x in range(self.K):
topicwords = []
twords = [(n, self.phi[x][n]) for n in range(self.dpre.words_count)]
twords.sort(key=lambda i: i[1], reverse=True)
for y in range(self.top_words_num):
word = self.dpre.id2word[[y][0]]
topicwords.append(word)
table.add_column("第"+str(x)+"类主题", topicwords)
print(table)
print(self.TopicWord)
def _showDocsTopic(self, num):
docxName = [self.dpre.docs[x].docuName for x in range(self.dpre.docs_count)]
table = pt.PrettyTable(docxName)
for x in range(self.dpre.docs_count):
topics = [(n, self.theta[x][n]) for n in range(self.K)]
topics.sort(key=lambda x:x[1], reverse=True)
topicwords = []
for y in range(num):
topic, probility = topics[y]
words = self.dpre.id2word[np.argmax(self.phi[topic])]
topicwords.append(words+"*"+str(probility))
table.add_column(self.dpre.docs[x].docuName, topicwords)
print(table)
def _get_topic_term(self, topicId):
'''
Parameters
----------
topicId 主题编号
Returns 主题词语,概率
-------
'''
topics = [(n, self.phi[topicId][n]) for n in range(self.dpre.words_count)]
topics.sort(lambda x:x[1], reverse=True)
return topics[:100][0], topics[:100][1]
def preprocessing():
#读取配置文件内容
conf = configparser.ConfigParser()
conf.read("settings.conf")
filename = conf.get('Setting', 'filename')
stanfordpath = conf.get('Setting', 'stanfordpath')
stopwordpath = conf.get('Setting', 'stopwordpath')
logpath = conf.get('Setting', 'logpath')
#获取文件控制器
log = logger.getlog(logpath)
#获取去停止词
stopword = [item.strip('\n').strip() for item in codecs.open(stopwordpath, 'r', 'utf-8').readlines()]
nlp = StanfordCoreNLP(stanfordpath, lang='zh')
item_idx = 0
dpre = DataPreProcessing()
log.info("读取文件夹内文件并处理中...")
for path in os.listdir(filename):
# ff = open(os.path.join("/home/beacon/software/自然语言处理/LDAModel/test"
# ,path), 'w+', encoding="utf-8")
if item_idx > 10000:
break
obspath = os.path.join(filename, path) #获取文件路径
# 生成一个文档对象:包含单词序列(w1,w2,w3,,,,,wn)可以重复的
doc = Document()
#读文件内容
with codecs.open(obspath, 'r', 'utf-8') as f:
docs = f.readlines()
#一行行读取文件内容
for item in docs:
item = item.strip().strip('\n').strip()
if len(item) == 0:
continue
words = nlp.pos_tag(item)#切词
for (word, tag) in words:
if tag in ['NN', 'NR']:
if word not in stopword and len(word) >= 2:#去停止词
# ff.write(word + " ")
if word in dpre.word2id.keys():
#已经在字典,加入到doc中
doc.words.append(dpre.word2id[word])
else:
#加入到字典和文档中
dpre.word2id[word] = item_idx
doc.words.append(item_idx)
item_idx += 1
# ff.write("\n")
doc.docuName = path
doc.length = len(doc.words)
dpre.docs.append(doc)
dpre.docs_count = len(dpre.docs)
dpre.words_count = len(dpre.word2id)
dpre.id2word = OrderedDict({value:key for key, value in dpre.word2id.items()})
log.info("处理文件夹内文件数据成功!")
log.info("文件夹内文本数量为:{}, 单词数量为:{}".format(dpre.docs_count, dpre.words_count))
print(dpre.word2id)
return dpre, log
def main():
dpre, log = preprocessing()
lda = LDA(
alpha=10,
beta=0.01,
topic_num=5,
iters_time=50,
top_words_num=10,
log=log,
dpre=dpre
)
lda._est()
lda._showTopicWord()
lda._showDocsTopic(3)
print(lda.perplexity())
if __name__ == '__main__':
main()