-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcutWord.py
47 lines (42 loc) · 1.3 KB
/
cutWord.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# -*- coding: utf-8 -*-
import jieba
import logging
'''
获取去停止词
得到去停止词的集合
'''
def get_stopword(*filepaths):
logging.basicConfig(format="%(asctime)s:%(levelname)s:%(message)s",level=logging.INFO)
stopWordList = set()
for filepath in filepaths:
with open(filepath,"r",encoding="utf-8") as f:
for item in f:
stopWordList.add(item.strip("\n"))
return stopWordList
'''
使用结巴分词工具对数据集进行切词
并去停止词后写入新文件
'''
def cutword(stopWordList, filepath, savepath):
file = open(filepath, "r", encoding="utf-8")
output = open(savepath, "a+", encoding="utf-8")
datas = file.readlines()
content_line = ""
for data in datas:
items = jieba.cut(data.strip("\n"), cut_all=False)
for item in items:
if item not in stopWordList:
content_line += item + " "
output.write(content_line+"\n")
content_line = ""
file.close()
output.close()
'''
对所有需要切词文件进行切词
'''
def main():
stopWordList = get_stopword("wordTovec\data\stopwords.txt","wordTovec\data\stopwords01.txt")
for i in range(3):
cutword(stopWordList, "wordTovec\AA\wiki0"+str(i), "wordTovec\AA\output.txt")
if __name__ == "__main__":
main()