# coding=utf-8
import re
import html
import jieba
import jieba.analyse
from sklearn.metrics.pairwise import cosine_similarity
class CosineSimilarity(object):
"""
余弦相似度
"""
def __init__(self, content_x1, content_y2):
self.s1 = content_x1
self.s2 = content_y2
@staticmethod
def extract_keyword(content): # 提取关键词
# 正则过滤 html 标签
re_exp = re.compile(r'()|(<[^>]+>)', re.S)
content = re_exp.sub(' ', content)
# html 转义符实体化
content = html.unescape(content)
# 切割
seg = [i for i in jieba.cut(content, cut_all=True) if i != '']
# 提取关键词
keywords = jieba.analyse.extract_tags("|".join(seg), topK=200, withWeight=False, allowPOS=('n', 'nr', 'ns'))
# print(keywords)
# return keywords
return seg,keywords
@staticmethod
def one_hot(word_dict, keywords): # oneHot编码
# cut_code = [word_dict[word] for word in keywords]
cut_code = [0]*len(word_dict)
for word in keywords:
cut_code[word_dict[word]] += 1
return cut_code
def main(self):
# 去除停用词
# jieba.analyse.set_stop_words('stopword1.txt')
# 提取关键词
# keywords1 = self.extract_keyword(self.s1)
# keywords2 = self.extract_keyword(self.s2)
seg1,keywords1 = self.extract_keyword(self.s1)
seg2,keywords2 = self.extract_keyword(self.s2)
# 词的并集
union = set(keywords1).union(set(keywords2))
# union = set(seg1).union(set(seg2))
# 编码
word_dict = {}
i = 0
for word in union:
word_dict[word] = i
i += 1
# oneHot编码
s1_cut_code = self.one_hot(word_dict, keywords1)
s2_cut_code = self.one_hot(word_dict, keywords2)
# s1_cut_code = self.one_hot(word_dict, seg1)
# s2_cut_code = self.one_hot(word_dict, seg2)
# 余弦相似度计算
sample = [s1_cut_code, s2_cut_code]
# 除零处理
try:
sim = cosine_similarity(sample)
return sim[1][0],keywords1,keywords2
except Exception as e:
print(e)
return 0.0,keywords1,keywords2
# 测试
if __name__ == '__main__':
with open(r'D:\pythonDM\Ndkj\live111\result\1.txt', encoding='UTF-8') as x, open(r'D:\pythonDM\Ndkj\live111\result\2.txt', encoding='UTF-8') as y:
content_x = x.read()
content_y = y.read()
similarity = CosineSimilarity(content_x, content_y)
# similarity = CosineSimilarity(file, file2)
similarity = similarity.main()
print(similarity)
print('相似度: %.2f%%' % (similarity*32))