# coding=utf-8 import re import html import jieba import jieba.analyse from sklearn.metrics.pairwise import cosine_similarity class CosineSimilarity(object): """ 余弦相似度 """ def __init__(self, content_x1, content_y2): self.s1 = content_x1 self.s2 = content_y2 @staticmethod def extract_keyword(content): # 提取关键词 # 正则过滤 html 标签 re_exp = re.compile(r'()|(<[^>]+>)', re.S) content = re_exp.sub(' ', content) # html 转义符实体化 content = html.unescape(content) # 切割 seg = [i for i in jieba.cut(content, cut_all=True) if i != ''] # 提取关键词 keywords = jieba.analyse.extract_tags("|".join(seg), topK=200, withWeight=False, allowPOS=('n', 'nr', 'ns')) # print(keywords) # return keywords return seg,keywords @staticmethod def one_hot(word_dict, keywords): # oneHot编码 # cut_code = [word_dict[word] for word in keywords] cut_code = [0]*len(word_dict) for word in keywords: cut_code[word_dict[word]] += 1 return cut_code def main(self): # 去除停用词 # jieba.analyse.set_stop_words('stopword1.txt') # 提取关键词 # keywords1 = self.extract_keyword(self.s1) # keywords2 = self.extract_keyword(self.s2) seg1,keywords1 = self.extract_keyword(self.s1) seg2,keywords2 = self.extract_keyword(self.s2) # 词的并集 union = set(keywords1).union(set(keywords2)) # union = set(seg1).union(set(seg2)) # 编码 word_dict = {} i = 0 for word in union: word_dict[word] = i i += 1 # oneHot编码 s1_cut_code = self.one_hot(word_dict, keywords1) s2_cut_code = self.one_hot(word_dict, keywords2) # s1_cut_code = self.one_hot(word_dict, seg1) # s2_cut_code = self.one_hot(word_dict, seg2) # 余弦相似度计算 sample = [s1_cut_code, s2_cut_code] # 除零处理 try: sim = cosine_similarity(sample) return sim[1][0],keywords1,keywords2 except Exception as e: print(e) return 0.0,keywords1,keywords2 # 测试 if __name__ == '__main__': with open(r'D:\pythonDM\Ndkj\live111\result\1.txt', encoding='UTF-8') as x, open(r'D:\pythonDM\Ndkj\live111\result\2.txt', encoding='UTF-8') as y: content_x = x.read() content_y = y.read() similarity = CosineSimilarity(content_x, content_y) # similarity = CosineSimilarity(file, file2) similarity = similarity.main() print(similarity) print('相似度: %.2f%%' % (similarity*32))