You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

85 lines
2.7KB

  1. # coding=utf-8
  2. import re
  3. import html
  4. import jieba
  5. import jieba.analyse
  6. from sklearn.metrics.pairwise import cosine_similarity
  7. class CosineSimilarity(object):
  8. """
  9. 余弦相似度
  10. """
  11. def __init__(self, content_x1, content_y2):
  12. self.s1 = content_x1
  13. self.s2 = content_y2
  14. @staticmethod
  15. def extract_keyword(content): # 提取关键词
  16. # 正则过滤 html 标签
  17. re_exp = re.compile(r'(<style>.*?</style>)|(<[^>]+>)', re.S)
  18. content = re_exp.sub(' ', content)
  19. # html 转义符实体化
  20. content = html.unescape(content)
  21. # 切割
  22. seg = [i for i in jieba.cut(content, cut_all=True) if i != '']
  23. # 提取关键词
  24. keywords = jieba.analyse.extract_tags("|".join(seg), topK=200, withWeight=False, allowPOS=('n', 'nr', 'ns'))
  25. # print(keywords)
  26. # return keywords
  27. return seg,keywords
  28. @staticmethod
  29. def one_hot(word_dict, keywords): # oneHot编码
  30. # cut_code = [word_dict[word] for word in keywords]
  31. cut_code = [0]*len(word_dict)
  32. for word in keywords:
  33. cut_code[word_dict[word]] += 1
  34. return cut_code
  35. def main(self):
  36. # 去除停用词
  37. # jieba.analyse.set_stop_words('stopword1.txt')
  38. # 提取关键词
  39. # keywords1 = self.extract_keyword(self.s1)
  40. # keywords2 = self.extract_keyword(self.s2)
  41. seg1,keywords1 = self.extract_keyword(self.s1)
  42. seg2,keywords2 = self.extract_keyword(self.s2)
  43. # 词的并集
  44. union = set(keywords1).union(set(keywords2))
  45. # union = set(seg1).union(set(seg2))
  46. # 编码
  47. word_dict = {}
  48. i = 0
  49. for word in union:
  50. word_dict[word] = i
  51. i += 1
  52. # oneHot编码
  53. s1_cut_code = self.one_hot(word_dict, keywords1)
  54. s2_cut_code = self.one_hot(word_dict, keywords2)
  55. # s1_cut_code = self.one_hot(word_dict, seg1)
  56. # s2_cut_code = self.one_hot(word_dict, seg2)
  57. # 余弦相似度计算
  58. sample = [s1_cut_code, s2_cut_code]
  59. # 除零处理
  60. try:
  61. sim = cosine_similarity(sample)
  62. return sim[1][0],keywords1,keywords2
  63. except Exception as e:
  64. print(e)
  65. return 0.0,keywords1,keywords2
  66. # 测试
  67. if __name__ == '__main__':
  68. with open(r'D:\pythonDM\Ndkj\live111\result\1.txt', encoding='UTF-8') as x, open(r'D:\pythonDM\Ndkj\live111\result\2.txt', encoding='UTF-8') as y:
  69. content_x = x.read()
  70. content_y = y.read()
  71. similarity = CosineSimilarity(content_x, content_y)
  72. # similarity = CosineSimilarity(file, file2)
  73. similarity = similarity.main()
  74. print(similarity)
  75. print('相似度: %.2f%%' % (similarity*32))