丽水查重代码
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

130 lines
6.1KB

  1. # coding=utf-8
  2. import re
  3. import html
  4. import jieba
  5. import jieba.analyse
  6. from sklearn.feature_extraction.text import TfidfVectorizer
  7. from sklearn.metrics.pairwise import cosine_similarity
  8. stopwords = open('stop_words.utf8', encoding='utf8')
  9. stopword_list = [k.strip() for k in stopwords.readlines() if k.strip() != '']
  10. def replace_tongyici(keywords):
  11. # tongyici_tihuan.txt是同义词表,每行是一系列同义词,用tab分割
  12. # 1读取同义词表:并生成一个字典。
  13. combine_dict = {}
  14. for line in open("tongyici_tihuan.txt", "r"):
  15. seperate_word = line.strip().split(" ")
  16. num = len(seperate_word)
  17. for i in range(1, num):
  18. combine_dict[seperate_word[i]] = seperate_word[0]
  19. kws = []
  20. for word in keywords:
  21. if word in combine_dict:
  22. word = combine_dict[word]
  23. kws.append(word)
  24. else:
  25. kws.append(word)
  26. return kws
  27. class CosineSimilarity(object):
  28. """
  29. 余弦相似度
  30. """
  31. def __init__(self, content_x1, content_y2):
  32. self.s1 = content_x1
  33. self.s2 = content_y2
  34. @staticmethod
  35. def extract_keyword(seq_str): # 提取关键词
  36. # 正则过滤 html 标签
  37. re_exp = re.compile(r'(<style>.*?</style>)|(<[^>]+>)', re.S)
  38. content = re_exp.sub(' ', seq_str)
  39. # html 转义符实体化
  40. content = html.unescape(content)
  41. # 切割
  42. jieba.load_userdict("user_dict.txt")
  43. seg = [i for i in jieba.cut(content, cut_all=True) if i != '']
  44. # keywords = [k for k in jieba.cut(content, cut_all=True) if k != ' ' and k != '' and k not in stopword_list]
  45. keywords = [k for k in jieba.analyse.extract_tags("|".join(seg), topK=500, withWeight=False) if k != ' ' and k != '' and k not in stopword_list]
  46. # keywords = replace_tongyici(keywords)
  47. # 提取关键词
  48. # keywords = jieba.analyse.extract_tags("|".join(seg), topK=500, withWeight=False, allowPOS=('n', 'nr', 'ns'))
  49. # keywords = jieba.analyse.extract_tags(content, topK=2000, withWeight=False)
  50. # print(keywords)
  51. # return keywords
  52. return [],keywords
  53. @staticmethod
  54. def one_hot(word_dict, keywords): # oneHot编码
  55. # cut_code = [word_dict[word] for word in keywords]
  56. cut_code = [0]*len(word_dict)
  57. for word in keywords:
  58. cut_code[word_dict[word]] += 1
  59. return cut_code
  60. def main(self):
  61. # 去除停用词
  62. # jieba.analyse.set_stop_words('stop_words.utf8')
  63. # 提取关键词
  64. seg1,keywords1 = self.extract_keyword(self.s1)
  65. seg2,keywords2 = self.extract_keyword(self.s2)
  66. # 词的并集
  67. union = set(keywords1).union(set(keywords2))
  68. # union = set(seg1).union(set(seg2))
  69. # 编码
  70. word_dict = {}
  71. i = 0
  72. for word in union:
  73. word_dict[word] = i
  74. i += 1
  75. # # oneHot编码
  76. s1_cut_code = self.one_hot(word_dict, keywords1)
  77. s2_cut_code = self.one_hot(word_dict, keywords2)
  78. # s1_cut_code = self.one_hot(word_dict, seg1)
  79. # s2_cut_code = self.one_hot(word_dict, seg2)
  80. # stopwords = open('stop_words.utf8', encoding='utf8')
  81. # stopword_list = [k.strip() for k in stopwords.readlines() if k.strip() != '']
  82. # stopwords.close()
  83. # vector = TfidfVectorizer(max_df=10, min_df=1)
  84. # tfidf = vector.fit_transform([" ".join(keywords1), " ".join(keywords2)])
  85. # 余弦相似度计算
  86. sample = [s1_cut_code, s2_cut_code]
  87. # 除零处理
  88. try:
  89. sim = cosine_similarity(sample)
  90. # sim = cosine_similarity(tfidf).tolist()
  91. return sim[1][0],keywords1,keywords2
  92. except Exception as e:
  93. print(e)
  94. return 0.0,keywords1,keywords2
  95. # 测试
  96. if __name__ == '__main__':
  97. # with open(r'D:\pythonDM\Ndkj\live111\result\1.txt', encoding='UTF-8') as x, open(r'D:\pythonDM\Ndkj\live111\result\2.txt', encoding='UTF-8') as y:
  98. # content_x = x.read()
  99. # content_y = y.read()
  100. content_x = """中英文双语版本开发建设,为平台提供国际化能力,对平台APP端所有功能菜单以及所有官方维护内容进行中英翻译,实现中英双语的APP版本,同时提供版本一键切换功能,提升一机游丽水平台服务的全面性,将一机游丽水打造成全国智慧文旅平台领域专业、专注、领先的范本。"""
  101. content_y = """(1)诉求受理、分流功能: 用户可以对进入统一受理中心的诉求信息进行识别,对有效且需要分流的诉求进行受理、分派操作。操作后,诉求自动进入下一个流程环节,操作后信息状态变为无效信息。对应的诉求状态变化会同步通知诉求来源系统。 (2)诉求结案回复、设为无效功能 用户对进入统一受理中心的诉求信息进行识别,对可以直接答复的信息进行回复并结案的操作,操作后诉求会自动结案。如诉求信息无效,则可以对其信息不受理操作,操作后信息状态变为无效信息。对应的诉求状态变化会同步通知诉求来源系统。 诉求流转跟踪视图用户可在统一受理中心的工作台上看到已分派的系统列表,信息详情中会展示该诉求的处理流程,内部和外部系统的处理过程都可以看到,方便用户掌握诉求的进展以便对诉求流转进行跟踪。 (3)自动分类、分流: 统一受理中心通过大数据分析,对诉求内容的语义解析算法,提取出该诉求的事件分类自动填充到分流信息中,再通过事项清单配置,将负责该类型事件的处理对象系统自动填充到分流信息中。用户只需核对系统填充信息即可实现一键分派。 (4)自动区分无效信息: 统一受理中心通过大数据分析,对诉求内容的语义解析算法,将疑似无效内容的诉求信息标记出来,提供用户判断的依据,提高用户处理业务的效率。"""
  102. similarity = CosineSimilarity(content_x, content_y)
  103. # similarity = CosineSimilarity(file, file2)
  104. similarity = similarity.main()
  105. print(similarity)