丽水查重代码
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

88 lines
4.4KB

  1. import jieba
  2. import pandas as pd
  3. from sklearn.feature_extraction.text import TfidfVectorizer
  4. from sklearn.feature_extraction.text import TfidfTransformer
  5. from sklearn.feature_extraction.text import CountVectorizer
  6. from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
  7. import math
  8. jieba.suggest_freq('以太坊', True)
  9. jieba.suggest_freq('区块链', True)
  10. jieba.suggest_freq('数字货币', True)
  11. jieba.suggest_freq('将于', True)
  12. jieba.suggest_freq('人人网', True)
  13. jieba.suggest_freq('比特币', True)
  14. jieba.suggest_freq('北上广', True)
  15. jieba.suggest_freq('大数据', True)
  16. jieba.suggest_freq('云计算', True)
  17. jieba.suggest_freq('公有链', True)
  18. # 引用停用词
  19. stpwrdpath = "./stop_words.utf8"
  20. stpwrd_dic = open(stpwrdpath, 'rb')
  21. stpwrd_content = stpwrd_dic.read()
  22. # 将停用词表转换为list
  23. stpwrdlst = stpwrd_content.splitlines()
  24. stpwrd_dic.close()
  25. # vector = TfidfVectorizer(stop_words=stpwrdlst)
  26. def get_xls_data():
  27. # 获取数据
  28. # data = pd.read_excel("./0825-丽水系统查重维度.xlsx", names=["项目名称", "数字百山祖(一期)—“云值守”建设方案"], sheet_name='Sheet1')
  29. # content_ls_1 = [(x, y) for x, y in enumerate(data[0]["项目名称"]) if y]
  30. # #print(content_ls_1)
  31. # content_ls_2 = [(x, y) for x, y in enumerate(data[0]["数字百山祖(一期)—“云值守”建设方案"]) if y]
  32. content_ls_1 = [("content", """通过本项目的实施,可以真实贯彻以人民为中心的发展思想,按 照政府办事“一件事”全流程“最多跑一次”的理念和目标,深化“四 张清单一张网”改革,从与群众和卫生健康监管对象关系最紧密的领 域和事项做起,充分运用“互联网+智慧监管”和大数据,促进卫生 健康领域风险监管创新,使群众和企业对综合行政执法改革的获得感 明显增强、综合行政执法效率明显提升、卫生健康领域环境进一步改 善,着力防范化解卫生健康领域重大风险,维护人民群众健康权益""")]
  33. content_ls_2 = [("content", """建成政府侧应用和企业侧应用,实现政府、工商联、商会、企业一体化协同应用,助力工商联全面摸排“浙江人经济”的底数,精准掌握省外浙商重点企业、产业、产业链以及省外浙江商会的情况,加强对在外浙商企业的日常联系和服务覆盖,以乡情为纽带,有效发挥在外浙商的产业优势、技术优势、市场优势、资源优势,抢抓国内大循环的制高点,推动产业链招商、精准靶向招商,开展政策实施情况第三方评估,促进浙江人经济与浙江经济融合发展,助力我省高质量发展建设共同富裕示范区。""")]
  34. content_ls = []
  35. for x in content_ls_1:
  36. for y in content_ls_2:
  37. if x[0] == y[0]:
  38. content_ls.append((x[1], y[1]))
  39. # 数据分词
  40. print("语料长度:" + str(len(content_ls)))
  41. similarity_length = 0
  42. for x in content_ls:
  43. # print([get_jieba_doc(x[0]), get_jieba_doc(x[1])])
  44. vectorizer = CountVectorizer()
  45. transformer = TfidfTransformer()
  46. # tfidf = transformer.fit_transform(vectorizer.fit_transform([get_jieba_doc(x[0]), get_jieba_doc(x[1])]))
  47. # print(cosine_similarity(tfidf))
  48. # print("======================================")
  49. vector = TfidfVectorizer(max_df=10, min_df=1)
  50. tfidf = vector.fit_transform([get_jieba_doc(x[0]), get_jieba_doc(x[1])])
  51. new_cosine_similarity = cosine_similarity(tfidf).tolist()
  52. if new_cosine_similarity[0][1] > 0.7:
  53. print(cosine_similarity(tfidf))
  54. print("相似文本为:" + x[0]+" ||||| " + x[1])
  55. print("==================")
  56. similarity_length = similarity_length + 1
  57. print("相似语料长度:" + str(similarity_length))
  58. print("相似度识别成功率:%s" % (similarity_length/len(content_ls))*100 + "%")
  59. def get_jieba_doc(document):
  60. document_cut = jieba.cut(document)
  61. try:
  62. return " ".join(document_cut)
  63. except Exception as e:
  64. print(e.message)
  65. # 计算向量夹角余弦
  66. def VectorCosine(x, y):
  67. vc = []
  68. for i in range(1, len(x)-2):
  69. xc1 = x[i] - x[i-1]
  70. xc2 = x[i+1] - x[i]
  71. yc1 = y[i] - y[i-1]
  72. yc2 = y[i+1] - y[i]
  73. vc.append((xc1*xc2+yc1*yc2)/(math.sqrt(xc1**2+yc1**2)*math.sqrt(xc2**2+yc2**2)))
  74. return vc
  75. if __name__ == '__main__':
  76. get_xls_data()