|
- import jieba
- import pandas as pd
- from sklearn.feature_extraction.text import TfidfVectorizer
- from sklearn.feature_extraction.text import TfidfTransformer
- from sklearn.feature_extraction.text import CountVectorizer
- from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
- import math
-
- jieba.suggest_freq('以太坊', True)
- jieba.suggest_freq('区块链', True)
- jieba.suggest_freq('数字货币', True)
- jieba.suggest_freq('将于', True)
- jieba.suggest_freq('人人网', True)
- jieba.suggest_freq('比特币', True)
- jieba.suggest_freq('北上广', True)
- jieba.suggest_freq('大数据', True)
- jieba.suggest_freq('云计算', True)
- jieba.suggest_freq('公有链', True)
- # 引用停用词
- stpwrdpath = "./stop_words.utf8"
- stpwrd_dic = open(stpwrdpath, 'rb')
- stpwrd_content = stpwrd_dic.read()
- # 将停用词表转换为list
- stpwrdlst = stpwrd_content.splitlines()
- stpwrd_dic.close()
- # vector = TfidfVectorizer(stop_words=stpwrdlst)
-
-
- def get_xls_data():
- # 获取数据
- # data = pd.read_excel("./0825-丽水系统查重维度.xlsx", names=["项目名称", "数字百山祖(一期)—“云值守”建设方案"], sheet_name='Sheet1')
- # content_ls_1 = [(x, y) for x, y in enumerate(data[0]["项目名称"]) if y]
- # #print(content_ls_1)
- # content_ls_2 = [(x, y) for x, y in enumerate(data[0]["数字百山祖(一期)—“云值守”建设方案"]) if y]
- content_ls_1 = [("content", """通过本项目的实施,可以真实贯彻以人民为中心的发展思想,按 照政府办事“一件事”全流程“最多跑一次”的理念和目标,深化“四 张清单一张网”改革,从与群众和卫生健康监管对象关系最紧密的领 域和事项做起,充分运用“互联网+智慧监管”和大数据,促进卫生 健康领域风险监管创新,使群众和企业对综合行政执法改革的获得感 明显增强、综合行政执法效率明显提升、卫生健康领域环境进一步改 善,着力防范化解卫生健康领域重大风险,维护人民群众健康权益""")]
- content_ls_2 = [("content", """建成政府侧应用和企业侧应用,实现政府、工商联、商会、企业一体化协同应用,助力工商联全面摸排“浙江人经济”的底数,精准掌握省外浙商重点企业、产业、产业链以及省外浙江商会的情况,加强对在外浙商企业的日常联系和服务覆盖,以乡情为纽带,有效发挥在外浙商的产业优势、技术优势、市场优势、资源优势,抢抓国内大循环的制高点,推动产业链招商、精准靶向招商,开展政策实施情况第三方评估,促进浙江人经济与浙江经济融合发展,助力我省高质量发展建设共同富裕示范区。""")]
- content_ls = []
- for x in content_ls_1:
- for y in content_ls_2:
- if x[0] == y[0]:
- content_ls.append((x[1], y[1]))
-
- # 数据分词
- print("语料长度:" + str(len(content_ls)))
- similarity_length = 0
- for x in content_ls:
- # print([get_jieba_doc(x[0]), get_jieba_doc(x[1])])
- vectorizer = CountVectorizer()
- transformer = TfidfTransformer()
- # tfidf = transformer.fit_transform(vectorizer.fit_transform([get_jieba_doc(x[0]), get_jieba_doc(x[1])]))
- # print(cosine_similarity(tfidf))
- # print("======================================")
- vector = TfidfVectorizer(max_df=10, min_df=1)
- tfidf = vector.fit_transform([get_jieba_doc(x[0]), get_jieba_doc(x[1])])
- new_cosine_similarity = cosine_similarity(tfidf).tolist()
- if new_cosine_similarity[0][1] > 0.7:
- print(cosine_similarity(tfidf))
- print("相似文本为:" + x[0]+" ||||| " + x[1])
- print("==================")
- similarity_length = similarity_length + 1
-
- print("相似语料长度:" + str(similarity_length))
- print("相似度识别成功率:%s" % (similarity_length/len(content_ls))*100 + "%")
-
-
- def get_jieba_doc(document):
- document_cut = jieba.cut(document)
- try:
- return " ".join(document_cut)
- except Exception as e:
- print(e.message)
-
-
- # 计算向量夹角余弦
- def VectorCosine(x, y):
- vc = []
- for i in range(1, len(x)-2):
- xc1 = x[i] - x[i-1]
- xc2 = x[i+1] - x[i]
- yc1 = y[i] - y[i-1]
- yc2 = y[i+1] - y[i]
- vc.append((xc1*xc2+yc1*yc2)/(math.sqrt(xc1**2+yc1**2)*math.sqrt(xc2**2+yc2**2)))
-
- return vc
-
-
- if __name__ == '__main__':
- get_xls_data()
|