import jieba import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_extraction.text import CountVectorizer from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances import math jieba.suggest_freq('以太坊', True) jieba.suggest_freq('区块链', True) jieba.suggest_freq('数字货币', True) jieba.suggest_freq('将于', True) jieba.suggest_freq('人人网', True) jieba.suggest_freq('比特币', True) jieba.suggest_freq('北上广', True) jieba.suggest_freq('大数据', True) jieba.suggest_freq('云计算', True) jieba.suggest_freq('公有链', True) # 引用停用词 stpwrdpath = "./stop_words.utf8" stpwrd_dic = open(stpwrdpath, 'rb') stpwrd_content = stpwrd_dic.read() # 将停用词表转换为list stpwrdlst = stpwrd_content.splitlines() stpwrd_dic.close() # vector = TfidfVectorizer(stop_words=stpwrdlst) def get_xls_data(): # 获取数据 # data = pd.read_excel("./0825-丽水系统查重维度.xlsx", names=["项目名称", "数字百山祖(一期)—“云值守”建设方案"], sheet_name='Sheet1') # content_ls_1 = [(x, y) for x, y in enumerate(data[0]["项目名称"]) if y] # #print(content_ls_1) # content_ls_2 = [(x, y) for x, y in enumerate(data[0]["数字百山祖(一期)—“云值守”建设方案"]) if y] content_ls_1 = [("content", """通过本项目的实施,可以真实贯彻以人民为中心的发展思想,按 照政府办事“一件事”全流程“最多跑一次”的理念和目标,深化“四 张清单一张网”改革,从与群众和卫生健康监管对象关系最紧密的领 域和事项做起,充分运用“互联网+智慧监管”和大数据,促进卫生 健康领域风险监管创新,使群众和企业对综合行政执法改革的获得感 明显增强、综合行政执法效率明显提升、卫生健康领域环境进一步改 善,着力防范化解卫生健康领域重大风险,维护人民群众健康权益""")] content_ls_2 = [("content", """建成政府侧应用和企业侧应用,实现政府、工商联、商会、企业一体化协同应用,助力工商联全面摸排“浙江人经济”的底数,精准掌握省外浙商重点企业、产业、产业链以及省外浙江商会的情况,加强对在外浙商企业的日常联系和服务覆盖,以乡情为纽带,有效发挥在外浙商的产业优势、技术优势、市场优势、资源优势,抢抓国内大循环的制高点,推动产业链招商、精准靶向招商,开展政策实施情况第三方评估,促进浙江人经济与浙江经济融合发展,助力我省高质量发展建设共同富裕示范区。""")] content_ls = [] for x in content_ls_1: for y in content_ls_2: if x[0] == y[0]: content_ls.append((x[1], y[1])) # 数据分词 print("语料长度:" + str(len(content_ls))) similarity_length = 0 for x in content_ls: # print([get_jieba_doc(x[0]), get_jieba_doc(x[1])]) vectorizer = CountVectorizer() transformer = TfidfTransformer() # tfidf = transformer.fit_transform(vectorizer.fit_transform([get_jieba_doc(x[0]), get_jieba_doc(x[1])])) # print(cosine_similarity(tfidf)) # print("======================================") vector = TfidfVectorizer(max_df=10, min_df=1) tfidf = vector.fit_transform([get_jieba_doc(x[0]), get_jieba_doc(x[1])]) new_cosine_similarity = cosine_similarity(tfidf).tolist() if new_cosine_similarity[0][1] > 0.7: print(cosine_similarity(tfidf)) print("相似文本为:" + x[0]+" ||||| " + x[1]) print("==================") similarity_length = similarity_length + 1 print("相似语料长度:" + str(similarity_length)) print("相似度识别成功率:%s" % (similarity_length/len(content_ls))*100 + "%") def get_jieba_doc(document): document_cut = jieba.cut(document) try: return " ".join(document_cut) except Exception as e: print(e.message) # 计算向量夹角余弦 def VectorCosine(x, y): vc = [] for i in range(1, len(x)-2): xc1 = x[i] - x[i-1] xc2 = x[i+1] - x[i] yc1 = y[i] - y[i-1] yc2 = y[i+1] - y[i] vc.append((xc1*xc2+yc1*yc2)/(math.sqrt(xc1**2+yc1**2)*math.sqrt(xc2**2+yc2**2))) return vc if __name__ == '__main__': get_xls_data()