# coding=utf-8 import sys import re import mysql_pool from pymysql.converters import escape_string import cosin_similarity import pandas as pd import datetime import requests wdys1 = { "项目名称": "xmmc", "现状问题": "xzwt", "系统基础": "xtjc", "项目目标": "xmmb", "预期绩效": "yqjx", "建设需求": "jsxq", "数据需求": "sjxq", "安全需求": "aqxq", "业务领域": "ywly", "核心业务": "hxyw", "业务需求": "ywxq", "业务协同": "ywxt", "建设层级": "jscj", "用户范围": "yhfw", "目标群体": "mbqt", "建设内容": "jsnr", "功能模块": "gnmk", "数据共享": "sjgx", "智能要素": "znys" } wdys2 = { "xmmc": "项目名称", "xzwt": "现状问题", "xtjc": "系统基础", "xmmb": "项目目标", "yqjx": "预期绩效", "jsxq": "建设需求", "sjxq": "数据需求", "aqxq": "安全需求", "ywly": "业务领域", "hxyw": "核心业务", "ywxq": "业务需求", "ywxt": "业务协同", "jscj": "建设层级", "yhfw": "用户范围", "mbqt": "目标群体", "jsnr": "建设内容", "gnmk": "功能模块", "sjgx": "数据共享", "znys": "智能要素" } gnmkys = { "gnmc": "功能名称", "gnms": "功能描述" } def getFlag(): data_dict = {} df = pd.read_excel("0825.xlsx") data = df.values data = list(pd.Series(data[:, 1]).dropna()) for d in data: try: wd = re.search("(.*?)(.*?%)", d).group(1).strip() wdc = wdys1.get(wd) if wdc: qz = re.search(".*?((.*?%))", d).group(1) data_dict[wdc] = qz except: pass return data_dict def gong_neng_mo_kuai(mysql, dl, data, er_title): # 将excel文件中的所有第三维度内容进行拼接 str_dict = {} for et in er_title: for d in data: if d[1] == et: if str_dict.get(et): str_dict[et] = str_dict.get(et) + d[3] else: str_dict[et] = d[3] for k, v in str_dict.items(): mysql.sql_change_msg( """insert into idc_project_module (project_id, check_duplicate_count, module_name, module_content, create_time, update_time, tag) value(%d, 1, "%s", "%s", "%s", "%s", "模块")""" % ( int(dl[0]), k, v, str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7])) module_id_list = mysql.sql_select_many( """select project_module_id, module_name, module_content from idc_project_module where project_id=%d""" % dl[ 0]) data_list = [] for mil in module_id_list: data_dict = {} data_dict["project_module_id"] = mil.get("project_module_id") data_dict["gnmc"] = mil.get("module_name") data_dict["gnms"] = mil.get("module_content") data_list.append(data_dict) # print(data_list) for i in data_list: gnmk_copy1 = mysql.sql_select_many("""select * from user_history_module_data""") if gnmk_copy1: for gc in gnmk_copy1: print( """insert into idc_project_module_check (project_module_id, module_name, project_name, company_name, create_time, update_time) value(%d, "%s", "%s", "%s", "%s", "%s")""" % ( i.get("project_module_id"), escape_string(gc.get("gnmc")), escape_string(gc.get("xmmc")), "", str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7])) mysql.sql_change_msg( """insert into idc_project_module_check (project_module_id, module_name, project_name, company_name, create_time, update_time) value(%d, "%s", "%s", "%s", "%s", "%s")""" % ( i.get("project_module_id"), escape_string(gc.get("gnmc")), escape_string(gc.get("xmmc")), "", str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7])) dup_module_id = mysql.cur.lastrowid check_module_info(mysql, gc, dl, i, dup_module_id) gnmk_gjc = {} for a in ["gnmc", "gnms"]: if i.get(a): content_x = i.get(a) content_y = i.get(a) if a == "gnmc": similarity = cosin_similarity.CosineSimilarity(content_x, content_y) # 相似度 关键词 similarity, keyword_x, keywords = similarity.main() # 去重 keywords = list(set(keywords)) gnmk_gjc[a] = keywords else: similarity = cosin_similarity.CosineSimilarity(content_x, content_y) # 相似度 关键词 similarity, keyword_x, keywords = similarity.main() # 去重 keywords = list(set(keywords)) gnmk_gjc[a] = keywords mysql.sql_change_msg("""insert into user_module_keywords (xmmc, gnmc, gnms) value("%s", "%s", "%s")""" % ( dl[2], str(gnmk_gjc.get("gnmc"))[1:-1] if gnmk_gjc.get("gnmc") else None, str(gnmk_gjc.get("gnms"))[1:-1] if gnmk_gjc.get("gnms") else None)) def check_module_info(mysql, gc, dl, pro, dup_module_id): total_similarity1 = 0 total_keywords1 = [] total_similarity2 = 0 total_keywords2 = [] for j in ["gnmc", "gnms"]: # 循环遍历每一个模块名称 content_x = gc.get(j) content_y = pro.get(j) if content_x and content_y: if j == "gnmc": similarity = cosin_similarity.CosineSimilarity(content_x, content_y) # 相似度 关键词 similarity, keyword_x, keywords = similarity.main() similarity = similarity * 1 total_keywords1 += keywords # print("######################相似度: %.2f%%" % similarity, "关键词: %s" % keywords) # 相似度相加 total_similarity1 += similarity mysql.sql_change_msg( """insert into idc_project_module_check_detail (dup_module_id, project_name, module_content, dup_module_content, similarity, dimension, create_time, update_time) value (%d, "%s", "%s", "%s", %f, "%s", "%s", "%s")""" % (dup_module_id, dl[2], escape_string(content_y), escape_string(content_x), similarity, "功能名称", str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7])) else: similarity = cosin_similarity.CosineSimilarity(content_x, content_y) # 相似度 关键词 similarity, keyword_x, keywords = similarity.main() similarity = similarity * 99 total_keywords2 += keywords # print("######################相似度: %.2f%%" % similarity, "关键词: %s" % keywords) # 相似度相加 total_similarity2 += similarity mysql.sql_change_msg( """insert into idc_project_module_check_detail (dup_module_id, project_name, module_content, dup_module_content, similarity, dimension, create_time, update_time) value (%d, "%s", "%s", "%s", %f, "%s", "%s", "%s")""" % (dup_module_id, dl[2], escape_string(content_y), escape_string(content_x), similarity, "功能模块描述", str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7])) mysql.sql_change_msg("""update idc_project_module_check set similarity=%f where dup_module_id=%d""" % ( total_similarity1 + total_similarity2, dup_module_id)) def project_check(data_list): mysql = mysql_pool.ConnMysql() # mysql.sql_select_many("""select * from mkgjc""") # 读取维度和权重 # xmnr_count = len(mysql.sql_select_many("""select * from xmnr_copy1""")) # gnmk_count = len(mysql.sql_select_many("""select * from gnmk_copy1""")) xmnr_count = len(mysql.sql_select_many("""select * from user_history_data""")) gnmk_count = len(mysql.sql_select_many("""select * from user_history_module_data""")) get_data_dict = getFlag() # 遍历excel存储路径 for dl in data_list: # path = "0825-丽水系统查重维度1.xlsx" # 读取路径下的excel print(dl,dl[1]) df = pd.read_excel(dl[1]) data = df.values # 将excel文件中的所有维度内容进行拼接 join_str = "" str_dict = {} gnmk_str = [] title = "" er_title = set() for d in data: # if pd.notnull(d[0]): # title = d[0] # if title == "功能模块": # er_title.add(d[1]) # join_str = "" # for i in d[1:]: # if pd.notnull(i): # join_str += str(i) # str_dict[wdys1.get(title)] = join_str if pd.notnull(d[0]): title = d[0] if title == "功能模块": er_title.add(d[1]) join_str = "" for i in d[1:]: if pd.notnull(i): join_str += str(i) if title == "功能模块": if i == '功能描述': continue else: gnmk_str.append(i) str_dict[wdys1.get(title)] = join_str else: if title == "功能模块": er_title.add(d[1]) for i in d[1:]: if pd.notnull(i): join_str += str(i) str_dict[wdys1.get(title)] = str_dict.get(wdys1.get(title)) + join_str # print(str_dict) gnmk = ",".join(gnmk_str) str_dict['gnmk'] = gnmk mysql.sql_change_msg( """insert into user_data (xmmc, xzwt, xtjc, xmmb, yqjx, jsxq, sjxq, aqxq, ywly, hxyw, ywxq, ywxt, jscj, yhfw, mbqt, jsnr, gnmk, sjgx, znys) value ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")""" % (dl[0], str_dict.get("xzwt") if str_dict.get("xzwt") else None, str_dict.get("xtjc") if str_dict.get("xtjc") else None, str_dict.get("xmmb") if str_dict.get("xmmb") else None, str_dict.get("yqjx") if str_dict.get("yqjx") else None, str_dict.get("jsxq") if str_dict.get("jsxq") else None, str_dict.get("sjxq") if str_dict.get("sjxq") else None, str_dict.get("aqxq") if str_dict.get("aqxq") else None, str_dict.get("ywly") if str_dict.get("ywly") else None, str_dict.get("hxyw") if str_dict.get("hxyw") else None, str_dict.get("ywxq") if str_dict.get("ywxq") else None, str_dict.get("ywxt") if str_dict.get("ywxt") else None, str_dict.get("jscj") if str_dict.get("jscj") else None, str_dict.get("yhfw") if str_dict.get("yhfw") else None, str_dict.get("mbqt") if str_dict.get("mbqt") else None, str_dict.get("jsnr") if str_dict.get("jsnr") else None, str_dict.get("gnmk") if str_dict.get("gnmk") else None, str_dict.get("sjgx") if str_dict.get("sjgx") else None, str_dict.get("znys") if str_dict.get("znys") else None)) # 或取所有的xmnr_copy1 xmnr_copy1 = mysql.sql_select_many("""select * from user_history_data""") # 对比xmnr_copy1和xmnr维度是否都有 if xmnr_copy1: # threads = [Thread(target=check_project_info, args=(mysql, dl, xc, str_dict)) for xc in xmnr_copy1] # for t in threads: # t.start() # # for t in threads: # t.join() for xc in xmnr_copy1: check_project_info(mysql, dl, xc, str_dict) project_gjc = {} for w in wdys2.keys(): content_x = str_dict.get(w) content_y = str_dict.get(w) if content_x and content_y: # 循环遍历每一个维度 similarity = cosin_similarity.CosineSimilarity(content_x, content_y) # 相似度 关键词 similarity, keywords_x, keywords = similarity.main() # 去重 keywords = list(set(keywords)) project_gjc[w] = keywords mysql.sql_change_msg( """insert into user_keyword (xmmc, xzwt, xtjc, xmmb, yqjx, jsxq, sjxq, aqxq, ywly, hxyw, ywxq, ywxt, jscj, yhfw, mbqt, jsnr, gnmk, sjgx, znys) value ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")""" % (dl[0], str(project_gjc.get("xzwt"))[1:-1] if project_gjc.get("xzwt") else None, str(project_gjc.get("xtjc"))[1:-1] if project_gjc.get("xtjc") else None, str(project_gjc.get("xmmb"))[1:-1] if project_gjc.get("xmmb") else None, str(project_gjc.get("yqjx"))[1:-1] if project_gjc.get("yqjx") else None, str(project_gjc.get("jsxq"))[1:-1] if project_gjc.get("jsxq") else None, str(project_gjc.get("sjxq"))[1:-1] if project_gjc.get("sjxq") else None, str(project_gjc.get("aqxq"))[1:-1] if project_gjc.get("aqxq") else None, str(project_gjc.get("ywly"))[1:-1] if project_gjc.get("ywly") else None, str(project_gjc.get("hxyw"))[1:-1] if project_gjc.get("hxyw") else None, str(project_gjc.get("ywxq"))[1:-1] if project_gjc.get("ywxq") else None, str(project_gjc.get("ywxt"))[1:-1] if project_gjc.get("ywxt") else None, str(project_gjc.get("jscj"))[1:-1] if project_gjc.get("jscj") else None, str(project_gjc.get("yhfw"))[1:-1] if project_gjc.get("yhfw") else None, str(project_gjc.get("mbqt"))[1:-1] if project_gjc.get("mbqt") else None, str(project_gjc.get("jsnr"))[1:-1] if project_gjc.get("jsnr") else None, str(project_gjc.get("gnmk"))[1:-1] if project_gjc.get("gnmk") else None, str(project_gjc.get("sjgx"))[1:-1] if project_gjc.get("sjgx") else None, str(project_gjc.get("znys"))[1:-1] if project_gjc.get("znys") else None)) mysql.sql_change_msg( """update idc_project set dup_status=3, one_vote_veto_status=1, self_check_status=1, history_project_count=%d ,module_count=%d where project_id=%d""" % ( xmnr_count, gnmk_count, dl[0])) gong_neng_mo_kuai(mysql, dl, data, er_title) def check_project_info(mysql, dl, xc, str_dict): total_keywords = {} total_similarity = 0 dup_count = 0 # 保存相加后的相似度到idc_project_check print(f'xmmc is {xc.get("xmmc")}') mysql.sql_change_msg( """insert into idc_project_check (project_id, dup_project_name, file_path, company_name, create_year, project_tag, project_range_tag, project_area, create_time, update_time) value ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")""" % (dl[0], escape_string(xc.get("xmmc")), escape_string(dl[1]), "", "", "需求相似、业务相似", "历史项目", "", str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7])) dup_id = mysql.cur.lastrowid for x in list(xc.keys())[1:]: content_x = xc.get(x) content_y = str_dict.get(x) if content_x and content_y: if x == 'gnmk': continue elif x == 'jsnr': continue else: dup_count += 1 if ((xc['gnmk'] == 'None' or xc['gnmk'] is None or str.strip(xc['gnmk']) == '') and (str_dict['gnmk'] is None or str.strip(str_dict['gnmk']) == '')) and ( not xc['jsnr'] is None and xc['jsnr'] != 'None' and not str_dict['jsnr'] is None and len(str.strip(str_dict['jsnr'])) > 0): for x in list(xc.keys())[1:]: content_x = xc.get(x) content_y = str_dict.get(x) if content_x and content_y: if x == 'gnmk': # 匹配到历史数据,次数加1 # dup_count += dup_file_test # 循环遍历每一个维度 similarity = cosin_similarity.CosineSimilarity(content_x, content_y) # 相似度 关键词 similarity, keywords_x, keywords_y = similarity.main() similarity = similarity * 0 # print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y) # 相似度相加 total_similarity += similarity # 去重 keywords_y = list(set(keywords_y)) # 去重 keywords_x = list(set(keywords_x)) # 关键词收集 total_keywords[x] = keywords_y function_content = content_y dup_function_content = content_x for word_y in keywords_y: word_y = word_y.strip().strip("'").strip('"') if word_y != '': function_content = str(function_content.replace("\"", "'")).replace(word_y, f'{word_y.strip()}') for word_x in keywords_x: word_x = word_x.strip().strip("'").strip('"') if word_x != '': dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x, f'{word_x.strip()}') # 保存每个维度对应的相似度到idc_project_check_detail mysql.sql_change_msg( """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")""" % (dup_id, wdys2.get(x), similarity, escape_string(function_content), escape_string(dup_function_content), str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7])) # content = content.replace(gjcs, f'{gjcs.strip()}') elif x == 'jsnr': similarity = cosin_similarity.CosineSimilarity(content_x, content_y) # 相似度 关键词 similarity, keywords_x, keywords_y = similarity.main() similarity = similarity * 40 # print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y) # 相似度相加 total_similarity += similarity # 去重 keywords_y = list(set(keywords_y)) # 去重 keywords_x = list(set(keywords_x)) # 关键词收集 total_keywords[x] = keywords_y function_content = content_y dup_function_content = content_x for word_y in keywords_y: word_y = word_y.strip().strip("'").strip('"') if word_y != '': function_content = str(function_content.replace("\"", "'")).replace(word_y, f'{word_y.strip()}') for word_x in keywords_x: word_x = word_x.strip().strip("'").strip('"') if word_x != '': dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x, f'{word_x.strip()}') # 保存每个维度对应的相似度到idc_project_check_detail mysql.sql_change_msg( """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")""" % (dup_id, wdys2.get(x), similarity, escape_string(function_content), escape_string(dup_function_content), str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7])) else: similarity = cosin_similarity.CosineSimilarity(content_x, content_y) # 相似度 关键词 similarity, keywords_x, keywords_y = similarity.main() similarity = similarity * (60 / dup_count) # print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y) # 相似度相加 total_similarity += similarity # 去重 keywords_y = list(set(keywords_y)) # 去重 keywords_x = list(set(keywords_x)) # 关键词收集 total_keywords[x] = keywords_y function_content = content_y dup_function_content = content_x for word_y in keywords_y: word_y = word_y.strip().strip("'").strip('"') if word_y != '': function_content = str(function_content.replace("\"", "'")).replace(word_y, f'{word_y.strip()}') for word_x in keywords_x: word_x = word_x.strip().strip("'").strip('"') if word_x != '': dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x, f'{word_x.strip()}') # 保存每个维度对应的相似度到idc_project_check_detail mysql.sql_change_msg( """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")""" % (dup_id, wdys2.get(x), similarity, escape_string(function_content), escape_string(dup_function_content), str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7])) elif ((xc['jsnr'] == 'None' or xc['jsnr'] is None or str.strip(xc['jsnr']) == '') and (str_dict['jsnr'] is None or str.strip(str_dict['jsnr']) == '')) and ( not xc['gnmk'] is None and xc['gnmk'] != 'None' and not str_dict['gnmk'] is None and len(str.strip(str_dict['gnmk'])) > 0): for x in list(xc.keys())[1:]: content_x = xc.get(x) content_y = str_dict.get(x) if content_x and content_y: if x == 'gnmk': # 匹配到历史数据,次数加1 # dup_count += dup_file_test # 循环遍历每一个维度 similarity = cosin_similarity.CosineSimilarity(content_x, content_y) # 相似度 关键词 similarity, keywords_x, keywords_y = similarity.main() similarity = similarity * 50 # print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y) # 相似度相加 total_similarity += similarity # 去重 keywords_y = list(set(keywords_y)) # 去重 keywords_x = list(set(keywords_x)) # 关键词收集 total_keywords[x] = keywords_y function_content = content_y dup_function_content = content_x for word_y in keywords_y: word_y = word_y.strip().strip("'").strip('"') if word_y != '': function_content = str(function_content.replace("\"", "'")).replace(word_y, f'{word_y.strip()}') for word_x in keywords_x: word_x = word_x.strip().strip("'").strip('"') if word_x != '': dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x, f'{word_x.strip()}') # 保存每个维度对应的相似度到idc_project_check_detail mysql.sql_change_msg( """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")""" % (dup_id, wdys2.get(x), similarity, escape_string(function_content), escape_string(dup_function_content), str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7])) # content = content.replace(gjcs, f'{gjcs.strip()}') elif x == 'jsnr': similarity = cosin_similarity.CosineSimilarity(content_x, content_y) # 相似度 关键词 similarity, keywords_x, keywords_y = similarity.main() similarity = similarity * 0 # print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y) # 相似度相加 total_similarity += similarity # 去重 keywords_y = list(set(keywords_y)) # 去重 keywords_x = list(set(keywords_x)) # 关键词收集 total_keywords[x] = keywords_y function_content = content_y dup_function_content = content_x for word_y in keywords_y: word_y = word_y.strip().strip("'").strip('"') if word_y != '': function_content = str(function_content.replace("\"", "'")).replace(word_y, f'{word_y.strip()}') for word_x in keywords_x: word_x = word_x.strip().strip("'").strip('"') if word_x != '': dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x, f'{word_x.strip()}') # 保存每个维度对应的相似度到idc_project_check_detail mysql.sql_change_msg( """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")""" % (dup_id, wdys2.get(x), similarity, escape_string(function_content), escape_string(dup_function_content), str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7])) else: similarity = cosin_similarity.CosineSimilarity(content_x, content_y) # 相似度 关键词 similarity, keywords_x, keywords_y = similarity.main() similarity = similarity * (50 / dup_count) # print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y) # 相似度相加 total_similarity += similarity # 去重 keywords_y = list(set(keywords_y)) # 去重 keywords_x = list(set(keywords_x)) # 关键词收集 total_keywords[x] = keywords_y function_content = content_y dup_function_content = content_x for word_y in keywords_y: word_y = word_y.strip().strip("'").strip('"') if word_y != '': function_content = str(function_content.replace("\"", "'")).replace(word_y, f'{word_y.strip()}') for word_x in keywords_x: word_x = word_x.strip().strip("'").strip('"') if word_x != '': dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x, f'{word_x.strip()}') # 保存每个维度对应的相似度到idc_project_check_detail mysql.sql_change_msg( """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")""" % (dup_id, wdys2.get(x), similarity, escape_string(function_content), escape_string(dup_function_content), str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7])) else: for x in list(xc.keys())[1:]: content_x = xc.get(x) content_y = str_dict.get(x) if content_x and content_y: if x == 'gnmk': # 匹配到历史数据,次数加1 # dup_count += dup_file_test # 循环遍历每一个维度 similarity = cosin_similarity.CosineSimilarity(content_x, content_y) # 相似度 关键词 similarity, keywords_x, keywords_y = similarity.main() similarity = similarity * 50 # 相似度相加 total_similarity += similarity # 去重 keywords_y = list(set(keywords_y)) # 去重 keywords_x = list(set(keywords_x)) print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y) # 关键词收集 total_keywords[x] = keywords_y function_content = content_y dup_function_content = content_x for word_y in keywords_y: word_y = word_y.strip().strip("'").strip('"') if word_y != '': function_content = str(function_content.replace("\"", "'")).replace(word_y, f'{word_y.strip()}') for word_x in keywords_x: word_x = word_x.strip().strip("'").strip('"') if word_x != '': dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x, f'{word_x.strip()}') # 保存每个维度对应的相似度到idc_project_check_detail mysql.sql_change_msg( """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")""" % (dup_id, wdys2.get(x), similarity, escape_string(function_content), escape_string(dup_function_content), str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7])) # content = content.replace(gjcs, f'{gjcs.strip()}') elif x == 'jsnr': similarity = cosin_similarity.CosineSimilarity(content_x, content_y) # 相似度 关键词 similarity, keywords_x, keywords_y = similarity.main() similarity = similarity * 40 # 相似度相加 total_similarity += similarity # 去重 keywords_y = list(set(keywords_y)) # 去重 keywords_x = list(set(keywords_x)) print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y) # 关键词收集 total_keywords[x] = keywords_y function_content = content_y dup_function_content = content_x for word_y in keywords_y: word_y = word_y.strip().strip("'").strip('"') if word_y != '': function_content = str(function_content.replace("\"", "'")).replace(word_y, f'{word_y.strip()}') for word_x in keywords_x: word_x = word_x.strip().strip("'").strip('"') if word_x != '': dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x, f'{word_x.strip()}') # 保存每个维度对应的相似度到idc_project_check_detail mysql.sql_change_msg( """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")""" % (dup_id, wdys2.get(x), similarity, escape_string(function_content), escape_string(dup_function_content), str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7])) else: similarity = cosin_similarity.CosineSimilarity(content_x, content_y) # 相似度 关键词 similarity, keywords_x, keywords_y = similarity.main() similarity = similarity * (10 / dup_count) # print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y) # 相似度相加 total_similarity += similarity # 去重 keywords_y = list(set(keywords_y)) # 去重 keywords_x = list(set(keywords_x)) # 关键词收集 total_keywords[x] = keywords_y function_content = content_y dup_function_content = content_x for word_y in keywords_y: word_y = word_y.strip().strip("'").strip('"') print(f'word_y = {word_y}') if word_y != '': function_content = str(function_content.replace("\"", "'")).replace(word_y, f'{word_y.strip()}') for word_x in keywords_x: word_x = word_x.strip().strip("'").strip('"') if word_x != '': dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x, f'{word_x.strip()}') # 保存每个维度对应的相似度到idc_project_check_detail mysql.sql_change_msg( """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")""" % (dup_id, wdys2.get(x), similarity, escape_string(function_content), escape_string(dup_function_content), str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7])) mysql.sql_change_msg( """update idc_project_check set similarity=%f where dup_id=%d""" % (total_similarity, dup_id)) if __name__ == "__main__": all_path = requests.get("http://127.0.0.1:19099/check/duplicates/%s" % 599).json() # print(all_path) # dict1 = {k:v for k, v in sorted(dict.items(), key= lambda item : item[1])} # print(dict1) data_list = [] for ap in all_path.get("data"): # if os.path.exists(ap.get("file_path")): data_list.append((ap.get("project_id"), ap.get("file_path"), ap.get("project_name"))) print(data_list) # data_list = [(11, r"C:\Users\HUAWEI\PycharmProjects\nlp\dup_check\0825-丽水系统查重维度1.xlsx", "水路运输综合监管系统建设项目.xls")] data_list = [(11, r"D:\ningda\dup_check2\dup_check\0825-丽水系统查重维度1.xlsx", "水路运输综合监管系统建设项目.xls")] project_check(data_list)