commit d9ffa090a9aee5ff5fea3a5b4541dd3ad402ce9b Author: xlt-evil Date: Fri Apr 21 17:33:56 2023 +0800 init project diff --git a/0825-丽水系统查重维度1.xlsx b/0825-丽水系统查重维度1.xlsx new file mode 100644 index 0000000..d8064be Binary files /dev/null and b/0825-丽水系统查重维度1.xlsx differ diff --git a/__pycache__/cosin_similarity.cpython-36.pyc b/__pycache__/cosin_similarity.cpython-36.pyc new file mode 100644 index 0000000..d0ffc08 Binary files /dev/null and b/__pycache__/cosin_similarity.cpython-36.pyc differ diff --git a/__pycache__/main1.cpython-36.pyc b/__pycache__/main1.cpython-36.pyc new file mode 100644 index 0000000..f370b75 Binary files /dev/null and b/__pycache__/main1.cpython-36.pyc differ diff --git a/__pycache__/mysql_pool.cpython-36.pyc b/__pycache__/mysql_pool.cpython-36.pyc new file mode 100644 index 0000000..621bc66 Binary files /dev/null and b/__pycache__/mysql_pool.cpython-36.pyc differ diff --git a/cosin_similarity.py b/cosin_similarity.py new file mode 100644 index 0000000..d315a2a --- /dev/null +++ b/cosin_similarity.py @@ -0,0 +1,84 @@ +# coding=utf-8 +import re +import html +import jieba +import jieba.analyse +from sklearn.metrics.pairwise import cosine_similarity + + +class CosineSimilarity(object): + """ + 余弦相似度 + """ + def __init__(self, content_x1, content_y2): + self.s1 = content_x1 + self.s2 = content_y2 + + @staticmethod + def extract_keyword(content): # 提取关键词 + # 正则过滤 html 标签 + re_exp = re.compile(r'()|(<[^>]+>)', re.S) + content = re_exp.sub(' ', content) + # html 转义符实体化 + content = html.unescape(content) + # 切割 + seg = [i for i in jieba.cut(content, cut_all=True) if i != ''] + # 提取关键词 + keywords = jieba.analyse.extract_tags("|".join(seg), topK=200, withWeight=False, allowPOS=('n', 'nr', 'ns')) + # print(keywords) + # return keywords + return seg,keywords + + @staticmethod + def one_hot(word_dict, keywords): # oneHot编码 + # cut_code = [word_dict[word] for word in keywords] + cut_code = [0]*len(word_dict) + for word in keywords: + cut_code[word_dict[word]] += 1 + return cut_code + + def main(self): + # 去除停用词 + # jieba.analyse.set_stop_words('stopword1.txt') + + # 提取关键词 + # keywords1 = self.extract_keyword(self.s1) + # keywords2 = self.extract_keyword(self.s2) + seg1,keywords1 = self.extract_keyword(self.s1) + seg2,keywords2 = self.extract_keyword(self.s2) + # 词的并集 + union = set(keywords1).union(set(keywords2)) + # union = set(seg1).union(set(seg2)) + + # 编码 + word_dict = {} + i = 0 + for word in union: + word_dict[word] = i + i += 1 + # oneHot编码 + s1_cut_code = self.one_hot(word_dict, keywords1) + s2_cut_code = self.one_hot(word_dict, keywords2) + # s1_cut_code = self.one_hot(word_dict, seg1) + # s2_cut_code = self.one_hot(word_dict, seg2) + # 余弦相似度计算 + sample = [s1_cut_code, s2_cut_code] + # 除零处理 + try: + sim = cosine_similarity(sample) + return sim[1][0],keywords1,keywords2 + except Exception as e: + print(e) + return 0.0,keywords1,keywords2 + + +# 测试 +if __name__ == '__main__': + with open(r'D:\pythonDM\Ndkj\live111\result\1.txt', encoding='UTF-8') as x, open(r'D:\pythonDM\Ndkj\live111\result\2.txt', encoding='UTF-8') as y: + content_x = x.read() + content_y = y.read() + similarity = CosineSimilarity(content_x, content_y) + # similarity = CosineSimilarity(file, file2) + similarity = similarity.main() + print(similarity) + print('相似度: %.2f%%' % (similarity*32)) diff --git a/flask_server.py b/flask_server.py new file mode 100644 index 0000000..a78885a --- /dev/null +++ b/flask_server.py @@ -0,0 +1,42 @@ +# coding=utf-8 +from flask import Flask, redirect, url_for, request +import sys +from flask import jsonify +import mysql_pool +import main1 +import cosin_similarity + +# import xm +# from xm import xsd + +app = Flask(__name__) + + +# mysql = mysql_pool.ConnMysql() + + +# 返回excel的保存地址 +@app.route('/check/duplicates/') +def success(projectId): + mysql=mysql_pool.ConnMysql() + if int(projectId) == 0: + data = mysql.sql_select_many("""select * from idc_project""") + else: + data = mysql.sql_select_many("""select * from idc_project where project_id=%s""" % projectId) + print(data) + data_list = [] + + for ap in data: + # if os.path.exists(ap.get("file_path")): + data_list.append((ap.get("project_id"), ap.get("file_path"), ap.get("project_name"))) + mysql.release() + # print(data_list) + main1.project_check(data_list) + + return jsonify({"code": 0, "data": data}) + + +# 去数据库idc_project里面拿数据,获取比如project_id=11,根据file_path地址拿到要开始处理的数据 +if __name__ == '__main__': + # app.run(host="0.0.0.0", port=19099) + app.run(port=19099) diff --git a/insert_history_data_total.py b/insert_history_data_total.py new file mode 100644 index 0000000..fbcb312 --- /dev/null +++ b/insert_history_data_total.py @@ -0,0 +1,283 @@ +# coding=utf-8 + +import re +import mysql_pool +from pymysql.converters import escape_string +import cosin_similarity +import pandas as pd +import datetime +import requests +import os + +wdys1 = { + "项目名称": "xmmc", + "现状问题": "xzwt", + "系统基础": "xtjc", + "项目目标": "xmmb", + "预期绩效": "yqjx", + "建设需求": "jsxq", + "数据需求": "sjxq", + "安全需求": "aqxq", + "业务领域": "ywly", + "核心业务": "hxyw", + "业务需求": "ywxq", + "业务协同": "ywxt", + "建设层级": "jscj", + "用户范围": "yhfw", + "目标群体": "mbqt", + "建设内容": "jsnr", + "功能模块": "gnmk", + "数据共享": "sjgx", + "智能要素": "znys", + "申报单位": "sbdw", + "所属地区": "ssdq", + "预算年度": "ysnd" +} +wdys2 = { + "xmmc": "项目名称", + "xzwt": "现状问题", + "xtjc": "系统基础", + "xmmb": "项目目标", + "yqjx": "预期绩效", + "jsxq": "建设需求", + "sjxq": "数据需求", + "aqxq": "安全需求", + "ywly": "业务领域", + "hxyw": "核心业务", + "ywxq": "业务需求", + "ywxt": "业务协同", + "jscj": "建设层级", + "yhfw": "用户范围", + "mbqt": "目标群体", + "jsnr": "建设内容", + "gnmk": "功能模块", + "sjgx": "数据共享", + "znys": "智能要素", + "sbdw": "申报单位", + "ssdq": "所属地区", + "ysnd": "预算年度" +} +gnmkys = { + "gnmc": "功能名称", + "gnms": "功能描述" +} + + +def getFlag(): + data_dict = {} + df = pd.read_excel("0825-丽水系统查重维度.xlsx") + data = df.values + data = list(pd.Series(data[:, 1]).dropna()) + for d in data: + try: + wd = re.search("(.*?)(.*?%)", d).group(1).strip() + wdc = wdys1.get(wd) + if wdc: + qz = re.search(".*?((.*?%))", d).group(1) + data_dict[wdc] = qz + except: + pass + return data_dict + + +def gong_neng_mo_kuai(xmmc, mysql, dl, data, er_title): + # 将excel文件中的所有第三维度内容进行拼接 + str_dict = {} + for et in er_title: + for d in data: + if d[1] == et: + if str_dict.get(et): + str_dict[et] = str_dict.get(et) + d[3] + else: + str_dict[et] = d[3] + for k, v in str_dict.items(): + mysql.sql_change_msg("""insert into user_history_module_data_total(xmmc,gnmc,gnms) value("%s", "%s", "%s")""" % ( + escape_string(xmmc), escape_string(k), escape_string(v))) + # + # similarity = cosin_similarity.CosineSimilarity(v, v) + # similarity, keywords_x, keywords_y = similarity.main() + # mysql.sql_change_msg("""insert into user_history_module_keywords (xmmc,gnmc,gnms) value("%s" ,"%s", "%s")""" % ( + # xmmc, escape_string(k), str(keywords_y)[dup_file_test:-dup_file_test])) + + +def project_check(data_list): + mysql = mysql_pool.ConnMysql() + # 读取维度和权重 + # get_data_dict = getFlag() + # 遍历excel存储路径 + for dl in data_list: + # path = "0825-丽水系统查重维度1.xlsx" + # 读取路径下的excel + print(dl) + df = pd.read_excel(dl[1]) + xmmc = df.keys() + # print(type(xmmc[dup_file_test])) + xmmc=xmmc[1] + # print(type(xmmc)) + # xmmc1='' + + if "可研报告"or "可研性报告"or "可行性研究报告" in xmmc: + xmmc=xmmc.replace('可研报告','') + xmmc=xmmc.replace('可研性报告','') + xmmc=xmmc.replace('可行性研究报告','') + # print(xmmc) + data = df.values + # 将excel文件中的所有维度内容进行拼接 + join_str = "" + str_dict = {} + title = "" + er_title = set() + # for d in data: + # # print(d) + # if pd.notnull(d[0]): + # title = d[0] + # if title == "功能模块": + # er_title.add(d[dup_file_test]) + # join_str = "" + # for i in d[dup_file_test:]: + # if pd.notnull(i): + # join_str += i + # str_dict[wdys1.get(title)] = join_str + # else: + # if title == "功能模块": + # er_title.add(d[dup_file_test]) + # for i in d[dup_file_test:]: + # if pd.notnull(i): + # join_str += i + # str_dict[wdys1.get(title)] = str_dict.get(wdys1.get(title)) + join_str + # print(str_dict) + gnmk_str = [] + # print(data) + for d in data: + if pd.notnull(d[0]): + title = d[0] + if title == "功能模块": + er_title.add(d[1]) + join_str = "" + for i in d[1:]: + # print(type(i)) + # i=str(i) + if pd.notnull(i): + join_str += str(i) + if title == "功能模块": + # for j in d[3:]: + if i == '功能描述': + continue + else: + gnmk_str.append(i) + str_dict[wdys1.get(title)] = join_str + # print(str_dict.get(wdys1.get(title))) + else: + if title == "功能模块": + er_title.add(d[1]) + for i in d[3:]: + if pd.notnull(i): + join_str += str(i) + if title == "功能模块": + gnmk_str.append(i) + str_dict[wdys1.get(title)] = str_dict.get(wdys1.get(title)) + join_str + # gnmk="".join(gnmk_str) + # str_dict['gnmk']=gnmk + gnmk = "".join(gnmk_str) + str_dict['gnmk'] = gnmk + # print(str_dict) + # print(str_dict.get("xzwt")if str_dict.get("xzwt") else None) + # print(str_dict.get('gnmk')if str_dict.get('gnmk')else None) + mysql.sql_change_msg( + """insert into user_history_data_total (xmmc, xzwt, xtjc, xmmb, yqjx, jsxq, sjxq, aqxq, ywly, hxyw, ywxq, ywxt, jscj, yhfw, mbqt, jsnr, gnmk, sjgx, znys,sbdw,ssdq,ysnd) value ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s","%s","%s","%s")""" + % (escape_string(xmmc), + escape_string(str_dict.get("xzwt")) if str_dict.get("xzwt") else None, + escape_string(str_dict.get("xtjc")) if str_dict.get("xtjc") else None, + escape_string(str_dict.get("xmmb")) if str_dict.get("xmmb") else None, + escape_string(str_dict.get("yqjx")) if str_dict.get("yqjx") else None, + escape_string(str_dict.get("jsxq")) if str_dict.get("jsxq") else None, + escape_string(str_dict.get("sjxq")) if str_dict.get("sjxq") else None, + escape_string(str_dict.get("aqxq")) if str_dict.get("aqxq") else None, + escape_string(str_dict.get("ywly")) if str_dict.get("ywly") else None, + escape_string(str_dict.get("hxyw")) if str_dict.get("hxyw") else None, + escape_string(str_dict.get("ywxq")) if str_dict.get("ywxq") else None, + escape_string(str_dict.get("ywxt")) if str_dict.get("ywxt") else None, + escape_string(str_dict.get("jscj")) if str_dict.get("jscj") else None, + escape_string(str_dict.get("yhfw")) if str_dict.get("yhfw") else None, + escape_string(str_dict.get("mbqt")) if str_dict.get("mbqt") else None, + escape_string(str_dict.get("jsnr")) if str_dict.get("jsnr") else None, + escape_string(str_dict.get("gnmk")) if str_dict.get("gnmk") else None, + escape_string(str_dict.get("sjgx")) if str_dict.get("sjgx") else None, + escape_string(str_dict.get("znys")) if str_dict.get("znys") else None, + escape_string(str_dict.get("sbdw")) if str_dict.get("sbdw") else None, + escape_string(str_dict.get("ssdq")) if str_dict.get("ssdq") else None, + escape_string(str_dict.get("ysnd")) if str_dict.get("ysnd") else None + )) + # project_gjc = {} + # for w in wdys2.keys(): + # content_x = str_dict.get(w) + # content_y = str_dict.get(w) + # if content_x and content_y: + # # 循环遍历每一个维度 + # similarity = cosin_similarity.CosineSimilarity(content_x, content_y) + # # 相似度 关键词 + # similarity, keywords_x, keywords_y = similarity.main() + # project_gjc[w] = keywords_y + # mysql.sql_change_msg( + # """insert into user_history_keywords (xmmc, xzwt, xtjc, xmmb, yqjx, jsxq, sjxq, aqxq, ywly, hxyw, ywxq, ywxt, jscj, yhfw, mbqt, jsnr, gnmk, sjgx, znys) value ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")""" + # % (xmmc, str(project_gjc.get("xzwt"))[dup_file_test:-dup_file_test] if project_gjc.get("xzwt") else None, + # str(project_gjc.get("xtjc"))[dup_file_test:-dup_file_test] if project_gjc.get("xtjc") else None, + # str(project_gjc.get("xmmb"))[dup_file_test:-dup_file_test] if project_gjc.get("xmmb") else None, + # str(project_gjc.get("yqjx"))[dup_file_test:-dup_file_test] if project_gjc.get("yqjx") else None, + # str(project_gjc.get("jsxq"))[dup_file_test:-dup_file_test] if project_gjc.get("jsxq") else None, + # str(project_gjc.get("sjxq"))[dup_file_test:-dup_file_test] if project_gjc.get("sjxq") else None, + # str(project_gjc.get("aqxq"))[dup_file_test:-dup_file_test] if project_gjc.get("aqxq") else None, + # str(project_gjc.get("ywly"))[dup_file_test:-dup_file_test] if project_gjc.get("ywly") else None, + # str(project_gjc.get("hxyw"))[dup_file_test:-dup_file_test] if project_gjc.get("hxyw") else None, + # str(project_gjc.get("ywxq"))[dup_file_test:-dup_file_test] if project_gjc.get("ywxq") else None, + # str(project_gjc.get("ywxt"))[dup_file_test:-dup_file_test] if project_gjc.get("ywxt") else None, + # str(project_gjc.get("jscj"))[dup_file_test:-dup_file_test] if project_gjc.get("jscj") else None, + # str(project_gjc.get("yhfw"))[dup_file_test:-dup_file_test] if project_gjc.get("yhfw") else None, + # str(project_gjc.get("mbqt"))[dup_file_test:-dup_file_test] if project_gjc.get("mbqt") else None, + # str(project_gjc.get("jsnr"))[dup_file_test:-dup_file_test] if project_gjc.get("jsnr") else None, + # str(project_gjc.get("gnmk"))[dup_file_test:-dup_file_test] if project_gjc.get("gnmk") else None, + # str(project_gjc.get("sjgx"))[dup_file_test:-dup_file_test] if project_gjc.get("sjgx") else None, + # str(project_gjc.get("znys"))[dup_file_test:-dup_file_test] if project_gjc.get("znys") else None)) + + gong_neng_mo_kuai(xmmc, mysql, dl, data, er_title) + + +if __name__ == "__main__": + path = r"D:\dup_file_test" + data_list = os.listdir(path) + print(len(data_list)) + for file in data_list: + # print(path+'\\'+file) + data_list = [(0, path + '\\' + file, "")] + project_check(data_list) + print("已存入************************************* %s" % file) + +""" +建设目标,业务功能 + +gnmk_str = [] +for d in data: + if pd.notnull(d[0]): + title = d[0] + if title == "功能模块": + er_title.add(d[dup_file_test]) + join_str = "" + for i in d[dup_file_test:]: + if pd.notnull(i): + join_str += i + if title == "功能模块": + gnmk_str.append(i) + str_dict[wdys1.get(title)] = join_str + else: + if title == "功能模块": + er_title.add(d[dup_file_test]) + for i in d[dup_file_test:]: + if pd.notnull(i): + join_str += i + if title == "功能模块": + gnmk_str.append(i) + str_dict[wdys1.get(title)] = str_dict.get(wdys1.get(title)) + join_str +gnmk = "".join(gnmk_str) + + +""" diff --git a/main1.py b/main1.py new file mode 100644 index 0000000..a23d097 --- /dev/null +++ b/main1.py @@ -0,0 +1,577 @@ +# coding=utf-8 +import sys +import re +import mysql_pool +from pymysql.converters import escape_string +import cosin_similarity +import pandas as pd +import datetime +import requests +import os +import pymysql + +wdys1 = { + "项目名称": "xmmc", + "现状问题": "xzwt", + "系统基础": "xtjc", + "项目目标": "xmmb", + "预期绩效": "yqjx", + "建设需求": "jsxq", + "数据需求": "sjxq", + "安全需求": "aqxq", + "业务领域": "ywly", + "核心业务": "hxyw", + "业务需求": "ywxq", + "业务协同": "ywxt", + "建设层级": "jscj", + "用户范围": "yhfw", + "目标群体": "mbqt", + "建设内容": "jsnr", + "功能模块": "gnmk", + "数据共享": "sjgx", + "智能要素": "znys" +} +wdys2 = { + "xmmc": "项目名称", + "xzwt": "现状问题", + "xtjc": "系统基础", + "xmmb": "项目目标", + "yqjx": "预期绩效", + "jsxq": "建设需求", + "sjxq": "数据需求", + "aqxq": "安全需求", + "ywly": "业务领域", + "hxyw": "核心业务", + "ywxq": "业务需求", + "ywxt": "业务协同", + "jscj": "建设层级", + "yhfw": "用户范围", + "mbqt": "目标群体", + "jsnr": "建设内容", + "gnmk": "功能模块", + "sjgx": "数据共享", + "znys": "智能要素" +} +gnmkys = { + "gnmc": "功能名称", + "gnms": "功能描述" +} + + +def getFlag(): + data_dict = {} + df = pd.read_excel("0825-丽水系统查重维度.xlsx") + data = df.values + data = list(pd.Series(data[:, 1]).dropna()) + for d in data: + try: + wd = re.search("(.*?)(.*?%)", d).group(1).strip() + wdc = wdys1.get(wd) + if wdc: + qz = re.search(".*?((.*?%))", d).group(1) + data_dict[wdc] = qz + except: + pass + return data_dict + + +def gong_neng_mo_kuai(mysql, dl, data, er_title): + # 将excel文件中的所有第三维度内容进行拼接 + str_dict = {} + for et in er_title: + for d in data: + if d[1] == et: + if str_dict.get(et): + str_dict[et] = str_dict.get(et) + d[3] + else: + str_dict[et] = d[3] + + for k, v in str_dict.items(): + mysql.sql_change_msg( + """insert into idc_project_module (project_id, check_duplicate_count, module_name, module_content, create_time, update_time, tag) value(%d, dup_file_test, "%s", "%s", "%s", "%s", "模块")""" % ( + int(dl[0]), k, v, str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7])) + + module_id_list = mysql.sql_select_many( + """select project_module_id, module_name, module_content from idc_project_module where project_id=%d""" % dl[ + 0]) + data_list = [] + for mil in module_id_list: + data_dict = {} + data_dict["project_module_id"] = mil.get("project_module_id") + data_dict["gnmc"] = mil.get("module_name") + data_dict["gnms"] = mil.get("module_content") + data_list.append(data_dict) + # print(data_list) + for i in data_list: + gnmk_copy1 = mysql.sql_select_many("""select * from user_history_module_data""") + if gnmk_copy1: + for gc in gnmk_copy1: + total_similarity1 = 0 + total_keywords1 = [] + total_similarity2 = 0 + total_keywords2 = [] + mysql.sql_change_msg( + """insert into idc_project_module_check (project_module_id, module_name, project_name, company_name, create_time, update_time) value(%d, "%s", "%s", "%s", "%s", "%s")""" + % ( + i.get("project_module_id"), gc.get("gnmc"), gc.get("xmmc"), "", + str(datetime.datetime.now())[:-7], + str(datetime.datetime.now())[:-7])) + dup_module_id = mysql.cur.lastrowid + for j in ["gnmc", "gnms"]: + # 循环遍历每一个模块名称 + content_x = gc.get(j) + content_y = i.get(j) + if content_x and content_y: + if j == "gnmc": + similarity = cosin_similarity.CosineSimilarity(content_x, content_y) + # 相似度 关键词 + similarity, keyword_x, keywords = similarity.main() + similarity = similarity * 1 + total_keywords1 += keywords + #print("######################相似度: %.2f%%" % similarity, "关键词: %s" % keywords) + # 相似度相加 + total_similarity1 += similarity + mysql.sql_change_msg( + """insert into idc_project_module_check_detail (dup_module_id, project_name, module_content, dup_module_content, similarity, dimension, create_time, update_time) value (%d, "%s", "%s", "%s", %f, "%s", "%s", "%s")""" + % (dup_module_id, dl[2], escape_string(content_y), escape_string(content_x), similarity, + "功能名称", + str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7])) + else: + similarity = cosin_similarity.CosineSimilarity(content_x, content_y) + # 相似度 关键词 + similarity, keyword_x, keywords = similarity.main() + similarity = similarity * 99 + total_keywords2 += keywords + #print("######################相似度: %.2f%%" % similarity, "关键词: %s" % keywords) + # 相似度相加 + total_similarity2 += similarity + mysql.sql_change_msg( + """insert into idc_project_module_check_detail (dup_module_id, project_name, module_content, dup_module_content, similarity, dimension, create_time, update_time) value (%d, "%s", "%s", "%s", %f, "%s", "%s", "%s")""" + % (dup_module_id, dl[2], escape_string(content_y), escape_string(content_x), similarity, + "功能模块描述", + str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7])) + mysql.sql_change_msg("""update idc_project_module_check set similarity=%f where dup_module_id=%d""" % ( + total_similarity1 + total_similarity2, dup_module_id)) + gnmk_gjc = {} + for a in ["gnmc", "gnms"]: + if i.get(a): + content_x = i.get(a) + content_y = i.get(a) + if a == "gnmc": + similarity = cosin_similarity.CosineSimilarity(content_x, content_y) + # 相似度 关键词 + similarity, keyword_x, keywords = similarity.main() + gnmk_gjc[a] = keywords + else: + similarity = cosin_similarity.CosineSimilarity(content_x, content_y) + # 相似度 关键词 + similarity, keyword_x, keywords = similarity.main() + gnmk_gjc[a] = keywords + mysql.sql_change_msg("""insert into user_module_keywords (xmmc, gnmc, gnms) value("%s", "%s", "%s")""" % ( + dl[2], str(gnmk_gjc.get("gnmc"))[1:-1] if gnmk_gjc.get("gnmc") else None, + str(gnmk_gjc.get("gnms"))[1:-1] if gnmk_gjc.get("gnms") else None)) + + +def project_check(data_list): + mysql = mysql_pool.ConnMysql() + # mysql.sql_select_many("""select * from mkgjc""") + # 读取维度和权重 + # xmnr_count = len(mysql.sql_select_many("""select * from xmnr_copy1""")) + # gnmk_count = len(mysql.sql_select_many("""select * from gnmk_copy1""")) + xmnr_count = len(mysql.sql_select_many("""select * from user_history_data""")) + gnmk_count = len(mysql.sql_select_many("""select * from user_history_module_data""")) + get_data_dict = getFlag() + # 遍历excel存储路径 + for dl in data_list: + # path = "0825-丽水系统查重维度1.xlsx" + # 读取路径下的excel + print(dl,dl[1]) + df = pd.read_excel(dl[1]) + data = df.values + # 将excel文件中的所有维度内容进行拼接 + join_str = "" + str_dict = {} + title = "" + er_title = set() + for d in data: + if pd.notnull(d[0]): + title = d[0] + if title == "功能模块": + er_title.add(d[1]) + join_str = "" + for i in d[1:]: + if pd.notnull(i): + join_str += i + str_dict[wdys1.get(title)] = join_str + else: + if title == "功能模块": + er_title.add(d[1]) + for i in d[1:]: + if pd.notnull(i): + join_str += i + str_dict[wdys1.get(title)] = str_dict.get(wdys1.get(title)) + join_str + print(str_dict) + mysql.sql_change_msg( + """insert into user_data (xmmc, xzwt, xtjc, xmmb, yqjx, jsxq, sjxq, aqxq, ywly, hxyw, ywxq, ywxt, jscj, yhfw, mbqt, jsnr, gnmk, sjgx, znys) value ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")""" + % (dl[0], str_dict.get("xzwt") if str_dict.get("xzwt") else None, + str_dict.get("xtjc") if str_dict.get("xtjc") else None, + str_dict.get("xmmb") if str_dict.get("xmmb") else None, + str_dict.get("yqjx") if str_dict.get("yqjx") else None, + str_dict.get("jsxq") if str_dict.get("jsxq") else None, + str_dict.get("sjxq") if str_dict.get("sjxq") else None, + str_dict.get("aqxq") if str_dict.get("aqxq") else None, + str_dict.get("ywly") if str_dict.get("ywly") else None, + str_dict.get("hxyw") if str_dict.get("hxyw") else None, + str_dict.get("ywxq") if str_dict.get("ywxq") else None, + str_dict.get("ywxt") if str_dict.get("ywxt") else None, + str_dict.get("jscj") if str_dict.get("jscj") else None, + str_dict.get("yhfw") if str_dict.get("yhfw") else None, + str_dict.get("mbqt") if str_dict.get("mbqt") else None, + str_dict.get("jsnr") if str_dict.get("jsnr") else None, + str_dict.get("gnmk") if str_dict.get("gnmk") else None, + str_dict.get("sjgx") if str_dict.get("sjgx") else None, + str_dict.get("znys") if str_dict.get("znys") else None)) + # 或取所有的xmnr_copy1 + xmnr_copy1 = mysql.sql_select_many("""select * from user_history_data""") + # 对比xmnr_copy1和xmnr维度是否都有 + if xmnr_copy1: + for xc in xmnr_copy1: + total_keywords = {} + total_similarity = 0 + dup_count = 0 + # 保存相加后的相似度到idc_project_check + mysql.sql_change_msg( + """insert into idc_project_check (project_id, dup_project_name, file_path, company_name, create_year, project_tag, project_range_tag, project_area, create_time, update_time) value ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")""" + % (dl[0], xc.get("xmmc"), dl[1], "", "", "需求相似、业务相似", "历史项目", "", + str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7])) + dup_id = mysql.cur.lastrowid + for x in list(xc.keys())[1:]: + content_x = xc.get(x) + content_y = str_dict.get(x) + if content_x and content_y: + if x == 'gnmk': + continue + elif x == 'jsnr': + continue + else: + dup_count += 1 + if xc.get('gnmk')==' ' and str_dict.get('gnmk')==' ': + for x in list(xc.keys())[1:]: + content_x = xc.get(x) + content_y = str_dict.get(x) + if content_x and content_y: + if x == 'gnmk': + # 匹配到历史数据,次数加1 + # dup_count += dup_file_test + # 循环遍历每一个维度 + similarity = cosin_similarity.CosineSimilarity(content_x, content_y) + # 相似度 关键词 + similarity, keywords_x, keywords_y = similarity.main() + similarity = similarity * 0 + #print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y) + # 相似度相加 + total_similarity += similarity + # 关键词收集 + total_keywords[x] = keywords_y + + function_content = content_y + dup_function_content = content_x + for word_y in keywords_y: + word_y = word_y.strip().strip("'").strip('"') + function_content = str(function_content.replace("\"", "'")).replace(word_y, + f'{word_y.strip()}') + for word_x in keywords_x: + word_x = word_x.strip().strip("'").strip('"') + dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x, + f'{word_x.strip()}') + # 保存每个维度对应的相似度到idc_project_check_detail + mysql.sql_change_msg( + """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")""" + % (dup_id, wdys2.get(x), similarity, escape_string(function_content), + escape_string(dup_function_content), str(datetime.datetime.now())[:-7], + str(datetime.datetime.now())[:-7])) + # content = content.replace(gjcs, f'{gjcs.strip()}') + elif x == 'jsnr': + similarity = cosin_similarity.CosineSimilarity(content_x, content_y) + # 相似度 关键词 + similarity, keywords_x, keywords_y = similarity.main() + similarity = similarity * 40 + #print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y) + # 相似度相加 + total_similarity += similarity + # 关键词收集 + total_keywords[x] = keywords_y + function_content = content_y + dup_function_content = content_x + for word_y in keywords_y: + word_y = word_y.strip().strip("'").strip('"') + function_content = str(function_content.replace("\"", "'")).replace(word_y, + f'{word_y.strip()}') + for word_x in keywords_x: + word_x = word_x.strip().strip("'").strip('"') + dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x, + f'{word_x.strip()}') + # 保存每个维度对应的相似度到idc_project_check_detail + mysql.sql_change_msg( + """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")""" + % (dup_id, wdys2.get(x), similarity, escape_string(function_content), + escape_string(dup_function_content), str(datetime.datetime.now())[:-7], + str(datetime.datetime.now())[:-7])) + else: + similarity = cosin_similarity.CosineSimilarity(content_x, content_y) + # 相似度 关键词 + similarity, keywords_x, keywords_y = similarity.main() + similarity = similarity * (60 / dup_count) + #print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y) + # 相似度相加 + total_similarity += similarity + # 关键词收集 + total_keywords[x] = keywords_y + function_content = content_y + dup_function_content = content_x + + for word_y in keywords_y: + word_y = word_y.strip().strip("'").strip('"') + function_content = str(function_content.replace("\"", "'")).replace(word_y, + f'{word_y.strip()}') + for word_x in keywords_x: + word_x = word_x.strip().strip("'").strip('"') + dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x, + f'{word_x.strip()}') + # 保存每个维度对应的相似度到idc_project_check_detail + mysql.sql_change_msg( + """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")""" + % (dup_id, wdys2.get(x), similarity, escape_string(function_content), + escape_string(dup_function_content), str(datetime.datetime.now())[:-7], + str(datetime.datetime.now())[:-7])) + elif xc['jsnr'] == ' ' and str_dict['jsnr'] == ' ': + for x in list(xc.keys())[1:]: + content_x = xc.get(x) + content_y = str_dict.get(x) + if content_x and content_y: + if x == 'gnmk': + # 匹配到历史数据,次数加1 + # dup_count += dup_file_test + # 循环遍历每一个维度 + similarity = cosin_similarity.CosineSimilarity(content_x, content_y) + # 相似度 关键词 + similarity, keywords_x, keywords_y = similarity.main() + similarity = similarity * 50 + #print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y) + # 相似度相加 + total_similarity += similarity + # 关键词收集 + total_keywords[x] = keywords_y + + function_content = content_y + dup_function_content = content_x + for word_y in keywords_y: + word_y = word_y.strip().strip("'").strip('"') + function_content = str(function_content.replace("\"", "'")).replace(word_y, + f'{word_y.strip()}') + for word_x in keywords_x: + word_x = word_x.strip().strip("'").strip('"') + dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x, + f'{word_x.strip()}') + # 保存每个维度对应的相似度到idc_project_check_detail + mysql.sql_change_msg( + """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")""" + % (dup_id, wdys2.get(x), similarity, escape_string(function_content), + escape_string(dup_function_content), str(datetime.datetime.now())[:-7], + str(datetime.datetime.now())[:-7])) + # content = content.replace(gjcs, f'{gjcs.strip()}') + elif x == 'jsnr': + similarity = cosin_similarity.CosineSimilarity(content_x, content_y) + # 相似度 关键词 + similarity, keywords_x, keywords_y = similarity.main() + similarity = similarity * 0 + #print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y) + # 相似度相加 + total_similarity += similarity + # 关键词收集 + total_keywords[x] = keywords_y + function_content = content_y + dup_function_content = content_x + for word_y in keywords_y: + word_y = word_y.strip().strip("'").strip('"') + function_content = str(function_content.replace("\"", "'")).replace(word_y, + f'{word_y.strip()}') + for word_x in keywords_x: + word_x = word_x.strip().strip("'").strip('"') + dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x, + f'{word_x.strip()}') + # 保存每个维度对应的相似度到idc_project_check_detail + mysql.sql_change_msg( + """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")""" + % (dup_id, wdys2.get(x), similarity, escape_string(function_content), + escape_string(dup_function_content), str(datetime.datetime.now())[:-7], + str(datetime.datetime.now())[:-7])) + else: + similarity = cosin_similarity.CosineSimilarity(content_x, content_y) + # 相似度 关键词 + similarity, keywords_x, keywords_y = similarity.main() + similarity = similarity * (50 / dup_count) + #print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y) + # 相似度相加 + total_similarity += similarity + # 关键词收集 + total_keywords[x] = keywords_y + function_content = content_y + dup_function_content = content_x + + for word_y in keywords_y: + word_y = word_y.strip().strip("'").strip('"') + function_content = str(function_content.replace("\"", "'")).replace(word_y, + f'{word_y.strip()}') + for word_x in keywords_x: + word_x = word_x.strip().strip("'").strip('"') + dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x, + f'{word_x.strip()}') + # 保存每个维度对应的相似度到idc_project_check_detail + mysql.sql_change_msg( + """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")""" + % (dup_id, wdys2.get(x), similarity, escape_string(function_content), + escape_string(dup_function_content), str(datetime.datetime.now())[:-7], + str(datetime.datetime.now())[:-7])) + else: + for x in list(xc.keys())[1:]: + content_x = xc.get(x) + content_y = str_dict.get(x) + if content_x and content_y: + if x == 'gnmk': + # 匹配到历史数据,次数加1 + # dup_count += dup_file_test + # 循环遍历每一个维度 + similarity = cosin_similarity.CosineSimilarity(content_x, content_y) + # 相似度 关键词 + similarity, keywords_x, keywords_y = similarity.main() + similarity = similarity * 50 + #print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y) + # 相似度相加 + total_similarity += similarity + # 关键词收集 + total_keywords[x] = keywords_y + + function_content = content_y + dup_function_content = content_x + for word_y in keywords_y: + word_y = word_y.strip().strip("'").strip('"') + function_content = str(function_content.replace("\"", "'")).replace(word_y, + f'{word_y.strip()}') + for word_x in keywords_x: + word_x = word_x.strip().strip("'").strip('"') + dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x, + f'{word_x.strip()}') + # 保存每个维度对应的相似度到idc_project_check_detail + mysql.sql_change_msg( + """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")""" + % (dup_id, wdys2.get(x), similarity, escape_string(function_content), + escape_string(dup_function_content), str(datetime.datetime.now())[:-7], + str(datetime.datetime.now())[:-7])) + # content = content.replace(gjcs, f'{gjcs.strip()}') + elif x == 'jsnr': + similarity = cosin_similarity.CosineSimilarity(content_x, content_y) + # 相似度 关键词 + similarity, keywords_x, keywords_y = similarity.main() + similarity = similarity * 40 + #print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y) + # 相似度相加 + total_similarity += similarity + # 关键词收集 + total_keywords[x] = keywords_y + function_content = content_y + dup_function_content = content_x + for word_y in keywords_y: + word_y = word_y.strip().strip("'").strip('"') + function_content = str(function_content.replace("\"", "'")).replace(word_y, + f'{word_y.strip()}') + for word_x in keywords_x: + word_x = word_x.strip().strip("'").strip('"') + dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x, + f'{word_x.strip()}') + # 保存每个维度对应的相似度到idc_project_check_detail + mysql.sql_change_msg( + """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")""" + % (dup_id, wdys2.get(x), similarity, escape_string(function_content), + escape_string(dup_function_content), str(datetime.datetime.now())[:-7], + str(datetime.datetime.now())[:-7])) + else: + similarity = cosin_similarity.CosineSimilarity(content_x, content_y) + # 相似度 关键词 + similarity, keywords_x, keywords_y = similarity.main() + similarity = similarity * (10 / dup_count) + #print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y) + # 相似度相加 + total_similarity += similarity + # 关键词收集 + total_keywords[x] = keywords_y + function_content = content_y + dup_function_content = content_x + + for word_y in keywords_y: + word_y = word_y.strip().strip("'").strip('"') + function_content = str(function_content.replace("\"", "'")).replace(word_y, + f'{word_y.strip()}') + for word_x in keywords_x: + word_x = word_x.strip().strip("'").strip('"') + dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x, + f'{word_x.strip()}') + # 保存每个维度对应的相似度到idc_project_check_detail + mysql.sql_change_msg( + """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")""" + % (dup_id, wdys2.get(x), similarity, escape_string(function_content), + escape_string(dup_function_content), str(datetime.datetime.now())[:-7], + str(datetime.datetime.now())[:-7])) + + mysql.sql_change_msg( + """update idc_project_check set similarity=%f where dup_id=%d""" % (total_similarity, dup_id)) + project_gjc = {} + for w in wdys2.keys(): + content_x = str_dict.get(w) + content_y = str_dict.get(w) + if content_x and content_y: + # 循环遍历每一个维度 + similarity = cosin_similarity.CosineSimilarity(content_x, content_y) + # 相似度 关键词 + similarity, keywords_x, keywords = similarity.main() + project_gjc[w] = keywords + mysql.sql_change_msg( + """insert into user_keyword (xmmc, xzwt, xtjc, xmmb, yqjx, jsxq, sjxq, aqxq, ywly, hxyw, ywxq, ywxt, jscj, yhfw, mbqt, jsnr, gnmk, sjgx, znys) value ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")""" + % (dl[0], str(project_gjc.get("xzwt"))[1:-1] if project_gjc.get("xzwt") else None, + str(project_gjc.get("xtjc"))[1:-1] if project_gjc.get("xtjc") else None, + str(project_gjc.get("xmmb"))[1:-1] if project_gjc.get("xmmb") else None, + str(project_gjc.get("yqjx"))[1:-1] if project_gjc.get("yqjx") else None, + str(project_gjc.get("jsxq"))[1:-1] if project_gjc.get("jsxq") else None, + str(project_gjc.get("sjxq"))[1:-1] if project_gjc.get("sjxq") else None, + str(project_gjc.get("aqxq"))[1:-1] if project_gjc.get("aqxq") else None, + str(project_gjc.get("ywly"))[1:-1] if project_gjc.get("ywly") else None, + str(project_gjc.get("hxyw"))[1:-1] if project_gjc.get("hxyw") else None, + str(project_gjc.get("ywxq"))[1:-1] if project_gjc.get("ywxq") else None, + str(project_gjc.get("ywxt"))[1:-1] if project_gjc.get("ywxt") else None, + str(project_gjc.get("jscj"))[1:-1] if project_gjc.get("jscj") else None, + str(project_gjc.get("yhfw"))[1:-1] if project_gjc.get("yhfw") else None, + str(project_gjc.get("mbqt"))[1:-1] if project_gjc.get("mbqt") else None, + str(project_gjc.get("jsnr"))[1:-1] if project_gjc.get("jsnr") else None, + str(project_gjc.get("gnmk"))[1:-1] if project_gjc.get("gnmk") else None, + str(project_gjc.get("sjgx"))[1:-1] if project_gjc.get("sjgx") else None, + str(project_gjc.get("znys"))[1:-1] if project_gjc.get("znys") else None)) + mysql.sql_change_msg( + """update idc_project set dup_status=3, one_vote_veto_status=dup_file_test, self_check_status=dup_file_test, history_project_count=%d ,module_count=%d where project_id=%d""" % ( + xmnr_count, gnmk_count, dl[0])) + gong_neng_mo_kuai(mysql, dl, data, er_title) + + +if __name__ == "__main__": + # all_path = requests.get("http://127.0.0.1:19099/check/duplicates/%s" % 15).json() + # print(all_path) + + # data_list = [] + # for ap in all_path.get("data"): + # # if os.path.exists(ap.get("file_path")): + # data_list.append((ap.get("project_id"), ap.get("file_path"), ap.get("project_name"))) + # print(data_list) + data_list = [(11, r"C:\Users\HUAWEI\PycharmProjects\nlp\dup_check\0825-丽水系统查重维度1.xlsx", "水路运输综合监管系统建设项目.xls")] + project_check(data_list) +""" + +""" diff --git a/main10.py b/main10.py new file mode 100644 index 0000000..e368dee --- /dev/null +++ b/main10.py @@ -0,0 +1,391 @@ +# coding=utf-8 +import sys +import re +import mysql_pool +from pymysql.converters import escape_string +import cosin_similarity +import pandas as pd +import datetime +import requests +import os + +wdys1 = { + "项目名称": "xmmc", + "现状问题": "xzwt", + "系统基础": "xtjc", + "项目目标": "xmmb", + "预期绩效": "yqjx", + "建设需求": "jsxq", + "数据需求": "sjxq", + "安全需求": "aqxq", + "业务领域": "ywly", + "核心业务": "hxyw", + "业务需求": "ywxq", + "业务协同": "ywxt", + "建设层级": "jscj", + "用户范围": "yhfw", + "目标群体": "mbqt", + "建设内容": "jsnr", + "功能模块": "gnmk", + "数据共享": "sjgx", + "智能要素": "znys" +} +wdys2 = { + "xmmc": "项目名称", + "xzwt": "现状问题", + "xtjc": "系统基础", + "xmmb": "项目目标", + "yqjx": "预期绩效", + "jsxq": "建设需求", + "sjxq": "数据需求", + "aqxq": "安全需求", + "ywly": "业务领域", + "hxyw": "核心业务", + "ywxq": "业务需求", + "ywxt": "业务协同", + "jscj": "建设层级", + "yhfw": "用户范围", + "mbqt": "目标群体", + "jsnr": "建设内容", + "gnmk": "功能模块", + "sjgx": "数据共享", + "znys": "智能要素" +} +gnmkys = { + "gnmc": "功能名称", + "gnms": "功能描述" +} + + +def getFlag(): + data_dict = {} + df = pd.read_excel("0825-丽水系统查重维度.xlsx") + data = df.values + data = list(pd.Series(data[:, 1]).dropna()) + for d in data: + try: + wd = re.search("(.*?)(.*?%)", d).group(1).strip() + wdc = wdys1.get(wd) + if wdc: + qz = re.search(".*?((.*?%))", d).group(1) + data_dict[wdc] = qz + except: + pass + return data_dict + + +def gong_neng_mo_kuai(mysql, dl, data, er_title): + # 将excel文件中的所有第三维度内容进行拼接 + str_dict = {} + for et in er_title: + for d in data: + if d[1] == et: + if str_dict.get(et): + str_dict[et] = str_dict.get(et) + d[3] + else: + str_dict[et] = d[3] + # print(str_dict) + for k, v in str_dict.items(): + mysql.sql_change_msg( + """insert into idc_project_module (project_id, check_duplicate_count, module_name, module_content, create_time, update_time, tag) value(%d, 1, "%s", "%s", "%s", "%s", "模块")""" % ( + int(dl[0]), k, v, str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7])) + module_id_list = mysql.sql_select_many( + """select project_module_id, module_name, module_content from idc_project_module where project_id=%d""" % dl[ + 0]) + data_list = [] + for mil in module_id_list: + data_dict = {} + data_dict["project_module_id"] = mil.get("project_module_id") + data_dict["gnmc"] = mil.get("module_name") + data_dict["gnms"] = mil.get("module_content") + data_list.append(data_dict) + # print(data_list) + for i in data_list: + gnmk_copy1 = mysql.sql_select_many("""select * from user_history_module_data""") + if gnmk_copy1: + for gc in gnmk_copy1: + total_similarity1 = 0 + total_keywords1 = [] + total_similarity2 = 0 + total_keywords2 = [] + mysql.sql_change_msg( + """insert into idc_project_module_check (project_module_id, module_name, project_name, company_name, create_time, update_time) value(%d, "%s", "%s", "%s", "%s", "%s")""" + % ( + i.get("project_module_id"), gc.get("gnmc"), gc.get("xmmc"), "", + str(datetime.datetime.now())[:-7], + str(datetime.datetime.now())[:-7])) + dup_module_id = mysql.cur.lastrowid + for j in ["gnmc", "gnms"]: + # 循环遍历每一个模块名称 + content_x = gc.get(j) + content_y = i.get(j) + if content_x and content_y: + if j == "gnmc": + similarity = cosin_similarity.CosineSimilarity(content_x, content_y) + # 相似度 关键词 + similarity, keyword_x, keywords = similarity.main() + similarity = similarity * 1 + total_keywords1 += keywords + print("######################相似度: %.2f%%" % similarity, "关键词: %s" % keywords) + # 相似度相加 + total_similarity1 += similarity + mysql.sql_change_msg( + """insert into idc_project_module_check_detail (dup_module_id, project_name, module_content, dup_module_content, similarity, dimension, create_time, update_time) value (%d, "%s", "%s", "%s", %f, "%s", "%s", "%s")""" + % (dup_module_id, dl[2], escape_string(content_y), escape_string(content_x), similarity, + "功能名称", + str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7])) + else: + similarity = cosin_similarity.CosineSimilarity(content_x, content_y) + # 相似度 关键词 + similarity, keyword_x, keywords = similarity.main() + similarity = similarity * 99 + total_keywords2 += keywords + print("######################相似度: %.2f%%" % similarity, "关键词: %s" % keywords) + # 相似度相加 + total_similarity2 += similarity + mysql.sql_change_msg( + """insert into idc_project_module_check_detail (dup_module_id, project_name, module_content, dup_module_content, similarity, dimension, create_time, update_time) value (%d, "%s", "%s", "%s", %f, "%s", "%s", "%s")""" + % (dup_module_id, dl[2], escape_string(content_y), escape_string(content_x), similarity, + "功能模块描述", + str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7])) + mysql.sql_change_msg("""update idc_project_module_check set similarity=%f where dup_module_id=%d""" % ( + total_similarity1 + total_similarity2, dup_module_id)) + gnmk_gjc = {} + for a in ["gnmc", "gnms"]: + if i.get(a): + content_x = i.get(a) + content_y = i.get(a) + if a == "gnmc": + similarity = cosin_similarity.CosineSimilarity(content_x, content_y) + # 相似度 关键词 + similarity, keyword_x, keywords = similarity.main() + gnmk_gjc[a] = keywords + else: + similarity = cosin_similarity.CosineSimilarity(content_x, content_y) + # 相似度 关键词 + similarity, keyword_x, keywords = similarity.main() + gnmk_gjc[a] = keywords + mysql.sql_change_msg("""insert into user_module_keywords (xmmc, gnmc, gnms) value("%s", "%s", "%s")""" % ( + dl[2], str(gnmk_gjc.get("gnmc"))[1:-1] if gnmk_gjc.get("gnmc") else None, + str(gnmk_gjc.get("gnms"))[1:-1] if gnmk_gjc.get("gnms") else None)) + + +def project_check(data_list): + mysql = mysql_pool.ConnMysql() + # mysql.sql_select_many("""select * from mkgjc""") + # 读取维度和权重 + # xmnr_count = len(mysql.sql_select_many("""select * from xmnr_copy1""")) + # gnmk_count = len(mysql.sql_select_many("""select * from gnmk_copy1""")) + xmnr_count = len(mysql.sql_select_many("""select * from user_history_data""")) + gnmk_count = len(mysql.sql_select_many("""select * from user_history_module_data""")) + get_data_dict = getFlag() + # 遍历excel存储路径 + for dl in data_list: + # path = "0825-丽水系统查重维度1.xlsx" + # 读取路径下的excel + df = pd.read_excel(dl[1]) + data = df.values + # 将excel文件中的所有维度内容进行拼接 + join_str = "" + str_dict = {} + title = "" + er_title = set() + for d in data: + if pd.notnull(d[0]): + title = d[0] + if title == "功能模块": + er_title.add(d[1]) + join_str = "" + for i in d[1:]: + if pd.notnull(i): + join_str +=i + str_dict[wdys1.get(title)] = join_str + else: + if title == "功能模块": + er_title.add(d[1]) + for i in d[1:]: + if pd.notnull(i): + join_str +=i + str_dict[wdys1.get(title)] = str_dict.get(wdys1.get(title)) + join_str + # print(str_dict) + mysql.sql_change_msg( + """insert into user_data (xmmc, xzwt, xtjc, xmmb, yqjx, jsxq, sjxq, aqxq, ywly, hxyw, ywxq, ywxt, jscj, yhfw, mbqt, jsnr, gnmk, sjgx, znys) value ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")""" + % (dl[0], str_dict.get("xzwt") if str_dict.get("xzwt") else None, + str_dict.get("xtjc") if str_dict.get("xtjc") else None, + str_dict.get("xmmb") if str_dict.get("xmmb") else None, + str_dict.get("yqjx") if str_dict.get("yqjx") else None, + str_dict.get("jsxq") if str_dict.get("jsxq") else None, + str_dict.get("sjxq") if str_dict.get("sjxq") else None, + str_dict.get("aqxq") if str_dict.get("aqxq") else None, + str_dict.get("ywly") if str_dict.get("ywly") else None, + str_dict.get("hxyw") if str_dict.get("hxyw") else None, + str_dict.get("ywxq") if str_dict.get("ywxq") else None, + str_dict.get("ywxt") if str_dict.get("ywxt") else None, + str_dict.get("jscj") if str_dict.get("jscj") else None, + str_dict.get("yhfw") if str_dict.get("yhfw") else None, + str_dict.get("mbqt") if str_dict.get("mbqt") else None, + str_dict.get("jsnr") if str_dict.get("jsnr") else None, + str_dict.get("gnmk") if str_dict.get("gnmk") else None, + str_dict.get("sjgx") if str_dict.get("sjgx") else None, + str_dict.get("znys") if str_dict.get("znys") else None)) + # 或取所有的xmnr_copy1 + xmnr_copy1 = mysql.sql_select_many("""select * from user_history_data""") + # 对比xmnr_copy1和xmnr维度是否都有 + if xmnr_copy1: + for xc in xmnr_copy1: + total_keywords = {} + total_similarity = 0 + dup_count = 0 + # 保存相加后的相似度到idc_project_check + mysql.sql_change_msg( + """insert into idc_project_check (project_id, dup_project_name, file_path, company_name, create_year, project_tag, project_range_tag, project_area, create_time, update_time) value ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")""" + % (dl[0], xc.get("xmmc"), dl[1], "", "", "需求相似、业务相似", "历史项目", "", + str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7])) + dup_id = mysql.cur.lastrowid + for x in list(xc.keys())[1:]: + content_x = xc.get(x) + content_y = str_dict.get(x) + if content_x and content_y: + if x == 'gnmk': + continue + elif x == 'jsnr': + continue + else: + dup_count += 1 + for x in list(xc.keys())[1:]: + content_x = xc.get(x) + content_y = str_dict.get(x) + if content_x and content_y: + if x == 'gnmk': + # 匹配到历史数据,次数加1 + # dup_count += 1 + # 循环遍历每一个维度 + similarity = cosin_similarity.CosineSimilarity(content_x, content_y) + # 相似度 关键词 + similarity, keywords_x, keywords_y = similarity.main() + similarity = similarity * 50 + print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y) + # 相似度相加 + total_similarity += similarity + # 关键词收集 + total_keywords[x] = keywords_y + + function_content = content_y + dup_function_content = content_x + for word_y in keywords_y: + word_y = word_y.strip().strip("'").strip('"') + function_content = str(function_content.replace("\"", "'")).replace(word_y, + f'{word_y.strip()}') + for word_x in keywords_x: + word_x = word_x.strip().strip("'").strip('"') + dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x, + f'{word_x.strip()}') + # 保存每个维度对应的相似度到idc_project_check_detail + mysql.sql_change_msg( + """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")""" + % (dup_id, wdys2.get(x), similarity, escape_string(function_content), + escape_string(dup_function_content), str(datetime.datetime.now())[:-7], + str(datetime.datetime.now())[:-7])) + # content = content.replace(gjcs, f'{gjcs.strip()}') + elif x == 'jsnr': + similarity = cosin_similarity.CosineSimilarity(content_x, content_y) + # 相似度 关键词 + similarity, keywords_x, keywords_y = similarity.main() + similarity = similarity * 40 + print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y) + # 相似度相加 + total_similarity += similarity + # 关键词收集 + total_keywords[x] = keywords_y + function_content = content_y + dup_function_content = content_x + for word_y in keywords_y: + word_y = word_y.strip().strip("'").strip('"') + function_content = str(function_content.replace("\"", "'")).replace(word_y, + f'{word_y.strip()}') + for word_x in keywords_x: + word_x = word_x.strip().strip("'").strip('"') + dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x, + f'{word_x.strip()}') + # 保存每个维度对应的相似度到idc_project_check_detail + mysql.sql_change_msg( + """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")""" + % (dup_id, wdys2.get(x), similarity, escape_string(function_content), + escape_string(dup_function_content), str(datetime.datetime.now())[:-7], + str(datetime.datetime.now())[:-7])) + else: + similarity = cosin_similarity.CosineSimilarity(content_x, content_y) + # 相似度 关键词 + similarity, keywords_x, keywords_y = similarity.main() + similarity = similarity * (10 / dup_count) + print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y) + # 相似度相加 + total_similarity += similarity + # 关键词收集 + total_keywords[x] = keywords_y + function_content = content_y + dup_function_content = content_x + for word_y in keywords_y: + word_y = word_y.strip().strip("'").strip('"') + function_content = str(function_content.replace("\"", "'")).replace(word_y, + f'{word_y.strip()}') + for word_x in keywords_x: + word_x = word_x.strip().strip("'").strip('"') + dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x, + f'{word_x.strip()}') + # 保存每个维度对应的相似度到idc_project_check_detail + mysql.sql_change_msg( + """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")""" + % (dup_id, wdys2.get(x), similarity, escape_string(function_content), + escape_string(dup_function_content), str(datetime.datetime.now())[:-7], + str(datetime.datetime.now())[:-7])) + + mysql.sql_change_msg( + """update idc_project_check set similarity=%f where dup_id=%d""" % (total_similarity, dup_id)) + project_gjc = {} + for w in wdys2.keys(): + content_x = str_dict.get(w) + content_y = str_dict.get(w) + if content_x and content_y: + # 循环遍历每一个维度 + similarity = cosin_similarity.CosineSimilarity(content_x, content_y) + # 相似度 关键词 + similarity, keywords_x, keywords = similarity.main() + project_gjc[w] = keywords + mysql.sql_change_msg( + """insert into user_keyword (xmmc, xzwt, xtjc, xmmb, yqjx, jsxq, sjxq, aqxq, ywly, hxyw, ywxq, ywxt, jscj, yhfw, mbqt, jsnr, gnmk, sjgx, znys) value ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")""" + % (dl[0], str(project_gjc.get("xzwt"))[1:-1] if project_gjc.get("xzwt") else None, + str(project_gjc.get("xtjc"))[1:-1] if project_gjc.get("xtjc") else None, + str(project_gjc.get("xmmb"))[1:-1] if project_gjc.get("xmmb") else None, + str(project_gjc.get("yqjx"))[1:-1] if project_gjc.get("yqjx") else None, + str(project_gjc.get("jsxq"))[1:-1] if project_gjc.get("jsxq") else None, + str(project_gjc.get("sjxq"))[1:-1] if project_gjc.get("sjxq") else None, + str(project_gjc.get("aqxq"))[1:-1] if project_gjc.get("aqxq") else None, + str(project_gjc.get("ywly"))[1:-1] if project_gjc.get("ywly") else None, + str(project_gjc.get("hxyw"))[1:-1] if project_gjc.get("hxyw") else None, + str(project_gjc.get("ywxq"))[1:-1] if project_gjc.get("ywxq") else None, + str(project_gjc.get("ywxt"))[1:-1] if project_gjc.get("ywxt") else None, + str(project_gjc.get("jscj"))[1:-1] if project_gjc.get("jscj") else None, + str(project_gjc.get("yhfw"))[1:-1] if project_gjc.get("yhfw") else None, + str(project_gjc.get("mbqt"))[1:-1] if project_gjc.get("mbqt") else None, + str(project_gjc.get("jsnr"))[1:-1] if project_gjc.get("jsnr") else None, + str(project_gjc.get("gnmk"))[1:-1] if project_gjc.get("gnmk") else None, + str(project_gjc.get("sjgx"))[1:-1] if project_gjc.get("sjgx") else None, + str(project_gjc.get("znys"))[1:-1] if project_gjc.get("znys") else None)) + mysql.sql_change_msg( + """update idc_project set dup_status=3, one_vote_veto_status=1, self_check_status=1, history_project_count=%d ,module_count=%d where project_id=%d""" % ( + xmnr_count, gnmk_count, dl[0])) + gong_neng_mo_kuai(mysql, dl, data, er_title) + + +if __name__ == "__main__": + all_path = requests.get("http://127.0.0.1:19099/check/duplicates/%s" % 15).json() + # print(all_path) + # + # data_list = [] + # for ap in all_path.get("data"): + # # if os.path.exists(ap.get("file_path")): + # data_list.append((ap.get("project_id"), ap.get("file_path"), ap.get("project_name"))) + # print(data_list) + # data_list = [(11, r"D:\pythonDM\Ndkj\duplicate_check\0825-丽水系统查重维度1.xlsx", "数字百山祖(一期)—“云值守”建设方案")] + # project_check(data_list) diff --git a/mysql_pool.py b/mysql_pool.py new file mode 100644 index 0000000..b09b6ed --- /dev/null +++ b/mysql_pool.py @@ -0,0 +1,113 @@ +# coding=utf-8 +import pymysql +from dbutils.pooled_db import PooledDB +# from dbutils.persistent_db import PersistentDB + +mysqlInfo = { + "host": '47.98.125.47', + "user": 'root', + "passwd": 'NingdaKeji123!', + "db": 'idc', + "port": 3306, + "charset": "utf8" +} + + +class ConnMysql(object): + __pool = None + + def __init__(self): + # 构造函数,创建数据库连接、游标 + self.coon = ConnMysql._get_mysql_conn() + self.cur = self.coon.cursor(cursor=pymysql.cursors.DictCursor) + + # 数据库连接池连接 + @staticmethod + def _get_mysql_conn(): + global __pool + if ConnMysql.__pool is None: + __pool = PooledDB( + creator=pymysql, + mincached=1, + maxcached=5, + maxconnections=6, + maxshared=3, + blocking=True, + maxusage=None, + setsession=[], + ping=2, + host=mysqlInfo['host'], + user=mysqlInfo['user'], + passwd=mysqlInfo['passwd'], + db=mysqlInfo['db'], + port=mysqlInfo['port'], + charset=mysqlInfo['charset']) + return __pool.connection() + + # 插入、修改、删除一条 + def sql_change_msg(self, sql): + change_sql = self.cur.execute(sql) + self.coon.commit() + return change_sql + + # 查询一条 + def sql_select_one(self, sql): + self.cur.execute(sql) + select_res = self.cur.fetchone() + return select_res + + # 查询多条 + def sql_select_many(self, sql, count=None): + self.cur.execute(sql) + if count is None: + select_res = self.cur.fetchall() + else: + select_res = self.cur.fetchmany(count) + return select_res + + # 释放资源 + def release(self): + self.coon.close() + self.cur.close() + + +if __name__ == '__main__': + [{'Tables_in_idc': 'gjc'}, + {'Tables_in_idc': 'gjc2'}, + {'Tables_in_idc': 'idc_dept'}, + {'Tables_in_idc': 'idc_project'}, {'Tables_in_idc': 'idc_project_check'}, + {'Tables_in_idc': 'idc_project_check_detail'}, {'Tables_in_idc': 'idc_project_module'}, + {'Tables_in_idc': 'idc_project_module_check'}, {'Tables_in_idc': 'idc_project_module_check_detail'}, + {'Tables_in_idc': 'idc_user'}, {'Tables_in_idc': 'idc_user_dept'}, {'Tables_in_idc': 'mk2'}] + + # print(ConnMysql().sql_select_many("show tables;")) + mysql = ConnMysql() + # mysql.sql_change_msg("""insert into idc_project (project_name,file_path) value ("%s", "%s")""" % ("森林火险", "/opt/idc/file/20220924/79a53829-8965-4aof-a342-c532f6c9c2a3森林火险.xlsx")) + # print(mysql.sql_select_many("""select * from gjc""")) + # print(mysql.sql_select_many("""select * from gjc2 where id=dup_file_test""")) + # print(mysql.sql_select_many("""select * from xmnr""")) + # print(mysql.sql_select_many("""select * from gjc_copy1""")) + # print(mysql.sql_select_one("""select * from idc_project_check""")) + # print(mysql.sql_select_one("""select * from idc_project_check_detail""")) + # print(mysql.sql_select_many("""select * from idc_project_module""")) + # print(mysql.sql_select_many("""select * from idc_project_module where project_id=%d""" % int(7))) + # print( mysql.sql_select_one("""select dup_id from idc_project_check where project_id=%d"""% int(7))) + # print(len(mysql.sql_select_many("""select * from xmnr_copy1"""))) + # print(len(mysql.sql_select_many("""select * from user_history_data"""))) + print(len(mysql.sql_select_many("""select * from user_history_data"""))) + + """查重复select * from user_history_module_data where gnms in (select gnms from user_history_module_data group by gnms having count(gnms)>1); +""" + # print() + # str_dict={} + # cmnr_count=551 + # gnmkcount=1192 + # + # print(mysql.sql_change_msg( + # """update idc_project set company_name=%s, dup_status=3, one_vote_veto_status=dup_file_test, self_check_status=dup_file_test, history_project_count=%d ,module_count=%d where project_id=%d""" % ( + # str_dict.get('sbdw'), xmnr_count=551, gnmk_count=1192, 104))) + # print(mysql.sql_change_msg( + # """update idc_project set dup_status=3, one_vote_veto_status=dup_file_test, self_check_status=dup_file_test, history_project_count=%d ,module_count=%d where project_id=%d""" % ( + # ) + # for k, v in mysql.sql_select_one("""select * from idc_project_check_detail""").items(): + # print(k, v) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..bea2beb --- /dev/null +++ b/requirements.txt @@ -0,0 +1,41 @@ +certifi==2022.6.15 +cffi==1.15.1 +chardet==5.0.0 +charset-normalizer==2.0.12 +click==8.0.4 +colorama==0.4.5 +cryptography==3.4.7 +dataclasses==0.8 +DBUtils==3.0.2 +et-xmlfile==1.1.0 +Flask==1.0.2 +idna==3.3 +importlib-metadata==4.8.3 +itsdangerous==2.0.1 +jieba==0.42.1 +Jinja2==3.0.3 +joblib==1.1.0 +MarkupSafe==2.0.1 +numpy==1.19.5 +openpyxl==3.0.10 +pandas==1.1.5 +pdfminer.six==20211012 +pdfplumber==0.6.0 +Pillow==8.4.0 +pycparser==2.21 +PyMySQL==0.10.1 +pypiwin32==223 +python-dateutil==2.8.2 +pytz==2022.2.1 +pywin32==304 +requests==2.27.1 +scikit-learn==0.24.2 +scipy==1.5.4 +six==1.16.0 +threadpoolctl==3.1.0 +typing_extensions==4.1.1 +urllib3==1.26.12 +Wand==0.6.10 +Werkzeug==2.0.3 +xlrd==1.2.0 +zipp==3.6.0 diff --git a/水路运输综合监管系统建设项目.xls b/水路运输综合监管系统建设项目.xls new file mode 100644 index 0000000..c3481ca Binary files /dev/null and b/水路运输综合监管系统建设项目.xls differ