Browse Source

init project

master
xlt-evil 1 year ago
commit
d9ffa090a9
12 changed files with 1531 additions and 0 deletions
  1. BIN
      0825-丽水系统查重维度1.xlsx
  2. BIN
      __pycache__/cosin_similarity.cpython-36.pyc
  3. BIN
      __pycache__/main1.cpython-36.pyc
  4. BIN
      __pycache__/mysql_pool.cpython-36.pyc
  5. +84
    -0
      cosin_similarity.py
  6. +42
    -0
      flask_server.py
  7. +283
    -0
      insert_history_data_total.py
  8. +577
    -0
      main1.py
  9. +391
    -0
      main10.py
  10. +113
    -0
      mysql_pool.py
  11. +41
    -0
      requirements.txt
  12. BIN
      水路运输综合监管系统建设项目.xls

BIN
0825-丽水系统查重维度1.xlsx View File


BIN
__pycache__/cosin_similarity.cpython-36.pyc View File


BIN
__pycache__/main1.cpython-36.pyc View File


BIN
__pycache__/mysql_pool.cpython-36.pyc View File


+ 84
- 0
cosin_similarity.py View File

@@ -0,0 +1,84 @@
# coding=utf-8
import re
import html
import jieba
import jieba.analyse
from sklearn.metrics.pairwise import cosine_similarity


class CosineSimilarity(object):
"""
余弦相似度
"""
def __init__(self, content_x1, content_y2):
self.s1 = content_x1
self.s2 = content_y2

@staticmethod
def extract_keyword(content): # 提取关键词
# 正则过滤 html 标签
re_exp = re.compile(r'(<style>.*?</style>)|(<[^>]+>)', re.S)
content = re_exp.sub(' ', content)
# html 转义符实体化
content = html.unescape(content)
# 切割
seg = [i for i in jieba.cut(content, cut_all=True) if i != '']
# 提取关键词
keywords = jieba.analyse.extract_tags("|".join(seg), topK=200, withWeight=False, allowPOS=('n', 'nr', 'ns'))
# print(keywords)
# return keywords
return seg,keywords

@staticmethod
def one_hot(word_dict, keywords): # oneHot编码
# cut_code = [word_dict[word] for word in keywords]
cut_code = [0]*len(word_dict)
for word in keywords:
cut_code[word_dict[word]] += 1
return cut_code

def main(self):
# 去除停用词
# jieba.analyse.set_stop_words('stopword1.txt')

# 提取关键词
# keywords1 = self.extract_keyword(self.s1)
# keywords2 = self.extract_keyword(self.s2)
seg1,keywords1 = self.extract_keyword(self.s1)
seg2,keywords2 = self.extract_keyword(self.s2)
# 词的并集
union = set(keywords1).union(set(keywords2))
# union = set(seg1).union(set(seg2))

# 编码
word_dict = {}
i = 0
for word in union:
word_dict[word] = i
i += 1
# oneHot编码
s1_cut_code = self.one_hot(word_dict, keywords1)
s2_cut_code = self.one_hot(word_dict, keywords2)
# s1_cut_code = self.one_hot(word_dict, seg1)
# s2_cut_code = self.one_hot(word_dict, seg2)
# 余弦相似度计算
sample = [s1_cut_code, s2_cut_code]
# 除零处理
try:
sim = cosine_similarity(sample)
return sim[1][0],keywords1,keywords2
except Exception as e:
print(e)
return 0.0,keywords1,keywords2


# 测试
if __name__ == '__main__':
with open(r'D:\pythonDM\Ndkj\live111\result\1.txt', encoding='UTF-8') as x, open(r'D:\pythonDM\Ndkj\live111\result\2.txt', encoding='UTF-8') as y:
content_x = x.read()
content_y = y.read()
similarity = CosineSimilarity(content_x, content_y)
# similarity = CosineSimilarity(file, file2)
similarity = similarity.main()
print(similarity)
print('相似度: %.2f%%' % (similarity*32))

+ 42
- 0
flask_server.py View File

@@ -0,0 +1,42 @@
# coding=utf-8
from flask import Flask, redirect, url_for, request
import sys
from flask import jsonify
import mysql_pool
import main1
import cosin_similarity

# import xm
# from xm import xsd

app = Flask(__name__)


# mysql = mysql_pool.ConnMysql()


# 返回excel的保存地址
@app.route('/check/duplicates/<projectId>')
def success(projectId):
mysql=mysql_pool.ConnMysql()
if int(projectId) == 0:
data = mysql.sql_select_many("""select * from idc_project""")
else:
data = mysql.sql_select_many("""select * from idc_project where project_id=%s""" % projectId)
print(data)
data_list = []

for ap in data:
# if os.path.exists(ap.get("file_path")):
data_list.append((ap.get("project_id"), ap.get("file_path"), ap.get("project_name")))
mysql.release()
# print(data_list)
main1.project_check(data_list)

return jsonify({"code": 0, "data": data})


# 去数据库idc_project里面拿数据,获取比如project_id=11,根据file_path地址拿到要开始处理的数据
if __name__ == '__main__':
# app.run(host="0.0.0.0", port=19099)
app.run(port=19099)

+ 283
- 0
insert_history_data_total.py View File

@@ -0,0 +1,283 @@
# coding=utf-8

import re
import mysql_pool
from pymysql.converters import escape_string
import cosin_similarity
import pandas as pd
import datetime
import requests
import os

wdys1 = {
"项目名称": "xmmc",
"现状问题": "xzwt",
"系统基础": "xtjc",
"项目目标": "xmmb",
"预期绩效": "yqjx",
"建设需求": "jsxq",
"数据需求": "sjxq",
"安全需求": "aqxq",
"业务领域": "ywly",
"核心业务": "hxyw",
"业务需求": "ywxq",
"业务协同": "ywxt",
"建设层级": "jscj",
"用户范围": "yhfw",
"目标群体": "mbqt",
"建设内容": "jsnr",
"功能模块": "gnmk",
"数据共享": "sjgx",
"智能要素": "znys",
"申报单位": "sbdw",
"所属地区": "ssdq",
"预算年度": "ysnd"
}
wdys2 = {
"xmmc": "项目名称",
"xzwt": "现状问题",
"xtjc": "系统基础",
"xmmb": "项目目标",
"yqjx": "预期绩效",
"jsxq": "建设需求",
"sjxq": "数据需求",
"aqxq": "安全需求",
"ywly": "业务领域",
"hxyw": "核心业务",
"ywxq": "业务需求",
"ywxt": "业务协同",
"jscj": "建设层级",
"yhfw": "用户范围",
"mbqt": "目标群体",
"jsnr": "建设内容",
"gnmk": "功能模块",
"sjgx": "数据共享",
"znys": "智能要素",
"sbdw": "申报单位",
"ssdq": "所属地区",
"ysnd": "预算年度"
}
gnmkys = {
"gnmc": "功能名称",
"gnms": "功能描述"
}


def getFlag():
data_dict = {}
df = pd.read_excel("0825-丽水系统查重维度.xlsx")
data = df.values
data = list(pd.Series(data[:, 1]).dropna())
for d in data:
try:
wd = re.search("(.*?)(.*?%)", d).group(1).strip()
wdc = wdys1.get(wd)
if wdc:
qz = re.search(".*?((.*?%))", d).group(1)
data_dict[wdc] = qz
except:
pass
return data_dict


def gong_neng_mo_kuai(xmmc, mysql, dl, data, er_title):
# 将excel文件中的所有第三维度内容进行拼接
str_dict = {}
for et in er_title:
for d in data:
if d[1] == et:
if str_dict.get(et):
str_dict[et] = str_dict.get(et) + d[3]
else:
str_dict[et] = d[3]
for k, v in str_dict.items():
mysql.sql_change_msg("""insert into user_history_module_data_total(xmmc,gnmc,gnms) value("%s", "%s", "%s")""" % (
escape_string(xmmc), escape_string(k), escape_string(v)))
#
# similarity = cosin_similarity.CosineSimilarity(v, v)
# similarity, keywords_x, keywords_y = similarity.main()
# mysql.sql_change_msg("""insert into user_history_module_keywords (xmmc,gnmc,gnms) value("%s" ,"%s", "%s")""" % (
# xmmc, escape_string(k), str(keywords_y)[dup_file_test:-dup_file_test]))


def project_check(data_list):
mysql = mysql_pool.ConnMysql()
# 读取维度和权重
# get_data_dict = getFlag()
# 遍历excel存储路径
for dl in data_list:
# path = "0825-丽水系统查重维度1.xlsx"
# 读取路径下的excel
print(dl)
df = pd.read_excel(dl[1])
xmmc = df.keys()
# print(type(xmmc[dup_file_test]))
xmmc=xmmc[1]
# print(type(xmmc))
# xmmc1=''

if "可研报告"or "可研性报告"or "可行性研究报告" in xmmc:
xmmc=xmmc.replace('可研报告','')
xmmc=xmmc.replace('可研性报告','')
xmmc=xmmc.replace('可行性研究报告','')
# print(xmmc)
data = df.values
# 将excel文件中的所有维度内容进行拼接
join_str = ""
str_dict = {}
title = ""
er_title = set()
# for d in data:
# # print(d)
# if pd.notnull(d[0]):
# title = d[0]
# if title == "功能模块":
# er_title.add(d[dup_file_test])
# join_str = ""
# for i in d[dup_file_test:]:
# if pd.notnull(i):
# join_str += i
# str_dict[wdys1.get(title)] = join_str
# else:
# if title == "功能模块":
# er_title.add(d[dup_file_test])
# for i in d[dup_file_test:]:
# if pd.notnull(i):
# join_str += i
# str_dict[wdys1.get(title)] = str_dict.get(wdys1.get(title)) + join_str
# print(str_dict)
gnmk_str = []
# print(data)
for d in data:
if pd.notnull(d[0]):
title = d[0]
if title == "功能模块":
er_title.add(d[1])
join_str = ""
for i in d[1:]:
# print(type(i))
# i=str(i)
if pd.notnull(i):
join_str += str(i)
if title == "功能模块":
# for j in d[3:]:
if i == '功能描述':
continue
else:
gnmk_str.append(i)
str_dict[wdys1.get(title)] = join_str
# print(str_dict.get(wdys1.get(title)))
else:
if title == "功能模块":
er_title.add(d[1])
for i in d[3:]:
if pd.notnull(i):
join_str += str(i)
if title == "功能模块":
gnmk_str.append(i)
str_dict[wdys1.get(title)] = str_dict.get(wdys1.get(title)) + join_str
# gnmk="".join(gnmk_str)
# str_dict['gnmk']=gnmk
gnmk = "".join(gnmk_str)
str_dict['gnmk'] = gnmk
# print(str_dict)
# print(str_dict.get("xzwt")if str_dict.get("xzwt") else None)
# print(str_dict.get('gnmk')if str_dict.get('gnmk')else None)
mysql.sql_change_msg(
"""insert into user_history_data_total (xmmc, xzwt, xtjc, xmmb, yqjx, jsxq, sjxq, aqxq, ywly, hxyw, ywxq, ywxt, jscj, yhfw, mbqt, jsnr, gnmk, sjgx, znys,sbdw,ssdq,ysnd) value ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s","%s","%s","%s")"""
% (escape_string(xmmc),
escape_string(str_dict.get("xzwt")) if str_dict.get("xzwt") else None,
escape_string(str_dict.get("xtjc")) if str_dict.get("xtjc") else None,
escape_string(str_dict.get("xmmb")) if str_dict.get("xmmb") else None,
escape_string(str_dict.get("yqjx")) if str_dict.get("yqjx") else None,
escape_string(str_dict.get("jsxq")) if str_dict.get("jsxq") else None,
escape_string(str_dict.get("sjxq")) if str_dict.get("sjxq") else None,
escape_string(str_dict.get("aqxq")) if str_dict.get("aqxq") else None,
escape_string(str_dict.get("ywly")) if str_dict.get("ywly") else None,
escape_string(str_dict.get("hxyw")) if str_dict.get("hxyw") else None,
escape_string(str_dict.get("ywxq")) if str_dict.get("ywxq") else None,
escape_string(str_dict.get("ywxt")) if str_dict.get("ywxt") else None,
escape_string(str_dict.get("jscj")) if str_dict.get("jscj") else None,
escape_string(str_dict.get("yhfw")) if str_dict.get("yhfw") else None,
escape_string(str_dict.get("mbqt")) if str_dict.get("mbqt") else None,
escape_string(str_dict.get("jsnr")) if str_dict.get("jsnr") else None,
escape_string(str_dict.get("gnmk")) if str_dict.get("gnmk") else None,
escape_string(str_dict.get("sjgx")) if str_dict.get("sjgx") else None,
escape_string(str_dict.get("znys")) if str_dict.get("znys") else None,
escape_string(str_dict.get("sbdw")) if str_dict.get("sbdw") else None,
escape_string(str_dict.get("ssdq")) if str_dict.get("ssdq") else None,
escape_string(str_dict.get("ysnd")) if str_dict.get("ysnd") else None
))
# project_gjc = {}
# for w in wdys2.keys():
# content_x = str_dict.get(w)
# content_y = str_dict.get(w)
# if content_x and content_y:
# # 循环遍历每一个维度
# similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# # 相似度 关键词
# similarity, keywords_x, keywords_y = similarity.main()
# project_gjc[w] = keywords_y
# mysql.sql_change_msg(
# """insert into user_history_keywords (xmmc, xzwt, xtjc, xmmb, yqjx, jsxq, sjxq, aqxq, ywly, hxyw, ywxq, ywxt, jscj, yhfw, mbqt, jsnr, gnmk, sjgx, znys) value ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")"""
# % (xmmc, str(project_gjc.get("xzwt"))[dup_file_test:-dup_file_test] if project_gjc.get("xzwt") else None,
# str(project_gjc.get("xtjc"))[dup_file_test:-dup_file_test] if project_gjc.get("xtjc") else None,
# str(project_gjc.get("xmmb"))[dup_file_test:-dup_file_test] if project_gjc.get("xmmb") else None,
# str(project_gjc.get("yqjx"))[dup_file_test:-dup_file_test] if project_gjc.get("yqjx") else None,
# str(project_gjc.get("jsxq"))[dup_file_test:-dup_file_test] if project_gjc.get("jsxq") else None,
# str(project_gjc.get("sjxq"))[dup_file_test:-dup_file_test] if project_gjc.get("sjxq") else None,
# str(project_gjc.get("aqxq"))[dup_file_test:-dup_file_test] if project_gjc.get("aqxq") else None,
# str(project_gjc.get("ywly"))[dup_file_test:-dup_file_test] if project_gjc.get("ywly") else None,
# str(project_gjc.get("hxyw"))[dup_file_test:-dup_file_test] if project_gjc.get("hxyw") else None,
# str(project_gjc.get("ywxq"))[dup_file_test:-dup_file_test] if project_gjc.get("ywxq") else None,
# str(project_gjc.get("ywxt"))[dup_file_test:-dup_file_test] if project_gjc.get("ywxt") else None,
# str(project_gjc.get("jscj"))[dup_file_test:-dup_file_test] if project_gjc.get("jscj") else None,
# str(project_gjc.get("yhfw"))[dup_file_test:-dup_file_test] if project_gjc.get("yhfw") else None,
# str(project_gjc.get("mbqt"))[dup_file_test:-dup_file_test] if project_gjc.get("mbqt") else None,
# str(project_gjc.get("jsnr"))[dup_file_test:-dup_file_test] if project_gjc.get("jsnr") else None,
# str(project_gjc.get("gnmk"))[dup_file_test:-dup_file_test] if project_gjc.get("gnmk") else None,
# str(project_gjc.get("sjgx"))[dup_file_test:-dup_file_test] if project_gjc.get("sjgx") else None,
# str(project_gjc.get("znys"))[dup_file_test:-dup_file_test] if project_gjc.get("znys") else None))

gong_neng_mo_kuai(xmmc, mysql, dl, data, er_title)


if __name__ == "__main__":
path = r"D:\dup_file_test"
data_list = os.listdir(path)
print(len(data_list))
for file in data_list:
# print(path+'\\'+file)
data_list = [(0, path + '\\' + file, "")]
project_check(data_list)
print("已存入************************************* %s" % file)

"""
建设目标,业务功能

gnmk_str = []
for d in data:
if pd.notnull(d[0]):
title = d[0]
if title == "功能模块":
er_title.add(d[dup_file_test])
join_str = ""
for i in d[dup_file_test:]:
if pd.notnull(i):
join_str += i
if title == "功能模块":
gnmk_str.append(i)
str_dict[wdys1.get(title)] = join_str
else:
if title == "功能模块":
er_title.add(d[dup_file_test])
for i in d[dup_file_test:]:
if pd.notnull(i):
join_str += i
if title == "功能模块":
gnmk_str.append(i)
str_dict[wdys1.get(title)] = str_dict.get(wdys1.get(title)) + join_str
gnmk = "".join(gnmk_str)


"""

+ 577
- 0
main1.py View File

@@ -0,0 +1,577 @@
# coding=utf-8
import sys
import re
import mysql_pool
from pymysql.converters import escape_string
import cosin_similarity
import pandas as pd
import datetime
import requests
import os
import pymysql

wdys1 = {
"项目名称": "xmmc",
"现状问题": "xzwt",
"系统基础": "xtjc",
"项目目标": "xmmb",
"预期绩效": "yqjx",
"建设需求": "jsxq",
"数据需求": "sjxq",
"安全需求": "aqxq",
"业务领域": "ywly",
"核心业务": "hxyw",
"业务需求": "ywxq",
"业务协同": "ywxt",
"建设层级": "jscj",
"用户范围": "yhfw",
"目标群体": "mbqt",
"建设内容": "jsnr",
"功能模块": "gnmk",
"数据共享": "sjgx",
"智能要素": "znys"
}
wdys2 = {
"xmmc": "项目名称",
"xzwt": "现状问题",
"xtjc": "系统基础",
"xmmb": "项目目标",
"yqjx": "预期绩效",
"jsxq": "建设需求",
"sjxq": "数据需求",
"aqxq": "安全需求",
"ywly": "业务领域",
"hxyw": "核心业务",
"ywxq": "业务需求",
"ywxt": "业务协同",
"jscj": "建设层级",
"yhfw": "用户范围",
"mbqt": "目标群体",
"jsnr": "建设内容",
"gnmk": "功能模块",
"sjgx": "数据共享",
"znys": "智能要素"
}
gnmkys = {
"gnmc": "功能名称",
"gnms": "功能描述"
}


def getFlag():
data_dict = {}
df = pd.read_excel("0825-丽水系统查重维度.xlsx")
data = df.values
data = list(pd.Series(data[:, 1]).dropna())
for d in data:
try:
wd = re.search("(.*?)(.*?%)", d).group(1).strip()
wdc = wdys1.get(wd)
if wdc:
qz = re.search(".*?((.*?%))", d).group(1)
data_dict[wdc] = qz
except:
pass
return data_dict


def gong_neng_mo_kuai(mysql, dl, data, er_title):
# 将excel文件中的所有第三维度内容进行拼接
str_dict = {}
for et in er_title:
for d in data:
if d[1] == et:
if str_dict.get(et):
str_dict[et] = str_dict.get(et) + d[3]
else:
str_dict[et] = d[3]

for k, v in str_dict.items():
mysql.sql_change_msg(
"""insert into idc_project_module (project_id, check_duplicate_count, module_name, module_content, create_time, update_time, tag) value(%d, dup_file_test, "%s", "%s", "%s", "%s", "模块")""" % (
int(dl[0]), k, v, str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7]))

module_id_list = mysql.sql_select_many(
"""select project_module_id, module_name, module_content from idc_project_module where project_id=%d""" % dl[
0])
data_list = []
for mil in module_id_list:
data_dict = {}
data_dict["project_module_id"] = mil.get("project_module_id")
data_dict["gnmc"] = mil.get("module_name")
data_dict["gnms"] = mil.get("module_content")
data_list.append(data_dict)
# print(data_list)
for i in data_list:
gnmk_copy1 = mysql.sql_select_many("""select * from user_history_module_data""")
if gnmk_copy1:
for gc in gnmk_copy1:
total_similarity1 = 0
total_keywords1 = []
total_similarity2 = 0
total_keywords2 = []
mysql.sql_change_msg(
"""insert into idc_project_module_check (project_module_id, module_name, project_name, company_name, create_time, update_time) value(%d, "%s", "%s", "%s", "%s", "%s")"""
% (
i.get("project_module_id"), gc.get("gnmc"), gc.get("xmmc"), "",
str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
dup_module_id = mysql.cur.lastrowid
for j in ["gnmc", "gnms"]:
# 循环遍历每一个模块名称
content_x = gc.get(j)
content_y = i.get(j)
if content_x and content_y:
if j == "gnmc":
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keyword_x, keywords = similarity.main()
similarity = similarity * 1
total_keywords1 += keywords
#print("######################相似度: %.2f%%" % similarity, "关键词: %s" % keywords)
# 相似度相加
total_similarity1 += similarity
mysql.sql_change_msg(
"""insert into idc_project_module_check_detail (dup_module_id, project_name, module_content, dup_module_content, similarity, dimension, create_time, update_time) value (%d, "%s", "%s", "%s", %f, "%s", "%s", "%s")"""
% (dup_module_id, dl[2], escape_string(content_y), escape_string(content_x), similarity,
"功能名称",
str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7]))
else:
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keyword_x, keywords = similarity.main()
similarity = similarity * 99
total_keywords2 += keywords
#print("######################相似度: %.2f%%" % similarity, "关键词: %s" % keywords)
# 相似度相加
total_similarity2 += similarity
mysql.sql_change_msg(
"""insert into idc_project_module_check_detail (dup_module_id, project_name, module_content, dup_module_content, similarity, dimension, create_time, update_time) value (%d, "%s", "%s", "%s", %f, "%s", "%s", "%s")"""
% (dup_module_id, dl[2], escape_string(content_y), escape_string(content_x), similarity,
"功能模块描述",
str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7]))
mysql.sql_change_msg("""update idc_project_module_check set similarity=%f where dup_module_id=%d""" % (
total_similarity1 + total_similarity2, dup_module_id))
gnmk_gjc = {}
for a in ["gnmc", "gnms"]:
if i.get(a):
content_x = i.get(a)
content_y = i.get(a)
if a == "gnmc":
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keyword_x, keywords = similarity.main()
gnmk_gjc[a] = keywords
else:
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keyword_x, keywords = similarity.main()
gnmk_gjc[a] = keywords
mysql.sql_change_msg("""insert into user_module_keywords (xmmc, gnmc, gnms) value("%s", "%s", "%s")""" % (
dl[2], str(gnmk_gjc.get("gnmc"))[1:-1] if gnmk_gjc.get("gnmc") else None,
str(gnmk_gjc.get("gnms"))[1:-1] if gnmk_gjc.get("gnms") else None))


def project_check(data_list):
mysql = mysql_pool.ConnMysql()
# mysql.sql_select_many("""select * from mkgjc""")
# 读取维度和权重
# xmnr_count = len(mysql.sql_select_many("""select * from xmnr_copy1"""))
# gnmk_count = len(mysql.sql_select_many("""select * from gnmk_copy1"""))
xmnr_count = len(mysql.sql_select_many("""select * from user_history_data"""))
gnmk_count = len(mysql.sql_select_many("""select * from user_history_module_data"""))
get_data_dict = getFlag()
# 遍历excel存储路径
for dl in data_list:
# path = "0825-丽水系统查重维度1.xlsx"
# 读取路径下的excel
print(dl,dl[1])
df = pd.read_excel(dl[1])
data = df.values
# 将excel文件中的所有维度内容进行拼接
join_str = ""
str_dict = {}
title = ""
er_title = set()
for d in data:
if pd.notnull(d[0]):
title = d[0]
if title == "功能模块":
er_title.add(d[1])
join_str = ""
for i in d[1:]:
if pd.notnull(i):
join_str += i
str_dict[wdys1.get(title)] = join_str
else:
if title == "功能模块":
er_title.add(d[1])
for i in d[1:]:
if pd.notnull(i):
join_str += i
str_dict[wdys1.get(title)] = str_dict.get(wdys1.get(title)) + join_str
print(str_dict)
mysql.sql_change_msg(
"""insert into user_data (xmmc, xzwt, xtjc, xmmb, yqjx, jsxq, sjxq, aqxq, ywly, hxyw, ywxq, ywxt, jscj, yhfw, mbqt, jsnr, gnmk, sjgx, znys) value ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")"""
% (dl[0], str_dict.get("xzwt") if str_dict.get("xzwt") else None,
str_dict.get("xtjc") if str_dict.get("xtjc") else None,
str_dict.get("xmmb") if str_dict.get("xmmb") else None,
str_dict.get("yqjx") if str_dict.get("yqjx") else None,
str_dict.get("jsxq") if str_dict.get("jsxq") else None,
str_dict.get("sjxq") if str_dict.get("sjxq") else None,
str_dict.get("aqxq") if str_dict.get("aqxq") else None,
str_dict.get("ywly") if str_dict.get("ywly") else None,
str_dict.get("hxyw") if str_dict.get("hxyw") else None,
str_dict.get("ywxq") if str_dict.get("ywxq") else None,
str_dict.get("ywxt") if str_dict.get("ywxt") else None,
str_dict.get("jscj") if str_dict.get("jscj") else None,
str_dict.get("yhfw") if str_dict.get("yhfw") else None,
str_dict.get("mbqt") if str_dict.get("mbqt") else None,
str_dict.get("jsnr") if str_dict.get("jsnr") else None,
str_dict.get("gnmk") if str_dict.get("gnmk") else None,
str_dict.get("sjgx") if str_dict.get("sjgx") else None,
str_dict.get("znys") if str_dict.get("znys") else None))
# 或取所有的xmnr_copy1
xmnr_copy1 = mysql.sql_select_many("""select * from user_history_data""")
# 对比xmnr_copy1和xmnr维度是否都有
if xmnr_copy1:
for xc in xmnr_copy1:
total_keywords = {}
total_similarity = 0
dup_count = 0
# 保存相加后的相似度到idc_project_check
mysql.sql_change_msg(
"""insert into idc_project_check (project_id, dup_project_name, file_path, company_name, create_year, project_tag, project_range_tag, project_area, create_time, update_time) value ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")"""
% (dl[0], xc.get("xmmc"), dl[1], "", "", "需求相似、业务相似", "历史项目", "",
str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7]))
dup_id = mysql.cur.lastrowid
for x in list(xc.keys())[1:]:
content_x = xc.get(x)
content_y = str_dict.get(x)
if content_x and content_y:
if x == 'gnmk':
continue
elif x == 'jsnr':
continue
else:
dup_count += 1
if xc.get('gnmk')==' ' and str_dict.get('gnmk')==' ':
for x in list(xc.keys())[1:]:
content_x = xc.get(x)
content_y = str_dict.get(x)
if content_x and content_y:
if x == 'gnmk':
# 匹配到历史数据,次数加1
# dup_count += dup_file_test
# 循环遍历每一个维度
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keywords_x, keywords_y = similarity.main()
similarity = similarity * 0
#print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
# 相似度相加
total_similarity += similarity
# 关键词收集
total_keywords[x] = keywords_y

function_content = content_y
dup_function_content = content_x
for word_y in keywords_y:
word_y = word_y.strip().strip("'").strip('"')
function_content = str(function_content.replace("\"", "'")).replace(word_y,
f'<span class="similarity">{word_y.strip()}</span>')
for word_x in keywords_x:
word_x = word_x.strip().strip("'").strip('"')
dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
f'<span class="similarity">{word_x.strip()}</span>')
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
# content = content.replace(gjcs, f'<span class="similarity">{gjcs.strip()}</span>')
elif x == 'jsnr':
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keywords_x, keywords_y = similarity.main()
similarity = similarity * 40
#print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
# 相似度相加
total_similarity += similarity
# 关键词收集
total_keywords[x] = keywords_y
function_content = content_y
dup_function_content = content_x
for word_y in keywords_y:
word_y = word_y.strip().strip("'").strip('"')
function_content = str(function_content.replace("\"", "'")).replace(word_y,
f'<span class="similarity">{word_y.strip()}</span>')
for word_x in keywords_x:
word_x = word_x.strip().strip("'").strip('"')
dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
f'<span class="similarity">{word_x.strip()}</span>')
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
else:
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keywords_x, keywords_y = similarity.main()
similarity = similarity * (60 / dup_count)
#print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
# 相似度相加
total_similarity += similarity
# 关键词收集
total_keywords[x] = keywords_y
function_content = content_y
dup_function_content = content_x

for word_y in keywords_y:
word_y = word_y.strip().strip("'").strip('"')
function_content = str(function_content.replace("\"", "'")).replace(word_y,
f'<span class="similarity">{word_y.strip()}</span>')
for word_x in keywords_x:
word_x = word_x.strip().strip("'").strip('"')
dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
f'<span class="similarity">{word_x.strip()}</span>')
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
elif xc['jsnr'] == ' ' and str_dict['jsnr'] == ' ':
for x in list(xc.keys())[1:]:
content_x = xc.get(x)
content_y = str_dict.get(x)
if content_x and content_y:
if x == 'gnmk':
# 匹配到历史数据,次数加1
# dup_count += dup_file_test
# 循环遍历每一个维度
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keywords_x, keywords_y = similarity.main()
similarity = similarity * 50
#print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
# 相似度相加
total_similarity += similarity
# 关键词收集
total_keywords[x] = keywords_y

function_content = content_y
dup_function_content = content_x
for word_y in keywords_y:
word_y = word_y.strip().strip("'").strip('"')
function_content = str(function_content.replace("\"", "'")).replace(word_y,
f'<span class="similarity">{word_y.strip()}</span>')
for word_x in keywords_x:
word_x = word_x.strip().strip("'").strip('"')
dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
f'<span class="similarity">{word_x.strip()}</span>')
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
# content = content.replace(gjcs, f'<span class="similarity">{gjcs.strip()}</span>')
elif x == 'jsnr':
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keywords_x, keywords_y = similarity.main()
similarity = similarity * 0
#print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
# 相似度相加
total_similarity += similarity
# 关键词收集
total_keywords[x] = keywords_y
function_content = content_y
dup_function_content = content_x
for word_y in keywords_y:
word_y = word_y.strip().strip("'").strip('"')
function_content = str(function_content.replace("\"", "'")).replace(word_y,
f'<span class="similarity">{word_y.strip()}</span>')
for word_x in keywords_x:
word_x = word_x.strip().strip("'").strip('"')
dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
f'<span class="similarity">{word_x.strip()}</span>')
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
else:
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keywords_x, keywords_y = similarity.main()
similarity = similarity * (50 / dup_count)
#print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
# 相似度相加
total_similarity += similarity
# 关键词收集
total_keywords[x] = keywords_y
function_content = content_y
dup_function_content = content_x

for word_y in keywords_y:
word_y = word_y.strip().strip("'").strip('"')
function_content = str(function_content.replace("\"", "'")).replace(word_y,
f'<span class="similarity">{word_y.strip()}</span>')
for word_x in keywords_x:
word_x = word_x.strip().strip("'").strip('"')
dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
f'<span class="similarity">{word_x.strip()}</span>')
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
else:
for x in list(xc.keys())[1:]:
content_x = xc.get(x)
content_y = str_dict.get(x)
if content_x and content_y:
if x == 'gnmk':
# 匹配到历史数据,次数加1
# dup_count += dup_file_test
# 循环遍历每一个维度
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keywords_x, keywords_y = similarity.main()
similarity = similarity * 50
#print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
# 相似度相加
total_similarity += similarity
# 关键词收集
total_keywords[x] = keywords_y

function_content = content_y
dup_function_content = content_x
for word_y in keywords_y:
word_y = word_y.strip().strip("'").strip('"')
function_content = str(function_content.replace("\"", "'")).replace(word_y,
f'<span class="similarity">{word_y.strip()}</span>')
for word_x in keywords_x:
word_x = word_x.strip().strip("'").strip('"')
dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
f'<span class="similarity">{word_x.strip()}</span>')
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
# content = content.replace(gjcs, f'<span class="similarity">{gjcs.strip()}</span>')
elif x == 'jsnr':
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keywords_x, keywords_y = similarity.main()
similarity = similarity * 40
#print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
# 相似度相加
total_similarity += similarity
# 关键词收集
total_keywords[x] = keywords_y
function_content = content_y
dup_function_content = content_x
for word_y in keywords_y:
word_y = word_y.strip().strip("'").strip('"')
function_content = str(function_content.replace("\"", "'")).replace(word_y,
f'<span class="similarity">{word_y.strip()}</span>')
for word_x in keywords_x:
word_x = word_x.strip().strip("'").strip('"')
dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
f'<span class="similarity">{word_x.strip()}</span>')
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
else:
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keywords_x, keywords_y = similarity.main()
similarity = similarity * (10 / dup_count)
#print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
# 相似度相加
total_similarity += similarity
# 关键词收集
total_keywords[x] = keywords_y
function_content = content_y
dup_function_content = content_x

for word_y in keywords_y:
word_y = word_y.strip().strip("'").strip('"')
function_content = str(function_content.replace("\"", "'")).replace(word_y,
f'<span class="similarity">{word_y.strip()}</span>')
for word_x in keywords_x:
word_x = word_x.strip().strip("'").strip('"')
dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
f'<span class="similarity">{word_x.strip()}</span>')
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))

mysql.sql_change_msg(
"""update idc_project_check set similarity=%f where dup_id=%d""" % (total_similarity, dup_id))
project_gjc = {}
for w in wdys2.keys():
content_x = str_dict.get(w)
content_y = str_dict.get(w)
if content_x and content_y:
# 循环遍历每一个维度
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keywords_x, keywords = similarity.main()
project_gjc[w] = keywords
mysql.sql_change_msg(
"""insert into user_keyword (xmmc, xzwt, xtjc, xmmb, yqjx, jsxq, sjxq, aqxq, ywly, hxyw, ywxq, ywxt, jscj, yhfw, mbqt, jsnr, gnmk, sjgx, znys) value ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")"""
% (dl[0], str(project_gjc.get("xzwt"))[1:-1] if project_gjc.get("xzwt") else None,
str(project_gjc.get("xtjc"))[1:-1] if project_gjc.get("xtjc") else None,
str(project_gjc.get("xmmb"))[1:-1] if project_gjc.get("xmmb") else None,
str(project_gjc.get("yqjx"))[1:-1] if project_gjc.get("yqjx") else None,
str(project_gjc.get("jsxq"))[1:-1] if project_gjc.get("jsxq") else None,
str(project_gjc.get("sjxq"))[1:-1] if project_gjc.get("sjxq") else None,
str(project_gjc.get("aqxq"))[1:-1] if project_gjc.get("aqxq") else None,
str(project_gjc.get("ywly"))[1:-1] if project_gjc.get("ywly") else None,
str(project_gjc.get("hxyw"))[1:-1] if project_gjc.get("hxyw") else None,
str(project_gjc.get("ywxq"))[1:-1] if project_gjc.get("ywxq") else None,
str(project_gjc.get("ywxt"))[1:-1] if project_gjc.get("ywxt") else None,
str(project_gjc.get("jscj"))[1:-1] if project_gjc.get("jscj") else None,
str(project_gjc.get("yhfw"))[1:-1] if project_gjc.get("yhfw") else None,
str(project_gjc.get("mbqt"))[1:-1] if project_gjc.get("mbqt") else None,
str(project_gjc.get("jsnr"))[1:-1] if project_gjc.get("jsnr") else None,
str(project_gjc.get("gnmk"))[1:-1] if project_gjc.get("gnmk") else None,
str(project_gjc.get("sjgx"))[1:-1] if project_gjc.get("sjgx") else None,
str(project_gjc.get("znys"))[1:-1] if project_gjc.get("znys") else None))
mysql.sql_change_msg(
"""update idc_project set dup_status=3, one_vote_veto_status=dup_file_test, self_check_status=dup_file_test, history_project_count=%d ,module_count=%d where project_id=%d""" % (
xmnr_count, gnmk_count, dl[0]))
gong_neng_mo_kuai(mysql, dl, data, er_title)


if __name__ == "__main__":
# all_path = requests.get("http://127.0.0.1:19099/check/duplicates/%s" % 15).json()
# print(all_path)

# data_list = []
# for ap in all_path.get("data"):
# # if os.path.exists(ap.get("file_path")):
# data_list.append((ap.get("project_id"), ap.get("file_path"), ap.get("project_name")))
# print(data_list)
data_list = [(11, r"C:\Users\HUAWEI\PycharmProjects\nlp\dup_check\0825-丽水系统查重维度1.xlsx", "水路运输综合监管系统建设项目.xls")]
project_check(data_list)
"""

"""

+ 391
- 0
main10.py View File

@@ -0,0 +1,391 @@
# coding=utf-8
import sys
import re
import mysql_pool
from pymysql.converters import escape_string
import cosin_similarity
import pandas as pd
import datetime
import requests
import os

wdys1 = {
"项目名称": "xmmc",
"现状问题": "xzwt",
"系统基础": "xtjc",
"项目目标": "xmmb",
"预期绩效": "yqjx",
"建设需求": "jsxq",
"数据需求": "sjxq",
"安全需求": "aqxq",
"业务领域": "ywly",
"核心业务": "hxyw",
"业务需求": "ywxq",
"业务协同": "ywxt",
"建设层级": "jscj",
"用户范围": "yhfw",
"目标群体": "mbqt",
"建设内容": "jsnr",
"功能模块": "gnmk",
"数据共享": "sjgx",
"智能要素": "znys"
}
wdys2 = {
"xmmc": "项目名称",
"xzwt": "现状问题",
"xtjc": "系统基础",
"xmmb": "项目目标",
"yqjx": "预期绩效",
"jsxq": "建设需求",
"sjxq": "数据需求",
"aqxq": "安全需求",
"ywly": "业务领域",
"hxyw": "核心业务",
"ywxq": "业务需求",
"ywxt": "业务协同",
"jscj": "建设层级",
"yhfw": "用户范围",
"mbqt": "目标群体",
"jsnr": "建设内容",
"gnmk": "功能模块",
"sjgx": "数据共享",
"znys": "智能要素"
}
gnmkys = {
"gnmc": "功能名称",
"gnms": "功能描述"
}


def getFlag():
data_dict = {}
df = pd.read_excel("0825-丽水系统查重维度.xlsx")
data = df.values
data = list(pd.Series(data[:, 1]).dropna())
for d in data:
try:
wd = re.search("(.*?)(.*?%)", d).group(1).strip()
wdc = wdys1.get(wd)
if wdc:
qz = re.search(".*?((.*?%))", d).group(1)
data_dict[wdc] = qz
except:
pass
return data_dict


def gong_neng_mo_kuai(mysql, dl, data, er_title):
# 将excel文件中的所有第三维度内容进行拼接
str_dict = {}
for et in er_title:
for d in data:
if d[1] == et:
if str_dict.get(et):
str_dict[et] = str_dict.get(et) + d[3]
else:
str_dict[et] = d[3]
# print(str_dict)
for k, v in str_dict.items():
mysql.sql_change_msg(
"""insert into idc_project_module (project_id, check_duplicate_count, module_name, module_content, create_time, update_time, tag) value(%d, 1, "%s", "%s", "%s", "%s", "模块")""" % (
int(dl[0]), k, v, str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7]))
module_id_list = mysql.sql_select_many(
"""select project_module_id, module_name, module_content from idc_project_module where project_id=%d""" % dl[
0])
data_list = []
for mil in module_id_list:
data_dict = {}
data_dict["project_module_id"] = mil.get("project_module_id")
data_dict["gnmc"] = mil.get("module_name")
data_dict["gnms"] = mil.get("module_content")
data_list.append(data_dict)
# print(data_list)
for i in data_list:
gnmk_copy1 = mysql.sql_select_many("""select * from user_history_module_data""")
if gnmk_copy1:
for gc in gnmk_copy1:
total_similarity1 = 0
total_keywords1 = []
total_similarity2 = 0
total_keywords2 = []
mysql.sql_change_msg(
"""insert into idc_project_module_check (project_module_id, module_name, project_name, company_name, create_time, update_time) value(%d, "%s", "%s", "%s", "%s", "%s")"""
% (
i.get("project_module_id"), gc.get("gnmc"), gc.get("xmmc"), "",
str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
dup_module_id = mysql.cur.lastrowid
for j in ["gnmc", "gnms"]:
# 循环遍历每一个模块名称
content_x = gc.get(j)
content_y = i.get(j)
if content_x and content_y:
if j == "gnmc":
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keyword_x, keywords = similarity.main()
similarity = similarity * 1
total_keywords1 += keywords
print("######################相似度: %.2f%%" % similarity, "关键词: %s" % keywords)
# 相似度相加
total_similarity1 += similarity
mysql.sql_change_msg(
"""insert into idc_project_module_check_detail (dup_module_id, project_name, module_content, dup_module_content, similarity, dimension, create_time, update_time) value (%d, "%s", "%s", "%s", %f, "%s", "%s", "%s")"""
% (dup_module_id, dl[2], escape_string(content_y), escape_string(content_x), similarity,
"功能名称",
str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7]))
else:
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keyword_x, keywords = similarity.main()
similarity = similarity * 99
total_keywords2 += keywords
print("######################相似度: %.2f%%" % similarity, "关键词: %s" % keywords)
# 相似度相加
total_similarity2 += similarity
mysql.sql_change_msg(
"""insert into idc_project_module_check_detail (dup_module_id, project_name, module_content, dup_module_content, similarity, dimension, create_time, update_time) value (%d, "%s", "%s", "%s", %f, "%s", "%s", "%s")"""
% (dup_module_id, dl[2], escape_string(content_y), escape_string(content_x), similarity,
"功能模块描述",
str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7]))
mysql.sql_change_msg("""update idc_project_module_check set similarity=%f where dup_module_id=%d""" % (
total_similarity1 + total_similarity2, dup_module_id))
gnmk_gjc = {}
for a in ["gnmc", "gnms"]:
if i.get(a):
content_x = i.get(a)
content_y = i.get(a)
if a == "gnmc":
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keyword_x, keywords = similarity.main()
gnmk_gjc[a] = keywords
else:
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keyword_x, keywords = similarity.main()
gnmk_gjc[a] = keywords
mysql.sql_change_msg("""insert into user_module_keywords (xmmc, gnmc, gnms) value("%s", "%s", "%s")""" % (
dl[2], str(gnmk_gjc.get("gnmc"))[1:-1] if gnmk_gjc.get("gnmc") else None,
str(gnmk_gjc.get("gnms"))[1:-1] if gnmk_gjc.get("gnms") else None))


def project_check(data_list):
mysql = mysql_pool.ConnMysql()
# mysql.sql_select_many("""select * from mkgjc""")
# 读取维度和权重
# xmnr_count = len(mysql.sql_select_many("""select * from xmnr_copy1"""))
# gnmk_count = len(mysql.sql_select_many("""select * from gnmk_copy1"""))
xmnr_count = len(mysql.sql_select_many("""select * from user_history_data"""))
gnmk_count = len(mysql.sql_select_many("""select * from user_history_module_data"""))
get_data_dict = getFlag()
# 遍历excel存储路径
for dl in data_list:
# path = "0825-丽水系统查重维度1.xlsx"
# 读取路径下的excel
df = pd.read_excel(dl[1])
data = df.values
# 将excel文件中的所有维度内容进行拼接
join_str = ""
str_dict = {}
title = ""
er_title = set()
for d in data:
if pd.notnull(d[0]):
title = d[0]
if title == "功能模块":
er_title.add(d[1])
join_str = ""
for i in d[1:]:
if pd.notnull(i):
join_str +=i
str_dict[wdys1.get(title)] = join_str
else:
if title == "功能模块":
er_title.add(d[1])
for i in d[1:]:
if pd.notnull(i):
join_str +=i
str_dict[wdys1.get(title)] = str_dict.get(wdys1.get(title)) + join_str
# print(str_dict)
mysql.sql_change_msg(
"""insert into user_data (xmmc, xzwt, xtjc, xmmb, yqjx, jsxq, sjxq, aqxq, ywly, hxyw, ywxq, ywxt, jscj, yhfw, mbqt, jsnr, gnmk, sjgx, znys) value ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")"""
% (dl[0], str_dict.get("xzwt") if str_dict.get("xzwt") else None,
str_dict.get("xtjc") if str_dict.get("xtjc") else None,
str_dict.get("xmmb") if str_dict.get("xmmb") else None,
str_dict.get("yqjx") if str_dict.get("yqjx") else None,
str_dict.get("jsxq") if str_dict.get("jsxq") else None,
str_dict.get("sjxq") if str_dict.get("sjxq") else None,
str_dict.get("aqxq") if str_dict.get("aqxq") else None,
str_dict.get("ywly") if str_dict.get("ywly") else None,
str_dict.get("hxyw") if str_dict.get("hxyw") else None,
str_dict.get("ywxq") if str_dict.get("ywxq") else None,
str_dict.get("ywxt") if str_dict.get("ywxt") else None,
str_dict.get("jscj") if str_dict.get("jscj") else None,
str_dict.get("yhfw") if str_dict.get("yhfw") else None,
str_dict.get("mbqt") if str_dict.get("mbqt") else None,
str_dict.get("jsnr") if str_dict.get("jsnr") else None,
str_dict.get("gnmk") if str_dict.get("gnmk") else None,
str_dict.get("sjgx") if str_dict.get("sjgx") else None,
str_dict.get("znys") if str_dict.get("znys") else None))
# 或取所有的xmnr_copy1
xmnr_copy1 = mysql.sql_select_many("""select * from user_history_data""")
# 对比xmnr_copy1和xmnr维度是否都有
if xmnr_copy1:
for xc in xmnr_copy1:
total_keywords = {}
total_similarity = 0
dup_count = 0
# 保存相加后的相似度到idc_project_check
mysql.sql_change_msg(
"""insert into idc_project_check (project_id, dup_project_name, file_path, company_name, create_year, project_tag, project_range_tag, project_area, create_time, update_time) value ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")"""
% (dl[0], xc.get("xmmc"), dl[1], "", "", "需求相似、业务相似", "历史项目", "",
str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7]))
dup_id = mysql.cur.lastrowid
for x in list(xc.keys())[1:]:
content_x = xc.get(x)
content_y = str_dict.get(x)
if content_x and content_y:
if x == 'gnmk':
continue
elif x == 'jsnr':
continue
else:
dup_count += 1
for x in list(xc.keys())[1:]:
content_x = xc.get(x)
content_y = str_dict.get(x)
if content_x and content_y:
if x == 'gnmk':
# 匹配到历史数据,次数加1
# dup_count += 1
# 循环遍历每一个维度
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keywords_x, keywords_y = similarity.main()
similarity = similarity * 50
print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
# 相似度相加
total_similarity += similarity
# 关键词收集
total_keywords[x] = keywords_y

function_content = content_y
dup_function_content = content_x
for word_y in keywords_y:
word_y = word_y.strip().strip("'").strip('"')
function_content = str(function_content.replace("\"", "'")).replace(word_y,
f'<span class="similarity">{word_y.strip()}</span>')
for word_x in keywords_x:
word_x = word_x.strip().strip("'").strip('"')
dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
f'<span class="similarity">{word_x.strip()}</span>')
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
# content = content.replace(gjcs, f'<span class="similarity">{gjcs.strip()}</span>')
elif x == 'jsnr':
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keywords_x, keywords_y = similarity.main()
similarity = similarity * 40
print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
# 相似度相加
total_similarity += similarity
# 关键词收集
total_keywords[x] = keywords_y
function_content = content_y
dup_function_content = content_x
for word_y in keywords_y:
word_y = word_y.strip().strip("'").strip('"')
function_content = str(function_content.replace("\"", "'")).replace(word_y,
f'<span class="similarity">{word_y.strip()}</span>')
for word_x in keywords_x:
word_x = word_x.strip().strip("'").strip('"')
dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
f'<span class="similarity">{word_x.strip()}</span>')
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
else:
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keywords_x, keywords_y = similarity.main()
similarity = similarity * (10 / dup_count)
print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
# 相似度相加
total_similarity += similarity
# 关键词收集
total_keywords[x] = keywords_y
function_content = content_y
dup_function_content = content_x
for word_y in keywords_y:
word_y = word_y.strip().strip("'").strip('"')
function_content = str(function_content.replace("\"", "'")).replace(word_y,
f'<span class="similarity">{word_y.strip()}</span>')
for word_x in keywords_x:
word_x = word_x.strip().strip("'").strip('"')
dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
f'<span class="similarity">{word_x.strip()}</span>')
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))

mysql.sql_change_msg(
"""update idc_project_check set similarity=%f where dup_id=%d""" % (total_similarity, dup_id))
project_gjc = {}
for w in wdys2.keys():
content_x = str_dict.get(w)
content_y = str_dict.get(w)
if content_x and content_y:
# 循环遍历每一个维度
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keywords_x, keywords = similarity.main()
project_gjc[w] = keywords
mysql.sql_change_msg(
"""insert into user_keyword (xmmc, xzwt, xtjc, xmmb, yqjx, jsxq, sjxq, aqxq, ywly, hxyw, ywxq, ywxt, jscj, yhfw, mbqt, jsnr, gnmk, sjgx, znys) value ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")"""
% (dl[0], str(project_gjc.get("xzwt"))[1:-1] if project_gjc.get("xzwt") else None,
str(project_gjc.get("xtjc"))[1:-1] if project_gjc.get("xtjc") else None,
str(project_gjc.get("xmmb"))[1:-1] if project_gjc.get("xmmb") else None,
str(project_gjc.get("yqjx"))[1:-1] if project_gjc.get("yqjx") else None,
str(project_gjc.get("jsxq"))[1:-1] if project_gjc.get("jsxq") else None,
str(project_gjc.get("sjxq"))[1:-1] if project_gjc.get("sjxq") else None,
str(project_gjc.get("aqxq"))[1:-1] if project_gjc.get("aqxq") else None,
str(project_gjc.get("ywly"))[1:-1] if project_gjc.get("ywly") else None,
str(project_gjc.get("hxyw"))[1:-1] if project_gjc.get("hxyw") else None,
str(project_gjc.get("ywxq"))[1:-1] if project_gjc.get("ywxq") else None,
str(project_gjc.get("ywxt"))[1:-1] if project_gjc.get("ywxt") else None,
str(project_gjc.get("jscj"))[1:-1] if project_gjc.get("jscj") else None,
str(project_gjc.get("yhfw"))[1:-1] if project_gjc.get("yhfw") else None,
str(project_gjc.get("mbqt"))[1:-1] if project_gjc.get("mbqt") else None,
str(project_gjc.get("jsnr"))[1:-1] if project_gjc.get("jsnr") else None,
str(project_gjc.get("gnmk"))[1:-1] if project_gjc.get("gnmk") else None,
str(project_gjc.get("sjgx"))[1:-1] if project_gjc.get("sjgx") else None,
str(project_gjc.get("znys"))[1:-1] if project_gjc.get("znys") else None))
mysql.sql_change_msg(
"""update idc_project set dup_status=3, one_vote_veto_status=1, self_check_status=1, history_project_count=%d ,module_count=%d where project_id=%d""" % (
xmnr_count, gnmk_count, dl[0]))
gong_neng_mo_kuai(mysql, dl, data, er_title)


if __name__ == "__main__":
all_path = requests.get("http://127.0.0.1:19099/check/duplicates/%s" % 15).json()
# print(all_path)
#
# data_list = []
# for ap in all_path.get("data"):
# # if os.path.exists(ap.get("file_path")):
# data_list.append((ap.get("project_id"), ap.get("file_path"), ap.get("project_name")))
# print(data_list)
# data_list = [(11, r"D:\pythonDM\Ndkj\duplicate_check\0825-丽水系统查重维度1.xlsx", "数字百山祖(一期)—“云值守”建设方案")]
# project_check(data_list)

+ 113
- 0
mysql_pool.py View File

@@ -0,0 +1,113 @@
# coding=utf-8
import pymysql
from dbutils.pooled_db import PooledDB
# from dbutils.persistent_db import PersistentDB

mysqlInfo = {
"host": '47.98.125.47',
"user": 'root',
"passwd": 'NingdaKeji123!',
"db": 'idc',
"port": 3306,
"charset": "utf8"
}


class ConnMysql(object):
__pool = None

def __init__(self):
# 构造函数,创建数据库连接、游标
self.coon = ConnMysql._get_mysql_conn()
self.cur = self.coon.cursor(cursor=pymysql.cursors.DictCursor)

# 数据库连接池连接
@staticmethod
def _get_mysql_conn():
global __pool
if ConnMysql.__pool is None:
__pool = PooledDB(
creator=pymysql,
mincached=1,
maxcached=5,
maxconnections=6,
maxshared=3,
blocking=True,
maxusage=None,
setsession=[],
ping=2,
host=mysqlInfo['host'],
user=mysqlInfo['user'],
passwd=mysqlInfo['passwd'],
db=mysqlInfo['db'],
port=mysqlInfo['port'],
charset=mysqlInfo['charset'])
return __pool.connection()

# 插入、修改、删除一条
def sql_change_msg(self, sql):
change_sql = self.cur.execute(sql)
self.coon.commit()
return change_sql

# 查询一条
def sql_select_one(self, sql):
self.cur.execute(sql)
select_res = self.cur.fetchone()
return select_res

# 查询多条
def sql_select_many(self, sql, count=None):
self.cur.execute(sql)
if count is None:
select_res = self.cur.fetchall()
else:
select_res = self.cur.fetchmany(count)
return select_res

# 释放资源
def release(self):
self.coon.close()
self.cur.close()


if __name__ == '__main__':
[{'Tables_in_idc': 'gjc'},
{'Tables_in_idc': 'gjc2'},
{'Tables_in_idc': 'idc_dept'},
{'Tables_in_idc': 'idc_project'}, {'Tables_in_idc': 'idc_project_check'},
{'Tables_in_idc': 'idc_project_check_detail'}, {'Tables_in_idc': 'idc_project_module'},
{'Tables_in_idc': 'idc_project_module_check'}, {'Tables_in_idc': 'idc_project_module_check_detail'},
{'Tables_in_idc': 'idc_user'}, {'Tables_in_idc': 'idc_user_dept'}, {'Tables_in_idc': 'mk2'}]

# print(ConnMysql().sql_select_many("show tables;"))
mysql = ConnMysql()
# mysql.sql_change_msg("""insert into idc_project (project_name,file_path) value ("%s", "%s")""" % ("森林火险", "/opt/idc/file/20220924/79a53829-8965-4aof-a342-c532f6c9c2a3森林火险.xlsx"))
# print(mysql.sql_select_many("""select * from gjc"""))
# print(mysql.sql_select_many("""select * from gjc2 where id=dup_file_test"""))
# print(mysql.sql_select_many("""select * from xmnr"""))
# print(mysql.sql_select_many("""select * from gjc_copy1"""))
# print(mysql.sql_select_one("""select * from idc_project_check"""))
# print(mysql.sql_select_one("""select * from idc_project_check_detail"""))
# print(mysql.sql_select_many("""select * from idc_project_module"""))
# print(mysql.sql_select_many("""select * from idc_project_module where project_id=%d""" % int(7)))
# print( mysql.sql_select_one("""select dup_id from idc_project_check where project_id=%d"""% int(7)))
# print(len(mysql.sql_select_many("""select * from xmnr_copy1""")))
# print(len(mysql.sql_select_many("""select * from user_history_data""")))
print(len(mysql.sql_select_many("""select * from user_history_data""")))

"""查重复select * from user_history_module_data where gnms in (select gnms from user_history_module_data group by gnms having count(gnms)>1);
"""
# print()
# str_dict={}
# cmnr_count=551
# gnmkcount=1192
#
# print(mysql.sql_change_msg(
# """update idc_project set company_name=%s, dup_status=3, one_vote_veto_status=dup_file_test, self_check_status=dup_file_test, history_project_count=%d ,module_count=%d where project_id=%d""" % (
# str_dict.get('sbdw'), xmnr_count=551, gnmk_count=1192, 104)))
# print(mysql.sql_change_msg(
# """update idc_project set dup_status=3, one_vote_veto_status=dup_file_test, self_check_status=dup_file_test, history_project_count=%d ,module_count=%d where project_id=%d""" % (
# )
# for k, v in mysql.sql_select_one("""select * from idc_project_check_detail""").items():
# print(k, v)

+ 41
- 0
requirements.txt View File

@@ -0,0 +1,41 @@
certifi==2022.6.15
cffi==1.15.1
chardet==5.0.0
charset-normalizer==2.0.12
click==8.0.4
colorama==0.4.5
cryptography==3.4.7
dataclasses==0.8
DBUtils==3.0.2
et-xmlfile==1.1.0
Flask==1.0.2
idna==3.3
importlib-metadata==4.8.3
itsdangerous==2.0.1
jieba==0.42.1
Jinja2==3.0.3
joblib==1.1.0
MarkupSafe==2.0.1
numpy==1.19.5
openpyxl==3.0.10
pandas==1.1.5
pdfminer.six==20211012
pdfplumber==0.6.0
Pillow==8.4.0
pycparser==2.21
PyMySQL==0.10.1
pypiwin32==223
python-dateutil==2.8.2
pytz==2022.2.1
pywin32==304
requests==2.27.1
scikit-learn==0.24.2
scipy==1.5.4
six==1.16.0
threadpoolctl==3.1.0
typing_extensions==4.1.1
urllib3==1.26.12
Wand==0.6.10
Werkzeug==2.0.3
xlrd==1.2.0
zipp==3.6.0

BIN
水路运输综合监管系统建设项目.xls View File


Loading…
Cancel
Save