瀏覽代碼

fix: 初始化项目

master
kbb 4 週之前
父節點
當前提交
1e1e1780d0
共有 37 個文件被更改,包括 92737 次插入0 次删除
  1. 二進制
      __pycache__/baidu.cpython-39.pyc
  2. 二進制
      __pycache__/cosin_similarity.cpython-36.pyc
  3. 二進制
      __pycache__/cosin_similarity.cpython-39.pyc
  4. 二進制
      __pycache__/docx_extract.cpython-39.pyc
  5. 二進制
      __pycache__/flask_server.cpython-39.pyc
  6. 二進制
      __pycache__/glm_utils.cpython-39.pyc
  7. 二進制
      __pycache__/insert_history_data_total.cpython-39.pyc
  8. 二進制
      __pycache__/main1.cpython-39.pyc
  9. 二進制
      __pycache__/model_scope.cpython-39.pyc
  10. 二進制
      __pycache__/mysql_pool.cpython-36.pyc
  11. 二進制
      __pycache__/mysql_pool.cpython-39.pyc
  12. +78
    -0
      baidu.py
  13. +38
    -0
      co_rom.py
  14. +88
    -0
      cos_demo.py
  15. +129
    -0
      cosin_similarity.py
  16. +104
    -0
      docx_extract.py
  17. +54
    -0
      flask_server.py
  18. +181
    -0
      glm_utils.py
  19. +410
    -0
      insert_history_data_total.py
  20. +86195
    -0
      log.log
  21. +511
    -0
      main1.py
  22. +437
    -0
      main1.py.cors.bak
  23. +720
    -0
      main1.py.glm.bak
  24. +675
    -0
      main1.py.word.bak
  25. +550
    -0
      main1.py_改造qwen
  26. +391
    -0
      main10.py
  27. +65
    -0
      model_scope.py
  28. +113
    -0
      mysql_pool.py
  29. +1
    -0
      nohup python3 flask_server.py >> nohup.info 2>&1 &
  30. +5
    -0
      nohup.out
  31. +63
    -0
      pro_check_demo.py
  32. +43
    -0
      requirements.txt
  33. +1626
    -0
      stop_words.utf8
  34. 二進制
      temp/丽水市本级信息化项目建设方案模板.doc
  35. 二進制
      temp/丽水市本级信息化项目建设方案模板.docx
  36. +179
    -0
      tongyici_tihuan.txt
  37. +81
    -0
      user_dict.txt

二進制
__pycache__/baidu.cpython-39.pyc 查看文件


二進制
__pycache__/cosin_similarity.cpython-36.pyc 查看文件


二進制
__pycache__/cosin_similarity.cpython-39.pyc 查看文件


二進制
__pycache__/docx_extract.cpython-39.pyc 查看文件


二進制
__pycache__/flask_server.cpython-39.pyc 查看文件


二進制
__pycache__/glm_utils.cpython-39.pyc 查看文件


二進制
__pycache__/insert_history_data_total.cpython-39.pyc 查看文件


二進制
__pycache__/main1.cpython-39.pyc 查看文件


二進制
__pycache__/model_scope.cpython-39.pyc 查看文件


二進制
__pycache__/mysql_pool.cpython-36.pyc 查看文件


二進制
__pycache__/mysql_pool.cpython-39.pyc 查看文件


+ 78
- 0
baidu.py 查看文件

@@ -0,0 +1,78 @@
# 填充API Key与Secret Key
import requests
import json
import pandas as pd
import re


# 获取百度token验证
def get_access_token():
app_key = "0nbZsMNAWGCU7rLp6olAXVUG"
app_secret = "gWgVIEMpf85npswY0XahUncx6aZGa8e3"
url = f"https://aip.baidubce.com/oauth/2.0/token?client_id={app_key}&client_secret={app_secret}&grant_type=client_credentials"

payload = json.dumps("")
headers = {
'Content-Type': 'application/json',
'Accept': 'application/json'
}
response = requests.request("POST", url, headers=headers, data=payload)
return response.json().get("access_token")


# 使用百度文心一言获取文本相识度
def CallResult(prompt):
token = "24.b7829fa01d73e3a4187c73fe7e27316c.2592000.1696475561.282335-38769936"
headers = {
'Content-Type': 'application/json',
}
data = json.dumps({
'temperature': 0.1,
'top_p': 0.8,
"penalty_score": 1.0,
'messages': [
{
"role": "user", # 提问者角色
"content": prompt # 提问内容
}
],
})
# print("ChatGLM prompt:",prompt)
# 调用api
response = requests.post(f"https://aip.baidubce.com/rpc/2.0/ai_custom/v1/wenxinworkshop/chat/completions?access_token={token}",headers=headers,data=data)
if response.status_code != 200:
return 0, "查询结果错误"
resp = response.json()
print(f"查重结果: {resp['result']}")
pattern = r"\d+\.?\d*"
nums = re.findall(pattern, resp['result'])
if len(nums) > 0:
print("提取到的数字:", nums[0])
n = float(nums[0])
if n is None:
return 0, ""
# resp['result'] = str(resp['result']).replace("\n", " ")
# prompt = prompt.replace("\n", " ")
# with open('train.json', 'a') as file:
# file.write("{" + f""""content": "{prompt}","summary": "{resp['result']}" """ + "}\n")
return n, resp['result']
return 0, ""


# 整理百度返回的格式
def format_data(prompt, result):
with open('train.json', 'w') as file:
file.write(f"")
return


if __name__ == '__main__':
access_token = get_access_token()
print(access_token)
result = CallResult("告诉我下面两段话的重复率百分比是多少:1. 城市总体态势主要从平安指数、生态环保、实有人口、重点人员、重点场所、防灾防控、宏观经济、城市管理、城市监测、事件统计的角度,展示丽水各项城市指标的运行情况。2. 为实现各模块的数据数量、数据接入、历史分析以及部门工作的情况,需要将各模块情况接入分析形成督办驾驶舱。数字化生成:根据市委市政府领导关心的驾驶舱数据生成和展示相关的运行指标,利用数据可视化技术,通过数据驾驶舱方式集中展示,让领导通过一屏,即可清晰掌握驾驶舱数据指标生成相关的实时情况。数字化生成相关的数据指标主要包括接入驾驶舱预警指标、优质指标、专题页面总数、数据指标总数、涉及部门总数、自动化率、涉及接口总数、采用数据直报的指标总数、数据直报完成率、延迟率、整改率、接口故障率、当前接口故障数场景应用相关统计、接入业务系统相关统计、等30余个统计维度指标。数字化督办:根据城市管理指挥中心工作人员针对每日会商工作推进过程中关心的相关指标,利用数据可视化技术,通过数据驾驶舱方式集中展示,通过数字化督办驾驶舱即可清晰掌握每日会商的工作成效和部门工作情况,方便城市管理指挥中心完善和优化每日会商工作。指标主要包括会商需关注指标数、指标批示数、事项批示数、交办情况、督办情况、部门应急预案相关统计、数字化督办相关指标、带班领导会商次数、议题数等不少于10个统计维度指标。")
print("回答结果:", result)
# 使用正则表达式提取小数和整数
# pattern = r"\d+\.?\d*"
# nums = re.findall(pattern, result)
# if len(nums) > 0:
# print("提取到的数字:", nums[0])

+ 38
- 0
co_rom.py 查看文件

@@ -0,0 +1,38 @@
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'


# 方法1
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
# damo/nlp_corom_sentence-embedding_chinese-base-ecom
# damo/nlp_corom_sentence-embedding_chinese-base
# damo/nlp_corom_sentence-embedding_chinese-tiny
model_id = "damo/nlp_corom_sentence-embedding_chinese-base-ecom"
# model_id = "damo/nlp_gpt3_text-generation_chinese-base"
pipeline_se = pipeline(Tasks.sentence_embedding, model=model_id, max_length=1024)
#
# 当输入包含“soure_sentence”与“sentences_to_compare”时,会输出source_sentence中首个句子与sentences_to_compare中每个句子的向量表示,以及source_sentence中首个句子与sentences_to_compare中每个句子的相似度。
sentences_to_compare = [
'''根据不同的季度、网红热点信息主动给用户推送热门景区和景点的游玩信息''',
'''开发一个在线商品销售商城,提供旅游路线上的相关特产和旅游商品的查阅及下单功能''',
'''推送相关旅游产品折扣信息以及景区景点举办的篝火晚会、烟花盛宴等活动内容,可通过线下报名参与。'''
]
inputs = {
"source_sentence": [
# '''模块功能:提供商家信息查询和种植区域查询等功能。功能描述:商家信息包括名称、地址、法人、联系方式、经营范围、投资规模等,种植区域主要是指丽水香茶种植区域的面积、位置、高程等信息,政府工作人员可根据多个查询条件完成上述信息的查询。'''
'''展示乡村游中相关乡村举办的庆典活动,包含庆典举办时间、内容等'''
],
"sentences_to_compare": sentences_to_compare
}

result = pipeline_se(input=inputs)
print(result["scores"])

arr = result["scores"]
max_value = max(arr)
max_index = arr.index(max_value)
print("最大值:", max_value)
print("最相识内容:", sentences_to_compare[max_index])
# #
#

+ 88
- 0
cos_demo.py 查看文件

@@ -0,0 +1,88 @@
import jieba
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
import math

jieba.suggest_freq('以太坊', True)
jieba.suggest_freq('区块链', True)
jieba.suggest_freq('数字货币', True)
jieba.suggest_freq('将于', True)
jieba.suggest_freq('人人网', True)
jieba.suggest_freq('比特币', True)
jieba.suggest_freq('北上广', True)
jieba.suggest_freq('大数据', True)
jieba.suggest_freq('云计算', True)
jieba.suggest_freq('公有链', True)
# 引用停用词
stpwrdpath = "./stop_words.utf8"
stpwrd_dic = open(stpwrdpath, 'rb')
stpwrd_content = stpwrd_dic.read()
# 将停用词表转换为list
stpwrdlst = stpwrd_content.splitlines()
stpwrd_dic.close()
# vector = TfidfVectorizer(stop_words=stpwrdlst)


def get_xls_data():
# 获取数据
# data = pd.read_excel("./0825-丽水系统查重维度.xlsx", names=["项目名称", "数字百山祖(一期)—“云值守”建设方案"], sheet_name='Sheet1')
# content_ls_1 = [(x, y) for x, y in enumerate(data[0]["项目名称"]) if y]
# #print(content_ls_1)
# content_ls_2 = [(x, y) for x, y in enumerate(data[0]["数字百山祖(一期)—“云值守”建设方案"]) if y]
content_ls_1 = [("content", """通过本项目的实施,可以真实贯彻以人民为中心的发展思想,按 照政府办事“一件事”全流程“最多跑一次”的理念和目标,深化“四 张清单一张网”改革,从与群众和卫生健康监管对象关系最紧密的领 域和事项做起,充分运用“互联网+智慧监管”和大数据,促进卫生 健康领域风险监管创新,使群众和企业对综合行政执法改革的获得感 明显增强、综合行政执法效率明显提升、卫生健康领域环境进一步改 善,着力防范化解卫生健康领域重大风险,维护人民群众健康权益""")]
content_ls_2 = [("content", """建成政府侧应用和企业侧应用,实现政府、工商联、商会、企业一体化协同应用,助力工商联全面摸排“浙江人经济”的底数,精准掌握省外浙商重点企业、产业、产业链以及省外浙江商会的情况,加强对在外浙商企业的日常联系和服务覆盖,以乡情为纽带,有效发挥在外浙商的产业优势、技术优势、市场优势、资源优势,抢抓国内大循环的制高点,推动产业链招商、精准靶向招商,开展政策实施情况第三方评估,促进浙江人经济与浙江经济融合发展,助力我省高质量发展建设共同富裕示范区。""")]
content_ls = []
for x in content_ls_1:
for y in content_ls_2:
if x[0] == y[0]:
content_ls.append((x[1], y[1]))

# 数据分词
print("语料长度:" + str(len(content_ls)))
similarity_length = 0
for x in content_ls:
# print([get_jieba_doc(x[0]), get_jieba_doc(x[1])])
vectorizer = CountVectorizer()
transformer = TfidfTransformer()
# tfidf = transformer.fit_transform(vectorizer.fit_transform([get_jieba_doc(x[0]), get_jieba_doc(x[1])]))
# print(cosine_similarity(tfidf))
# print("======================================")
vector = TfidfVectorizer(max_df=10, min_df=1)
tfidf = vector.fit_transform([get_jieba_doc(x[0]), get_jieba_doc(x[1])])
new_cosine_similarity = cosine_similarity(tfidf).tolist()
if new_cosine_similarity[0][1] > 0.7:
print(cosine_similarity(tfidf))
print("相似文本为:" + x[0]+" ||||| " + x[1])
print("==================")
similarity_length = similarity_length + 1

print("相似语料长度:" + str(similarity_length))
print("相似度识别成功率:%s" % (similarity_length/len(content_ls))*100 + "%")


def get_jieba_doc(document):
document_cut = jieba.cut(document)
try:
return " ".join(document_cut)
except Exception as e:
print(e.message)


# 计算向量夹角余弦
def VectorCosine(x, y):
vc = []
for i in range(1, len(x)-2):
xc1 = x[i] - x[i-1]
xc2 = x[i+1] - x[i]
yc1 = y[i] - y[i-1]
yc2 = y[i+1] - y[i]
vc.append((xc1*xc2+yc1*yc2)/(math.sqrt(xc1**2+yc1**2)*math.sqrt(xc2**2+yc2**2)))

return vc


if __name__ == '__main__':
get_xls_data()

+ 129
- 0
cosin_similarity.py 查看文件

@@ -0,0 +1,129 @@
# coding=utf-8
import re
import html
import jieba
import jieba.analyse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


stopwords = open('stop_words.utf8', encoding='utf8')
stopword_list = [k.strip() for k in stopwords.readlines() if k.strip() != '']


def replace_tongyici(keywords):
# tongyici_tihuan.txt是同义词表,每行是一系列同义词,用tab分割
# 1读取同义词表:并生成一个字典。
combine_dict = {}
for line in open("tongyici_tihuan.txt", "r"):
seperate_word = line.strip().split(" ")
num = len(seperate_word)
for i in range(1, num):
combine_dict[seperate_word[i]] = seperate_word[0]

kws = []
for word in keywords:
if word in combine_dict:
word = combine_dict[word]
kws.append(word)
else:
kws.append(word)
return kws

class CosineSimilarity(object):
"""
余弦相似度
"""
def __init__(self, content_x1, content_y2):
self.s1 = content_x1
self.s2 = content_y2

@staticmethod
def extract_keyword(seq_str): # 提取关键词
# 正则过滤 html 标签
re_exp = re.compile(r'(<style>.*?</style>)|(<[^>]+>)', re.S)
content = re_exp.sub(' ', seq_str)
# html 转义符实体化
content = html.unescape(content)
# 切割
jieba.load_userdict("user_dict.txt")
seg = [i for i in jieba.cut(content, cut_all=True) if i != '']
# keywords = [k for k in jieba.cut(content, cut_all=True) if k != ' ' and k != '' and k not in stopword_list]
keywords = [k for k in jieba.analyse.extract_tags("|".join(seg), topK=500, withWeight=False) if k != ' ' and k != '' and k not in stopword_list]

# keywords = replace_tongyici(keywords)
# 提取关键词
# keywords = jieba.analyse.extract_tags("|".join(seg), topK=500, withWeight=False, allowPOS=('n', 'nr', 'ns'))
# keywords = jieba.analyse.extract_tags(content, topK=2000, withWeight=False)
# print(keywords)
# return keywords
return [],keywords

@staticmethod
def one_hot(word_dict, keywords): # oneHot编码
# cut_code = [word_dict[word] for word in keywords]
cut_code = [0]*len(word_dict)
for word in keywords:
cut_code[word_dict[word]] += 1
return cut_code


def main(self):
# 去除停用词
# jieba.analyse.set_stop_words('stop_words.utf8')

# 提取关键词
seg1,keywords1 = self.extract_keyword(self.s1)
seg2,keywords2 = self.extract_keyword(self.s2)

# 词的并集
union = set(keywords1).union(set(keywords2))
# union = set(seg1).union(set(seg2))

# 编码
word_dict = {}
i = 0
for word in union:
word_dict[word] = i
i += 1
# # oneHot编码
s1_cut_code = self.one_hot(word_dict, keywords1)
s2_cut_code = self.one_hot(word_dict, keywords2)
# s1_cut_code = self.one_hot(word_dict, seg1)
# s2_cut_code = self.one_hot(word_dict, seg2)

# stopwords = open('stop_words.utf8', encoding='utf8')
# stopword_list = [k.strip() for k in stopwords.readlines() if k.strip() != '']
# stopwords.close()

# vector = TfidfVectorizer(max_df=10, min_df=1)
# tfidf = vector.fit_transform([" ".join(keywords1), " ".join(keywords2)])

# 余弦相似度计算
sample = [s1_cut_code, s2_cut_code]
# 除零处理
try:
sim = cosine_similarity(sample)
# sim = cosine_similarity(tfidf).tolist()
return sim[1][0],keywords1,keywords2
except Exception as e:
print(e)
return 0.0,keywords1,keywords2


# 测试
if __name__ == '__main__':
# with open(r'D:\pythonDM\Ndkj\live111\result\1.txt', encoding='UTF-8') as x, open(r'D:\pythonDM\Ndkj\live111\result\2.txt', encoding='UTF-8') as y:
# content_x = x.read()
# content_y = y.read()
content_x = """中英文双语版本开发建设,为平台提供国际化能力,对平台APP端所有功能菜单以及所有官方维护内容进行中英翻译,实现中英双语的APP版本,同时提供版本一键切换功能,提升一机游丽水平台服务的全面性,将一机游丽水打造成全国智慧文旅平台领域专业、专注、领先的范本。"""
content_y = """(1)诉求受理、分流功能: 用户可以对进入统一受理中心的诉求信息进行识别,对有效且需要分流的诉求进行受理、分派操作。操作后,诉求自动进入下一个流程环节,操作后信息状态变为无效信息。对应的诉求状态变化会同步通知诉求来源系统。 (2)诉求结案回复、设为无效功能 用户对进入统一受理中心的诉求信息进行识别,对可以直接答复的信息进行回复并结案的操作,操作后诉求会自动结案。如诉求信息无效,则可以对其信息不受理操作,操作后信息状态变为无效信息。对应的诉求状态变化会同步通知诉求来源系统。 诉求流转跟踪视图用户可在统一受理中心的工作台上看到已分派的系统列表,信息详情中会展示该诉求的处理流程,内部和外部系统的处理过程都可以看到,方便用户掌握诉求的进展以便对诉求流转进行跟踪。 (3)自动分类、分流: 统一受理中心通过大数据分析,对诉求内容的语义解析算法,提取出该诉求的事件分类自动填充到分流信息中,再通过事项清单配置,将负责该类型事件的处理对象系统自动填充到分流信息中。用户只需核对系统填充信息即可实现一键分派。 (4)自动区分无效信息: 统一受理中心通过大数据分析,对诉求内容的语义解析算法,将疑似无效内容的诉求信息标记出来,提供用户判断的依据,提高用户处理业务的效率。"""
similarity = CosineSimilarity(content_x, content_y)
# similarity = CosineSimilarity(file, file2)
similarity = similarity.main()
print(similarity)






+ 104
- 0
docx_extract.py 查看文件

@@ -0,0 +1,104 @@
import os
import docx
import requests
import mysql_pool
from pymysql.converters import escape_string


def read_docx(file_path):
mysql = mysql_pool.ConnMysql()
# print(os.path.abspath('丽水市本级信息化项目建设方案模板.docx'))
# # 通过url获取文件 http://jobapi.ningdatech.com/prometheus-yw_file_service/files/20240116/5a75cb43d17d4f1589d455d21547ab0c.doc
# url = "http://jobapi.ningdatech.com/prometheus-yw_file_service/files/20240919/669f323c5c824f89a34bf04a66105902.doc"
# file_name = "丽水市本级信息化项目建设方案模板.docx"
# file_path = os.path.join("temp", file_name)
try:
# r = requests.get(url)
# with open(file_path, "wb") as code:
# code.write(r.content)

# # 转化文件格式
# convert_doc_to_docx(file_path, file_path.replace('.doc', '.docx'))
# file_path = file_path.replace('.doc', '.docx')
# 读取文件
# doc = docx.Document(os.path.abspath(file_path))
doc = docx.Document(file_path)
# 是否开始获取文本
is_acquire = 0
is_project_name = 0
content = []
# 功能模块
feature_map = {}
# 功能名
feature_name = ""
# 项目名
xmmc = ""
for para in doc.paragraphs:
style = para.style.name
print(f"style: {para.style.name}, value: {para.text}")
if str(style).find('toc') == 1:
continue
# 获取文档项目名称
if para.text.find('项目名称') != -1:
is_project_name = 1
elif para.text.find('项目类型') != -1:
is_project_name = 0
if is_project_name == 1:
if str(style).find('Heading') == -1 and str(style).find('toc') == -1:
xmmc = para.text

if para.text == '3.1.2 建设内容':
is_acquire = 1
elif para.text == '3.2 整体架构设计':
is_acquire = 0
if is_acquire == 1:
if str(style).find('Heading') == -1:
# print(f"content: {para.text}, style: {para.style.name}")
feature_map[feature_name] = para.text
# 重置功能名
feature_name = ""
content.append(para.text)
else:
feature_map[para.text] = ""
feature_name = para.text

# 使用next函数逐个获取元素
for key, value in feature_map.items():
if key != "3.1.2 建设内容" and key != "":
print(f"Key: {key}, Value: {value}")
# 将功能描述入库
mysql.sql_change_msg(
"""insert into user_history_module_data(xmmc,gnmc,gnms,line, remark) value("%s", "%s", "%s", "%s", "%s")""" % (
escape_string(xmmc), escape_string(key), escape_string(value), "", "自动拆解导入"))


finally:
# os.remove(file_path)
print("删除文件")

return "\n".join(content)


def convert_doc_to_docx(doc_file, docx_file):
try:
if doc_file.endswith('.doc'):
# 创建一个新的.docx文件
docx_document = docx.Document()

# 读取.doc文件的内容
with open(doc_file, 'rb') as doc:
content = doc.read()

# 将.doc文件的内容写入.docx文件
docx_document.add_paragraph(content)

# 保存.docx文件
docx_document.save(docx_file)
finally:
os.remove(doc_file)


# file_path = "丽水市本级信息化项目建设方案模板.docx"
# doc_content = read_docx()
# print(doc_content)


+ 54
- 0
flask_server.py 查看文件

@@ -0,0 +1,54 @@
# coding=utf-8
from flask import Flask, redirect, url_for, request
from flask import jsonify

import docx_extract
import mysql_pool
import main1

# import xm
# from xm import xsd

app = Flask(__name__)

# mysql = mysql_pool.ConnMysql()


# 返回excel的保存地址
@app.route('/check/duplicates/<projectId>')
def success(projectId):
# file_type = request.args.get('fileType', 'excel')
mysql=mysql_pool.ConnMysql()
if int(projectId) == 0:
data = mysql.sql_select_many("""select * from idc_project""")
else:
data = mysql.sql_select_many("""select * from idc_project where project_id=%s""" % projectId)
print(data)
data_list = []

for ap in data:
# if os.path.exists(ap.get("file_path")):
data_list.append((ap.get("project_id"), ap.get("file_path"), ap.get("project_name")))

# data_list.append((ap.get("project_id"), "C:/Users/PoffyZhang/Desktop/花园云(城市大脑)数字驾驶舱20230202.xls", ap.get("project_name")))
mysql.release()
# print(data_list)
main1.project_check(data_list)
return jsonify({"code": 0, "data": data})


# 自动提取docx内容
@app.route('/check/docx/save', methods=['POST'])
def docx_save():
data = request.get_json()
if data["file_path"] != "":
docx_extract.read_docx(data["file_path"])
return jsonify({"code": 0, "data": data})


# 去数据库idc_project里面拿数据,获取比如project_id=11,根据file_path地址拿到要开始处理的数据
if __name__ == '__main__':
app.run(host="0.0.0.0", port=19099)
# insert_history_data_total.update_desc()
print("run server ...")
# app.run(port=19097)

+ 181
- 0
glm_utils.py 查看文件

@@ -0,0 +1,181 @@
import requests
import json
import re


def CallResult(prompt):
headers = {
'Content-Type': 'application/json',
'Cookie': 'MODEL_USER_AUTH=4879b93405ebb89cad144590f0a4873f#3',
}
data = json.dumps({
'reqSource': "private",
'reqParams': {
'busCode': "fpExtract",
'degree': "low",
'fpRequire': prompt,
},
})
# print("ChatGLM prompt:",prompt)
# 调用api
response = requests.post("http://81.70.174.229:9000/api/serveChannel",headers=headers,data=data)
if response.status_code != 200:
return "查询结果错误"
resp = response.json()
return resp['data']


# 调用glm
def CallResultNew(prompt):
headers = {
'Content-Type': 'application/json',
'Cookie': 'MODEL_USER_AUTH=92962ed4181f5221b20faaad1c42b3b8#3',
}
# 调用api
if prompt != "":
url = f'http://81.70.174.229:8090/smarty/fpNameExtract?fpRequire={prompt}&modelType=llm'
response = requests.get(url, headers=headers)
if response.status_code != 200:
return "查询结果错误"
resp = response.json()
return resp['data']


def CallContentResult(prompt):
content = ""
seqs = re.split("。", prompt)
for seq_ele in seqs:
if seq_ele != '':
headers = {
'Content-Type': 'application/json',
'Cookie': 'MODEL_USER_AUTH=4879b93405ebb89cad144590f0a4873f#3',
}
data = json.dumps({
'reqSource': "private",
'reqParams': {
'busCode': "fpExtract",
'degree': "low",
'fpRequire': seq_ele.replace("\n", "").replace(" ", ""),
},
})
# 调用api
response = requests.post("http://81.70.174.229:9000/api/serveChannel", headers=headers, data=data)
if response.status_code != 200:
return "查询结果错误"
resp = response.json()
glm_data =resp['data']
print(f'glm_data = {glm_data}')
act_list_str = ",".join(glm_data["actList"])
obj_list_str = ",".join(glm_data["objList"])
content = content + act_list_str + "|" + obj_list_str + " -> "

print(content)
return content


def CallContentResultNew(prompt):
content = ""
seqs = re.split("。", prompt)
for seq_ele in seqs:
if seq_ele != '':
headers = {
'Content-Type': 'application/json',
'Cookie': 'MODEL_USER_AUTH=92962ed4181f5221b20faaad1c42b3b8#3',
}
url = f'http://81.70.174.229:8090/smarty/fpNameExtract?fpRequire={prompt}&modelType=llm'
response = requests.post(url, headers=headers)
if response.status_code != 200:
return "查询结果错误"
resp = response.json()
glm_datas = resp['data']
for glm_data in glm_datas:
name = glm_data["name"]
content += name
content = content.replace("<br/>", ",")
return content


def AutoDLResult(prompt):
# prompt = prompt.replace("\n", " ")
# print(f"prompt: {prompt}")
# url = f"http://10.100.148.24:8000"
# payload = json.dumps({
# # 'top_p': 0.8,
# # 'temperature': 0.1,
# 'prompt': prompt,
# })
# headers = {
# 'Content-Type': 'application/json',
# 'Accept': 'application/json'
# }
# response = requests.request("POST", url, headers=headers, data=payload)
# desc = response.json().get("response")
# print(f"desc : {desc}")
# pattern = r"\d+\.?\d*"
# nums = re.findall(pattern, desc)
# if len(nums) > 0:
# print("提取到的数字:", nums[0])
# n = float(nums[0])
# if n is None:
# return 0, ""
# return n, filter_emoji(desc)
return 0, ""


def AutoDLResultNoNum(prompt):
# prompt = prompt.replace("\n", " ")
# print(f"prompt: {prompt}")
# url = f"http://10.100.148.24:8000"
# payload = json.dumps({
# # 'top_p': 0.8,
# # 'temperature': 0.1,
# 'prompt': prompt,
# })
# headers = {
# 'Content-Type': 'application/json',
# 'Accept': 'application/json'
# }
# response = requests.request("POST", url, headers=headers, data=payload)
# desc = response.json().get("response")
# print(f"desc : {desc}")
# return desc
return ""


def qwenResult(sentence1, sentence2):
url = f"http://127.0.0.1:5010/api/v1/compare"
payload = json.dumps({
'sentence1': sentence1,
'sentence2': sentence2,
})
headers = {
'Content-Type': 'application/json',
'Accept': 'application/json'
}
response = requests.request("POST", url, headers=headers, data=payload)
desc = response.json().get("data")
print(f"desc : {desc}")
return desc



# 去除文本中的特殊表情
def filter_emoji(desstr, restr=''):
# 过滤表情
try:
co = re.compile(u'[\U00010000-\U0010ffff]')
except re.error:
co = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')
return co.sub(restr, desstr)


if __name__ == '__main__':
# text = "这是一个带有表情符号😊的字符"
# text = filter_emoji(text)
# content_x = '''会议管理,支持相关人员建立评审会议,评审会议和项目关联,并支持后续的专家抽取、结果确认等。会议建立完成后,相关人员可选择专家抽取模式,手动抽取是从专家库中手动筛选及选取专家参加评审。会议建立完成后,相关人员可选择专家抽取模式,自动抽取实现专家按标签自动抽取,抽取成功后生成会议通知、会议签到单。'''
# content_y = '''1、专家库管理概述:实现和省级专家库对接,获取专家信息,并实现用户自行录入专家信息。同时要实现专家自定义抽取、抽取规则模板设置、在线短信与智能语音通知,专家赴会反馈等功能,继而实现专家线上管理。同时需要建立信创专家和信息化专家两个大类别,两个类别可重合。2、专家库基础信息:(1)统一的专家基础信息:构建统一的专家信息,同时与现有专家库进行数据同步,实现信息化项目专家信息的统一共享。支持多维度的专家查询通道。(2)标签管理:实现专家标签管理,为不同的专家贴上擅长领域标签,让专家擅长领域更加直观,让单位建设信息化项目有一个对应标签的专家支撑,为项目单位信息化项目保驾护航,减少单位信息化项目建设的风险。(3)专家卡片:专家卡片涵盖专家基本信息、专业标签信息以及参与信息化项目履职情况,形成一个动态的专家档案信息。(4)自定义设计:专家登录系统后可自行修改完善自身专家信息。信息需数管中心审核后方可生效。3、专家抽取管理:实现已有专家中,可根据不同类型筛选,并随机抽取对应专家,系统通过对接短信平台和智能语音平台来自动发送评审会议通知,专家实施反馈。(1)需在抽取管理中预设抽取规则模板:模板一:1000万预算以下项目,抽取5名评审专家,先抽取1名信创专家,待信创专家确认参会后再抽取4名信息化专家。模板二:1000万预算(含)以上项目抽取7名评审专家,先抽取1名信创专家,待信创专家确认参会后再抽取6名信息化专家。支持多轮抽取,已确保专家随机抽取的有效性。(2)专家评审通知规则:抽中的专家立即通过智能语音电话进行通知,专家选择参会后以短信形式再次发送项目评审会的时间、地址、项目名称信息给参会专家。如抽中专家接了电话未反馈是否参会或拒绝参会,系统自动进行下一轮抽取。如抽中的专家未接电话,10分钟后再拨打一次,如还未接,判断为不参会,系统自动进行下一轮抽取。直至抽满足够人数为止。(3)抽取规避原则:1)信创专家会和信息化专家有重叠,在信创专家中抽中的专家在后续信息化专家抽取中要规避。2)如有专家是在此申报单位中任职的,系统要自动规避抽取。(4)评审会信息修改与取消:需要实现评审会评审时间或地点修改,可以语音电话通知和短信通知相应专家的功能。需要实现评审会取消与通知,可以语音电话通知和短信通知相应专家的功能。(5)自动抽取失败处理:如专家自动抽取中发生反馈参会专家数量少于所需专家数量时,需要提实现手动指定专家参会的功能。(6)专家请假:提供专家评审会请假功能,已抽中并参加评审会的专家,如后续需要请假,可由两种方式进行申请:(1)专家电话联系评审会邀请人进行请假,评审会邀请人在系统中录入并确认该专家请假,邀请人可再次发起专家抽取进行补充专家。(2)专家登录系统进行申请请假,请假申请后系统自动再抽取一轮专家进行补充,并将变更信息通知评审会邀请人。4、专家履职: 可在系统中,查看专家参与评审的所有项目记录信息,并履行相关的项目审查工作。'''
# str_con = AutoDLResultNoNum(f"告诉我下面两段话的重复率百分比是多少: \n第一段话是:'{content_x}', \n ----------------- \n 第二段话是:'{content_y}'")
# print(str_con)
# qwenResult("你好啊", "你真好")
string = "相似度等级:高 原因:这两段话都涉及到信息"
print(string[6:7])

+ 410
- 0
insert_history_data_total.py 查看文件

@@ -0,0 +1,410 @@
# coding=utf-8

import re
import mysql_pool
from pymysql.converters import escape_string
import cosin_similarity
import pandas as pd
import glm_utils
import os
import json

wdys1 = {
"项目名称": "xmmc",
"现状问题": "xzwt",
"系统基础": "xtjc",
"项目目标": "xmmb",
"预期绩效": "yqjx",
"建设需求": "jsxq",
"数据需求": "sjxq",
"安全需求": "aqxq",
"业务领域": "ywly",
"核心业务": "hxyw",
"业务需求": "ywxq",
"业务协同": "ywxt",
"建设层级": "jscj",
"用户范围": "yhfw",
"目标群体": "mbqt",
"建设内容": "jsnr",
"功能模块": "gnmk",
"数据共享": "sjgx",
"智能要素": "znys",
"申报单位": "sbdw",
"所属地区": "ssdq",
"预算年度": "ysnd"
}
wdys2 = {
"xmmc": "项目名称",
"xzwt": "现状问题",
"xtjc": "系统基础",
"xmmb": "项目目标",
"yqjx": "预期绩效",
"jsxq": "建设需求",
"sjxq": "数据需求",
"aqxq": "安全需求",
"ywly": "业务领域",
"hxyw": "核心业务",
"ywxq": "业务需求",
"ywxt": "业务协同",
"jscj": "建设层级",
"yhfw": "用户范围",
"mbqt": "目标群体",
"jsnr": "建设内容",
"gnmk": "功能模块",
"sjgx": "数据共享",
"znys": "智能要素",
"sbdw": "申报单位",
"ssdq": "所属地区",
"ysnd": "预算年度"
}
gnmkys = {
"gnmc": "功能名称",
"gnms": "功能描述"
}


def getFlag():
data_dict = {}
df = pd.read_excel("0825-丽水系统查重维度.xlsx")
data = df.values
data = list(pd.Series(data[:, 1]).dropna())
for d in data:
try:
wd = re.search("(.*?)(.*?%)", d).group(1).strip()
wdc = wdys1.get(wd)
if wdc:
qz = re.search(".*?((.*?%))", d).group(1)
data_dict[wdc] = qz
except:
pass
return data_dict

# getFlag()

def gong_neng_mo_kuai(xmmc, mysql, dl, data, er_title, line):
# 将excel文件中的所有第三维度内容进行拼接
str_dict = {}
for et in er_title:
for d in data:
if d[1] == et:
if str_dict.get(et):
str_dict[et] = str_dict.get(et) + d[3]
else:
str_dict[et] = d[3]
for k, v in str_dict.items():
mysql.sql_change_msg("""insert into user_history_module_data(xmmc,gnmc,gnms,line, remark) value("%s", "%s", "%s", "%s", "%s")""" % (
escape_string(xmmc), escape_string(k), escape_string(v), line, ""))
#
similarity = cosin_similarity.CosineSimilarity(v, v)
similarity, keywords_x, keywords_y = similarity.main()
mysql.sql_change_msg("""insert into user_history_module_keywords (xmmc,gnmc,gnms,line) value("%s" ,"%s", "%s", "%s")""" % (
xmmc, escape_string(k), str(keywords_y)[1:-1], line))


def project_check(data_list, line):
mysql = mysql_pool.ConnMysql()
# 读取维度和权重
# get_data_dict = getFlag()
# 遍历excel存储路径
for dl in data_list:
# path = "0825-丽水系统查重维度1.xlsx"
# 读取路径下的excel
print(dl)
df = pd.read_excel(dl[1])
xmmc = df.keys()
# print(type(xmmc[dup_file_test]))
xmmc=xmmc[1]
# print(type(xmmc))
# xmmc1=''

if "可研报告"or "可研性报告"or "可行性研究报告" in xmmc:
xmmc=xmmc.replace('可研报告','')
xmmc=xmmc.replace('可研性报告','')
xmmc=xmmc.replace('可行性研究报告','')
# print(xmmc)
data = df.values
# 将excel文件中的所有维度内容进行拼接
join_str = ""
str_dict = {}
title = ""
er_title = set()
# for d in data:
# # print(d)
# if pd.notnull(d[0]):
# title = d[0]
# if title == "功能模块":
# er_title.add(d[dup_file_test])
# join_str = ""
# for i in d[dup_file_test:]:
# if pd.notnull(i):
# join_str += i
# str_dict[wdys1.get(title)] = join_str
# else:
# if title == "功能模块":
# er_title.add(d[dup_file_test])
# for i in d[dup_file_test:]:
# if pd.notnull(i):
# join_str += i
# str_dict[wdys1.get(title)] = str_dict.get(wdys1.get(title)) + join_str
# print(str_dict)
gnmk_str = []
# print(data)
for d in data:
if pd.notnull(d[0]):
title = d[0]
if title == "功能模块":
er_title.add(d[1])
join_str = ""
for i in d[1:]:
# print(type(i))
# i=str(i)
if pd.notnull(i):
join_str += str(i)
if title == "功能模块":
# for j in d[3:]:
if i == '功能描述':
continue
else:
gnmk_str.append(i)
str_dict[wdys1.get(title)] = join_str
# print(str_dict.get(wdys1.get(title)))
else:
if title == "功能模块":
er_title.add(d[1])
for i in d[3:]:
if pd.notnull(i):
join_str += str(i)
if title == "功能模块":
gnmk_str.append(i)
str_dict[wdys1.get(title)] = str_dict.get(wdys1.get(title)) + join_str
# gnmk="".join(gnmk_str)
# str_dict['gnmk']=gnmk
gnmk = ",".join(gnmk_str)
str_dict['gnmk'] = gnmk
# print(str_dict)
# print(str_dict.get("xzwt")if str_dict.get("xzwt") else None)
# print(str_dict.get('gnmk')if str_dict.get('gnmk')else None)
mysql.sql_change_msg(
"""insert into user_history_data (xmmc, xzwt, xtjc, xmmb, yqjx, jsxq, sjxq, aqxq, ywly, hxyw, ywxq, ywxt, jscj, yhfw, mbqt, jsnr, gnmk, sjgx, znys,sbdw,ssdq,ysnd,line,remark) value ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s","%s","%s","%s","%s","%s")"""
% (escape_string(xmmc),
escape_string(str_dict.get("xzwt")) if str_dict.get("xzwt") else None,
escape_string(str_dict.get("xtjc")) if str_dict.get("xtjc") else None,
escape_string(str_dict.get("xmmb")) if str_dict.get("xmmb") else None,
escape_string(str_dict.get("yqjx")) if str_dict.get("yqjx") else None,
escape_string(str_dict.get("jsxq")) if str_dict.get("jsxq") else None,
escape_string(str_dict.get("sjxq")) if str_dict.get("sjxq") else None,
escape_string(str_dict.get("aqxq")) if str_dict.get("aqxq") else None,
escape_string(str_dict.get("ywly")) if str_dict.get("ywly") else None,
escape_string(str_dict.get("hxyw")) if str_dict.get("hxyw") else None,
escape_string(str_dict.get("ywxq")) if str_dict.get("ywxq") else None,
escape_string(str_dict.get("ywxt")) if str_dict.get("ywxt") else None,
escape_string(str_dict.get("jscj")) if str_dict.get("jscj") else None,
escape_string(str_dict.get("yhfw")) if str_dict.get("yhfw") else None,
escape_string(str_dict.get("mbqt")) if str_dict.get("mbqt") else None,
escape_string(str_dict.get("jsnr")) if str_dict.get("jsnr") else None,
escape_string(str_dict.get("gnmk")) if str_dict.get("gnmk") else None,
escape_string(str_dict.get("sjgx")) if str_dict.get("sjgx") else None,
escape_string(str_dict.get("znys")) if str_dict.get("znys") else None,
escape_string(str_dict.get("sbdw")) if str_dict.get("sbdw") else None,
escape_string(str_dict.get("ssdq")) if str_dict.get("ssdq") else None,
escape_string(str_dict.get("ysnd")) if str_dict.get("ysnd") else None,
line, ""))
project_gjc = {}
for w in wdys2.keys():
content_x = str_dict.get(w)
content_y = str_dict.get(w)
if content_x and content_y:
# 循环遍历每一个维度
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keywords_x, keywords_y = similarity.main()
project_gjc[w] = keywords_y
mysql.sql_change_msg(
"""insert into user_history_keywords (xmmc, xzwt, xtjc, xmmb, yqjx, jsxq, sjxq, aqxq, ywly, hxyw, ywxq, ywxt, jscj, yhfw, mbqt, jsnr, gnmk, sjgx, znys, line) value ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")"""
% (xmmc, str(project_gjc.get("xzwt"))[1:-1] if project_gjc.get("xzwt") else None,
str(project_gjc.get("xtjc"))[1:-1] if project_gjc.get("xtjc") else None,
str(project_gjc.get("xmmb"))[1:-1] if project_gjc.get("xmmb") else None,
str(project_gjc.get("yqjx"))[1:-1] if project_gjc.get("yqjx") else None,
str(project_gjc.get("jsxq"))[1:-1] if project_gjc.get("jsxq") else None,
str(project_gjc.get("sjxq"))[1:-1] if project_gjc.get("sjxq") else None,
str(project_gjc.get("aqxq"))[1:-1] if project_gjc.get("aqxq") else None,
str(project_gjc.get("ywly"))[1:-1] if project_gjc.get("ywly") else None,
str(project_gjc.get("hxyw"))[1:-1] if project_gjc.get("hxyw") else None,
str(project_gjc.get("ywxq"))[1:-1] if project_gjc.get("ywxq") else None,
str(project_gjc.get("ywxt"))[1:-1] if project_gjc.get("ywxt") else None,
str(project_gjc.get("jscj"))[1:-1] if project_gjc.get("jscj") else None,
str(project_gjc.get("yhfw"))[1:-1] if project_gjc.get("yhfw") else None,
str(project_gjc.get("mbqt"))[1:-1] if project_gjc.get("mbqt") else None,
str(project_gjc.get("jsnr"))[1:-1] if project_gjc.get("jsnr") else None,
str(project_gjc.get("gnmk"))[1:-1] if project_gjc.get("gnmk") else None,
str(project_gjc.get("sjgx"))[1:-1] if project_gjc.get("sjgx") else None,
str(project_gjc.get("znys"))[1:-1] if project_gjc.get("znys") else None,
line))

gong_neng_mo_kuai(xmmc, mysql, dl, data, er_title, line)



def update_desc():
mysql = mysql_pool.ConnMysql()
module_list = mysql.sql_select_many("""select id, gnms from user_history_module_data where xmmc = '丽水市城市管理指挥中心信息系统(一期)项目'""")
for module in module_list:
# 通过chatglm进行提取信息
gnms = module.get("gnms")
content = glm_utils.CallContentResultNew(gnms)

mysql.sql_change_msg(
"""UPDATE user_history_module_data SET glm_desc = "%s" WHERE id = %d""" % (
content if content else None,
module.get("id")))
print(content)


def update_desc1():
mysql = mysql_pool.ConnMysql()
module_list = mysql.sql_select_many("""select id, gnms from gnms_gml where xmmc = '丽水花园云(城市大脑)数字驾驶舱项目'""")
for module in module_list:
# 通过chatglm进行提取信息
gnms = module.get("gnms")
content = glm_utils.CallContentResultNew(gnms)

mysql.sql_change_msg(
"""UPDATE gnms_gml SET glm_desc = "%s" WHERE id = %d""" % (
content if content else None,
module.get("id")))
print(content)


def info_word_project():
mysql = mysql_pool.ConnMysql()
module_list1 = mysql.sql_select_many(
"""select jsnr from user_history_data where xmmc = '2023年丽水市云和县数字法治门户建设项目' """)
module_list2 = mysql.sql_select_many(
"""select jsnr from user_history_data where xmmc IN ('浙江省第二监狱重点罪犯管控模型项目',
'浙江省农村水电站管理数字化应用',
'浙江省河湖库保护数字化应用建设项目',
'浙江省环境遥感监测业务智治',
'平台项目',
'浙江林业智媒平台项目',
'未来e家应用建设方案',
'浙江省智慧林业云平台升级改造项目建设方案',
'为侨服务“全球通”平台二期建设项目')""")
json_objects = []
for module_info1 in module_list1:
for jsnr1Ele in module_info1["jsnr"].split('-----》'):
for module_info2 in module_list2:
for jsnr2Ele in module_info2["jsnr"].split('-----》'):
str = "A:%s\nB:%s" % (jsnr1Ele, jsnr2Ele)
data = {
"instruction": "现在你是一个政府提案的查重检查人员,给定两段话A和B:让我们一步步思考并判断是否相似。请以相似度从高、中、低三个等级进行评价,并给出理由。",
"input": str,
"output": ""
}
json_objects.append(data)

with open('其他-建设内容.json', 'w') as f:
for json_obj in json_objects:
json_str = json.dumps(json_obj, ensure_ascii=False) # 将JSON对象转换为字符串
f.write(json_str + '\n') # 写入字符串,并添加换行符



def info_word1():
mysql = mysql_pool.ConnMysql()
# module_list1 = mysql.sql_select_many("""select gnms from user_history_module_data where xmmc = '莲智社区' """)
# module_list2 = mysql.sql_select_many("""select gnms from user_history_module_data where xmmc IN ('古堰画乡智慧客厅项目—未来社区智慧服务平台', '未来e家')""")
module_list1 = mysql.sql_select_many("""select gnms from user_history_module_data where xmmc = '丽水市遂昌县政法委数字法治综合应用' """)
module_list2 = mysql.sql_select_many("""select gnms from user_history_module_data where xmmc IN ('浙江省第二监狱重点罪犯管控模型项目',
'浙江省农村水电站管理数字化应用',
'浙江省河湖库保护数字化应用建设项目',
'浙江省环境遥感监测业务智治',
'平台项目',
'浙江林业智媒平台项目',
'未来e家应用建设方案',
'浙江省智慧林业云平台升级改造项目建设方案',
'为侨服务“全球通”平台二期建设项目')""")

json_objects = []
for module_info1 in module_list1:
for module_info2 in module_list2:
str = "A:%s\nB:%s" % (module_info1["gnms"], module_info2["gnms"])
data = {
"instruction": "现在你是一个政府提案的查重检查人员,给定两段话A和B:让我们一步步思考并判断是否相似。请以相似度从高、中、低三个等级进行评价,并给出理由。",
"input": str,
"output": ""
}
json_objects.append(data)

with open('其他-功能模块对比.json', 'w') as f:
for json_obj in json_objects:
json_str = json.dumps(json_obj, ensure_ascii=False) # 将JSON对象转换为字符串
f.write(json_str + '\n') # 写入字符串,并添加换行符


def info_word_project_yw():
mysql = mysql_pool.ConnMysql()
module_list1 = mysql.sql_select_many(
"""select jsnr from user_history_data where xmmc = '2023年丽水市云和县数字法治门户建设项目' """)
module_list2 = mysql.sql_select_many(
"""select jsnr from user_history_data where xmmc IN ('2023年丽水市云和县数字法治门户建设项目', '浙江省司法厅全域数字法治监督应用系统(一期)', '丽水市遂昌县政法委数字法治综合应用', '丽水市龙泉市政法委法治龙泉门户', '庆元县数字法治综合门户')""")
json_objects = []
for module_info1 in module_list1:
for jsnr1Ele in module_info1["jsnr"].split('-----》'):
for module_info2 in module_list2:
for jsnr2Ele in module_info2["jsnr"].split('-----》'):
str = "A:%s\nB:%s" % (jsnr1Ele, jsnr2Ele)
data = {
"instruction": "现在你是一个政府提案的查重检查人员,给定两段话A和B:让我们一步步思考并判断是否相似。请以相似度从高、中、低三个等级进行评价,并给出理由。",
"input": str,
"output": ""
}
json_objects.append(data)

with open('其他-建设内容.json', 'w') as f:
for json_obj in json_objects:
json_str = json.dumps(json_obj, ensure_ascii=False) # 将JSON对象转换为字符串
f.write(json_str + '\n') # 写入字符串,并添加换行符


if __name__ == "__main__":
info_word1()
print("ok.......")
path = r"/Users/kebobo/Downloads/丽水/未来社区"
data_list = os.listdir(path)
for file in data_list:
if file != '.DS_Store':
data_list = [(0, path + '/' + file, "")]
project_check(data_list, "2024-07-27-数字法治")
print("已存入************************************* %s" % file)

"""
建设目标,业务功能

gnmk_str = []
for d in data:
if pd.notnull(d[0]):
title = d[0]
if title == "功能模块":
er_title.add(d[dup_file_test])
join_str = ""
for i in d[dup_file_test:]:
if pd.notnull(i):
join_str += i
if title == "功能模块":
gnmk_str.append(i)
str_dict[wdys1.get(title)] = join_str
else:
if title == "功能模块":
er_title.add(d[dup_file_test])
for i in d[dup_file_test:]:
if pd.notnull(i):
join_str += i
if title == "功能模块":
gnmk_str.append(i)
str_dict[wdys1.get(title)] = str_dict.get(wdys1.get(title)) + join_str
gnmk = "".join(gnmk_str)


"""



+ 86195
- 0
log.log
文件差異過大導致無法顯示
查看文件


+ 511
- 0
main1.py 查看文件

@@ -0,0 +1,511 @@
# coding=utf-8
import sys
import re

import baidu
import model_scope
import mysql_pool
from pymysql.converters import escape_string
import cosin_similarity
import pandas as pd
import datetime
import requests
import glm_utils
from threading import Thread


wdys1 = {
"项目名称": "xmmc",
"现状问题": "xzwt",
"系统基础": "xtjc",
"项目目标": "xmmb",
"预期绩效": "yqjx",
"建设需求": "jsxq",
"数据需求": "sjxq",
"安全需求": "aqxq",
"业务领域": "ywly",
"核心业务": "hxyw",
"业务需求": "ywxq",
"业务协同": "ywxt",
"建设层级": "jscj",
"用户范围": "yhfw",
"目标群体": "mbqt",
"建设内容": "jsnr",
"功能模块": "gnmk",
"数据共享": "sjgx",
"智能要素": "znys"
}
wdys2 = {
"xmmc": "项目名称",
"xzwt": "现状问题",
"xtjc": "系统基础",
"xmmb": "项目目标",
"yqjx": "预期绩效",
"jsxq": "建设需求",
"sjxq": "数据需求",
"aqxq": "安全需求",
"ywly": "业务领域",
"hxyw": "核心业务",
"ywxq": "业务需求",
"ywxt": "业务协同",
"jscj": "建设层级",
"yhfw": "用户范围",
"mbqt": "目标群体",
"jsnr": "建设内容",
"gnmk": "功能模块",
"sjgx": "数据共享",
"znys": "智能要素"
}
gnmkys = {
"gnmc": "功能名称",
"gnms": "功能描述"
}


def getFlag():
data_dict = {}
df = pd.read_excel("0825.xlsx")
data = df.values
data = list(pd.Series(data[:, 1]).dropna())
for d in data:
try:
wd = re.search("(.*?)(.*?%)", d).group(1).strip()
wdc = wdys1.get(wd)
if wdc:
qz = re.search(".*?((.*?%))", d).group(1)
data_dict[wdc] = qz
except:
pass
return data_dict


def gong_neng_mo_kuai(mysql, dl, data, er_title, str_dict_new):
nlp = model_scope.Bert_nlp("corom")
# 将excel文件中的所有第三维度内容进行拼接
str_dict = {}
for et in er_title:
for d in data:
if d[1] == et:
if str_dict.get(et):
str_dict[et] = str_dict.get(et) + d[3]
else:
str_dict[et] = d[3]

for k, v in str_dict.items():
mysql.sql_change_msg(
"""insert into idc_project_module (project_id, check_duplicate_count, module_name, module_content, create_time, update_time, tag) value(%d, 1, "%s", "%s", "%s", "%s", "模块")""" % (
int(dl[0]), k, v, str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7]))

module_id_list = mysql.sql_select_many(
"""select project_module_id, module_name, module_content from idc_project_module where project_id=%d""" % dl[
0])
data_list = []
for mil in module_id_list:
data_dict = {}
data_dict["project_module_id"] = mil.get("project_module_id")
data_dict["gnmc"] = mil.get("module_name")
# data_dict["glm_desc"] = baidu.CallResult(mil.get("module_content"))
data_dict["gnms"] = mil.get("module_content")
# print(f'module_content = ({mil.get("module_content")}), glm_desc = ({data_dict["glm_desc"]})')
data_list.append(data_dict)
# print(data_list)
for i in data_list:
# where xmmc = '南浔区信息化项目全生命周期管理系统' where xmmc = '丽水数字教育(一期)项目'
gnmk_copy1 = mysql.sql_select_many("""select * from user_history_module_data WHERE gnmc not in ('专项考评管理应用')""")
if gnmk_copy1:
desc_info_list = []
for gc in gnmk_copy1:
if gc.get("xmmc") != dl[2]:
desc_info_list.append(gc.get("gnms"))

similarity, s1, s2, idx = nlp.main(i.get("gnms"), desc_info_list)
if idx == -1:
continue
mysql.sql_change_msg(
"""insert into idc_project_module_check (project_module_id, module_name, project_name, company_name, create_time, update_time) value(%d, "%s", "%s", "%s", "%s", "%s")"""
% (
i.get("project_module_id"), escape_string(gnmk_copy1[idx].get("gnmc")), escape_string(gnmk_copy1[idx].get("xmmc")), "",
str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
dup_module_id = mysql.cur.lastrowid
check_module_info(mysql, gnmk_copy1[idx], dl, i, dup_module_id, similarity)


def check_module_info(mysql, gc, dl, pro, dup_module_id, score):
total_similarity1 = 0
total_similarity2 = 0
for j in ["gnmc", "gnms"]:
# 循环遍历每一个模块名称
content_x = gc.get(j)
content_y = pro.get(j)
if content_x and content_y:
if j == "gnmc":
# print("功能名称对比")
similarity, check_desc = glm_utils.AutoDLResult(f"""请帮我分析以下两段重复语句重复的地方: \n第一段话是:'{content_y}', \n ----------------- \n 第二段话是:'{content_x}'""")
# # 相似度相加
if similarity is None:
similarity = 0
print(f"similarity is {similarity}")
total_similarity1 += similarity/100
mysql.sql_change_msg(
"""insert into idc_project_module_check_detail (dup_module_id, project_name, module_content, dup_module_content, similarity, dimension, create_time, update_time, check_desc) value (%d, "%s", "%s", "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_module_id, dl[2], escape_string(content_y), escape_string(content_x), similarity,
"功能名称",
str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7], escape_string(check_desc)))
else:
check_desc = glm_utils.AutoDLResultNoNum(f"""请帮我分析以下两段重复语句重复的地方: \n第一段话是:'{content_y}', \n ----------------- \n 第二段话是:'{content_x}'""")
similarity = score
# 相似度相加 gnms
total_similarity2 += similarity
module_content = pro.get("gnms")
dup_module_content = gc.get("gnms")
mysql.sql_change_msg(
"""insert into idc_project_module_check_detail (dup_module_id, project_name, module_content, dup_module_content, similarity, dimension, create_time, update_time, check_desc) value (%d, "%s", "%s", "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_module_id, dl[2], escape_string(module_content), escape_string(dup_module_content),
similarity,
"功能模块描述",
str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7],
escape_string(check_desc)))

mysql.sql_change_msg("""update idc_project_module_check set similarity=%f where dup_module_id=%d""" % (
total_similarity1 + total_similarity2, dup_module_id))


def project_check(data_list):
mysql = mysql_pool.ConnMysql()
# mysql.sql_select_many("""select * from mkgjc""")
# 读取历史数据
xmnr_count = len(mysql.sql_select_many("""select * from user_history_data"""))
gnmk_count = len(mysql.sql_select_many("""select * from user_history_module_data"""))

nlp = model_scope.Bert_nlp("corom")

# 遍历excel存储路径
for dl in data_list:
# path = "0825-丽水系统查重维度1.xlsx"
# 读取路径下的excel
print(dl,dl[1])
df = pd.read_excel(dl[1])
data = df.values
# 将excel文件中的所有维度内容进行拼接
join_str = ""
str_dict = {}
gnmk_str = []
title = ""
er_title = set()
for d in data:
# if pd.notnull(d[0]):
# title = d[0]
# if title == "功能模块":
# er_title.add(d[1])
# join_str = ""
# for i in d[1:]:
# if pd.notnull(i):
# join_str += str(i)
# str_dict[wdys1.get(title)] = join_str
if pd.notnull(d[0]):
title = d[0]
if title == "功能模块":
er_title.add(d[1])
join_str = ""
for i in d[1:]:
if pd.notnull(i):
join_str += str(i)
if title == "功能模块":
if i == '功能描述':
continue
else:
gnmk_str.append(i)
str_dict[wdys1.get(title)] = join_str
else:
if title == "功能模块":
er_title.add(d[1])
for i in d[1:]:
if pd.notnull(i):
join_str += str(i)
str_dict[wdys1.get(title)] = str_dict.get(wdys1.get(title)) + join_str
# print(str_dict)
gnmk = ",".join(gnmk_str)
str_dict['gnmk'] = gnmk
mysql.sql_change_msg(
"""insert into user_data (xmmc, xzwt, xtjc, xmmb, yqjx, jsxq, sjxq, aqxq, ywly, hxyw, ywxq, ywxt, jscj, yhfw, mbqt, jsnr, gnmk, sjgx, znys) value ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")"""
% (dl[0], str_dict.get("xzwt") if str_dict.get("xzwt") else None,
str_dict.get("xtjc") if str_dict.get("xtjc") else None,
str_dict.get("xmmb") if str_dict.get("xmmb") else None,
str_dict.get("yqjx") if str_dict.get("yqjx") else None,
str_dict.get("jsxq") if str_dict.get("jsxq") else None,
str_dict.get("sjxq") if str_dict.get("sjxq") else None,
str_dict.get("aqxq") if str_dict.get("aqxq") else None,
str_dict.get("ywly") if str_dict.get("ywly") else None,
str_dict.get("hxyw") if str_dict.get("hxyw") else None,
str_dict.get("ywxq") if str_dict.get("ywxq") else None,
str_dict.get("ywxt") if str_dict.get("ywxt") else None,
str_dict.get("jscj") if str_dict.get("jscj") else None,
str_dict.get("yhfw") if str_dict.get("yhfw") else None,
str_dict.get("mbqt") if str_dict.get("mbqt") else None,
str_dict.get("jsnr") if str_dict.get("jsnr") else None,
str_dict.get("gnmk") if str_dict.get("gnmk") else None,
str_dict.get("sjgx") if str_dict.get("sjgx") else None,
str_dict.get("znys") if str_dict.get("znys") else None))
# 或取所有的xmnr_copy1 where xmmc = '南浔区信息化项目全生命周期管理系统' where xmmc = '丽水数字教育(一期)项目'
xmnr_copy1 = mysql.sql_select_many("""select * from user_history_data """)
# 对比xmnr_copy1和xmnr维度是否都有
if xmnr_copy1:
# threads = [Thread(target=check_project_info, args=(mysql, dl, xc, str_dict)) for xc in xmnr_copy1]
# for t in threads:
# t.start()
#
# for t in threads:
# t.join()
# pro_ths = []
# for xc in xmnr_copy1:
# # check_project_info(mysql, dl, xc, str_dict)
# p = Thread(target=check_project_info, args=(mysql, dl, xc, str_dict))
# pro_ths.append(p)
# p.start()
# for p in pro_ths:
# p.join()
xmnr_copy1_new = []
for xc in xmnr_copy1:
if xc["xmmc"] == str_dict.get("xmmc"):
continue
check_project_info(mysql, dl, xc, str_dict, nlp)

# 找出相识对最高的项目通过glm分析


mysql.sql_change_msg(
"""update idc_project set dup_status=3, one_vote_veto_status=1, self_check_status=1, history_project_count=%d ,module_count=%d where project_id=%d""" % (
xmnr_count, gnmk_count, dl[0]))
gong_neng_mo_kuai(mysql, dl, data, er_title, str_dict)


def check_project_info(mysql, dl, xc, str_dict, nlp):
total_keywords = {}
total_similarity = 0
dup_count = 0
# 保存相加后的相似度到idc_project_check
mysql.sql_change_msg(
"""insert into idc_project_check (project_id, dup_project_name, file_path, company_name, create_year, project_tag, project_range_tag, project_area, create_time, update_time) value ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")"""
% (dl[0], escape_string(xc.get("xmmc")), escape_string(dl[1]), "", "", "需求相似、业务相似", "历史项目", "",
str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7]))
dup_id = mysql.cur.lastrowid
for x in list(xc.keys())[1:]:
content_x = xc.get(x)
content_y = str_dict.get(x)
if content_x and content_y:
if x == 'gnmk':
continue
elif x == 'jsnr':
continue
else:
dup_count += 1
if ((xc['gnmk'] == 'None' or xc['gnmk'] is None or str.strip(xc['gnmk']) == '') and (str_dict['gnmk'] is None or str.strip(str_dict['gnmk']) == '')) and (
not xc['jsnr'] is None and xc['jsnr'] != 'None' and not str_dict['jsnr'] is None and len(str.strip(str_dict['jsnr'])) > 0):
for x in list(xc.keys())[1:]:
content_x = xc.get(x)
content_y = str_dict.get(x)
if content_x and content_y:
if x == 'gnmk':
# 循环遍历每一个维度
contents_y = []
contents_y.append(content_y)
similarity, content1, content2, idx = nlp.main(content_x, contents_y)
similarity = similarity * 0
# print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
# 相似度相加
total_similarity += similarity
function_content = content_y
dup_function_content = content_x
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
elif x == 'jsnr':
# 循环遍历每一个维度
contents_y = []
contents_y.append(content_y)
similarity, content1, content2, idx = nlp.main(content_x, contents_y)

similarity = similarity * 40
# print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
# 相似度相加
total_similarity += similarity

function_content = content_y
dup_function_content = content_x
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
else:
# 循环遍历每一个维度
contents_y = []
contents_y.append(content_y)
similarity, content1, content2, idx = nlp.main(content_x, contents_y)
similarity = similarity * (60 / dup_count)
# print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
# 相似度相加
total_similarity += similarity

function_content = content_y
dup_function_content = content_x

# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
elif ((xc['jsnr'] == 'None' or xc['jsnr'] is None or str.strip(xc['jsnr']) == '') and (str_dict['jsnr'] is None or str.strip(str_dict['jsnr']) == '')) and (
not xc['gnmk'] is None and xc['gnmk'] != 'None' and not str_dict['gnmk'] is None and len(str.strip(str_dict['gnmk'])) > 0):
for x in list(xc.keys())[1:]:
content_x = xc.get(x)
content_y = str_dict.get(x)
if content_x and content_y:
if x == 'gnmk':
# 循环遍历每一个维度
contents_y = []
contents_y.append(content_y)
similarity, content1, content2, idx = nlp.main(content_x, contents_y)

similarity = similarity * 50
# print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
# 相似度相加
total_similarity += similarity

function_content = content_y
dup_function_content = content_x
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
elif x == 'jsnr':
# 循环遍历每一个维度
contents_y = []
contents_y.append(content_y)
similarity, content1, content2, idx = nlp.main(content_x, contents_y)

similarity = similarity * 0
# print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
# 相似度相加
total_similarity += similarity

function_content = content_y
dup_function_content = content_x
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
else:
# 循环遍历每一个维度
contents_y = []
contents_y.append(content_y)
similarity, content1, content2, idx = nlp.main(content_x, contents_y)

similarity = similarity * (50 / dup_count)
# print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
# 相似度相加
total_similarity += similarity

function_content = content_y
dup_function_content = content_x
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
else:
for x in list(xc.keys())[1:]:
content_x = xc.get(x)
content_y = str_dict.get(x)
if content_x and content_y:
if x == 'gnmk':
# 循环遍历每一个维度
contents_y = []
contents_y.append(content_y)
similarity, content1, content2, idx = nlp.main(content_x, contents_y)

similarity = similarity * 50

# 相似度相加
total_similarity += similarity

function_content = content_y
dup_function_content = content_x
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
elif x == 'jsnr':
# 循环遍历每一个维度
contents_y = []
contents_y.append(content_y)
similarity, content1, content2, idx = nlp.main(content_x, contents_y)

similarity = similarity * 40
# 相似度相加
total_similarity += similarity

function_content = content_y
dup_function_content = content_x
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
else:
# 循环遍历每一个维度
contents_y = []
contents_y.append(content_y)
print(f"123:{content_x}")
print(f"234:{contents_y}")
similarity, content1, content2, idx = nlp.main(content_x, contents_y)

similarity = similarity * (10 / dup_count)
# print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
# 相似度相加
total_similarity += similarity

function_content = content_y
dup_function_content = content_x
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))

print("insert 成功")

mysql.sql_change_msg(
"""update idc_project_check set similarity=%f where dup_id=%d""" % (total_similarity, dup_id))


if __name__ == "__main__":
all_path = requests.get("http://127.0.0.1:19099/check/duplicates/%s" % 599).json()
# print(all_path)

# dict1 = {k:v for k, v in sorted(dict.items(), key= lambda item : item[1])}
# print(dict1)

data_list = []
for ap in all_path.get("data"):
# if os.path.exists(ap.get("file_path")):
data_list.append((ap.get("project_id"), ap.get("file_path"), ap.get("project_name")))
print(data_list)
# data_list = [(11, r"C:\Users\HUAWEI\PycharmProjects\nlp\dup_check\0825-丽水系统查重维度1.xlsx", "水路运输综合监管系统建设项目.xls")]
data_list = [(11, r"D:\ningda\dup_check2\dup_check\0825-丽水系统查重维度1.xlsx", "水路运输综合监管系统建设项目.xls")]
project_check(data_list)

+ 437
- 0
main1.py.cors.bak 查看文件

@@ -0,0 +1,437 @@
# coding=utf-8
import sys
import re
import mysql_pool
from pymysql.converters import escape_string
import model_scope
import pandas as pd
import datetime
import requests

# 通过corom算法进行文本向量化对比相识度

wdys1 = {
"项目名称": "xmmc",
"现状问题": "xzwt",
"系统基础": "xtjc",
"项目目标": "xmmb",
"预期绩效": "yqjx",
"建设需求": "jsxq",
"数据需求": "sjxq",
"安全需求": "aqxq",
"业务领域": "ywly",
"核心业务": "hxyw",
"业务需求": "ywxq",
"业务协同": "ywxt",
"建设层级": "jscj",
"用户范围": "yhfw",
"目标群体": "mbqt",
"建设内容": "jsnr",
"功能模块": "gnmk",
"数据共享": "sjgx",
"智能要素": "znys"
}
wdys2 = {
"xmmc": "项目名称",
"xzwt": "现状问题",
"xtjc": "系统基础",
"xmmb": "项目目标",
"yqjx": "预期绩效",
"jsxq": "建设需求",
"sjxq": "数据需求",
"aqxq": "安全需求",
"ywly": "业务领域",
"hxyw": "核心业务",
"ywxq": "业务需求",
"ywxt": "业务协同",
"jscj": "建设层级",
"yhfw": "用户范围",
"mbqt": "目标群体",
"jsnr": "建设内容",
"gnmk": "功能模块",
"sjgx": "数据共享",
"znys": "智能要素"
}
gnmkys = {
"gnmc": "功能名称",
"gnms": "功能描述"
}


def getFlag():
data_dict = {}
df = pd.read_excel("0825.xlsx")
data = df.values
data = list(pd.Series(data[:, 1]).dropna())
for d in data:
try:
wd = re.search("(.*?)(.*?%)", d).group(1).strip()
wdc = wdys1.get(wd)
if wdc:
qz = re.search(".*?((.*?%))", d).group(1)
data_dict[wdc] = qz
except:
pass
return data_dict


def gong_neng_mo_kuai(mysql, dl, data, er_title, similarity_nlp):
# 将excel文件中的所有第三维度内容进行拼接
str_dict = {}
for et in er_title:
for d in data:
if d[1] == et:
if str_dict.get(et):
str_dict[et] = str_dict.get(et) + d[3]
else:
str_dict[et] = d[3]

for k, v in str_dict.items():
mysql.sql_change_msg(
"""insert into idc_project_module (project_id, check_duplicate_count, module_name, module_content, create_time, update_time, tag) value(%d, 1, "%s", "%s", "%s", "%s", "模块")""" % (
int(dl[0]), k, v, str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7]))

module_id_list = mysql.sql_select_many(
"""select project_module_id, module_name, module_content from idc_project_module where project_id=%d""" % dl[
0])
data_list = []
for mil in module_id_list:
data_dict = {}
data_dict["project_module_id"] = mil.get("project_module_id")
data_dict["gnmc"] = mil.get("module_name")
data_dict["gnms"] = mil.get("module_content")
data_list.append(data_dict)
# print(data_list)
for i in data_list:
gnmk_copy1 = mysql.sql_select_many("""select * from user_history_module_data WHERE xmmc = '丽水市城市管理指挥中心信息系统(一期)项目' """)
if gnmk_copy1:
for gc in gnmk_copy1:
print(
"""insert into idc_project_module_check (project_module_id, module_name, project_name, company_name, create_time, update_time) value(%d, "%s", "%s", "%s", "%s", "%s")"""
% (
i.get("project_module_id"), escape_string(gc.get("gnmc")), escape_string(gc.get("xmmc")), "",
str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
mysql.sql_change_msg(
"""insert into idc_project_module_check (project_module_id, module_name, project_name, company_name, create_time, update_time) value(%d, "%s", "%s", "%s", "%s", "%s")"""
% (
i.get("project_module_id"), escape_string(gc.get("gnmc")), escape_string(gc.get("xmmc")), "",
str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
dup_module_id = mysql.cur.lastrowid
check_module_info(mysql, gc, dl, i, dup_module_id, similarity_nlp)


def check_module_info(mysql, gc, dl, pro, dup_module_id, similarity_nlp):
total_similarity1 = 0
total_keywords1 = []
total_similarity2 = 0
total_keywords2 = []
for j in ["gnmc", "gnms"]:
# 循环遍历每一个模块名称
content_x = gc.get(j)
content_y = pro.get(j)
if content_x and content_y:
if j == "gnmc":
print("功能名称 暂时不计算")
# 相似度
# similarity = similarity_nlp.main(content_x, content_y)
# similarity = similarity * 1
# # 相似度相加
# total_similarity1 += similarity
# mysql.sql_change_msg(
# """insert into idc_project_module_check_detail (dup_module_id, project_name, module_content, dup_module_content, similarity, dimension, create_time, update_time) value (%d, "%s", "%s", "%s", %f, "%s", "%s", "%s")"""
# % (dup_module_id, dl[2], escape_string(content_y), escape_string(content_x), similarity,
# "功能名称",
# str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7]))
else:
# 相似度
similarity = similarity_nlp.main(content_x, content_y)
similarity = similarity * 99
# 相似度相加
total_similarity2 += similarity
mysql.sql_change_msg(
"""insert into idc_project_module_check_detail (dup_module_id, project_name, module_content, dup_module_content, similarity, dimension, create_time, update_time) value (%d, "%s", "%s", "%s", %f, "%s", "%s", "%s")"""
% (dup_module_id, dl[2], escape_string(content_y), escape_string(content_x), similarity,
"功能模块描述",
str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7]))
mysql.sql_change_msg("""update idc_project_module_check set similarity=%f where dup_module_id=%d""" % (
total_similarity1 + total_similarity2, dup_module_id))


def project_check(data_list):
similarity_nlp = model_scope.Bert_nlp("structbert")
mysql = mysql_pool.ConnMysql()
# 读取维度和权重
xmnr_count = len(mysql.sql_select_many("""select * from user_history_data"""))
gnmk_count = len(mysql.sql_select_many("""select * from user_history_module_data"""))
# 遍历excel存储路径
for dl in data_list:
# 读取路径下的excel
print(dl,dl[1])
df = pd.read_excel(dl[1])
data = df.values
# 将excel文件中的所有维度内容进行拼接
join_str = ""
str_dict = {}
gnmk_str = []
title = ""
er_title = set()
for d in data:
if pd.notnull(d[0]):
title = d[0]
if title == "功能模块":
er_title.add(d[1])
join_str = ""
for i in d[1:]:
if pd.notnull(i):
join_str += str(i)
if title == "功能模块":
if i == '功能描述':
continue
else:
gnmk_str.append(i)
str_dict[wdys1.get(title)] = join_str
else:
if title == "功能模块":
er_title.add(d[1])
for i in d[1:]:
if pd.notnull(i):
join_str += str(i)
str_dict[wdys1.get(title)] = str_dict.get(wdys1.get(title)) + join_str
# print(str_dict)
gnmk = ",".join(gnmk_str)
str_dict['gnmk'] = gnmk
mysql.sql_change_msg(
"""insert into user_data (xmmc, xzwt, xtjc, xmmb, yqjx, jsxq, sjxq, aqxq, ywly, hxyw, ywxq, ywxt, jscj, yhfw, mbqt, jsnr, gnmk, sjgx, znys) value ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")"""
% (dl[0], str_dict.get("xzwt") if str_dict.get("xzwt") else None,
str_dict.get("xtjc") if str_dict.get("xtjc") else None,
str_dict.get("xmmb") if str_dict.get("xmmb") else None,
str_dict.get("yqjx") if str_dict.get("yqjx") else None,
str_dict.get("jsxq") if str_dict.get("jsxq") else None,
str_dict.get("sjxq") if str_dict.get("sjxq") else None,
str_dict.get("aqxq") if str_dict.get("aqxq") else None,
str_dict.get("ywly") if str_dict.get("ywly") else None,
str_dict.get("hxyw") if str_dict.get("hxyw") else None,
str_dict.get("ywxq") if str_dict.get("ywxq") else None,
str_dict.get("ywxt") if str_dict.get("ywxt") else None,
str_dict.get("jscj") if str_dict.get("jscj") else None,
str_dict.get("yhfw") if str_dict.get("yhfw") else None,
str_dict.get("mbqt") if str_dict.get("mbqt") else None,
str_dict.get("jsnr") if str_dict.get("jsnr") else None,
str_dict.get("gnmk") if str_dict.get("gnmk") else None,
str_dict.get("sjgx") if str_dict.get("sjgx") else None,
str_dict.get("znys") if str_dict.get("znys") else None))
# 或取所有的xmnr_copy1
xmnr_copy1 = mysql.sql_select_many("""select * from user_history_data WHERE xmmc = '丽水市城市管理指挥中心信息系统(一期)项目' """)
# 对比xmnr_copy1和xmnr维度是否都有
if xmnr_copy1:
# threads = [Thread(target=check_project_info, args=(mysql, dl, xc, str_dict)) for xc in xmnr_copy1]
# for t in threads:
# t.start()
#
# for t in threads:
# t.join()
for xc in xmnr_copy1:
check_project_info(mysql, dl, xc, str_dict, similarity_nlp)
mysql.sql_change_msg(
"""update idc_project set dup_status=3, one_vote_veto_status=1, self_check_status=1, history_project_count=%d ,module_count=%d where project_id=%d""" % (
xmnr_count, gnmk_count, dl[0]))
gong_neng_mo_kuai(mysql, dl, data, er_title, similarity_nlp)
# 释放数据库资源
mysql.release()


def check_project_info(mysql, dl, xc, str_dict, similarity_nlp):
total_keywords = {}
total_similarity = 0
dup_count = 0
# 保存相加后的相似度到idc_project_check
print(f'xmmc is {xc.get("xmmc")}')
mysql.sql_change_msg(
"""insert into idc_project_check (project_id, dup_project_name, file_path, company_name, create_year, project_tag, project_range_tag, project_area, create_time, update_time) value ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")"""
% (dl[0], escape_string(xc.get("xmmc")), escape_string(dl[1]), "", "", "需求相似、业务相似", "历史项目", "",
str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7]))
dup_id = mysql.cur.lastrowid
for x in list(xc.keys())[1:]:
content_x = xc.get(x)
content_y = str_dict.get(x)
if content_x and content_y:
if x == 'gnmk':
continue
elif x == 'jsnr':
continue
else:
dup_count += 1
if ((xc['gnmk'] == 'None' or xc['gnmk'] is None or str.strip(xc['gnmk']) == '') and (str_dict['gnmk'] is None or str.strip(str_dict['gnmk']) == '')) and (
not xc['jsnr'] is None and xc['jsnr'] != 'None' and not str_dict['jsnr'] is None and len(str.strip(str_dict['jsnr'])) > 0):
for x in list(xc.keys())[1:]:
content_x = xc.get(x)
content_y = str_dict.get(x)
if content_x and content_y:
if x == 'gnmk':
# 循环遍历每一个维度
# 相似度
similarity= similarity_nlp.main(content_x, content_y)
similarity = similarity * 0
# print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
# 相似度相加
total_similarity += similarity

function_content = content_y
dup_function_content = content_x
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
# content = content.replace(gjcs, f'<span class="similarity">{gjcs.strip()}</span>')
elif x == 'jsnr':
# 相似度 关键词
similarity = similarity_nlp.main(content_x, content_y)
similarity = similarity * 40
# print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
# 相似度相加
total_similarity += similarity
# 关键词收集
function_content = content_y
dup_function_content = content_x
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
else:
# 相似度 关键词
similarity = similarity_nlp.main(content_x, content_y)
similarity = similarity * (60 / dup_count)
# 相似度相加
total_similarity += similarity
function_content = content_y
dup_function_content = content_x
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
elif ((xc['jsnr'] == 'None' or xc['jsnr'] is None or str.strip(xc['jsnr']) == '') and (str_dict['jsnr'] is None or str.strip(str_dict['jsnr']) == '')) and (
not xc['gnmk'] is None and xc['gnmk'] != 'None' and not str_dict['gnmk'] is None and len(str.strip(str_dict['gnmk'])) > 0):
for x in list(xc.keys())[1:]:
content_x = xc.get(x)
content_y = str_dict.get(x)
if content_x and content_y:
if x == 'gnmk':
# 相似度 关键词
similarity = similarity_nlp.main(content_x, content_y)
similarity = similarity * 50
# 相似度相加
total_similarity += similarity
function_content = content_y
dup_function_content = content_x

# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
elif x == 'jsnr':
# 相似度
similarity = similarity_nlp.main(content_x, content_y)
similarity = similarity * 0
# 相似度相加
total_similarity += similarity
function_content = content_y
dup_function_content = content_x
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
else:
# 相似度 关键词
similarity = similarity_nlp.main(content_x, content_y)
similarity = similarity * (50 / dup_count)
# 相似度相加
total_similarity += similarity
function_content = content_y
dup_function_content = content_x
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
else:
for x in list(xc.keys())[1:]:
content_x = xc.get(x)
content_y = str_dict.get(x)
if content_x and content_y:
if x == 'gnmk':
# 相似度
similarity = similarity_nlp.main(content_x, content_y)
similarity = similarity * 50

# 相似度相加
total_similarity += similarity
function_content = content_y
dup_function_content = content_x
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
# content = content.replace(gjcs, f'<span class="similarity">{gjcs.strip()}</span>')
elif x == 'jsnr':
# 相似度
similarity = similarity_nlp.main(content_x, content_y)
similarity = similarity * 40
# 相似度相加
total_similarity += similarity
function_content = content_y
dup_function_content = content_x
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
else:
# 相似度 关键词
similarity = similarity_nlp.main(content_x, content_y)
similarity = similarity * (10 / dup_count)
# 相似度相加
total_similarity += similarity
function_content = content_y
dup_function_content = content_x
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))

mysql.sql_change_msg(
"""update idc_project_check set similarity=%f where dup_id=%d""" % (total_similarity, dup_id))


if __name__ == "__main__":
all_path = requests.get("http://127.0.0.1:19099/check/duplicates/%s" % 599).json()
# print(all_path)

# dict1 = {k:v for k, v in sorted(dict.items(), key= lambda item : item[1])}
# print(dict1)

data_list = []
for ap in all_path.get("data"):
# if os.path.exists(ap.get("file_path")):
data_list.append((ap.get("project_id"), ap.get("file_path"), ap.get("project_name")))
print(data_list)
# data_list = [(11, r"C:\Users\HUAWEI\PycharmProjects\nlp\dup_check\0825-丽水系统查重维度1.xlsx", "水路运输综合监管系统建设项目.xls")]
data_list = [(11, r"D:\ningda\dup_check2\dup_check\0825-丽水系统查重维度1.xlsx", "水路运输综合监管系统建设项目.xls")]
project_check(data_list)

+ 720
- 0
main1.py.glm.bak 查看文件

@@ -0,0 +1,720 @@
# coding=utf-8
import sys
import re

import baidu
import mysql_pool
from pymysql.converters import escape_string
import cosin_similarity
import pandas as pd
import datetime
import requests
import glm_utils
from threading import Thread


wdys1 = {
"项目名称": "xmmc",
"现状问题": "xzwt",
"系统基础": "xtjc",
"项目目标": "xmmb",
"预期绩效": "yqjx",
"建设需求": "jsxq",
"数据需求": "sjxq",
"安全需求": "aqxq",
"业务领域": "ywly",
"核心业务": "hxyw",
"业务需求": "ywxq",
"业务协同": "ywxt",
"建设层级": "jscj",
"用户范围": "yhfw",
"目标群体": "mbqt",
"建设内容": "jsnr",
"功能模块": "gnmk",
"数据共享": "sjgx",
"智能要素": "znys"
}
wdys2 = {
"xmmc": "项目名称",
"xzwt": "现状问题",
"xtjc": "系统基础",
"xmmb": "项目目标",
"yqjx": "预期绩效",
"jsxq": "建设需求",
"sjxq": "数据需求",
"aqxq": "安全需求",
"ywly": "业务领域",
"hxyw": "核心业务",
"ywxq": "业务需求",
"ywxt": "业务协同",
"jscj": "建设层级",
"yhfw": "用户范围",
"mbqt": "目标群体",
"jsnr": "建设内容",
"gnmk": "功能模块",
"sjgx": "数据共享",
"znys": "智能要素"
}
gnmkys = {
"gnmc": "功能名称",
"gnms": "功能描述"
}


def getFlag():
data_dict = {}
df = pd.read_excel("0825.xlsx")
data = df.values
data = list(pd.Series(data[:, 1]).dropna())
for d in data:
try:
wd = re.search("(.*?)(.*?%)", d).group(1).strip()
wdc = wdys1.get(wd)
if wdc:
qz = re.search(".*?((.*?%))", d).group(1)
data_dict[wdc] = qz
except:
pass
return data_dict


def gong_neng_mo_kuai(mysql, dl, data, er_title):
# 将excel文件中的所有第三维度内容进行拼接
str_dict = {}
for et in er_title:
for d in data:
if d[1] == et:
if str_dict.get(et):
str_dict[et] = str_dict.get(et) + d[3]
else:
str_dict[et] = d[3]

for k, v in str_dict.items():
mysql.sql_change_msg(
"""insert into idc_project_module (project_id, check_duplicate_count, module_name, module_content, create_time, update_time, tag) value(%d, 1, "%s", "%s", "%s", "%s", "模块")""" % (
int(dl[0]), k, v, str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7]))

module_id_list = mysql.sql_select_many(
"""select project_module_id, module_name, module_content from idc_project_module where project_id=%d""" % dl[
0])
data_list = []
for mil in module_id_list:
data_dict = {}
data_dict["project_module_id"] = mil.get("project_module_id")
data_dict["gnmc"] = mil.get("module_name")
# data_dict["glm_desc"] = baidu.CallResult(mil.get("module_content"))
data_dict["gnms"] = mil.get("module_content")
# print(f'module_content = ({mil.get("module_content")}), glm_desc = ({data_dict["glm_desc"]})')
data_list.append(data_dict)
# print(data_list)
for i in data_list:
gnmk_copy1 = mysql.sql_select_many("""select * from user_history_module_data where xmmc = '南浔区信息化项目全生命周期管理系统'""")
if gnmk_copy1:
for gc in gnmk_copy1:
print(
"""insert into idc_project_module_check (project_module_id, module_name, project_name, company_name, create_time, update_time) value(%d, "%s", "%s", "%s", "%s", "%s")"""
% (
i.get("project_module_id"), escape_string(gc.get("gnmc")), escape_string(gc.get("xmmc")), "",
str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
mysql.sql_change_msg(
"""insert into idc_project_module_check (project_module_id, module_name, project_name, company_name, create_time, update_time) value(%d, "%s", "%s", "%s", "%s", "%s")"""
% (
i.get("project_module_id"), escape_string(gc.get("gnmc")), escape_string(gc.get("xmmc")), "",
str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
dup_module_id = mysql.cur.lastrowid
check_module_info(mysql, gc, dl, i, dup_module_id)

gnmk_gjc = {}
for a in ["gnmc", "glm_desc"]:
if i.get(a):
content_x = i.get(a)
content_y = i.get(a)
if a == "gnmc":
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keyword_x, keywords = similarity.main()
# 去重
keywords = list(set(keywords))
gnmk_gjc[a] = keywords
else:
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keyword_x, keywords = similarity.main()
# 去重
keywords = list(set(keywords))
gnmk_gjc[a] = keywords
mysql.sql_change_msg("""insert into user_module_keywords (xmmc, gnmc, gnms) value("%s", "%s", "%s")""" % (
dl[2], str(gnmk_gjc.get("gnmc"))[1:-1] if gnmk_gjc.get("gnmc") else None,
str(gnmk_gjc.get("gnms"))[1:-1] if gnmk_gjc.get("gnms") else None))


def check_module_info(mysql, gc, dl, pro, dup_module_id):
total_similarity1 = 0
total_keywords1 = []
total_similarity2 = 0
total_keywords2 = []
for j in ["gnmc", "gnms"]:
# 循环遍历每一个模块名称
content_x = gc.get(j)
content_y = pro.get(j)
if content_x and content_y:
if j == "gnmc":
# similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
# similarity, keyword_x, keywords = similarity.main()
# similarity = similarity * 1
# total_keywords1 += keywords
# print("######################相似度: %.2f%%" % similarity, "关键词: %s" % keywords)
similarity, check_desc = glm_utils.AutoDLResult(f"""告诉我下面两段话的重复率百分比是多少: \n第一段话是:'{content_y}', \n ----------------- \n 第二段话是:'{content_x}'""")
# similarity, check_desc = baidu.CallResult(
# f"""告诉我下面两段话的重复率百分比是多少: 第一段话是:'{content_x}', ----------------- 第二段话是:'{content_y}'""")
# 相似度相加
if similarity is None:
similarity = 0
print(f"similarity is {similarity}")
total_similarity1 += similarity/100
mysql.sql_change_msg(
"""insert into idc_project_module_check_detail (dup_module_id, project_name, module_content, dup_module_content, similarity, dimension, create_time, update_time, check_desc) value (%d, "%s", "%s", "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_module_id, dl[2], escape_string(content_y), escape_string(content_x), similarity,
"功能名称",
str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7], escape_string(check_desc)))
else:
# similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# # 相似度 关键词
# similarity, keyword_x, keywords = similarity.main()
# similarity = similarity * 99
# total_keywords2 += keywords

similarity, check_desc = glm_utils.AutoDLResult(f"""告诉我下面两段话的重复率百分比是多少: \n第一段话是:'{content_y}', \n ----------------- \n 第二段话是:'{content_x}'""")
# similarity, check_desc = baidu.CallResult(
# f"""告诉我下面两段话的重复率百分比是多少: 第一段话是:'{content_x}', ----------------- 第二段话是:'{content_y}'""")
# 临时写入文件
# check_desc = str(check_desc).replace("\n", " ")
# prompt = f"""告诉我下面两段话的重复率百分比是多少: 第一段话是:'{content_x}', ----------------- 第二段话是:'{content_y}'"""
# prompt = prompt.replace("\n", " ")
# with open('train.json', 'a') as file:
# file.write("{" + f"""
# "content": "{prompt}",
# "summary": "{check_desc}"
# """ + "}\n")

if similarity is None:
similarity = 0
print(f"similarity is {similarity}")
similarity = similarity * 0.99

# print("######################相似度: %.2f%%" % similarity, "关键词: %s" % keywords)
# 相似度相加 gnms
total_similarity2 += similarity
# module_content = pro.get("gnms") + "/n" + content_y
# dup_module_content = gc.get("gnms") + "/n" + content_x
module_content = pro.get("gnms")
dup_module_content = gc.get("gnms")
mysql.sql_change_msg(
"""insert into idc_project_module_check_detail (dup_module_id, project_name, module_content, dup_module_content, similarity, dimension, create_time, update_time, check_desc) value (%d, "%s", "%s", "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_module_id, dl[2], escape_string(module_content), escape_string(dup_module_content),
similarity,
"功能模块描述",
str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7],
escape_string(check_desc)))

mysql.sql_change_msg("""update idc_project_module_check set similarity=%f where dup_module_id=%d""" % (
total_similarity1 + total_similarity2, dup_module_id))


def project_check(data_list):
mysql = mysql_pool.ConnMysql()
# mysql.sql_select_many("""select * from mkgjc""")
# 读取维度和权重
# xmnr_count = len(mysql.sql_select_many("""select * from xmnr_copy1"""))
# gnmk_count = len(mysql.sql_select_many("""select * from gnmk_copy1"""))
xmnr_count = len(mysql.sql_select_many("""select * from user_history_data"""))
gnmk_count = len(mysql.sql_select_many("""select * from user_history_module_data"""))
get_data_dict = getFlag()
# 遍历excel存储路径
for dl in data_list:
# path = "0825-丽水系统查重维度1.xlsx"
# 读取路径下的excel
print(dl,dl[1])
df = pd.read_excel(dl[1])
data = df.values
# 将excel文件中的所有维度内容进行拼接
join_str = ""
str_dict = {}
gnmk_str = []
title = ""
er_title = set()
for d in data:
# if pd.notnull(d[0]):
# title = d[0]
# if title == "功能模块":
# er_title.add(d[1])
# join_str = ""
# for i in d[1:]:
# if pd.notnull(i):
# join_str += str(i)
# str_dict[wdys1.get(title)] = join_str
if pd.notnull(d[0]):
title = d[0]
if title == "功能模块":
er_title.add(d[1])
join_str = ""
for i in d[1:]:
if pd.notnull(i):
join_str += str(i)
if title == "功能模块":
if i == '功能描述':
continue
else:
gnmk_str.append(i)
str_dict[wdys1.get(title)] = join_str
else:
if title == "功能模块":
er_title.add(d[1])
for i in d[1:]:
if pd.notnull(i):
join_str += str(i)
str_dict[wdys1.get(title)] = str_dict.get(wdys1.get(title)) + join_str
# print(str_dict)
gnmk = ",".join(gnmk_str)
str_dict['gnmk'] = gnmk
mysql.sql_change_msg(
"""insert into user_data (xmmc, xzwt, xtjc, xmmb, yqjx, jsxq, sjxq, aqxq, ywly, hxyw, ywxq, ywxt, jscj, yhfw, mbqt, jsnr, gnmk, sjgx, znys) value ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")"""
% (dl[0], str_dict.get("xzwt") if str_dict.get("xzwt") else None,
str_dict.get("xtjc") if str_dict.get("xtjc") else None,
str_dict.get("xmmb") if str_dict.get("xmmb") else None,
str_dict.get("yqjx") if str_dict.get("yqjx") else None,
str_dict.get("jsxq") if str_dict.get("jsxq") else None,
str_dict.get("sjxq") if str_dict.get("sjxq") else None,
str_dict.get("aqxq") if str_dict.get("aqxq") else None,
str_dict.get("ywly") if str_dict.get("ywly") else None,
str_dict.get("hxyw") if str_dict.get("hxyw") else None,
str_dict.get("ywxq") if str_dict.get("ywxq") else None,
str_dict.get("ywxt") if str_dict.get("ywxt") else None,
str_dict.get("jscj") if str_dict.get("jscj") else None,
str_dict.get("yhfw") if str_dict.get("yhfw") else None,
str_dict.get("mbqt") if str_dict.get("mbqt") else None,
str_dict.get("jsnr") if str_dict.get("jsnr") else None,
str_dict.get("gnmk") if str_dict.get("gnmk") else None,
str_dict.get("sjgx") if str_dict.get("sjgx") else None,
str_dict.get("znys") if str_dict.get("znys") else None))
# 或取所有的xmnr_copy1
xmnr_copy1 = mysql.sql_select_many("""select * from user_history_data WHERE xmmc = '南浔区信息化项目全生命周期管理系统'""")
# 对比xmnr_copy1和xmnr维度是否都有
if xmnr_copy1:
# threads = [Thread(target=check_project_info, args=(mysql, dl, xc, str_dict)) for xc in xmnr_copy1]
# for t in threads:
# t.start()
#
# for t in threads:
# t.join()
# pro_ths = []
# for xc in xmnr_copy1:
# # check_project_info(mysql, dl, xc, str_dict)
# p = Thread(target=check_project_info, args=(mysql, dl, xc, str_dict))
# pro_ths.append(p)
# p.start()
# for p in pro_ths:
# p.join()
for xc in xmnr_copy1:
check_project_info(mysql, dl, xc, str_dict)

project_gjc = {}
for w in wdys2.keys():
content_x = str_dict.get(w)
content_y = str_dict.get(w)
if content_x and content_y:
# 循环遍历每一个维度
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keywords_x, keywords = similarity.main()
# 去重
keywords = list(set(keywords))
project_gjc[w] = keywords
mysql.sql_change_msg(
"""insert into user_keyword (xmmc, xzwt, xtjc, xmmb, yqjx, jsxq, sjxq, aqxq, ywly, hxyw, ywxq, ywxt, jscj, yhfw, mbqt, jsnr, gnmk, sjgx, znys) value ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")"""
% (dl[0], str(project_gjc.get("xzwt"))[1:-1] if project_gjc.get("xzwt") else None,
str(project_gjc.get("xtjc"))[1:-1] if project_gjc.get("xtjc") else None,
str(project_gjc.get("xmmb"))[1:-1] if project_gjc.get("xmmb") else None,
str(project_gjc.get("yqjx"))[1:-1] if project_gjc.get("yqjx") else None,
str(project_gjc.get("jsxq"))[1:-1] if project_gjc.get("jsxq") else None,
str(project_gjc.get("sjxq"))[1:-1] if project_gjc.get("sjxq") else None,
str(project_gjc.get("aqxq"))[1:-1] if project_gjc.get("aqxq") else None,
str(project_gjc.get("ywly"))[1:-1] if project_gjc.get("ywly") else None,
str(project_gjc.get("hxyw"))[1:-1] if project_gjc.get("hxyw") else None,
str(project_gjc.get("ywxq"))[1:-1] if project_gjc.get("ywxq") else None,
str(project_gjc.get("ywxt"))[1:-1] if project_gjc.get("ywxt") else None,
str(project_gjc.get("jscj"))[1:-1] if project_gjc.get("jscj") else None,
str(project_gjc.get("yhfw"))[1:-1] if project_gjc.get("yhfw") else None,
str(project_gjc.get("mbqt"))[1:-1] if project_gjc.get("mbqt") else None,
str(project_gjc.get("jsnr"))[1:-1] if project_gjc.get("jsnr") else None,
str(project_gjc.get("gnmk"))[1:-1] if project_gjc.get("gnmk") else None,
str(project_gjc.get("sjgx"))[1:-1] if project_gjc.get("sjgx") else None,
str(project_gjc.get("znys"))[1:-1] if project_gjc.get("znys") else None))
mysql.sql_change_msg(
"""update idc_project set dup_status=3, one_vote_veto_status=1, self_check_status=1, history_project_count=%d ,module_count=%d where project_id=%d""" % (
xmnr_count, gnmk_count, dl[0]))
gong_neng_mo_kuai(mysql, dl, data, er_title)


def check_project_info(mysql, dl, xc, str_dict):
total_keywords = {}
total_similarity = 0
dup_count = 0
# 保存相加后的相似度到idc_project_check
mysql.sql_change_msg(
"""insert into idc_project_check (project_id, dup_project_name, file_path, company_name, create_year, project_tag, project_range_tag, project_area, create_time, update_time) value ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")"""
% (dl[0], escape_string(xc.get("xmmc")), escape_string(dl[1]), "", "", "需求相似、业务相似", "历史项目", "",
str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7]))
dup_id = mysql.cur.lastrowid
for x in list(xc.keys())[1:]:
content_x = xc.get(x)
content_y = str_dict.get(x)
if content_x and content_y:
if x == 'gnmk':
continue
elif x == 'jsnr':
continue
else:
dup_count += 1
if ((xc['gnmk'] == 'None' or xc['gnmk'] is None or str.strip(xc['gnmk']) == '') and (str_dict['gnmk'] is None or str.strip(str_dict['gnmk']) == '')) and (
not xc['jsnr'] is None and xc['jsnr'] != 'None' and not str_dict['jsnr'] is None and len(str.strip(str_dict['jsnr'])) > 0):
for x in list(xc.keys())[1:]:
content_x = xc.get(x)
content_y = str_dict.get(x)
if content_x and content_y:
if x == 'gnmk':
# 匹配到历史数据,次数加1
# dup_count += dup_file_test
# 循环遍历每一个维度
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keywords_x, keywords_y = similarity.main()
similarity = similarity * 0
# print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
# 相似度相加
total_similarity += similarity
# 去重
keywords_y = list(set(keywords_y))
# 去重
keywords_x = list(set(keywords_x))
# 关键词收集
total_keywords[x] = keywords_y
function_content = content_y
dup_function_content = content_x
for word_y in keywords_y:
word_y = word_y.strip().strip("'").strip('"')
if word_y != '':
function_content = str(function_content.replace("\"", "'")).replace(word_y,
f'<span class="similarity">{word_y.strip()}</span>')
for word_x in keywords_x:
word_x = word_x.strip().strip("'").strip('"')
if word_x != '':
dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
f'<span class="similarity">{word_x.strip()}</span>')
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
# content = content.replace(gjcs, f'<span class="similarity">{gjcs.strip()}</span>')
elif x == 'jsnr':
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keywords_x, keywords_y = similarity.main()
similarity = similarity * 40
# print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
# 相似度相加
total_similarity += similarity
# 去重
keywords_y = list(set(keywords_y))
# 去重
keywords_x = list(set(keywords_x))
# 关键词收集
total_keywords[x] = keywords_y
function_content = content_y
dup_function_content = content_x
for word_y in keywords_y:
word_y = word_y.strip().strip("'").strip('"')
if word_y != '':
function_content = str(function_content.replace("\"", "'")).replace(word_y,
f'<span class="similarity">{word_y.strip()}</span>')
for word_x in keywords_x:
word_x = word_x.strip().strip("'").strip('"')
if word_x != '':
dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
f'<span class="similarity">{word_x.strip()}</span>')
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
else:
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keywords_x, keywords_y = similarity.main()
similarity = similarity * (60 / dup_count)
# print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
# 相似度相加
total_similarity += similarity
# 去重
keywords_y = list(set(keywords_y))
# 去重
keywords_x = list(set(keywords_x))
# 关键词收集
total_keywords[x] = keywords_y
function_content = content_y
dup_function_content = content_x

for word_y in keywords_y:
word_y = word_y.strip().strip("'").strip('"')
if word_y != '':
function_content = str(function_content.replace("\"", "'")).replace(word_y,
f'<span class="similarity">{word_y.strip()}</span>')
for word_x in keywords_x:
word_x = word_x.strip().strip("'").strip('"')
if word_x != '':
dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
f'<span class="similarity">{word_x.strip()}</span>')
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
elif ((xc['jsnr'] == 'None' or xc['jsnr'] is None or str.strip(xc['jsnr']) == '') and (str_dict['jsnr'] is None or str.strip(str_dict['jsnr']) == '')) and (
not xc['gnmk'] is None and xc['gnmk'] != 'None' and not str_dict['gnmk'] is None and len(str.strip(str_dict['gnmk'])) > 0):
for x in list(xc.keys())[1:]:
content_x = xc.get(x)
content_y = str_dict.get(x)
if content_x and content_y:
if x == 'gnmk':
# 匹配到历史数据,次数加1
# dup_count += dup_file_test
# 循环遍历每一个维度
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keywords_x, keywords_y = similarity.main()
similarity = similarity * 50
# print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
# 相似度相加
total_similarity += similarity
# 去重
keywords_y = list(set(keywords_y))
# 去重
keywords_x = list(set(keywords_x))
# 关键词收集
total_keywords[x] = keywords_y

function_content = content_y
dup_function_content = content_x
for word_y in keywords_y:
word_y = word_y.strip().strip("'").strip('"')
if word_y != '':
function_content = str(function_content.replace("\"", "'")).replace(word_y,
f'<span class="similarity">{word_y.strip()}</span>')
for word_x in keywords_x:
word_x = word_x.strip().strip("'").strip('"')
if word_x != '':
dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
f'<span class="similarity">{word_x.strip()}</span>')
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
# content = content.replace(gjcs, f'<span class="similarity">{gjcs.strip()}</span>')
elif x == 'jsnr':
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keywords_x, keywords_y = similarity.main()
similarity = similarity * 0
# print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
# 相似度相加
total_similarity += similarity
# 去重
keywords_y = list(set(keywords_y))
# 去重
keywords_x = list(set(keywords_x))
# 关键词收集
total_keywords[x] = keywords_y
function_content = content_y
dup_function_content = content_x
for word_y in keywords_y:
word_y = word_y.strip().strip("'").strip('"')
if word_y != '':
function_content = str(function_content.replace("\"", "'")).replace(word_y,
f'<span class="similarity">{word_y.strip()}</span>')
for word_x in keywords_x:
word_x = word_x.strip().strip("'").strip('"')
if word_x != '':
dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
f'<span class="similarity">{word_x.strip()}</span>')
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
else:
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keywords_x, keywords_y = similarity.main()
similarity = similarity * (50 / dup_count)
# print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
# 相似度相加
total_similarity += similarity
# 去重
keywords_y = list(set(keywords_y))
# 去重
keywords_x = list(set(keywords_x))
# 关键词收集
total_keywords[x] = keywords_y
function_content = content_y
dup_function_content = content_x
for word_y in keywords_y:
word_y = word_y.strip().strip("'").strip('"')
if word_y != '':
function_content = str(function_content.replace("\"", "'")).replace(word_y,
f'<span class="similarity">{word_y.strip()}</span>')
for word_x in keywords_x:
word_x = word_x.strip().strip("'").strip('"')
if word_x != '':
dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
f'<span class="similarity">{word_x.strip()}</span>')
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
else:
for x in list(xc.keys())[1:]:
content_x = xc.get(x)
content_y = str_dict.get(x)
if content_x and content_y:
if x == 'gnmk':
# 匹配到历史数据,次数加1
# dup_count += dup_file_test
# 循环遍历每一个维度
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keywords_x, keywords_y = similarity.main()
similarity = similarity * 50

# 相似度相加
total_similarity += similarity
# 去重
keywords_y = list(set(keywords_y))
# 去重
keywords_x = list(set(keywords_x))
print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
# 关键词收集
total_keywords[x] = keywords_y

function_content = content_y
dup_function_content = content_x
for word_y in keywords_y:
word_y = word_y.strip().strip("'").strip('"')
if word_y != '':
function_content = str(function_content.replace("\"", "'")).replace(word_y,
f'<span class="similarity">{word_y.strip()}</span>')
for word_x in keywords_x:
word_x = word_x.strip().strip("'").strip('"')
if word_x != '':
dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
f'<span class="similarity">{word_x.strip()}</span>')
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
# content = content.replace(gjcs, f'<span class="similarity">{gjcs.strip()}</span>')
elif x == 'jsnr':
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keywords_x, keywords_y = similarity.main()
similarity = similarity * 40
# 相似度相加
total_similarity += similarity
# 去重
keywords_y = list(set(keywords_y))
# 去重
keywords_x = list(set(keywords_x))
print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
# 关键词收集
total_keywords[x] = keywords_y
function_content = content_y
dup_function_content = content_x
for word_y in keywords_y:
word_y = word_y.strip().strip("'").strip('"')
if word_y != '':
function_content = str(function_content.replace("\"", "'")).replace(word_y,
f'<span class="similarity">{word_y.strip()}</span>')
for word_x in keywords_x:
word_x = word_x.strip().strip("'").strip('"')
if word_x != '':
dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
f'<span class="similarity">{word_x.strip()}</span>')
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
else:
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keywords_x, keywords_y = similarity.main()
similarity = similarity * (10 / dup_count)
# print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
# 相似度相加
total_similarity += similarity
# 去重
keywords_y = list(set(keywords_y))
# 去重
keywords_x = list(set(keywords_x))
# 关键词收集
total_keywords[x] = keywords_y
function_content = content_y
dup_function_content = content_x
for word_y in keywords_y:
word_y = word_y.strip().strip("'").strip('"')
if word_y != '':
function_content = str(function_content.replace("\"", "'")).replace(word_y, f'<span class="similarity">{word_y.strip()}</span>')
for word_x in keywords_x:
word_x = word_x.strip().strip("'").strip('"')
if word_x != '':
dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x, f'<span class="similarity">{word_x.strip()}</span>')
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))

mysql.sql_change_msg(
"""update idc_project_check set similarity=%f where dup_id=%d""" % (total_similarity, dup_id))


if __name__ == "__main__":
all_path = requests.get("http://127.0.0.1:19099/check/duplicates/%s" % 599).json()
# print(all_path)

# dict1 = {k:v for k, v in sorted(dict.items(), key= lambda item : item[1])}
# print(dict1)

data_list = []
for ap in all_path.get("data"):
# if os.path.exists(ap.get("file_path")):
data_list.append((ap.get("project_id"), ap.get("file_path"), ap.get("project_name")))
print(data_list)
# data_list = [(11, r"C:\Users\HUAWEI\PycharmProjects\nlp\dup_check\0825-丽水系统查重维度1.xlsx", "水路运输综合监管系统建设项目.xls")]
data_list = [(11, r"D:\ningda\dup_check2\dup_check\0825-丽水系统查重维度1.xlsx", "水路运输综合监管系统建设项目.xls")]
project_check(data_list)

+ 675
- 0
main1.py.word.bak 查看文件

@@ -0,0 +1,675 @@
# coding=utf-8
import sys
import re
import mysql_pool
from pymysql.converters import escape_string
import cosin_similarity
import pandas as pd
import datetime
import requests


wdys1 = {
"项目名称": "xmmc",
"现状问题": "xzwt",
"系统基础": "xtjc",
"项目目标": "xmmb",
"预期绩效": "yqjx",
"建设需求": "jsxq",
"数据需求": "sjxq",
"安全需求": "aqxq",
"业务领域": "ywly",
"核心业务": "hxyw",
"业务需求": "ywxq",
"业务协同": "ywxt",
"建设层级": "jscj",
"用户范围": "yhfw",
"目标群体": "mbqt",
"建设内容": "jsnr",
"功能模块": "gnmk",
"数据共享": "sjgx",
"智能要素": "znys"
}
wdys2 = {
"xmmc": "项目名称",
"xzwt": "现状问题",
"xtjc": "系统基础",
"xmmb": "项目目标",
"yqjx": "预期绩效",
"jsxq": "建设需求",
"sjxq": "数据需求",
"aqxq": "安全需求",
"ywly": "业务领域",
"hxyw": "核心业务",
"ywxq": "业务需求",
"ywxt": "业务协同",
"jscj": "建设层级",
"yhfw": "用户范围",
"mbqt": "目标群体",
"jsnr": "建设内容",
"gnmk": "功能模块",
"sjgx": "数据共享",
"znys": "智能要素"
}
gnmkys = {
"gnmc": "功能名称",
"gnms": "功能描述"
}


def getFlag():
data_dict = {}
df = pd.read_excel("0825.xlsx")
data = df.values
data = list(pd.Series(data[:, 1]).dropna())
for d in data:
try:
wd = re.search("(.*?)(.*?%)", d).group(1).strip()
wdc = wdys1.get(wd)
if wdc:
qz = re.search(".*?((.*?%))", d).group(1)
data_dict[wdc] = qz
except:
pass
return data_dict


def gong_neng_mo_kuai(mysql, dl, data, er_title):
# 将excel文件中的所有第三维度内容进行拼接
str_dict = {}
for et in er_title:
for d in data:
if d[1] == et:
if str_dict.get(et):
str_dict[et] = str_dict.get(et) + d[3]
else:
str_dict[et] = d[3]

for k, v in str_dict.items():
mysql.sql_change_msg(
"""insert into idc_project_module (project_id, check_duplicate_count, module_name, module_content, create_time, update_time, tag) value(%d, 1, "%s", "%s", "%s", "%s", "模块")""" % (
int(dl[0]), k, v, str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7]))

module_id_list = mysql.sql_select_many(
"""select project_module_id, module_name, module_content from idc_project_module where project_id=%d""" % dl[
0])
data_list = []
for mil in module_id_list:
data_dict = {}
data_dict["project_module_id"] = mil.get("project_module_id")
data_dict["gnmc"] = mil.get("module_name")
data_dict["gnms"] = mil.get("module_content")
data_list.append(data_dict)
# print(data_list)
for i in data_list:
gnmk_copy1 = mysql.sql_select_many("""select * from user_history_module_data""")
if gnmk_copy1:
for gc in gnmk_copy1:
print(
"""insert into idc_project_module_check (project_module_id, module_name, project_name, company_name, create_time, update_time) value(%d, "%s", "%s", "%s", "%s", "%s")"""
% (
i.get("project_module_id"), escape_string(gc.get("gnmc")), escape_string(gc.get("xmmc")), "",
str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
mysql.sql_change_msg(
"""insert into idc_project_module_check (project_module_id, module_name, project_name, company_name, create_time, update_time) value(%d, "%s", "%s", "%s", "%s", "%s")"""
% (
i.get("project_module_id"), escape_string(gc.get("gnmc")), escape_string(gc.get("xmmc")), "",
str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
dup_module_id = mysql.cur.lastrowid
check_module_info(mysql, gc, dl, i, dup_module_id)
gnmk_gjc = {}
for a in ["gnmc", "gnms"]:
if i.get(a):
content_x = i.get(a)
content_y = i.get(a)
if a == "gnmc":
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keyword_x, keywords = similarity.main()
# 去重
keywords = list(set(keywords))
gnmk_gjc[a] = keywords
else:
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keyword_x, keywords = similarity.main()
# 去重
keywords = list(set(keywords))
gnmk_gjc[a] = keywords
mysql.sql_change_msg("""insert into user_module_keywords (xmmc, gnmc, gnms) value("%s", "%s", "%s")""" % (
dl[2], str(gnmk_gjc.get("gnmc"))[1:-1] if gnmk_gjc.get("gnmc") else None,
str(gnmk_gjc.get("gnms"))[1:-1] if gnmk_gjc.get("gnms") else None))


def check_module_info(mysql, gc, dl, pro, dup_module_id):
total_similarity1 = 0
total_keywords1 = []
total_similarity2 = 0
total_keywords2 = []
for j in ["gnmc", "gnms"]:
# 循环遍历每一个模块名称
content_x = gc.get(j)
content_y = pro.get(j)
if content_x and content_y:
if j == "gnmc":
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keyword_x, keywords = similarity.main()
similarity = similarity * 1
total_keywords1 += keywords
# print("######################相似度: %.2f%%" % similarity, "关键词: %s" % keywords)
# 相似度相加
total_similarity1 += similarity
mysql.sql_change_msg(
"""insert into idc_project_module_check_detail (dup_module_id, project_name, module_content, dup_module_content, similarity, dimension, create_time, update_time) value (%d, "%s", "%s", "%s", %f, "%s", "%s", "%s")"""
% (dup_module_id, dl[2], escape_string(content_y), escape_string(content_x), similarity,
"功能名称",
str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7]))
else:
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keyword_x, keywords = similarity.main()
similarity = similarity * 99
total_keywords2 += keywords
# print("######################相似度: %.2f%%" % similarity, "关键词: %s" % keywords)
# 相似度相加
total_similarity2 += similarity
mysql.sql_change_msg(
"""insert into idc_project_module_check_detail (dup_module_id, project_name, module_content, dup_module_content, similarity, dimension, create_time, update_time) value (%d, "%s", "%s", "%s", %f, "%s", "%s", "%s")"""
% (dup_module_id, dl[2], escape_string(content_y), escape_string(content_x), similarity,
"功能模块描述",
str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7]))
mysql.sql_change_msg("""update idc_project_module_check set similarity=%f where dup_module_id=%d""" % (
total_similarity1 + total_similarity2, dup_module_id))


def project_check(data_list):
mysql = mysql_pool.ConnMysql()
# mysql.sql_select_many("""select * from mkgjc""")
# 读取维度和权重
# xmnr_count = len(mysql.sql_select_many("""select * from xmnr_copy1"""))
# gnmk_count = len(mysql.sql_select_many("""select * from gnmk_copy1"""))
xmnr_count = len(mysql.sql_select_many("""select * from user_history_data"""))
gnmk_count = len(mysql.sql_select_many("""select * from user_history_module_data"""))
get_data_dict = getFlag()
# 遍历excel存储路径
for dl in data_list:
# path = "0825-丽水系统查重维度1.xlsx"
# 读取路径下的excel
print(dl,dl[1])
df = pd.read_excel(dl[1])
data = df.values
# 将excel文件中的所有维度内容进行拼接
join_str = ""
str_dict = {}
gnmk_str = []
title = ""
er_title = set()
for d in data:
# if pd.notnull(d[0]):
# title = d[0]
# if title == "功能模块":
# er_title.add(d[1])
# join_str = ""
# for i in d[1:]:
# if pd.notnull(i):
# join_str += str(i)
# str_dict[wdys1.get(title)] = join_str
if pd.notnull(d[0]):
title = d[0]
if title == "功能模块":
er_title.add(d[1])
join_str = ""
for i in d[1:]:
if pd.notnull(i):
join_str += str(i)
if title == "功能模块":
if i == '功能描述':
continue
else:
gnmk_str.append(i)
str_dict[wdys1.get(title)] = join_str
else:
if title == "功能模块":
er_title.add(d[1])
for i in d[1:]:
if pd.notnull(i):
join_str += str(i)
str_dict[wdys1.get(title)] = str_dict.get(wdys1.get(title)) + join_str
# print(str_dict)
gnmk = ",".join(gnmk_str)
str_dict['gnmk'] = gnmk
mysql.sql_change_msg(
"""insert into user_data (xmmc, xzwt, xtjc, xmmb, yqjx, jsxq, sjxq, aqxq, ywly, hxyw, ywxq, ywxt, jscj, yhfw, mbqt, jsnr, gnmk, sjgx, znys) value ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")"""
% (dl[0], str_dict.get("xzwt") if str_dict.get("xzwt") else None,
str_dict.get("xtjc") if str_dict.get("xtjc") else None,
str_dict.get("xmmb") if str_dict.get("xmmb") else None,
str_dict.get("yqjx") if str_dict.get("yqjx") else None,
str_dict.get("jsxq") if str_dict.get("jsxq") else None,
str_dict.get("sjxq") if str_dict.get("sjxq") else None,
str_dict.get("aqxq") if str_dict.get("aqxq") else None,
str_dict.get("ywly") if str_dict.get("ywly") else None,
str_dict.get("hxyw") if str_dict.get("hxyw") else None,
str_dict.get("ywxq") if str_dict.get("ywxq") else None,
str_dict.get("ywxt") if str_dict.get("ywxt") else None,
str_dict.get("jscj") if str_dict.get("jscj") else None,
str_dict.get("yhfw") if str_dict.get("yhfw") else None,
str_dict.get("mbqt") if str_dict.get("mbqt") else None,
str_dict.get("jsnr") if str_dict.get("jsnr") else None,
str_dict.get("gnmk") if str_dict.get("gnmk") else None,
str_dict.get("sjgx") if str_dict.get("sjgx") else None,
str_dict.get("znys") if str_dict.get("znys") else None))
# 或取所有的xmnr_copy1
xmnr_copy1 = mysql.sql_select_many("""select * from user_history_data""")
# 对比xmnr_copy1和xmnr维度是否都有
if xmnr_copy1:
# threads = [Thread(target=check_project_info, args=(mysql, dl, xc, str_dict)) for xc in xmnr_copy1]
# for t in threads:
# t.start()
#
# for t in threads:
# t.join()
for xc in xmnr_copy1:
check_project_info(mysql, dl, xc, str_dict)

project_gjc = {}
for w in wdys2.keys():
content_x = str_dict.get(w)
content_y = str_dict.get(w)
if content_x and content_y:
# 循环遍历每一个维度
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keywords_x, keywords = similarity.main()
# 去重
keywords = list(set(keywords))
project_gjc[w] = keywords
mysql.sql_change_msg(
"""insert into user_keyword (xmmc, xzwt, xtjc, xmmb, yqjx, jsxq, sjxq, aqxq, ywly, hxyw, ywxq, ywxt, jscj, yhfw, mbqt, jsnr, gnmk, sjgx, znys) value ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")"""
% (dl[0], str(project_gjc.get("xzwt"))[1:-1] if project_gjc.get("xzwt") else None,
str(project_gjc.get("xtjc"))[1:-1] if project_gjc.get("xtjc") else None,
str(project_gjc.get("xmmb"))[1:-1] if project_gjc.get("xmmb") else None,
str(project_gjc.get("yqjx"))[1:-1] if project_gjc.get("yqjx") else None,
str(project_gjc.get("jsxq"))[1:-1] if project_gjc.get("jsxq") else None,
str(project_gjc.get("sjxq"))[1:-1] if project_gjc.get("sjxq") else None,
str(project_gjc.get("aqxq"))[1:-1] if project_gjc.get("aqxq") else None,
str(project_gjc.get("ywly"))[1:-1] if project_gjc.get("ywly") else None,
str(project_gjc.get("hxyw"))[1:-1] if project_gjc.get("hxyw") else None,
str(project_gjc.get("ywxq"))[1:-1] if project_gjc.get("ywxq") else None,
str(project_gjc.get("ywxt"))[1:-1] if project_gjc.get("ywxt") else None,
str(project_gjc.get("jscj"))[1:-1] if project_gjc.get("jscj") else None,
str(project_gjc.get("yhfw"))[1:-1] if project_gjc.get("yhfw") else None,
str(project_gjc.get("mbqt"))[1:-1] if project_gjc.get("mbqt") else None,
str(project_gjc.get("jsnr"))[1:-1] if project_gjc.get("jsnr") else None,
str(project_gjc.get("gnmk"))[1:-1] if project_gjc.get("gnmk") else None,
str(project_gjc.get("sjgx"))[1:-1] if project_gjc.get("sjgx") else None,
str(project_gjc.get("znys"))[1:-1] if project_gjc.get("znys") else None))
mysql.sql_change_msg(
"""update idc_project set dup_status=3, one_vote_veto_status=1, self_check_status=1, history_project_count=%d ,module_count=%d where project_id=%d""" % (
xmnr_count, gnmk_count, dl[0]))
gong_neng_mo_kuai(mysql, dl, data, er_title)


def check_project_info(mysql, dl, xc, str_dict):
total_keywords = {}
total_similarity = 0
dup_count = 0
# 保存相加后的相似度到idc_project_check
print(f'xmmc is {xc.get("xmmc")}')
mysql.sql_change_msg(
"""insert into idc_project_check (project_id, dup_project_name, file_path, company_name, create_year, project_tag, project_range_tag, project_area, create_time, update_time) value ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")"""
% (dl[0], escape_string(xc.get("xmmc")), escape_string(dl[1]), "", "", "需求相似、业务相似", "历史项目", "",
str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7]))
dup_id = mysql.cur.lastrowid
for x in list(xc.keys())[1:]:
content_x = xc.get(x)
content_y = str_dict.get(x)
if content_x and content_y:
if x == 'gnmk':
continue
elif x == 'jsnr':
continue
else:
dup_count += 1
if ((xc['gnmk'] == 'None' or xc['gnmk'] is None or str.strip(xc['gnmk']) == '') and (str_dict['gnmk'] is None or str.strip(str_dict['gnmk']) == '')) and (
not xc['jsnr'] is None and xc['jsnr'] != 'None' and not str_dict['jsnr'] is None and len(str.strip(str_dict['jsnr'])) > 0):
for x in list(xc.keys())[1:]:
content_x = xc.get(x)
content_y = str_dict.get(x)
if content_x and content_y:
if x == 'gnmk':
# 匹配到历史数据,次数加1
# dup_count += dup_file_test
# 循环遍历每一个维度
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keywords_x, keywords_y = similarity.main()
similarity = similarity * 0
# print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
# 相似度相加
total_similarity += similarity
# 去重
keywords_y = list(set(keywords_y))
# 去重
keywords_x = list(set(keywords_x))
# 关键词收集
total_keywords[x] = keywords_y
function_content = content_y
dup_function_content = content_x
for word_y in keywords_y:
word_y = word_y.strip().strip("'").strip('"')
if word_y != '':
function_content = str(function_content.replace("\"", "'")).replace(word_y,
f'<span class="similarity">{word_y.strip()}</span>')
for word_x in keywords_x:
word_x = word_x.strip().strip("'").strip('"')
if word_x != '':
dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
f'<span class="similarity">{word_x.strip()}</span>')
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
# content = content.replace(gjcs, f'<span class="similarity">{gjcs.strip()}</span>')
elif x == 'jsnr':
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keywords_x, keywords_y = similarity.main()
similarity = similarity * 40
# print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
# 相似度相加
total_similarity += similarity
# 去重
keywords_y = list(set(keywords_y))
# 去重
keywords_x = list(set(keywords_x))
# 关键词收集
total_keywords[x] = keywords_y
function_content = content_y
dup_function_content = content_x
for word_y in keywords_y:
word_y = word_y.strip().strip("'").strip('"')
if word_y != '':
function_content = str(function_content.replace("\"", "'")).replace(word_y,
f'<span class="similarity">{word_y.strip()}</span>')
for word_x in keywords_x:
word_x = word_x.strip().strip("'").strip('"')
if word_x != '':
dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
f'<span class="similarity">{word_x.strip()}</span>')
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
else:
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keywords_x, keywords_y = similarity.main()
similarity = similarity * (60 / dup_count)
# print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
# 相似度相加
total_similarity += similarity
# 去重
keywords_y = list(set(keywords_y))
# 去重
keywords_x = list(set(keywords_x))
# 关键词收集
total_keywords[x] = keywords_y
function_content = content_y
dup_function_content = content_x

for word_y in keywords_y:
word_y = word_y.strip().strip("'").strip('"')
if word_y != '':
function_content = str(function_content.replace("\"", "'")).replace(word_y,
f'<span class="similarity">{word_y.strip()}</span>')
for word_x in keywords_x:
word_x = word_x.strip().strip("'").strip('"')
if word_x != '':
dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
f'<span class="similarity">{word_x.strip()}</span>')
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
elif ((xc['jsnr'] == 'None' or xc['jsnr'] is None or str.strip(xc['jsnr']) == '') and (str_dict['jsnr'] is None or str.strip(str_dict['jsnr']) == '')) and (
not xc['gnmk'] is None and xc['gnmk'] != 'None' and not str_dict['gnmk'] is None and len(str.strip(str_dict['gnmk'])) > 0):
for x in list(xc.keys())[1:]:
content_x = xc.get(x)
content_y = str_dict.get(x)
if content_x and content_y:
if x == 'gnmk':
# 匹配到历史数据,次数加1
# dup_count += dup_file_test
# 循环遍历每一个维度
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keywords_x, keywords_y = similarity.main()
similarity = similarity * 50
# print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
# 相似度相加
total_similarity += similarity
# 去重
keywords_y = list(set(keywords_y))
# 去重
keywords_x = list(set(keywords_x))
# 关键词收集
total_keywords[x] = keywords_y

function_content = content_y
dup_function_content = content_x
for word_y in keywords_y:
word_y = word_y.strip().strip("'").strip('"')
if word_y != '':
function_content = str(function_content.replace("\"", "'")).replace(word_y,
f'<span class="similarity">{word_y.strip()}</span>')
for word_x in keywords_x:
word_x = word_x.strip().strip("'").strip('"')
if word_x != '':
dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
f'<span class="similarity">{word_x.strip()}</span>')
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
# content = content.replace(gjcs, f'<span class="similarity">{gjcs.strip()}</span>')
elif x == 'jsnr':
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keywords_x, keywords_y = similarity.main()
similarity = similarity * 0
# print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
# 相似度相加
total_similarity += similarity
# 去重
keywords_y = list(set(keywords_y))
# 去重
keywords_x = list(set(keywords_x))
# 关键词收集
total_keywords[x] = keywords_y
function_content = content_y
dup_function_content = content_x
for word_y in keywords_y:
word_y = word_y.strip().strip("'").strip('"')
if word_y != '':
function_content = str(function_content.replace("\"", "'")).replace(word_y,
f'<span class="similarity">{word_y.strip()}</span>')
for word_x in keywords_x:
word_x = word_x.strip().strip("'").strip('"')
if word_x != '':
dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
f'<span class="similarity">{word_x.strip()}</span>')
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
else:
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keywords_x, keywords_y = similarity.main()
similarity = similarity * (50 / dup_count)
# print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
# 相似度相加
total_similarity += similarity
# 去重
keywords_y = list(set(keywords_y))
# 去重
keywords_x = list(set(keywords_x))
# 关键词收集
total_keywords[x] = keywords_y
function_content = content_y
dup_function_content = content_x
for word_y in keywords_y:
word_y = word_y.strip().strip("'").strip('"')
if word_y != '':
function_content = str(function_content.replace("\"", "'")).replace(word_y,
f'<span class="similarity">{word_y.strip()}</span>')
for word_x in keywords_x:
word_x = word_x.strip().strip("'").strip('"')
if word_x != '':
dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
f'<span class="similarity">{word_x.strip()}</span>')
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
else:
for x in list(xc.keys())[1:]:
content_x = xc.get(x)
content_y = str_dict.get(x)
if content_x and content_y:
if x == 'gnmk':
# 匹配到历史数据,次数加1
# dup_count += dup_file_test
# 循环遍历每一个维度
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keywords_x, keywords_y = similarity.main()
similarity = similarity * 50

# 相似度相加
total_similarity += similarity
# 去重
keywords_y = list(set(keywords_y))
# 去重
keywords_x = list(set(keywords_x))
print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
# 关键词收集
total_keywords[x] = keywords_y

function_content = content_y
dup_function_content = content_x
for word_y in keywords_y:
word_y = word_y.strip().strip("'").strip('"')
if word_y != '':
function_content = str(function_content.replace("\"", "'")).replace(word_y,
f'<span class="similarity">{word_y.strip()}</span>')
for word_x in keywords_x:
word_x = word_x.strip().strip("'").strip('"')
if word_x != '':
dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
f'<span class="similarity">{word_x.strip()}</span>')
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
# content = content.replace(gjcs, f'<span class="similarity">{gjcs.strip()}</span>')
elif x == 'jsnr':
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keywords_x, keywords_y = similarity.main()
similarity = similarity * 40
# 相似度相加
total_similarity += similarity
# 去重
keywords_y = list(set(keywords_y))
# 去重
keywords_x = list(set(keywords_x))
print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
# 关键词收集
total_keywords[x] = keywords_y
function_content = content_y
dup_function_content = content_x
for word_y in keywords_y:
word_y = word_y.strip().strip("'").strip('"')
if word_y != '':
function_content = str(function_content.replace("\"", "'")).replace(word_y,
f'<span class="similarity">{word_y.strip()}</span>')
for word_x in keywords_x:
word_x = word_x.strip().strip("'").strip('"')
if word_x != '':
dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
f'<span class="similarity">{word_x.strip()}</span>')
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
else:
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keywords_x, keywords_y = similarity.main()
similarity = similarity * (10 / dup_count)
# print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
# 相似度相加
total_similarity += similarity
# 去重
keywords_y = list(set(keywords_y))
# 去重
keywords_x = list(set(keywords_x))
# 关键词收集
total_keywords[x] = keywords_y
function_content = content_y
dup_function_content = content_x
for word_y in keywords_y:
word_y = word_y.strip().strip("'").strip('"')
print(f'word_y = {word_y}')
if word_y != '':
function_content = str(function_content.replace("\"", "'")).replace(word_y, f'<span class="similarity">{word_y.strip()}</span>')
for word_x in keywords_x:
word_x = word_x.strip().strip("'").strip('"')
if word_x != '':
dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x, f'<span class="similarity">{word_x.strip()}</span>')
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))

mysql.sql_change_msg(
"""update idc_project_check set similarity=%f where dup_id=%d""" % (total_similarity, dup_id))


if __name__ == "__main__":
all_path = requests.get("http://127.0.0.1:19099/check/duplicates/%s" % 599).json()
# print(all_path)

# dict1 = {k:v for k, v in sorted(dict.items(), key= lambda item : item[1])}
# print(dict1)

data_list = []
for ap in all_path.get("data"):
# if os.path.exists(ap.get("file_path")):
data_list.append((ap.get("project_id"), ap.get("file_path"), ap.get("project_name")))
print(data_list)
# data_list = [(11, r"C:\Users\HUAWEI\PycharmProjects\nlp\dup_check\0825-丽水系统查重维度1.xlsx", "水路运输综合监管系统建设项目.xls")]
data_list = [(11, r"D:\ningda\dup_check2\dup_check\0825-丽水系统查重维度1.xlsx", "水路运输综合监管系统建设项目.xls")]
project_check(data_list)

+ 550
- 0
main1.py_改造qwen 查看文件

@@ -0,0 +1,550 @@
# coding=utf-8
import sys
import re

import baidu
import model_scope
import mysql_pool
from pymysql.converters import escape_string
import cosin_similarity
import pandas as pd
import datetime
import requests
import glm_utils
from threading import Thread


wdys1 = {
"项目名称": "xmmc",
"现状问题": "xzwt",
"系统基础": "xtjc",
"项目目标": "xmmb",
"预期绩效": "yqjx",
"建设需求": "jsxq",
"数据需求": "sjxq",
"安全需求": "aqxq",
"业务领域": "ywly",
"核心业务": "hxyw",
"业务需求": "ywxq",
"业务协同": "ywxt",
"建设层级": "jscj",
"用户范围": "yhfw",
"目标群体": "mbqt",
"建设内容": "jsnr",
"功能模块": "gnmk",
"数据共享": "sjgx",
"智能要素": "znys"
}
wdys2 = {
"xmmc": "项目名称",
"xzwt": "现状问题",
"xtjc": "系统基础",
"xmmb": "项目目标",
"yqjx": "预期绩效",
"jsxq": "建设需求",
"sjxq": "数据需求",
"aqxq": "安全需求",
"ywly": "业务领域",
"hxyw": "核心业务",
"ywxq": "业务需求",
"ywxt": "业务协同",
"jscj": "建设层级",
"yhfw": "用户范围",
"mbqt": "目标群体",
"jsnr": "建设内容",
"gnmk": "功能模块",
"sjgx": "数据共享",
"znys": "智能要素"
}
gnmkys = {
"gnmc": "功能名称",
"gnms": "功能描述"
}


def getFlag():
data_dict = {}
df = pd.read_excel("0825.xlsx")
data = df.values
data = list(pd.Series(data[:, 1]).dropna())
for d in data:
try:
wd = re.search("(.*?)(.*?%)", d).group(1).strip()
wdc = wdys1.get(wd)
if wdc:
qz = re.search(".*?((.*?%))", d).group(1)
data_dict[wdc] = qz
except:
pass
return data_dict


def gong_neng_mo_kuai(mysql, dl, data, er_title, str_dict_new):
nlp = model_scope.Bert_nlp("corom")
# 将excel文件中的所有第三维度内容进行拼接
str_dict = {}
for et in er_title:
for d in data:
if d[1] == et:
if str_dict.get(et):
str_dict[et] = str_dict.get(et) + d[3]
else:
str_dict[et] = d[3]

for k, v in str_dict.items():
mysql.sql_change_msg(
"""insert into idc_project_module (project_id, check_duplicate_count, module_name, module_content, create_time, update_time, tag) value(%d, 1, "%s", "%s", "%s", "%s", "模块")""" % (
int(dl[0]), k, v, str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7]))

module_id_list = mysql.sql_select_many(
"""select project_module_id, module_name, module_content from idc_project_module where project_id=%d""" % dl[
0])
data_list = []
for mil in module_id_list:
data_dict = {}
data_dict["project_module_id"] = mil.get("project_module_id")
data_dict["gnmc"] = mil.get("module_name")
data_dict["gnms"] = mil.get("module_content")
data_list.append(data_dict)
for i in data_list:
# where xmmc = '南浔区信息化项目全生命周期管理系统'
gnmk_copy1 = mysql.sql_select_many("""select * from user_history_module_data """)
if gnmk_copy1:
# desc_info_list = []
# for gc in gnmk_copy1:
# if gc.get("xmmc") != dl[2]:
# desc_info_list.append(gc.get("gnms"))
# similarity, s1, s2, idx = nlp.main(i.get("gnms"), desc_info_list)
# if idx == -1:
# continue

for gc in gnmk_copy1:
desc = glm_utils.qwenResult(i.get("gnms"), gc.get("gnms"))
similarity_result, count = similarity_result_check(desc)
similarity = count
mysql.sql_change_msg(
"""insert into idc_project_module_check (project_module_id, module_name, project_name, company_name, create_time, update_time, similarity_result) value(%d, "%s", "%s", "%s", "%s", "%s", "%s")"""
% (
i.get("project_module_id"), escape_string(gc.get("gnmc")), escape_string(gc.get("xmmc")), "",
str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7], similarity_result))
dup_module_id = mysql.cur.lastrowid
check_module_info(mysql, gc, dl, i, dup_module_id, similarity)


def check_module_info(mysql, gc, dl, pro, dup_module_id, score):
total_similarity1 = 0
total_similarity2 = 0
for j in ["gnmc", "gnms"]:
# 循环遍历每一个模块名称
content_x = gc.get(j)
content_y = pro.get(j)
if content_x and content_y:
if j == "gnmc":
# print("功能名称对比")
similarity, check_desc = glm_utils.AutoDLResult(f"""请帮我分析以下两段重复语句重复的地方: \n第一段话是:'{content_y}', \n ----------------- \n 第二段话是:'{content_x}'""")
# # 相似度相加
if similarity is None:
similarity = 0
print(f"similarity is {similarity}")
total_similarity1 += similarity/100
mysql.sql_change_msg(
"""insert into idc_project_module_check_detail (dup_module_id, project_name, module_content, dup_module_content, similarity, dimension, create_time, update_time, check_desc) value (%d, "%s", "%s", "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_module_id, dl[2], escape_string(content_y), escape_string(content_x), similarity,
"功能名称",
str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7], escape_string(check_desc)))
else:
check_desc = glm_utils.AutoDLResultNoNum(f"""请帮我分析以下两段重复语句重复的地方: \n第一段话是:'{content_y}', \n ----------------- \n 第二段话是:'{content_x}'""")
similarity = score
# 相似度相加 gnms
total_similarity2 += similarity
module_content = pro.get("gnms")
dup_module_content = gc.get("gnms")
mysql.sql_change_msg(
"""insert into idc_project_module_check_detail (dup_module_id, project_name, module_content, dup_module_content, similarity, dimension, create_time, update_time, check_desc) value (%d, "%s", "%s", "%s", %f, "%s", "%s", "%s", "%s")"""
% (dup_module_id, dl[2], escape_string(module_content), escape_string(dup_module_content),
similarity,
"功能模块描述",
str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7],
escape_string(check_desc)))

mysql.sql_change_msg("""update idc_project_module_check set similarity=%f where dup_module_id=%d""" % (
total_similarity1 + total_similarity2, dup_module_id))


def project_check(data_list):
mysql = mysql_pool.ConnMysql()
# mysql.sql_select_many("""select * from mkgjc""")
# 读取历史数据
xmnr_count = len(mysql.sql_select_many("""select * from user_history_data"""))
gnmk_count = len(mysql.sql_select_many("""select * from user_history_module_data"""))

nlp = model_scope.Bert_nlp("corom")

# 遍历excel存储路径
for dl in data_list:
# path = "0825-丽水系统查重维度1.xlsx"
# 读取路径下的excel
print(dl,dl[1])
df = pd.read_excel(dl[1])
data = df.values
# 将excel文件中的所有维度内容进行拼接
join_str = ""
str_dict = {}
gnmk_str = []
title = ""
er_title = set()
for d in data:
# if pd.notnull(d[0]):
# title = d[0]
# if title == "功能模块":
# er_title.add(d[1])
# join_str = ""
# for i in d[1:]:
# if pd.notnull(i):
# join_str += str(i)
# str_dict[wdys1.get(title)] = join_str
if pd.notnull(d[0]):
title = d[0]
if title == "功能模块":
er_title.add(d[1])
join_str = ""
for i in d[1:]:
if pd.notnull(i):
join_str += str(i)
if title == "功能模块":
if i == '功能描述':
continue
else:
gnmk_str.append(i)
str_dict[wdys1.get(title)] = join_str
else:
if title == "功能模块":
er_title.add(d[1])
for i in d[1:]:
if pd.notnull(i):
join_str += str(i)
str_dict[wdys1.get(title)] = str_dict.get(wdys1.get(title)) + join_str
# print(str_dict)
gnmk = ",".join(gnmk_str)
str_dict['gnmk'] = gnmk
mysql.sql_change_msg(
"""insert into user_data (xmmc, xzwt, xtjc, xmmb, yqjx, jsxq, sjxq, aqxq, ywly, hxyw, ywxq, ywxt, jscj, yhfw, mbqt, jsnr, gnmk, sjgx, znys) value ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")"""
% (dl[0], str_dict.get("xzwt") if str_dict.get("xzwt") else None,
str_dict.get("xtjc") if str_dict.get("xtjc") else None,
str_dict.get("xmmb") if str_dict.get("xmmb") else None,
str_dict.get("yqjx") if str_dict.get("yqjx") else None,
str_dict.get("jsxq") if str_dict.get("jsxq") else None,
str_dict.get("sjxq") if str_dict.get("sjxq") else None,
str_dict.get("aqxq") if str_dict.get("aqxq") else None,
str_dict.get("ywly") if str_dict.get("ywly") else None,
str_dict.get("hxyw") if str_dict.get("hxyw") else None,
str_dict.get("ywxq") if str_dict.get("ywxq") else None,
str_dict.get("ywxt") if str_dict.get("ywxt") else None,
str_dict.get("jscj") if str_dict.get("jscj") else None,
str_dict.get("yhfw") if str_dict.get("yhfw") else None,
str_dict.get("mbqt") if str_dict.get("mbqt") else None,
str_dict.get("jsnr") if str_dict.get("jsnr") else None,
str_dict.get("gnmk") if str_dict.get("gnmk") else None,
str_dict.get("sjgx") if str_dict.get("sjgx") else None,
str_dict.get("znys") if str_dict.get("znys") else None))
# 或取所有的xmnr_copy1 where xmmc = '南浔区信息化项目全生命周期管理系统'
xmnr_copy1 = mysql.sql_select_many("""select * from user_history_data where xmmc = '富阳未来社区(乡村)一体化数智平台' """)
# 对比xmnr_copy1和xmnr维度是否都有
if xmnr_copy1:
# threads = [Thread(target=check_project_info, args=(mysql, dl, xc, str_dict)) for xc in xmnr_copy1]
# for t in threads:
# t.start()
#
# for t in threads:
# t.join()
# pro_ths = []
# for xc in xmnr_copy1:
# # check_project_info(mysql, dl, xc, str_dict)
# p = Thread(target=check_project_info, args=(mysql, dl, xc, str_dict))
# pro_ths.append(p)
# p.start()
# for p in pro_ths:
# p.join()
xmnr_copy1_new = []
for xc in xmnr_copy1:
if xc["xmmc"] == str_dict.get("xmmc"):
continue
check_project_info(mysql, dl, xc, str_dict, nlp)

# 找出相识对最高的项目通过glm分析


mysql.sql_change_msg(
"""update idc_project set dup_status=3, one_vote_veto_status=1, self_check_status=1, history_project_count=%d ,module_count=%d where project_id=%d""" % (
xmnr_count, gnmk_count, dl[0]))
gong_neng_mo_kuai(mysql, dl, data, er_title, str_dict)


def check_project_info(mysql, dl, xc, str_dict, nlp):
total_keywords = {}
total_similarity = 0
dup_count = 0
# 保存相加后的相似度到idc_project_check
mysql.sql_change_msg(
"""insert into idc_project_check (project_id, dup_project_name, file_path, company_name, create_year, project_tag, project_range_tag, project_area, create_time, update_time) value ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")"""
% (dl[0], escape_string(xc.get("xmmc")), escape_string(dl[1]), "", "", "需求相似、业务相似", "历史项目", "",
str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7]))
dup_id = mysql.cur.lastrowid
for x in list(xc.keys())[1:]:
content_x = xc.get(x)
content_y = str_dict.get(x)
if content_x and content_y:
if x == 'gnmk':
continue
elif x == 'jsnr':
continue
else:
dup_count += 1
if ((xc['gnmk'] == 'None' or xc['gnmk'] is None or str.strip(xc['gnmk']) == '') and (str_dict['gnmk'] is None or str.strip(str_dict['gnmk']) == '')) and (
not xc['jsnr'] is None and xc['jsnr'] != 'None' and not str_dict['jsnr'] is None and len(str.strip(str_dict['jsnr'])) > 0):
for x in list(xc.keys())[1:]:
content_x = xc.get(x)
content_y = str_dict.get(x)
if content_x and content_y:
if x == 'gnmk':
# 循环遍历每一个维度
# contents_y = []
# contents_y.append(content_y)
# similarity, content1, content2, idx = nlp.main(content_x, contents_y)

desc = glm_utils.qwenResult(content_y, content_x)
similarity_result, count = similarity_result_check(desc)
similarity = count * 0
# print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
# 相似度相加
total_similarity += similarity
function_content = content_y
dup_function_content = content_x
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time, similarity_result) value (%d, "%s", %f, "%s", "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7], similarity_result))
elif x == 'jsnr':
# 循环遍历每一个维度
# contents_y = []
# contents_y.append(content_y)
# similarity, content1, content2, idx = nlp.main(content_x, contents_y)

desc = glm_utils.qwenResult(content_y, content_x)
similarity_result, count = similarity_result_check(desc)
similarity = count * 40
# print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
# 相似度相加
total_similarity += similarity

function_content = content_y
dup_function_content = content_x
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time, similarity_result) value (%d, "%s", %f, "%s", "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7], similarity_result))
else:
# 循环遍历每一个维度
# contents_y = []
# contents_y.append(content_y)
# similarity, content1, content2, idx = nlp.main(content_x, contents_y)

desc = glm_utils.qwenResult(content_y, content_x)
similarity_result, count = similarity_result_check(desc)
similarity = count * (60 / dup_count)
# print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
# 相似度相加
total_similarity += similarity

function_content = content_y
dup_function_content = content_x

# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time, similarity_result) value (%d, "%s", %f, "%s", "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7], similarity_result))
elif ((xc['jsnr'] == 'None' or xc['jsnr'] is None or str.strip(xc['jsnr']) == '') and (str_dict['jsnr'] is None or str.strip(str_dict['jsnr']) == '')) and (
not xc['gnmk'] is None and xc['gnmk'] != 'None' and not str_dict['gnmk'] is None and len(str.strip(str_dict['gnmk'])) > 0):
for x in list(xc.keys())[1:]:
content_x = xc.get(x)
content_y = str_dict.get(x)
if content_x and content_y:
if x == 'gnmk':
# 循环遍历每一个维度
# contents_y = []
# contents_y.append(content_y)
# similarity, content1, content2, idx = nlp.main(content_x, contents_y)

desc = glm_utils.qwenResult(content_y, content_x)
similarity_result, count = similarity_result_check(desc)

similarity = count * 50
# print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
# 相似度相加
total_similarity += similarity

function_content = content_y
dup_function_content = content_x
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time, similarity_result) value (%d, "%s", %f, "%s", "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7], similarity_result))
elif x == 'jsnr':
# 循环遍历每一个维度
# contents_y = []
# contents_y.append(content_y)
# similarity, content1, content2, idx = nlp.main(content_x, contents_y)

desc = glm_utils.qwenResult(content_y, content_x)
similarity_result, count = similarity_result_check(desc)

similarity = count * 0
# print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
# 相似度相加
total_similarity += similarity

function_content = content_y
dup_function_content = content_x
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time, similarity_result) value (%d, "%s", %f, "%s", "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7], similarity_result))
else:
# 循环遍历每一个维度
# contents_y = []
# contents_y.append(content_y)
# similarity, content1, content2, idx = nlp.main(content_x, contents_y)

desc = glm_utils.qwenResult(content_y, content_x)
similarity_result, count = similarity_result_check(desc)

similarity = count * (50 / dup_count)
# print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
# 相似度相加
total_similarity += similarity

function_content = content_y
dup_function_content = content_x
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time, similarity_result) value (%d, "%s", %f, "%s", "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7], similarity_result))
else:
for x in list(xc.keys())[1:]:
content_x = xc.get(x)
content_y = str_dict.get(x)
if content_x and content_y:
if x == 'gnmk':
# 循环遍历每一个维度
# contents_y = []
# contents_y.append(content_y)
# similarity, content1, content2, idx = nlp.main(content_x, contents_y)
desc = glm_utils.qwenResult(content_y, content_x)
similarity_result, count = similarity_result_check(desc)

similarity = count * 50

# 相似度相加
total_similarity += similarity

function_content = content_y
dup_function_content = content_x
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time, similarity_result) value (%d, "%s", %f, "%s", "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7], similarity_result))
elif x == 'jsnr':
# 循环遍历每一个维度
# contents_y = []
# contents_y.append(content_y)
# similarity, content1, content2, idx = nlp.main(content_x, contents_y)

desc = glm_utils.qwenResult(content_y, content_x)
similarity_result, count = similarity_result_check(desc)

similarity = count * 40
# 相似度相加
total_similarity += similarity

function_content = content_y
dup_function_content = content_x
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time, similarity_result) value (%d, "%s", %f, "%s", "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7], similarity_result))
else:
# 循环遍历每一个维度
# contents_y = []
# contents_y.append(content_y)
# similarity, content1, content2, idx = nlp.main(content_x, contents_y)

desc = glm_utils.qwenResult(content_y, content_x)
similarity_result, count = similarity_result_check(desc)

similarity = count * (10 / dup_count)
# print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
# 相似度相加
total_similarity += similarity

function_content = content_y
dup_function_content = content_x
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time, similarity_result) value (%d, "%s", %f, "%s", "%s", "%s", "%s", "%s")"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7], similarity_result))

mysql.sql_change_msg(
"""update idc_project_check set similarity=%f where dup_id=%d""" % (total_similarity, dup_id))


if __name__ == "__main__":
all_path = requests.get("http://127.0.0.1:19099/check/duplicates/%s" % 599).json()
# print(all_path)

# dict1 = {k:v for k, v in sorted(dict.items(), key= lambda item : item[1])}
# print(dict1)

data_list = []
for ap in all_path.get("data"):
# if os.path.exists(ap.get("file_path")):
data_list.append((ap.get("project_id"), ap.get("file_path"), ap.get("project_name")))
print(data_list)
# data_list = [(11, r"C:\Users\HUAWEI\PycharmProjects\nlp\dup_check\0825-丽水系统查重维度1.xlsx", "水路运输综合监管系统建设项目.xls")]
data_list = [(11, r"D:\ningda\dup_check2\dup_check\0825-丽水系统查重维度1.xlsx", "水路运输综合监管系统建设项目.xls")]
project_check(data_list)


# 对比相似度
def similarity_result_check(desc):
similarity_result = ""
similarity_result_count = 0
if len(desc) > 7:
if desc[6:7] == "高":
similarity_result = "非常相似"
similarity_result_count = 90
elif desc[6:7] == "中":
similarity_result = "比较相似"
similarity_result_count = 60
elif desc[6:7] == "低":
similarity_result = "相似度低"
similarity_result_count = 30
return similarity_result, similarity_result_count

+ 391
- 0
main10.py 查看文件

@@ -0,0 +1,391 @@
# coding=utf-8
import sys
import re
import mysql_pool
from pymysql.converters import escape_string
import cosin_similarity
import pandas as pd
import datetime
import requests
import os

wdys1 = {
"项目名称": "xmmc",
"现状问题": "xzwt",
"系统基础": "xtjc",
"项目目标": "xmmb",
"预期绩效": "yqjx",
"建设需求": "jsxq",
"数据需求": "sjxq",
"安全需求": "aqxq",
"业务领域": "ywly",
"核心业务": "hxyw",
"业务需求": "ywxq",
"业务协同": "ywxt",
"建设层级": "jscj",
"用户范围": "yhfw",
"目标群体": "mbqt",
"建设内容": "jsnr",
"功能模块": "gnmk",
"数据共享": "sjgx",
"智能要素": "znys"
}
wdys2 = {
"xmmc": "项目名称",
"xzwt": "现状问题",
"xtjc": "系统基础",
"xmmb": "项目目标",
"yqjx": "预期绩效",
"jsxq": "建设需求",
"sjxq": "数据需求",
"aqxq": "安全需求",
"ywly": "业务领域",
"hxyw": "核心业务",
"ywxq": "业务需求",
"ywxt": "业务协同",
"jscj": "建设层级",
"yhfw": "用户范围",
"mbqt": "目标群体",
"jsnr": "建设内容",
"gnmk": "功能模块",
"sjgx": "数据共享",
"znys": "智能要素"
}
gnmkys = {
"gnmc": "功能名称",
"gnms": "功能描述"
}


def getFlag():
data_dict = {}
df = pd.read_excel("0825.xlsx")
data = df.values
data = list(pd.Series(data[:, 1]).dropna())
for d in data:
try:
wd = re.search("(.*?)(.*?%)", d).group(1).strip()
wdc = wdys1.get(wd)
if wdc:
qz = re.search(".*?((.*?%))", d).group(1)
data_dict[wdc] = qz
except:
pass
return data_dict


def gong_neng_mo_kuai(mysql, dl, data, er_title):
# 将excel文件中的所有第三维度内容进行拼接
str_dict = {}
for et in er_title:
for d in data:
if d[1] == et:
if str_dict.get(et):
str_dict[et] = str_dict.get(et) + d[3]
else:
str_dict[et] = d[3]
# print(str_dict)
for k, v in str_dict.items():
mysql.sql_change_msg(
"""insert into idc_project_module (project_id, check_duplicate_count, module_name, module_content, create_time, update_time, tag) value(%d, 1, '%s', '%s', '%s', '%s', "模块")""" % (
int(dl[0]), k, v, str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7]))
module_id_list = mysql.sql_select_many(
"""select project_module_id, module_name, module_content from idc_project_module where project_id=%d""" % dl[
0])
data_list = []
for mil in module_id_list:
data_dict = {}
data_dict["project_module_id"] = mil.get("project_module_id")
data_dict["gnmc"] = mil.get("module_name")
data_dict["gnms"] = mil.get("module_content")
data_list.append(data_dict)
# print(data_list)
for i in data_list:
gnmk_copy1 = mysql.sql_select_many("""select * from user_history_module_data""")
if gnmk_copy1:
for gc in gnmk_copy1:
total_similarity1 = 0
total_keywords1 = []
total_similarity2 = 0
total_keywords2 = []
mysql.sql_change_msg(
"""insert into idc_project_module_check (project_module_id, module_name, project_name, company_name, create_time, update_time) value(%d, '%s', '%s', '%s', '%s', '%s')"""
% (
i.get("project_module_id"), gc.get("gnmc"), gc.get("xmmc"), "",
str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
dup_module_id = mysql.cur.lastrowid
for j in ["gnmc", "gnms"]:
# 循环遍历每一个模块名称
content_x = gc.get(j)
content_y = i.get(j)
if content_x and content_y:
if j == "gnmc":
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keyword_x, keywords = similarity.main()
similarity = similarity * 1
total_keywords1 += keywords
print("######################相似度: %.2f%%" % similarity, "关键词: %s" % keywords)
# 相似度相加
total_similarity1 += similarity
mysql.sql_change_msg(
"""insert into idc_project_module_check_detail (dup_module_id, project_name, module_content, dup_module_content, similarity, dimension, create_time, update_time) value (%d, '%s', '%s', '%s', %f, '%s', '%s', '%s')"""
% (dup_module_id, dl[2], escape_string(content_y), escape_string(content_x), similarity,
"功能名称",
str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7]))
else:
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keyword_x, keywords = similarity.main()
similarity = similarity * 99
total_keywords2 += keywords
print("######################相似度: %.2f%%" % similarity, "关键词: %s" % keywords)
# 相似度相加
total_similarity2 += similarity
mysql.sql_change_msg(
"""insert into idc_project_module_check_detail (dup_module_id, project_name, module_content, dup_module_content, similarity, dimension, create_time, update_time) value (%d, '%s', '%s', '%s', %f, '%s', '%s', '%s')"""
% (dup_module_id, dl[2], escape_string(content_y), escape_string(content_x), similarity,
"功能模块描述",
str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7]))
mysql.sql_change_msg("""update idc_project_module_check set similarity=%f where dup_module_id=%d""" % (
total_similarity1 + total_similarity2, dup_module_id))
gnmk_gjc = {}
for a in ["gnmc", "gnms"]:
if i.get(a):
content_x = i.get(a)
content_y = i.get(a)
if a == "gnmc":
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keyword_x, keywords = similarity.main()
gnmk_gjc[a] = keywords
else:
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keyword_x, keywords = similarity.main()
gnmk_gjc[a] = keywords
mysql.sql_change_msg("""insert into user_module_keywords (xmmc, gnmc, gnms) value('%s', '%s', '%s')""" % (
dl[2], str(gnmk_gjc.get("gnmc"))[1:-1] if gnmk_gjc.get("gnmc") else None,
str(gnmk_gjc.get("gnms"))[1:-1] if gnmk_gjc.get("gnms") else None))


def project_check(data_list):
mysql = mysql_pool.ConnMysql()
# mysql.sql_select_many("""select * from mkgjc""")
# 读取维度和权重
# xmnr_count = len(mysql.sql_select_many("""select * from xmnr_copy1"""))
# gnmk_count = len(mysql.sql_select_many("""select * from gnmk_copy1"""))
xmnr_count = len(mysql.sql_select_many("""select * from user_history_data"""))
gnmk_count = len(mysql.sql_select_many("""select * from user_history_module_data"""))
get_data_dict = getFlag()
# 遍历excel存储路径
for dl in data_list:
# path = "0825-丽水系统查重维度1.xlsx"
# 读取路径下的excel
df = pd.read_excel(dl[1])
data = df.values
# 将excel文件中的所有维度内容进行拼接
join_str = ""
str_dict = {}
title = ""
er_title = set()
for d in data:
if pd.notnull(d[0]):
title = d[0]
if title == "功能模块":
er_title.add(d[1])
join_str = ""
for i in d[1:]:
if pd.notnull(i):
join_str +=i
str_dict[wdys1.get(title)] = join_str
else:
if title == "功能模块":
er_title.add(d[1])
for i in d[1:]:
if pd.notnull(i):
join_str +=i
str_dict[wdys1.get(title)] = str_dict.get(wdys1.get(title)) + join_str
# print(str_dict)
mysql.sql_change_msg(
"""insert into user_data (xmmc, xzwt, xtjc, xmmb, yqjx, jsxq, sjxq, aqxq, ywly, hxyw, ywxq, ywxt, jscj, yhfw, mbqt, jsnr, gnmk, sjgx, znys) value ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')"""
% (dl[0], str_dict.get("xzwt") if str_dict.get("xzwt") else None,
str_dict.get("xtjc") if str_dict.get("xtjc") else None,
str_dict.get("xmmb") if str_dict.get("xmmb") else None,
str_dict.get("yqjx") if str_dict.get("yqjx") else None,
str_dict.get("jsxq") if str_dict.get("jsxq") else None,
str_dict.get("sjxq") if str_dict.get("sjxq") else None,
str_dict.get("aqxq") if str_dict.get("aqxq") else None,
str_dict.get("ywly") if str_dict.get("ywly") else None,
str_dict.get("hxyw") if str_dict.get("hxyw") else None,
str_dict.get("ywxq") if str_dict.get("ywxq") else None,
str_dict.get("ywxt") if str_dict.get("ywxt") else None,
str_dict.get("jscj") if str_dict.get("jscj") else None,
str_dict.get("yhfw") if str_dict.get("yhfw") else None,
str_dict.get("mbqt") if str_dict.get("mbqt") else None,
str_dict.get("jsnr") if str_dict.get("jsnr") else None,
str_dict.get("gnmk") if str_dict.get("gnmk") else None,
str_dict.get("sjgx") if str_dict.get("sjgx") else None,
str_dict.get("znys") if str_dict.get("znys") else None))
# 或取所有的xmnr_copy1
xmnr_copy1 = mysql.sql_select_many("""select * from user_history_data""")
# 对比xmnr_copy1和xmnr维度是否都有
if xmnr_copy1:
for xc in xmnr_copy1:
total_keywords = {}
total_similarity = 0
dup_count = 0
# 保存相加后的相似度到idc_project_check
mysql.sql_change_msg(
"""insert into idc_project_check (project_id, dup_project_name, file_path, company_name, create_year, project_tag, project_range_tag, project_area, create_time, update_time) value ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')"""
% (dl[0], xc.get("xmmc"), dl[1], "", "", "需求相似、业务相似", "历史项目", "",
str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7]))
dup_id = mysql.cur.lastrowid
for x in list(xc.keys())[1:]:
content_x = xc.get(x)
content_y = str_dict.get(x)
if content_x and content_y:
if x == 'gnmk':
continue
elif x == 'jsnr':
continue
else:
dup_count += 1
for x in list(xc.keys())[1:]:
content_x = xc.get(x)
content_y = str_dict.get(x)
if content_x and content_y:
if x == 'gnmk':
# 匹配到历史数据,次数加1
# dup_count += 1
# 循环遍历每一个维度
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keywords_x, keywords_y = similarity.main()
similarity = similarity * 50
print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
# 相似度相加
total_similarity += similarity
# 关键词收集
total_keywords[x] = keywords_y

function_content = content_y
dup_function_content = content_x
for word_y in keywords_y:
word_y = word_y.strip().strip("'").strip('"')
function_content = str(function_content.replace("\"", "'")).replace(word_y,
f'<span class="similarity">{word_y.strip()}</span>')
for word_x in keywords_x:
word_x = word_x.strip().strip("'").strip('"')
dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
f'<span class="similarity">{word_x.strip()}</span>')
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, '%s', %f, '%s', '%s', '%s', '%s')"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
# content = content.replace(gjcs, f'<span class="similarity">{gjcs.strip()}</span>')
elif x == 'jsnr':
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keywords_x, keywords_y = similarity.main()
similarity = similarity * 40
print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
# 相似度相加
total_similarity += similarity
# 关键词收集
total_keywords[x] = keywords_y
function_content = content_y
dup_function_content = content_x
for word_y in keywords_y:
word_y = word_y.strip().strip("'").strip('"')
function_content = str(function_content.replace("\"", "'")).replace(word_y,
f'<span class="similarity">{word_y.strip()}</span>')
for word_x in keywords_x:
word_x = word_x.strip().strip("'").strip('"')
dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
f'<span class="similarity">{word_x.strip()}</span>')
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, '%s', %f, '%s', '%s', '%s', '%s')"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))
else:
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keywords_x, keywords_y = similarity.main()
similarity = similarity * (10 / dup_count)
print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
# 相似度相加
total_similarity += similarity
# 关键词收集
total_keywords[x] = keywords_y
function_content = content_y
dup_function_content = content_x
for word_y in keywords_y:
word_y = word_y.strip().strip("'").strip('"')
function_content = str(function_content.replace("\"", "'")).replace(word_y,
f'<span class="similarity">{word_y.strip()}</span>')
for word_x in keywords_x:
word_x = word_x.strip().strip("'").strip('"')
dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
f'<span class="similarity">{word_x.strip()}</span>')
# 保存每个维度对应的相似度到idc_project_check_detail
mysql.sql_change_msg(
"""insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, '%s', %f, '%s', '%s', '%s', '%s')"""
% (dup_id, wdys2.get(x), similarity, escape_string(function_content),
escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
str(datetime.datetime.now())[:-7]))

mysql.sql_change_msg(
"""update idc_project_check set similarity=%f where dup_id=%d""" % (total_similarity, dup_id))
project_gjc = {}
for w in wdys2.keys():
content_x = str_dict.get(w)
content_y = str_dict.get(w)
if content_x and content_y:
# 循环遍历每一个维度
similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
# 相似度 关键词
similarity, keywords_x, keywords = similarity.main()
project_gjc[w] = keywords
mysql.sql_change_msg(
"""insert into user_keyword (xmmc, xzwt, xtjc, xmmb, yqjx, jsxq, sjxq, aqxq, ywly, hxyw, ywxq, ywxt, jscj, yhfw, mbqt, jsnr, gnmk, sjgx, znys) value ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')"""
% (dl[0], str(project_gjc.get("xzwt"))[1:-1] if project_gjc.get("xzwt") else None,
str(project_gjc.get("xtjc"))[1:-1] if project_gjc.get("xtjc") else None,
str(project_gjc.get("xmmb"))[1:-1] if project_gjc.get("xmmb") else None,
str(project_gjc.get("yqjx"))[1:-1] if project_gjc.get("yqjx") else None,
str(project_gjc.get("jsxq"))[1:-1] if project_gjc.get("jsxq") else None,
str(project_gjc.get("sjxq"))[1:-1] if project_gjc.get("sjxq") else None,
str(project_gjc.get("aqxq"))[1:-1] if project_gjc.get("aqxq") else None,
str(project_gjc.get("ywly"))[1:-1] if project_gjc.get("ywly") else None,
str(project_gjc.get("hxyw"))[1:-1] if project_gjc.get("hxyw") else None,
str(project_gjc.get("ywxq"))[1:-1] if project_gjc.get("ywxq") else None,
str(project_gjc.get("ywxt"))[1:-1] if project_gjc.get("ywxt") else None,
str(project_gjc.get("jscj"))[1:-1] if project_gjc.get("jscj") else None,
str(project_gjc.get("yhfw"))[1:-1] if project_gjc.get("yhfw") else None,
str(project_gjc.get("mbqt"))[1:-1] if project_gjc.get("mbqt") else None,
str(project_gjc.get("jsnr"))[1:-1] if project_gjc.get("jsnr") else None,
str(project_gjc.get("gnmk"))[1:-1] if project_gjc.get("gnmk") else None,
str(project_gjc.get("sjgx"))[1:-1] if project_gjc.get("sjgx") else None,
str(project_gjc.get("znys"))[1:-1] if project_gjc.get("znys") else None))
mysql.sql_change_msg(
"""update idc_project set dup_status=3, one_vote_veto_status=1, self_check_status=1, history_project_count=%d ,module_count=%d where project_id=%d""" % (
xmnr_count, gnmk_count, dl[0]))
gong_neng_mo_kuai(mysql, dl, data, er_title)


if __name__ == "__main__":
all_path = requests.get("http://127.0.0.1:19099/check/duplicates/%s" % 320).json()
print(all_path)

data_list = []
for ap in all_path.get("data"):
# if os.path.exists(ap.get("file_path")):
data_list.append((ap.get("project_id"), ap.get("file_path"), ap.get("project_name")))
print(data_list)
data_list = [(11, r"D:\ningda\dup_check2\dup_check\0825-丽水系统查重维度1.xlsx", "0216-2")]
project_check(data_list)

+ 65
- 0
model_scope.py 查看文件

@@ -0,0 +1,65 @@
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks


# 替换换行字符
def replace_newlines(text, new_line=''):
# 替换所有类型的换行符
return text.replace('\r\n', new_line).replace('\r', new_line).replace('\n', new_line)


# 使用bert计算文本相识度
class Bert_nlp(object):

def __init__(self, nlp_type):
self.nlp_type = nlp_type
if nlp_type == "structbert":
model_id = "damo/nlp_structbert_sentence-similarity_chinese-large"
self.semantic_cls = pipeline(Tasks.sentence_similarity, model_id)
elif nlp_type == "corom":
# , sequence_length=1024 /Users/kebobo/.cache/modelscope/hub/damo/nlp_corom_sentence-embedding_chinese-tiny
model_id = "damo/nlp_corom_sentence-embedding_chinese-tiny"
self.semantic_cls = pipeline(Tasks.sentence_embedding, model=model_id)

def main(self, content1, contents):
# if content1 is None or content1 == "None":
# return 0, "", "", -1
score = 0.0
if len(contents) == 0:
return score, content1, "", -1
if self.nlp_type == "structbert":
result = self.semantic_cls(input=(content1, contents[0]))
print(result)
labels = result["labels"]
acq = labels.index("1")
score = result["scores"][acq]
elif self.nlp_type == "corom":
inputs = {
"source_sentence": [
replace_newlines(content1)
],
"sentences_to_compare": contents
}
result = self.semantic_cls(input=inputs)
print(result)

arr = result["scores"]
score = max(arr)
idx = arr.index(score)

return score, content1, contents[idx], idx




if __name__ == "__main__":
content1 = """主要功能为快速进行学生课堂评价及小组评价"""
content2 = ["""用户通过建设单位账户进入建设单位门户,建设单位门户主要展示本单位项目信息、通知公告与政策文件栏、待办事项栏、本单位进行中项目栏模块。""",
"""主要功能为快速进行学生课堂评价及小组评价"""
]
nlp = Bert_nlp("corom")
print(nlp.main(content1, content2))


+ 113
- 0
mysql_pool.py 查看文件

@@ -0,0 +1,113 @@
# coding=utf-8
import pymysql
from dbutils.pooled_db import PooledDB
# from dbutils.persistent_db import PersistentDB

mysqlInfo = {
"host": '47.98.125.47',
"user": 'root',
"passwd": 'NingdaKeji123!',
"db": 'idc',
"port": 3306,
"charset": "utf8"
}


class ConnMysql(object):
__pool = None

def __init__(self):
# 构造函数,创建数据库连接、游标
self.coon = ConnMysql._get_mysql_conn()
self.cur = self.coon.cursor(cursor=pymysql.cursors.DictCursor)

# 数据库连接池连接
@staticmethod
def _get_mysql_conn():
global __pool
if ConnMysql.__pool is None:
__pool = PooledDB(
creator=pymysql,
mincached=1,
maxcached=5,
maxconnections=6,
maxshared=3,
blocking=True,
maxusage=None,
setsession=[],
ping=2,
host=mysqlInfo['host'],
user=mysqlInfo['user'],
passwd=mysqlInfo['passwd'],
db=mysqlInfo['db'],
port=mysqlInfo['port'],
charset=mysqlInfo['charset'])
return __pool.connection()

# 插入、修改、删除一条
def sql_change_msg(self, sql):
change_sql = self.cur.execute(sql)
self.coon.commit()
return change_sql

# 查询一条
def sql_select_one(self, sql):
self.cur.execute(sql)
select_res = self.cur.fetchone()
return select_res

# 查询多条
def sql_select_many(self, sql, count=None):
self.cur.execute(sql)
if count is None:
select_res = self.cur.fetchall()
else:
select_res = self.cur.fetchmany(count)
return select_res

# 释放资源
def release(self):
self.coon.close()
self.cur.close()


if __name__ == '__main__':
[{'Tables_in_idc': 'gjc'},
{'Tables_in_idc': 'gjc2'},
{'Tables_in_idc': 'idc_dept'},
{'Tables_in_idc': 'idc_project'}, {'Tables_in_idc': 'idc_project_check'},
{'Tables_in_idc': 'idc_project_check_detail'}, {'Tables_in_idc': 'idc_project_module'},
{'Tables_in_idc': 'idc_project_module_check'}, {'Tables_in_idc': 'idc_project_module_check_detail'},
{'Tables_in_idc': 'idc_user'}, {'Tables_in_idc': 'idc_user_dept'}, {'Tables_in_idc': 'mk2'}]

# print(ConnMysql().sql_select_many("show tables;"))
mysql = ConnMysql()
# mysql.sql_change_msg("""insert into idc_project (project_name,file_path) value ('%s', '%s')""" % ("森林火险", "/opt/idc/file/20220924/79a53829-8965-4aof-a342-c532f6c9c2a3森林火险.xlsx"))
# print(mysql.sql_select_many("""select * from gjc"""))
# print(mysql.sql_select_many("""select * from gjc2 where id=dup_file_test"""))
# print(mysql.sql_select_many("""select * from xmnr"""))
# print(mysql.sql_select_many("""select * from gjc_copy1"""))
# print(mysql.sql_select_one("""select * from idc_project_check"""))
# print(mysql.sql_select_one("""select * from idc_project_check_detail"""))
# print(mysql.sql_select_many("""select * from idc_project_module"""))
# print(mysql.sql_select_many("""select * from idc_project_module where project_id=%d""" % int(7)))
# print( mysql.sql_select_one("""select dup_id from idc_project_check where project_id=%d"""% int(7)))
# print(len(mysql.sql_select_many("""select * from xmnr_copy1""")))
# print(len(mysql.sql_select_many("""select * from user_history_data""")))
print(len(mysql.sql_select_many("""select * from user_history_data""")))

"""查重复select * from user_history_module_data where gnms in (select gnms from user_history_module_data group by gnms having count(gnms)>1);
"""
# print()
# str_dict={}
# cmnr_count=551
# gnmkcount=1192
#
# print(mysql.sql_change_msg(
# """update idc_project set company_name=%s, dup_status=3, one_vote_veto_status=dup_file_test, self_check_status=dup_file_test, history_project_count=%d ,module_count=%d where project_id=%d""" % (
# str_dict.get('sbdw'), xmnr_count=551, gnmk_count=1192, 104)))
# print(mysql.sql_change_msg(
# """update idc_project set dup_status=3, one_vote_veto_status=dup_file_test, self_check_status=dup_file_test, history_project_count=%d ,module_count=%d where project_id=%d""" % (
# )
# for k, v in mysql.sql_select_one("""select * from idc_project_check_detail""").items():
# print(k, v)

+ 1
- 0
nohup python3 flask_server.py >> nohup.info 2>&1 & 查看文件

@@ -0,0 +1 @@
nohup python3 flask_server.py >> nohup.info 2>&1 &

+ 5
- 0
nohup.out 查看文件

@@ -0,0 +1,5 @@
* Serving Flask app 'flask_server' (lazy loading)
* Environment: production
WARNING: This is a development server. Do not use it in a production deployment.
Use a production WSGI server instead.
* Debug mode: off

+ 63
- 0
pro_check_demo.py 查看文件

@@ -0,0 +1,63 @@
import mysql_pool
import heapq
import uuid
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks


def check_demo():
batch_no = str(uuid.uuid4())
mysql = mysql_pool.ConnMysql()
data = mysql.sql_select_many("""select * from test_pro_info_new where super_unit LIKE '%农%'""")
# 全部的项目信息
# pro_map = []
# for ap1 in data:
# print(type(ap1.get("base_proj_intro")))
# # pro_list.append((ap.get("base_proj_name"), ap.get("base_proj_intro")))
# pro_map[ap1.get("base_proj_intro")] = ap1.get("base_proj_name")
# 获取模型
model_id = "damo/nlp_corom_sentence-embedding_chinese-tiny"
semantic_cls = pipeline(Tasks.sentence_embedding, model=model_id)

for pro in data:
# try:
pro_info_list = []
# print(pro.get("base_area_code")[0:4])
for ap in data:
# if ap.get("base_proj_intro") != pro.get("base_proj_intro") and ap.get("base_area_code")[0:4] == pro.get("base_area_code")[0:4]:
if ap.get("base_proj_intro") != pro.get("base_proj_intro"):
pro_info_list.append(str(ap.get("base_proj_intro")).replace('\n', ''))
inputs = {
"source_sentence": [
pro.get("base_proj_intro")
],
"sentences_to_compare": pro_info_list
}
result = semantic_cls(input=inputs)
print(result)

arr = result["scores"]
top_3 = heapq.nlargest(3, arr)

for ele in top_3:
idx = arr.index(ele)
# print(pro_info_list[idx])
for ele1 in data:
if ele1.get("base_proj_intro") == pro_info_list[idx]:
mysql.sql_change_msg(
"""insert into test_pro_check (pro_name, pro_info, check_pro_name, check_pro_info, batch_no, score, pro_area, check_pro_area, pro_set_year, check_pro_set_year, create_time) value("%s" ,"%s", "%s", "%s", "%s", "%f", "%s", "%s", "%s", "%s", now())""" % (
pro.get("base_proj_name"), pro.get("base_proj_intro"), ele1.get("base_proj_name"), pro_info_list[idx], batch_no, ele, pro.get("base_area_name"), ele1.get("base_area_name"), pro.get("base_proj_set_year"), ele1.get("base_proj_set_year")))
break

# except Exception:
# mysql.sql_change_msg(
# """insert into test_pro_check (pro_name, pro_info, batch_no, score, pro_area, pro_set_year, create_time) value("%s" ,"%s", "%s", "%f", "%s", "%s", now())""" % (
# pro.get("base_proj_name"), pro.get("base_proj_intro"), batch_no, 0, pro.get("base_area_name"), pro.get("base_proj_set_year")))


if __name__ == "__main__":
check_demo()


+ 43
- 0
requirements.txt 查看文件

@@ -0,0 +1,43 @@
certifi==2022.6.15
cffi==1.15.1
chardet==5.0.0
charset-normalizer==2.0.12
click==8.0.4
colorama==0.4.5
cryptography==3.4.7
dataclasses==0.8
DBUtils==3.0.2
et-xmlfile==1.1.0
Flask==1.0.2
idna==3.3
importlib-metadata==4.8.3
itsdangerous==2.0.1
jieba==0.42.1
Jinja2==3.0.3
joblib==1.1.0
MarkupSafe==2.0.1
numpy==1.19.5
openpyxl==3.0.10
pandas==1.1.5
pdfminer.six==20211012
pdfplumber==0.6.0
Pillow==8.4.0
pycparser==2.21
PyMySQL==0.10.1
pypiwin32==223
python-dateutil==2.8.2
pytz==2022.2.1
pywin32==304
requests~=2.31.0
scikit-learn~=1.1.3
scipy==1.5.4
six==1.16.0
threadpoolctl==3.1.0
typing_extensions==4.1.1
urllib3==1.26.12
Wand==0.6.10
Werkzeug==2.0.3
zipp==3.6.0

modelscope~=1.9.2
sklearn~=0.0.post5

+ 1626
- 0
stop_words.utf8
文件差異過大導致無法顯示
查看文件


二進制
temp/丽水市本级信息化项目建设方案模板.doc 查看文件


二進制
temp/丽水市本级信息化项目建设方案模板.docx 查看文件


+ 179
- 0
tongyici_tihuan.txt 查看文件

@@ -0,0 +1,179 @@
人民群众 居民 公众
数据仓 数据仓库 数据高铁 数据集市 数据资源库
浙江省 省 全省 我省
政务云 专有云 信创云 电子政务网络 政务云平台 信创云资源 政法云 浙江省政府数据中心政务云平台
省统建电子健康档案系统 全省电子健康档案数据管理
日浏览量 pv
日均访问量 uv
数据交换 数据共享
互联网+健康医疗 健康大脑+
安全三级测评 等保三级
安全等级保护 等保 网络安全等级保护 信息安全等级化保护 信息安全保护等级 安全保障体系 信息系统定级 等保测评 国家信息安全等级
AED 除颤器 除颤仪
电脑端 pc端 Web端
HIS 医院信息系统
监管驾驶舱 监管系统
GIS 地理信息地图 GIS地理信息技术
维护 技术支撑 维护人员 网络管理员 系统管理员 软件维护人员 系统支撑 运维团队 平台管理人员
一体化智能化数据共享平台 IRS系统 政务大数据统一支撑平台 公共数据交换平台 公共数据中心平台 电子政务基础设施一体化管控平台 一体化数字资源系统
软件 应用 系统 应用软件
接口 api 数据接口 RestfulAPI
前后端 开发人员 开发 技术人员 IT技术人员
异步接口 AJAX JSONP
巡检 巡查
驾驶舱 数字化驾驶舱 数据大屏 数据可视化平台
信创 信息技术应用创新产业
防御性验证技术 用户ID/密码 口令 人脸识别
数据备份工具 pg_dump
日志管理 日志记录
数字化改革 信息化 数字化 浙江省数字政府改革 政府数字化
易扩充 可扩展性 开放性
三单制 动态督考应用 “三单制”履职服务平台
Java消息服务 Java JMS
虚拟专用网络 VPN 专用网络技术
短信认证 短信猫 短信网关
云资源 云服务资源
缓存中间件 reids 缓存数据库
最小磁盘空间需求 MDSR
身份认证 身份鉴别
剩余信息保护 磁盘空闲数据擦除 数据完全擦除
互联互通 共享性 开放性 互联共享
数据加工 数据加工清洗平台
面向服务架构 SOA
集成融合技术 SOI
消息协议 http jms
直接连接存储 DAS
网络连接存储设备 NAS
磁盘阵列技术 RAID
双机容错技术 双机热备份技术 对子双机容错
安全防护软件 安全软件
JAAS Java认证与授权服务
WebService web应用程序分支
食品小作坊数字化监管平台 红盾智坊
政务服务网用户体系 浙里办
浙政钉用户体系 浙政钉
AU 总用户量
DC 每用户每天产生数据增量
YD 每年有效工作日
C 为存储冗余量比例
F 为系统未来3~5年的业务量发展冗余预留,发展系数设以1起算
IP地址伪装技术 NAT
VR 全景VR
离线引擎 Hive odps
在线引擎 sck
时序数据库 Druid、HBase
关系型数据库 PostgreSQL
浙里护苗 未成年人全生命周期专题库
逻辑块地址 LBA
数据块 Block
云防火墙 防火墙的硬件虚拟化技术
云杀毒 云环境下的杀毒策略
国产操作系统 国产信创麒麟操作系统 统信UOS 中标Linux 中标麒麟 麒麟桌面操作系统
项目问题 PPR ProjectProblemReport
项目干系人 Stakeholder
变更请求 CRR ChangeRequestReport 需求变更
软件产品需求基准 Baseline
新需求 NewRequirement
需求取消 CanceledRequirement
CCB ChangeControlBoard
软件问题 SPR SoftwareProblemReport
问题 Issue
软件问题 SPR
Q&A QuestionAndAnswer
质量保证员 QA
揭榜挂帅 组团赛马
B/S Browser/Server 浏览器/服务器
数据管理层 Server
用户界面层 Client
ECMAScript 欧洲计算机协会制定的js规范
JSON JavaScriptObject Notation JS 对象简谱
Hadoop 分布式文件系统 HDFS
网络数据传输方式 TCP UDP HTTP 串口
专有钉钉 专有钉钉开放平台
应用服务器 ApplicationServer
开放字体格式 WOFF
GNSS 表面位移监
MEMS 加速度计
图形工具 ArcGIS
实时数据流处理技术 Storm 连续计算
CDEM 连续-非连 Continuum Discontinuum Element Method
IVR 手机智能群呼
B/S 开发工具 Arcgis servers
BSD 风险许可证
C&C 肉鸡检测
云计算 cloud computing
增量备份 incrementalbackup
log 日志
Web应用防火墙 WAF
入侵检测系统 IDS
国产处理器芯片 RK3399
无故障时间 MTBF
平均修复时间 MTTR
单点登录功能 SSO
入侵检测 IDS
现代浏览器 Chrome Safari FireFox
Service-Oriented-Architecture SOA 面向服务的体系结构
模型-视图-控制器 MVC MODEL-VIEW-CONTROLLER
简体中文 GB2312 UNICODE UTF-7 UFT-8
BIG5 繁体中文
声音计量仪器 GPRS噪声变送器
GPS 定位与导航
可燃气体探测器 报警器
business component name 业务组件名称
EAI 企业应用集成
OGC标准服务 WMS WMS-C WFS WCS
地图瓦片服务 WMTS
表现层 UI
业务逻辑层 BLL
数据访问层 DAL
三层架构 3-tierapplication
跨站脚本漏洞 XSS
发光二极管 LED
电视信号 VCD
录像机 DVD
影碟机 LD
影像 Video
电插锁 阳极锁
阴极锁 电锁口
接入路数 接入带宽
即插即用 UPnP
SNMP 简单网络管理
NTP 网络校时
SADP 自动搜索IP地址
SMTP 邮件服务
NFS NAS网盘
iSCSI IP SAN网盘
PPPoE 拨号上网
移动终端应用 APP 移动客户端
OLT Optical Line Terminal 光纤线路终端
ONU Optical Network Unit 光纤网络单元 光网络单元
光配线网 ODN
数字沙盘 数字地理信息系统
地理信息服务总线 ESB
Geo-ESB Geographic Enterprise Service Bus 地理信息企业服务总线
JMS Java Message Service
搜索服务器 Elasticsearch群集
农家乐 民宿
城市大脑 城市治理
好龙泉 i龙泉模块
网络态势感知 Cyberspace Situation Awareness CSA
消息队列 kafka amq rmq
数据集成服务 ETL
数据抽取 Extract
转换 Transform
加载 Load
数据存储 ODS
数据仓库 DW
数值型数据 度量
角度 维度
指标 KPI
多维数据集 OLAP 数据立方
元数据 Metadata
MTBR 平均无故障率
MBTF 平均无故障时间
软件防护 防病毒软件
硬件防护 硬件防火墙
数据库管理员 SA
水政务协同 整体智治
数字创新,一网智办 政策项目管理与服务平台
龙财通 数字创新场景迭代升级建设项目

+ 81
- 0
user_dict.txt 查看文件

@@ -0,0 +1,81 @@
社会主义 ns
信息化 ns
最多跑一次 ns
零次跑 ns
多跨协同 ns
数字化改革 ns
区块链 ns
大数据 ns
物联网 ns
智能化 ns
数字化 ns
OA ns
子模块 ns
政治面貌 ns
社会背景 ns
职称职务 ns
数字政府 ns
一件事 ns
智慧监管 ns
互联网+ ns
政务云 ns
四横三纵 ns
政务钉钉 ns
数据共享 ns
业务协同 ns
数据协同 ns
钉钉 ns
数据仓 ns
领导驾驶舱 ns
条线 ns
卫健委 ns
政区划码 ns
监管信息 ns
任务调度 ns
IRS ns
JSON ns
API ns
SM3 ns
AED ns
HIS ns
GIS ns
api ns
信创 ns
日志管理 ns
权限管理 ns
角色管理 ns
短信认证 ns
短信猫 ns
短信网关 ns
云资源 ns
MDSR ns
身份认证 ns
SOA ns
DAS ns
SOI ns
http ns
揭榜挂帅 ns
专有钉钉 ns
GNSS ns
MEMS ns
ArcGIS ns
IVR ns
BSD ns
GPS ns
数字沙盘 ns
城市大脑 ns
KPI ns
一体化智能化数据共享平台 ns
三单制 ns
JAAS ns
浙里办 ns
浙政钉 ns
VR ns
浙里护苗 ns
CCB ns
B/S ns
EAI ns
ESB ns
Web ns
一张图 ns
nan ns

Loading…
取消
儲存