丽水查重代码
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

438 lines
23KB

  1. # coding=utf-8
  2. import sys
  3. import re
  4. import mysql_pool
  5. from pymysql.converters import escape_string
  6. import model_scope
  7. import pandas as pd
  8. import datetime
  9. import requests
  10. # 通过corom算法进行文本向量化对比相识度
  11. wdys1 = {
  12. "项目名称": "xmmc",
  13. "现状问题": "xzwt",
  14. "系统基础": "xtjc",
  15. "项目目标": "xmmb",
  16. "预期绩效": "yqjx",
  17. "建设需求": "jsxq",
  18. "数据需求": "sjxq",
  19. "安全需求": "aqxq",
  20. "业务领域": "ywly",
  21. "核心业务": "hxyw",
  22. "业务需求": "ywxq",
  23. "业务协同": "ywxt",
  24. "建设层级": "jscj",
  25. "用户范围": "yhfw",
  26. "目标群体": "mbqt",
  27. "建设内容": "jsnr",
  28. "功能模块": "gnmk",
  29. "数据共享": "sjgx",
  30. "智能要素": "znys"
  31. }
  32. wdys2 = {
  33. "xmmc": "项目名称",
  34. "xzwt": "现状问题",
  35. "xtjc": "系统基础",
  36. "xmmb": "项目目标",
  37. "yqjx": "预期绩效",
  38. "jsxq": "建设需求",
  39. "sjxq": "数据需求",
  40. "aqxq": "安全需求",
  41. "ywly": "业务领域",
  42. "hxyw": "核心业务",
  43. "ywxq": "业务需求",
  44. "ywxt": "业务协同",
  45. "jscj": "建设层级",
  46. "yhfw": "用户范围",
  47. "mbqt": "目标群体",
  48. "jsnr": "建设内容",
  49. "gnmk": "功能模块",
  50. "sjgx": "数据共享",
  51. "znys": "智能要素"
  52. }
  53. gnmkys = {
  54. "gnmc": "功能名称",
  55. "gnms": "功能描述"
  56. }
  57. def getFlag():
  58. data_dict = {}
  59. df = pd.read_excel("0825.xlsx")
  60. data = df.values
  61. data = list(pd.Series(data[:, 1]).dropna())
  62. for d in data:
  63. try:
  64. wd = re.search("(.*?)(.*?%)", d).group(1).strip()
  65. wdc = wdys1.get(wd)
  66. if wdc:
  67. qz = re.search(".*?((.*?%))", d).group(1)
  68. data_dict[wdc] = qz
  69. except:
  70. pass
  71. return data_dict
  72. def gong_neng_mo_kuai(mysql, dl, data, er_title, similarity_nlp):
  73. # 将excel文件中的所有第三维度内容进行拼接
  74. str_dict = {}
  75. for et in er_title:
  76. for d in data:
  77. if d[1] == et:
  78. if str_dict.get(et):
  79. str_dict[et] = str_dict.get(et) + d[3]
  80. else:
  81. str_dict[et] = d[3]
  82. for k, v in str_dict.items():
  83. mysql.sql_change_msg(
  84. """insert into idc_project_module (project_id, check_duplicate_count, module_name, module_content, create_time, update_time, tag) value(%d, 1, "%s", "%s", "%s", "%s", "模块")""" % (
  85. int(dl[0]), k, v, str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7]))
  86. module_id_list = mysql.sql_select_many(
  87. """select project_module_id, module_name, module_content from idc_project_module where project_id=%d""" % dl[
  88. 0])
  89. data_list = []
  90. for mil in module_id_list:
  91. data_dict = {}
  92. data_dict["project_module_id"] = mil.get("project_module_id")
  93. data_dict["gnmc"] = mil.get("module_name")
  94. data_dict["gnms"] = mil.get("module_content")
  95. data_list.append(data_dict)
  96. # print(data_list)
  97. for i in data_list:
  98. gnmk_copy1 = mysql.sql_select_many("""select * from user_history_module_data WHERE xmmc = '丽水市城市管理指挥中心信息系统(一期)项目' """)
  99. if gnmk_copy1:
  100. for gc in gnmk_copy1:
  101. print(
  102. """insert into idc_project_module_check (project_module_id, module_name, project_name, company_name, create_time, update_time) value(%d, "%s", "%s", "%s", "%s", "%s")"""
  103. % (
  104. i.get("project_module_id"), escape_string(gc.get("gnmc")), escape_string(gc.get("xmmc")), "",
  105. str(datetime.datetime.now())[:-7],
  106. str(datetime.datetime.now())[:-7]))
  107. mysql.sql_change_msg(
  108. """insert into idc_project_module_check (project_module_id, module_name, project_name, company_name, create_time, update_time) value(%d, "%s", "%s", "%s", "%s", "%s")"""
  109. % (
  110. i.get("project_module_id"), escape_string(gc.get("gnmc")), escape_string(gc.get("xmmc")), "",
  111. str(datetime.datetime.now())[:-7],
  112. str(datetime.datetime.now())[:-7]))
  113. dup_module_id = mysql.cur.lastrowid
  114. check_module_info(mysql, gc, dl, i, dup_module_id, similarity_nlp)
  115. def check_module_info(mysql, gc, dl, pro, dup_module_id, similarity_nlp):
  116. total_similarity1 = 0
  117. total_keywords1 = []
  118. total_similarity2 = 0
  119. total_keywords2 = []
  120. for j in ["gnmc", "gnms"]:
  121. # 循环遍历每一个模块名称
  122. content_x = gc.get(j)
  123. content_y = pro.get(j)
  124. if content_x and content_y:
  125. if j == "gnmc":
  126. print("功能名称 暂时不计算")
  127. # 相似度
  128. # similarity = similarity_nlp.main(content_x, content_y)
  129. # similarity = similarity * 1
  130. # # 相似度相加
  131. # total_similarity1 += similarity
  132. # mysql.sql_change_msg(
  133. # """insert into idc_project_module_check_detail (dup_module_id, project_name, module_content, dup_module_content, similarity, dimension, create_time, update_time) value (%d, "%s", "%s", "%s", %f, "%s", "%s", "%s")"""
  134. # % (dup_module_id, dl[2], escape_string(content_y), escape_string(content_x), similarity,
  135. # "功能名称",
  136. # str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7]))
  137. else:
  138. # 相似度
  139. similarity = similarity_nlp.main(content_x, content_y)
  140. similarity = similarity * 99
  141. # 相似度相加
  142. total_similarity2 += similarity
  143. mysql.sql_change_msg(
  144. """insert into idc_project_module_check_detail (dup_module_id, project_name, module_content, dup_module_content, similarity, dimension, create_time, update_time) value (%d, "%s", "%s", "%s", %f, "%s", "%s", "%s")"""
  145. % (dup_module_id, dl[2], escape_string(content_y), escape_string(content_x), similarity,
  146. "功能模块描述",
  147. str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7]))
  148. mysql.sql_change_msg("""update idc_project_module_check set similarity=%f where dup_module_id=%d""" % (
  149. total_similarity1 + total_similarity2, dup_module_id))
  150. def project_check(data_list):
  151. similarity_nlp = model_scope.Bert_nlp("structbert")
  152. mysql = mysql_pool.ConnMysql()
  153. # 读取维度和权重
  154. xmnr_count = len(mysql.sql_select_many("""select * from user_history_data"""))
  155. gnmk_count = len(mysql.sql_select_many("""select * from user_history_module_data"""))
  156. # 遍历excel存储路径
  157. for dl in data_list:
  158. # 读取路径下的excel
  159. print(dl,dl[1])
  160. df = pd.read_excel(dl[1])
  161. data = df.values
  162. # 将excel文件中的所有维度内容进行拼接
  163. join_str = ""
  164. str_dict = {}
  165. gnmk_str = []
  166. title = ""
  167. er_title = set()
  168. for d in data:
  169. if pd.notnull(d[0]):
  170. title = d[0]
  171. if title == "功能模块":
  172. er_title.add(d[1])
  173. join_str = ""
  174. for i in d[1:]:
  175. if pd.notnull(i):
  176. join_str += str(i)
  177. if title == "功能模块":
  178. if i == '功能描述':
  179. continue
  180. else:
  181. gnmk_str.append(i)
  182. str_dict[wdys1.get(title)] = join_str
  183. else:
  184. if title == "功能模块":
  185. er_title.add(d[1])
  186. for i in d[1:]:
  187. if pd.notnull(i):
  188. join_str += str(i)
  189. str_dict[wdys1.get(title)] = str_dict.get(wdys1.get(title)) + join_str
  190. # print(str_dict)
  191. gnmk = ",".join(gnmk_str)
  192. str_dict['gnmk'] = gnmk
  193. mysql.sql_change_msg(
  194. """insert into user_data (xmmc, xzwt, xtjc, xmmb, yqjx, jsxq, sjxq, aqxq, ywly, hxyw, ywxq, ywxt, jscj, yhfw, mbqt, jsnr, gnmk, sjgx, znys) value ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")"""
  195. % (dl[0], str_dict.get("xzwt") if str_dict.get("xzwt") else None,
  196. str_dict.get("xtjc") if str_dict.get("xtjc") else None,
  197. str_dict.get("xmmb") if str_dict.get("xmmb") else None,
  198. str_dict.get("yqjx") if str_dict.get("yqjx") else None,
  199. str_dict.get("jsxq") if str_dict.get("jsxq") else None,
  200. str_dict.get("sjxq") if str_dict.get("sjxq") else None,
  201. str_dict.get("aqxq") if str_dict.get("aqxq") else None,
  202. str_dict.get("ywly") if str_dict.get("ywly") else None,
  203. str_dict.get("hxyw") if str_dict.get("hxyw") else None,
  204. str_dict.get("ywxq") if str_dict.get("ywxq") else None,
  205. str_dict.get("ywxt") if str_dict.get("ywxt") else None,
  206. str_dict.get("jscj") if str_dict.get("jscj") else None,
  207. str_dict.get("yhfw") if str_dict.get("yhfw") else None,
  208. str_dict.get("mbqt") if str_dict.get("mbqt") else None,
  209. str_dict.get("jsnr") if str_dict.get("jsnr") else None,
  210. str_dict.get("gnmk") if str_dict.get("gnmk") else None,
  211. str_dict.get("sjgx") if str_dict.get("sjgx") else None,
  212. str_dict.get("znys") if str_dict.get("znys") else None))
  213. # 或取所有的xmnr_copy1
  214. xmnr_copy1 = mysql.sql_select_many("""select * from user_history_data WHERE xmmc = '丽水市城市管理指挥中心信息系统(一期)项目' """)
  215. # 对比xmnr_copy1和xmnr维度是否都有
  216. if xmnr_copy1:
  217. # threads = [Thread(target=check_project_info, args=(mysql, dl, xc, str_dict)) for xc in xmnr_copy1]
  218. # for t in threads:
  219. # t.start()
  220. #
  221. # for t in threads:
  222. # t.join()
  223. for xc in xmnr_copy1:
  224. check_project_info(mysql, dl, xc, str_dict, similarity_nlp)
  225. mysql.sql_change_msg(
  226. """update idc_project set dup_status=3, one_vote_veto_status=1, self_check_status=1, history_project_count=%d ,module_count=%d where project_id=%d""" % (
  227. xmnr_count, gnmk_count, dl[0]))
  228. gong_neng_mo_kuai(mysql, dl, data, er_title, similarity_nlp)
  229. # 释放数据库资源
  230. mysql.release()
  231. def check_project_info(mysql, dl, xc, str_dict, similarity_nlp):
  232. total_keywords = {}
  233. total_similarity = 0
  234. dup_count = 0
  235. # 保存相加后的相似度到idc_project_check
  236. print(f'xmmc is {xc.get("xmmc")}')
  237. mysql.sql_change_msg(
  238. """insert into idc_project_check (project_id, dup_project_name, file_path, company_name, create_year, project_tag, project_range_tag, project_area, create_time, update_time) value ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")"""
  239. % (dl[0], escape_string(xc.get("xmmc")), escape_string(dl[1]), "", "", "需求相似、业务相似", "历史项目", "",
  240. str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7]))
  241. dup_id = mysql.cur.lastrowid
  242. for x in list(xc.keys())[1:]:
  243. content_x = xc.get(x)
  244. content_y = str_dict.get(x)
  245. if content_x and content_y:
  246. if x == 'gnmk':
  247. continue
  248. elif x == 'jsnr':
  249. continue
  250. else:
  251. dup_count += 1
  252. if ((xc['gnmk'] == 'None' or xc['gnmk'] is None or str.strip(xc['gnmk']) == '') and (str_dict['gnmk'] is None or str.strip(str_dict['gnmk']) == '')) and (
  253. not xc['jsnr'] is None and xc['jsnr'] != 'None' and not str_dict['jsnr'] is None and len(str.strip(str_dict['jsnr'])) > 0):
  254. for x in list(xc.keys())[1:]:
  255. content_x = xc.get(x)
  256. content_y = str_dict.get(x)
  257. if content_x and content_y:
  258. if x == 'gnmk':
  259. # 循环遍历每一个维度
  260. # 相似度
  261. similarity= similarity_nlp.main(content_x, content_y)
  262. similarity = similarity * 0
  263. # print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
  264. # 相似度相加
  265. total_similarity += similarity
  266. function_content = content_y
  267. dup_function_content = content_x
  268. # 保存每个维度对应的相似度到idc_project_check_detail
  269. mysql.sql_change_msg(
  270. """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
  271. % (dup_id, wdys2.get(x), similarity, escape_string(function_content),
  272. escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
  273. str(datetime.datetime.now())[:-7]))
  274. # content = content.replace(gjcs, f'<span class="similarity">{gjcs.strip()}</span>')
  275. elif x == 'jsnr':
  276. # 相似度 关键词
  277. similarity = similarity_nlp.main(content_x, content_y)
  278. similarity = similarity * 40
  279. # print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
  280. # 相似度相加
  281. total_similarity += similarity
  282. # 关键词收集
  283. function_content = content_y
  284. dup_function_content = content_x
  285. # 保存每个维度对应的相似度到idc_project_check_detail
  286. mysql.sql_change_msg(
  287. """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
  288. % (dup_id, wdys2.get(x), similarity, escape_string(function_content),
  289. escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
  290. str(datetime.datetime.now())[:-7]))
  291. else:
  292. # 相似度 关键词
  293. similarity = similarity_nlp.main(content_x, content_y)
  294. similarity = similarity * (60 / dup_count)
  295. # 相似度相加
  296. total_similarity += similarity
  297. function_content = content_y
  298. dup_function_content = content_x
  299. # 保存每个维度对应的相似度到idc_project_check_detail
  300. mysql.sql_change_msg(
  301. """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
  302. % (dup_id, wdys2.get(x), similarity, escape_string(function_content),
  303. escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
  304. str(datetime.datetime.now())[:-7]))
  305. elif ((xc['jsnr'] == 'None' or xc['jsnr'] is None or str.strip(xc['jsnr']) == '') and (str_dict['jsnr'] is None or str.strip(str_dict['jsnr']) == '')) and (
  306. not xc['gnmk'] is None and xc['gnmk'] != 'None' and not str_dict['gnmk'] is None and len(str.strip(str_dict['gnmk'])) > 0):
  307. for x in list(xc.keys())[1:]:
  308. content_x = xc.get(x)
  309. content_y = str_dict.get(x)
  310. if content_x and content_y:
  311. if x == 'gnmk':
  312. # 相似度 关键词
  313. similarity = similarity_nlp.main(content_x, content_y)
  314. similarity = similarity * 50
  315. # 相似度相加
  316. total_similarity += similarity
  317. function_content = content_y
  318. dup_function_content = content_x
  319. # 保存每个维度对应的相似度到idc_project_check_detail
  320. mysql.sql_change_msg(
  321. """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
  322. % (dup_id, wdys2.get(x), similarity, escape_string(function_content),
  323. escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
  324. str(datetime.datetime.now())[:-7]))
  325. elif x == 'jsnr':
  326. # 相似度
  327. similarity = similarity_nlp.main(content_x, content_y)
  328. similarity = similarity * 0
  329. # 相似度相加
  330. total_similarity += similarity
  331. function_content = content_y
  332. dup_function_content = content_x
  333. # 保存每个维度对应的相似度到idc_project_check_detail
  334. mysql.sql_change_msg(
  335. """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
  336. % (dup_id, wdys2.get(x), similarity, escape_string(function_content),
  337. escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
  338. str(datetime.datetime.now())[:-7]))
  339. else:
  340. # 相似度 关键词
  341. similarity = similarity_nlp.main(content_x, content_y)
  342. similarity = similarity * (50 / dup_count)
  343. # 相似度相加
  344. total_similarity += similarity
  345. function_content = content_y
  346. dup_function_content = content_x
  347. # 保存每个维度对应的相似度到idc_project_check_detail
  348. mysql.sql_change_msg(
  349. """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
  350. % (dup_id, wdys2.get(x), similarity, escape_string(function_content),
  351. escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
  352. str(datetime.datetime.now())[:-7]))
  353. else:
  354. for x in list(xc.keys())[1:]:
  355. content_x = xc.get(x)
  356. content_y = str_dict.get(x)
  357. if content_x and content_y:
  358. if x == 'gnmk':
  359. # 相似度
  360. similarity = similarity_nlp.main(content_x, content_y)
  361. similarity = similarity * 50
  362. # 相似度相加
  363. total_similarity += similarity
  364. function_content = content_y
  365. dup_function_content = content_x
  366. # 保存每个维度对应的相似度到idc_project_check_detail
  367. mysql.sql_change_msg(
  368. """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
  369. % (dup_id, wdys2.get(x), similarity, escape_string(function_content),
  370. escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
  371. str(datetime.datetime.now())[:-7]))
  372. # content = content.replace(gjcs, f'<span class="similarity">{gjcs.strip()}</span>')
  373. elif x == 'jsnr':
  374. # 相似度
  375. similarity = similarity_nlp.main(content_x, content_y)
  376. similarity = similarity * 40
  377. # 相似度相加
  378. total_similarity += similarity
  379. function_content = content_y
  380. dup_function_content = content_x
  381. # 保存每个维度对应的相似度到idc_project_check_detail
  382. mysql.sql_change_msg(
  383. """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
  384. % (dup_id, wdys2.get(x), similarity, escape_string(function_content),
  385. escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
  386. str(datetime.datetime.now())[:-7]))
  387. else:
  388. # 相似度 关键词
  389. similarity = similarity_nlp.main(content_x, content_y)
  390. similarity = similarity * (10 / dup_count)
  391. # 相似度相加
  392. total_similarity += similarity
  393. function_content = content_y
  394. dup_function_content = content_x
  395. # 保存每个维度对应的相似度到idc_project_check_detail
  396. mysql.sql_change_msg(
  397. """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
  398. % (dup_id, wdys2.get(x), similarity, escape_string(function_content),
  399. escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
  400. str(datetime.datetime.now())[:-7]))
  401. mysql.sql_change_msg(
  402. """update idc_project_check set similarity=%f where dup_id=%d""" % (total_similarity, dup_id))
  403. if __name__ == "__main__":
  404. all_path = requests.get("http://127.0.0.1:19099/check/duplicates/%s" % 599).json()
  405. # print(all_path)
  406. # dict1 = {k:v for k, v in sorted(dict.items(), key= lambda item : item[1])}
  407. # print(dict1)
  408. data_list = []
  409. for ap in all_path.get("data"):
  410. # if os.path.exists(ap.get("file_path")):
  411. data_list.append((ap.get("project_id"), ap.get("file_path"), ap.get("project_name")))
  412. print(data_list)
  413. # data_list = [(11, r"C:\Users\HUAWEI\PycharmProjects\nlp\dup_check\0825-丽水系统查重维度1.xlsx", "水路运输综合监管系统建设项目.xls")]
  414. data_list = [(11, r"D:\ningda\dup_check2\dup_check\0825-丽水系统查重维度1.xlsx", "水路运输综合监管系统建设项目.xls")]
  415. project_check(data_list)