丽水查重代码
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

550 lines
28KB

  1. # coding=utf-8
  2. import sys
  3. import re
  4. import baidu
  5. import model_scope
  6. import mysql_pool
  7. from pymysql.converters import escape_string
  8. import cosin_similarity
  9. import pandas as pd
  10. import datetime
  11. import requests
  12. import glm_utils
  13. from threading import Thread
  14. wdys1 = {
  15. "项目名称": "xmmc",
  16. "现状问题": "xzwt",
  17. "系统基础": "xtjc",
  18. "项目目标": "xmmb",
  19. "预期绩效": "yqjx",
  20. "建设需求": "jsxq",
  21. "数据需求": "sjxq",
  22. "安全需求": "aqxq",
  23. "业务领域": "ywly",
  24. "核心业务": "hxyw",
  25. "业务需求": "ywxq",
  26. "业务协同": "ywxt",
  27. "建设层级": "jscj",
  28. "用户范围": "yhfw",
  29. "目标群体": "mbqt",
  30. "建设内容": "jsnr",
  31. "功能模块": "gnmk",
  32. "数据共享": "sjgx",
  33. "智能要素": "znys"
  34. }
  35. wdys2 = {
  36. "xmmc": "项目名称",
  37. "xzwt": "现状问题",
  38. "xtjc": "系统基础",
  39. "xmmb": "项目目标",
  40. "yqjx": "预期绩效",
  41. "jsxq": "建设需求",
  42. "sjxq": "数据需求",
  43. "aqxq": "安全需求",
  44. "ywly": "业务领域",
  45. "hxyw": "核心业务",
  46. "ywxq": "业务需求",
  47. "ywxt": "业务协同",
  48. "jscj": "建设层级",
  49. "yhfw": "用户范围",
  50. "mbqt": "目标群体",
  51. "jsnr": "建设内容",
  52. "gnmk": "功能模块",
  53. "sjgx": "数据共享",
  54. "znys": "智能要素"
  55. }
  56. gnmkys = {
  57. "gnmc": "功能名称",
  58. "gnms": "功能描述"
  59. }
  60. def getFlag():
  61. data_dict = {}
  62. df = pd.read_excel("0825.xlsx")
  63. data = df.values
  64. data = list(pd.Series(data[:, 1]).dropna())
  65. for d in data:
  66. try:
  67. wd = re.search("(.*?)(.*?%)", d).group(1).strip()
  68. wdc = wdys1.get(wd)
  69. if wdc:
  70. qz = re.search(".*?((.*?%))", d).group(1)
  71. data_dict[wdc] = qz
  72. except:
  73. pass
  74. return data_dict
  75. def gong_neng_mo_kuai(mysql, dl, data, er_title, str_dict_new):
  76. nlp = model_scope.Bert_nlp("corom")
  77. # 将excel文件中的所有第三维度内容进行拼接
  78. str_dict = {}
  79. for et in er_title:
  80. for d in data:
  81. if d[1] == et:
  82. if str_dict.get(et):
  83. str_dict[et] = str_dict.get(et) + d[3]
  84. else:
  85. str_dict[et] = d[3]
  86. for k, v in str_dict.items():
  87. mysql.sql_change_msg(
  88. """insert into idc_project_module (project_id, check_duplicate_count, module_name, module_content, create_time, update_time, tag) value(%d, 1, "%s", "%s", "%s", "%s", "模块")""" % (
  89. int(dl[0]), k, v, str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7]))
  90. module_id_list = mysql.sql_select_many(
  91. """select project_module_id, module_name, module_content from idc_project_module where project_id=%d""" % dl[
  92. 0])
  93. data_list = []
  94. for mil in module_id_list:
  95. data_dict = {}
  96. data_dict["project_module_id"] = mil.get("project_module_id")
  97. data_dict["gnmc"] = mil.get("module_name")
  98. data_dict["gnms"] = mil.get("module_content")
  99. data_list.append(data_dict)
  100. for i in data_list:
  101. # where xmmc = '南浔区信息化项目全生命周期管理系统'
  102. gnmk_copy1 = mysql.sql_select_many("""select * from user_history_module_data """)
  103. if gnmk_copy1:
  104. # desc_info_list = []
  105. # for gc in gnmk_copy1:
  106. # if gc.get("xmmc") != dl[2]:
  107. # desc_info_list.append(gc.get("gnms"))
  108. # similarity, s1, s2, idx = nlp.main(i.get("gnms"), desc_info_list)
  109. # if idx == -1:
  110. # continue
  111. for gc in gnmk_copy1:
  112. desc = glm_utils.qwenResult(i.get("gnms"), gc.get("gnms"))
  113. similarity_result, count = similarity_result_check(desc)
  114. similarity = count
  115. mysql.sql_change_msg(
  116. """insert into idc_project_module_check (project_module_id, module_name, project_name, company_name, create_time, update_time, similarity_result) value(%d, "%s", "%s", "%s", "%s", "%s", "%s")"""
  117. % (
  118. i.get("project_module_id"), escape_string(gc.get("gnmc")), escape_string(gc.get("xmmc")), "",
  119. str(datetime.datetime.now())[:-7],
  120. str(datetime.datetime.now())[:-7], similarity_result))
  121. dup_module_id = mysql.cur.lastrowid
  122. check_module_info(mysql, gc, dl, i, dup_module_id, similarity)
  123. def check_module_info(mysql, gc, dl, pro, dup_module_id, score):
  124. total_similarity1 = 0
  125. total_similarity2 = 0
  126. for j in ["gnmc", "gnms"]:
  127. # 循环遍历每一个模块名称
  128. content_x = gc.get(j)
  129. content_y = pro.get(j)
  130. if content_x and content_y:
  131. if j == "gnmc":
  132. # print("功能名称对比")
  133. similarity, check_desc = glm_utils.AutoDLResult(f"""请帮我分析以下两段重复语句重复的地方: \n第一段话是:'{content_y}', \n ----------------- \n 第二段话是:'{content_x}'""")
  134. # # 相似度相加
  135. if similarity is None:
  136. similarity = 0
  137. print(f"similarity is {similarity}")
  138. total_similarity1 += similarity/100
  139. mysql.sql_change_msg(
  140. """insert into idc_project_module_check_detail (dup_module_id, project_name, module_content, dup_module_content, similarity, dimension, create_time, update_time, check_desc) value (%d, "%s", "%s", "%s", %f, "%s", "%s", "%s", "%s")"""
  141. % (dup_module_id, dl[2], escape_string(content_y), escape_string(content_x), similarity,
  142. "功能名称",
  143. str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7], escape_string(check_desc)))
  144. else:
  145. check_desc = glm_utils.AutoDLResultNoNum(f"""请帮我分析以下两段重复语句重复的地方: \n第一段话是:'{content_y}', \n ----------------- \n 第二段话是:'{content_x}'""")
  146. similarity = score
  147. # 相似度相加 gnms
  148. total_similarity2 += similarity
  149. module_content = pro.get("gnms")
  150. dup_module_content = gc.get("gnms")
  151. mysql.sql_change_msg(
  152. """insert into idc_project_module_check_detail (dup_module_id, project_name, module_content, dup_module_content, similarity, dimension, create_time, update_time, check_desc) value (%d, "%s", "%s", "%s", %f, "%s", "%s", "%s", "%s")"""
  153. % (dup_module_id, dl[2], escape_string(module_content), escape_string(dup_module_content),
  154. similarity,
  155. "功能模块描述",
  156. str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7],
  157. escape_string(check_desc)))
  158. mysql.sql_change_msg("""update idc_project_module_check set similarity=%f where dup_module_id=%d""" % (
  159. total_similarity1 + total_similarity2, dup_module_id))
  160. def project_check(data_list):
  161. mysql = mysql_pool.ConnMysql()
  162. # mysql.sql_select_many("""select * from mkgjc""")
  163. # 读取历史数据
  164. xmnr_count = len(mysql.sql_select_many("""select * from user_history_data"""))
  165. gnmk_count = len(mysql.sql_select_many("""select * from user_history_module_data"""))
  166. nlp = model_scope.Bert_nlp("corom")
  167. # 遍历excel存储路径
  168. for dl in data_list:
  169. # path = "0825-丽水系统查重维度1.xlsx"
  170. # 读取路径下的excel
  171. print(dl,dl[1])
  172. df = pd.read_excel(dl[1])
  173. data = df.values
  174. # 将excel文件中的所有维度内容进行拼接
  175. join_str = ""
  176. str_dict = {}
  177. gnmk_str = []
  178. title = ""
  179. er_title = set()
  180. for d in data:
  181. # if pd.notnull(d[0]):
  182. # title = d[0]
  183. # if title == "功能模块":
  184. # er_title.add(d[1])
  185. # join_str = ""
  186. # for i in d[1:]:
  187. # if pd.notnull(i):
  188. # join_str += str(i)
  189. # str_dict[wdys1.get(title)] = join_str
  190. if pd.notnull(d[0]):
  191. title = d[0]
  192. if title == "功能模块":
  193. er_title.add(d[1])
  194. join_str = ""
  195. for i in d[1:]:
  196. if pd.notnull(i):
  197. join_str += str(i)
  198. if title == "功能模块":
  199. if i == '功能描述':
  200. continue
  201. else:
  202. gnmk_str.append(i)
  203. str_dict[wdys1.get(title)] = join_str
  204. else:
  205. if title == "功能模块":
  206. er_title.add(d[1])
  207. for i in d[1:]:
  208. if pd.notnull(i):
  209. join_str += str(i)
  210. str_dict[wdys1.get(title)] = str_dict.get(wdys1.get(title)) + join_str
  211. # print(str_dict)
  212. gnmk = ",".join(gnmk_str)
  213. str_dict['gnmk'] = gnmk
  214. mysql.sql_change_msg(
  215. """insert into user_data (xmmc, xzwt, xtjc, xmmb, yqjx, jsxq, sjxq, aqxq, ywly, hxyw, ywxq, ywxt, jscj, yhfw, mbqt, jsnr, gnmk, sjgx, znys) value ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")"""
  216. % (dl[0], str_dict.get("xzwt") if str_dict.get("xzwt") else None,
  217. str_dict.get("xtjc") if str_dict.get("xtjc") else None,
  218. str_dict.get("xmmb") if str_dict.get("xmmb") else None,
  219. str_dict.get("yqjx") if str_dict.get("yqjx") else None,
  220. str_dict.get("jsxq") if str_dict.get("jsxq") else None,
  221. str_dict.get("sjxq") if str_dict.get("sjxq") else None,
  222. str_dict.get("aqxq") if str_dict.get("aqxq") else None,
  223. str_dict.get("ywly") if str_dict.get("ywly") else None,
  224. str_dict.get("hxyw") if str_dict.get("hxyw") else None,
  225. str_dict.get("ywxq") if str_dict.get("ywxq") else None,
  226. str_dict.get("ywxt") if str_dict.get("ywxt") else None,
  227. str_dict.get("jscj") if str_dict.get("jscj") else None,
  228. str_dict.get("yhfw") if str_dict.get("yhfw") else None,
  229. str_dict.get("mbqt") if str_dict.get("mbqt") else None,
  230. str_dict.get("jsnr") if str_dict.get("jsnr") else None,
  231. str_dict.get("gnmk") if str_dict.get("gnmk") else None,
  232. str_dict.get("sjgx") if str_dict.get("sjgx") else None,
  233. str_dict.get("znys") if str_dict.get("znys") else None))
  234. # 或取所有的xmnr_copy1 where xmmc = '南浔区信息化项目全生命周期管理系统'
  235. xmnr_copy1 = mysql.sql_select_many("""select * from user_history_data where xmmc = '富阳未来社区(乡村)一体化数智平台' """)
  236. # 对比xmnr_copy1和xmnr维度是否都有
  237. if xmnr_copy1:
  238. # threads = [Thread(target=check_project_info, args=(mysql, dl, xc, str_dict)) for xc in xmnr_copy1]
  239. # for t in threads:
  240. # t.start()
  241. #
  242. # for t in threads:
  243. # t.join()
  244. # pro_ths = []
  245. # for xc in xmnr_copy1:
  246. # # check_project_info(mysql, dl, xc, str_dict)
  247. # p = Thread(target=check_project_info, args=(mysql, dl, xc, str_dict))
  248. # pro_ths.append(p)
  249. # p.start()
  250. # for p in pro_ths:
  251. # p.join()
  252. xmnr_copy1_new = []
  253. for xc in xmnr_copy1:
  254. if xc["xmmc"] == str_dict.get("xmmc"):
  255. continue
  256. check_project_info(mysql, dl, xc, str_dict, nlp)
  257. # 找出相识对最高的项目通过glm分析
  258. mysql.sql_change_msg(
  259. """update idc_project set dup_status=3, one_vote_veto_status=1, self_check_status=1, history_project_count=%d ,module_count=%d where project_id=%d""" % (
  260. xmnr_count, gnmk_count, dl[0]))
  261. gong_neng_mo_kuai(mysql, dl, data, er_title, str_dict)
  262. def check_project_info(mysql, dl, xc, str_dict, nlp):
  263. total_keywords = {}
  264. total_similarity = 0
  265. dup_count = 0
  266. # 保存相加后的相似度到idc_project_check
  267. mysql.sql_change_msg(
  268. """insert into idc_project_check (project_id, dup_project_name, file_path, company_name, create_year, project_tag, project_range_tag, project_area, create_time, update_time) value ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")"""
  269. % (dl[0], escape_string(xc.get("xmmc")), escape_string(dl[1]), "", "", "需求相似、业务相似", "历史项目", "",
  270. str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7]))
  271. dup_id = mysql.cur.lastrowid
  272. for x in list(xc.keys())[1:]:
  273. content_x = xc.get(x)
  274. content_y = str_dict.get(x)
  275. if content_x and content_y:
  276. if x == 'gnmk':
  277. continue
  278. elif x == 'jsnr':
  279. continue
  280. else:
  281. dup_count += 1
  282. if ((xc['gnmk'] == 'None' or xc['gnmk'] is None or str.strip(xc['gnmk']) == '') and (str_dict['gnmk'] is None or str.strip(str_dict['gnmk']) == '')) and (
  283. not xc['jsnr'] is None and xc['jsnr'] != 'None' and not str_dict['jsnr'] is None and len(str.strip(str_dict['jsnr'])) > 0):
  284. for x in list(xc.keys())[1:]:
  285. content_x = xc.get(x)
  286. content_y = str_dict.get(x)
  287. if content_x and content_y:
  288. if x == 'gnmk':
  289. # 循环遍历每一个维度
  290. # contents_y = []
  291. # contents_y.append(content_y)
  292. # similarity, content1, content2, idx = nlp.main(content_x, contents_y)
  293. desc = glm_utils.qwenResult(content_y, content_x)
  294. similarity_result, count = similarity_result_check(desc)
  295. similarity = count * 0
  296. # print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
  297. # 相似度相加
  298. total_similarity += similarity
  299. function_content = content_y
  300. dup_function_content = content_x
  301. # 保存每个维度对应的相似度到idc_project_check_detail
  302. mysql.sql_change_msg(
  303. """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time, similarity_result) value (%d, "%s", %f, "%s", "%s", "%s", "%s", "%s")"""
  304. % (dup_id, wdys2.get(x), similarity, escape_string(function_content),
  305. escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
  306. str(datetime.datetime.now())[:-7], similarity_result))
  307. elif x == 'jsnr':
  308. # 循环遍历每一个维度
  309. # contents_y = []
  310. # contents_y.append(content_y)
  311. # similarity, content1, content2, idx = nlp.main(content_x, contents_y)
  312. desc = glm_utils.qwenResult(content_y, content_x)
  313. similarity_result, count = similarity_result_check(desc)
  314. similarity = count * 40
  315. # print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
  316. # 相似度相加
  317. total_similarity += similarity
  318. function_content = content_y
  319. dup_function_content = content_x
  320. # 保存每个维度对应的相似度到idc_project_check_detail
  321. mysql.sql_change_msg(
  322. """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time, similarity_result) value (%d, "%s", %f, "%s", "%s", "%s", "%s", "%s")"""
  323. % (dup_id, wdys2.get(x), similarity, escape_string(function_content),
  324. escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
  325. str(datetime.datetime.now())[:-7], similarity_result))
  326. else:
  327. # 循环遍历每一个维度
  328. # contents_y = []
  329. # contents_y.append(content_y)
  330. # similarity, content1, content2, idx = nlp.main(content_x, contents_y)
  331. desc = glm_utils.qwenResult(content_y, content_x)
  332. similarity_result, count = similarity_result_check(desc)
  333. similarity = count * (60 / dup_count)
  334. # print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
  335. # 相似度相加
  336. total_similarity += similarity
  337. function_content = content_y
  338. dup_function_content = content_x
  339. # 保存每个维度对应的相似度到idc_project_check_detail
  340. mysql.sql_change_msg(
  341. """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time, similarity_result) value (%d, "%s", %f, "%s", "%s", "%s", "%s", "%s")"""
  342. % (dup_id, wdys2.get(x), similarity, escape_string(function_content),
  343. escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
  344. str(datetime.datetime.now())[:-7], similarity_result))
  345. elif ((xc['jsnr'] == 'None' or xc['jsnr'] is None or str.strip(xc['jsnr']) == '') and (str_dict['jsnr'] is None or str.strip(str_dict['jsnr']) == '')) and (
  346. not xc['gnmk'] is None and xc['gnmk'] != 'None' and not str_dict['gnmk'] is None and len(str.strip(str_dict['gnmk'])) > 0):
  347. for x in list(xc.keys())[1:]:
  348. content_x = xc.get(x)
  349. content_y = str_dict.get(x)
  350. if content_x and content_y:
  351. if x == 'gnmk':
  352. # 循环遍历每一个维度
  353. # contents_y = []
  354. # contents_y.append(content_y)
  355. # similarity, content1, content2, idx = nlp.main(content_x, contents_y)
  356. desc = glm_utils.qwenResult(content_y, content_x)
  357. similarity_result, count = similarity_result_check(desc)
  358. similarity = count * 50
  359. # print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
  360. # 相似度相加
  361. total_similarity += similarity
  362. function_content = content_y
  363. dup_function_content = content_x
  364. # 保存每个维度对应的相似度到idc_project_check_detail
  365. mysql.sql_change_msg(
  366. """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time, similarity_result) value (%d, "%s", %f, "%s", "%s", "%s", "%s", "%s")"""
  367. % (dup_id, wdys2.get(x), similarity, escape_string(function_content),
  368. escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
  369. str(datetime.datetime.now())[:-7], similarity_result))
  370. elif x == 'jsnr':
  371. # 循环遍历每一个维度
  372. # contents_y = []
  373. # contents_y.append(content_y)
  374. # similarity, content1, content2, idx = nlp.main(content_x, contents_y)
  375. desc = glm_utils.qwenResult(content_y, content_x)
  376. similarity_result, count = similarity_result_check(desc)
  377. similarity = count * 0
  378. # print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
  379. # 相似度相加
  380. total_similarity += similarity
  381. function_content = content_y
  382. dup_function_content = content_x
  383. # 保存每个维度对应的相似度到idc_project_check_detail
  384. mysql.sql_change_msg(
  385. """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time, similarity_result) value (%d, "%s", %f, "%s", "%s", "%s", "%s", "%s")"""
  386. % (dup_id, wdys2.get(x), similarity, escape_string(function_content),
  387. escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
  388. str(datetime.datetime.now())[:-7], similarity_result))
  389. else:
  390. # 循环遍历每一个维度
  391. # contents_y = []
  392. # contents_y.append(content_y)
  393. # similarity, content1, content2, idx = nlp.main(content_x, contents_y)
  394. desc = glm_utils.qwenResult(content_y, content_x)
  395. similarity_result, count = similarity_result_check(desc)
  396. similarity = count * (50 / dup_count)
  397. # print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
  398. # 相似度相加
  399. total_similarity += similarity
  400. function_content = content_y
  401. dup_function_content = content_x
  402. # 保存每个维度对应的相似度到idc_project_check_detail
  403. mysql.sql_change_msg(
  404. """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time, similarity_result) value (%d, "%s", %f, "%s", "%s", "%s", "%s", "%s")"""
  405. % (dup_id, wdys2.get(x), similarity, escape_string(function_content),
  406. escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
  407. str(datetime.datetime.now())[:-7], similarity_result))
  408. else:
  409. for x in list(xc.keys())[1:]:
  410. content_x = xc.get(x)
  411. content_y = str_dict.get(x)
  412. if content_x and content_y:
  413. if x == 'gnmk':
  414. # 循环遍历每一个维度
  415. # contents_y = []
  416. # contents_y.append(content_y)
  417. # similarity, content1, content2, idx = nlp.main(content_x, contents_y)
  418. desc = glm_utils.qwenResult(content_y, content_x)
  419. similarity_result, count = similarity_result_check(desc)
  420. similarity = count * 50
  421. # 相似度相加
  422. total_similarity += similarity
  423. function_content = content_y
  424. dup_function_content = content_x
  425. # 保存每个维度对应的相似度到idc_project_check_detail
  426. mysql.sql_change_msg(
  427. """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time, similarity_result) value (%d, "%s", %f, "%s", "%s", "%s", "%s", "%s")"""
  428. % (dup_id, wdys2.get(x), similarity, escape_string(function_content),
  429. escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
  430. str(datetime.datetime.now())[:-7], similarity_result))
  431. elif x == 'jsnr':
  432. # 循环遍历每一个维度
  433. # contents_y = []
  434. # contents_y.append(content_y)
  435. # similarity, content1, content2, idx = nlp.main(content_x, contents_y)
  436. desc = glm_utils.qwenResult(content_y, content_x)
  437. similarity_result, count = similarity_result_check(desc)
  438. similarity = count * 40
  439. # 相似度相加
  440. total_similarity += similarity
  441. function_content = content_y
  442. dup_function_content = content_x
  443. # 保存每个维度对应的相似度到idc_project_check_detail
  444. mysql.sql_change_msg(
  445. """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time, similarity_result) value (%d, "%s", %f, "%s", "%s", "%s", "%s", "%s")"""
  446. % (dup_id, wdys2.get(x), similarity, escape_string(function_content),
  447. escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
  448. str(datetime.datetime.now())[:-7], similarity_result))
  449. else:
  450. # 循环遍历每一个维度
  451. # contents_y = []
  452. # contents_y.append(content_y)
  453. # similarity, content1, content2, idx = nlp.main(content_x, contents_y)
  454. desc = glm_utils.qwenResult(content_y, content_x)
  455. similarity_result, count = similarity_result_check(desc)
  456. similarity = count * (10 / dup_count)
  457. # print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
  458. # 相似度相加
  459. total_similarity += similarity
  460. function_content = content_y
  461. dup_function_content = content_x
  462. # 保存每个维度对应的相似度到idc_project_check_detail
  463. mysql.sql_change_msg(
  464. """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time, similarity_result) value (%d, "%s", %f, "%s", "%s", "%s", "%s", "%s")"""
  465. % (dup_id, wdys2.get(x), similarity, escape_string(function_content),
  466. escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
  467. str(datetime.datetime.now())[:-7], similarity_result))
  468. mysql.sql_change_msg(
  469. """update idc_project_check set similarity=%f where dup_id=%d""" % (total_similarity, dup_id))
  470. if __name__ == "__main__":
  471. all_path = requests.get("http://127.0.0.1:19099/check/duplicates/%s" % 599).json()
  472. # print(all_path)
  473. # dict1 = {k:v for k, v in sorted(dict.items(), key= lambda item : item[1])}
  474. # print(dict1)
  475. data_list = []
  476. for ap in all_path.get("data"):
  477. # if os.path.exists(ap.get("file_path")):
  478. data_list.append((ap.get("project_id"), ap.get("file_path"), ap.get("project_name")))
  479. print(data_list)
  480. # data_list = [(11, r"C:\Users\HUAWEI\PycharmProjects\nlp\dup_check\0825-丽水系统查重维度1.xlsx", "水路运输综合监管系统建设项目.xls")]
  481. data_list = [(11, r"D:\ningda\dup_check2\dup_check\0825-丽水系统查重维度1.xlsx", "水路运输综合监管系统建设项目.xls")]
  482. project_check(data_list)
  483. # 对比相似度
  484. def similarity_result_check(desc):
  485. similarity_result = ""
  486. similarity_result_count = 0
  487. if len(desc) > 7:
  488. if desc[6:7] == "高":
  489. similarity_result = "非常相似"
  490. similarity_result_count = 90
  491. elif desc[6:7] == "中":
  492. similarity_result = "比较相似"
  493. similarity_result_count = 60
  494. elif desc[6:7] == "低":
  495. similarity_result = "相似度低"
  496. similarity_result_count = 30
  497. return similarity_result, similarity_result_count