丽水查重代码
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

721 lines
41KB

  1. # coding=utf-8
  2. import sys
  3. import re
  4. import baidu
  5. import mysql_pool
  6. from pymysql.converters import escape_string
  7. import cosin_similarity
  8. import pandas as pd
  9. import datetime
  10. import requests
  11. import glm_utils
  12. from threading import Thread
  13. wdys1 = {
  14. "项目名称": "xmmc",
  15. "现状问题": "xzwt",
  16. "系统基础": "xtjc",
  17. "项目目标": "xmmb",
  18. "预期绩效": "yqjx",
  19. "建设需求": "jsxq",
  20. "数据需求": "sjxq",
  21. "安全需求": "aqxq",
  22. "业务领域": "ywly",
  23. "核心业务": "hxyw",
  24. "业务需求": "ywxq",
  25. "业务协同": "ywxt",
  26. "建设层级": "jscj",
  27. "用户范围": "yhfw",
  28. "目标群体": "mbqt",
  29. "建设内容": "jsnr",
  30. "功能模块": "gnmk",
  31. "数据共享": "sjgx",
  32. "智能要素": "znys"
  33. }
  34. wdys2 = {
  35. "xmmc": "项目名称",
  36. "xzwt": "现状问题",
  37. "xtjc": "系统基础",
  38. "xmmb": "项目目标",
  39. "yqjx": "预期绩效",
  40. "jsxq": "建设需求",
  41. "sjxq": "数据需求",
  42. "aqxq": "安全需求",
  43. "ywly": "业务领域",
  44. "hxyw": "核心业务",
  45. "ywxq": "业务需求",
  46. "ywxt": "业务协同",
  47. "jscj": "建设层级",
  48. "yhfw": "用户范围",
  49. "mbqt": "目标群体",
  50. "jsnr": "建设内容",
  51. "gnmk": "功能模块",
  52. "sjgx": "数据共享",
  53. "znys": "智能要素"
  54. }
  55. gnmkys = {
  56. "gnmc": "功能名称",
  57. "gnms": "功能描述"
  58. }
  59. def getFlag():
  60. data_dict = {}
  61. df = pd.read_excel("0825.xlsx")
  62. data = df.values
  63. data = list(pd.Series(data[:, 1]).dropna())
  64. for d in data:
  65. try:
  66. wd = re.search("(.*?)(.*?%)", d).group(1).strip()
  67. wdc = wdys1.get(wd)
  68. if wdc:
  69. qz = re.search(".*?((.*?%))", d).group(1)
  70. data_dict[wdc] = qz
  71. except:
  72. pass
  73. return data_dict
  74. def gong_neng_mo_kuai(mysql, dl, data, er_title):
  75. # 将excel文件中的所有第三维度内容进行拼接
  76. str_dict = {}
  77. for et in er_title:
  78. for d in data:
  79. if d[1] == et:
  80. if str_dict.get(et):
  81. str_dict[et] = str_dict.get(et) + d[3]
  82. else:
  83. str_dict[et] = d[3]
  84. for k, v in str_dict.items():
  85. mysql.sql_change_msg(
  86. """insert into idc_project_module (project_id, check_duplicate_count, module_name, module_content, create_time, update_time, tag) value(%d, 1, "%s", "%s", "%s", "%s", "模块")""" % (
  87. int(dl[0]), k, v, str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7]))
  88. module_id_list = mysql.sql_select_many(
  89. """select project_module_id, module_name, module_content from idc_project_module where project_id=%d""" % dl[
  90. 0])
  91. data_list = []
  92. for mil in module_id_list:
  93. data_dict = {}
  94. data_dict["project_module_id"] = mil.get("project_module_id")
  95. data_dict["gnmc"] = mil.get("module_name")
  96. # data_dict["glm_desc"] = baidu.CallResult(mil.get("module_content"))
  97. data_dict["gnms"] = mil.get("module_content")
  98. # print(f'module_content = ({mil.get("module_content")}), glm_desc = ({data_dict["glm_desc"]})')
  99. data_list.append(data_dict)
  100. # print(data_list)
  101. for i in data_list:
  102. gnmk_copy1 = mysql.sql_select_many("""select * from user_history_module_data where xmmc = '南浔区信息化项目全生命周期管理系统'""")
  103. if gnmk_copy1:
  104. for gc in gnmk_copy1:
  105. print(
  106. """insert into idc_project_module_check (project_module_id, module_name, project_name, company_name, create_time, update_time) value(%d, "%s", "%s", "%s", "%s", "%s")"""
  107. % (
  108. i.get("project_module_id"), escape_string(gc.get("gnmc")), escape_string(gc.get("xmmc")), "",
  109. str(datetime.datetime.now())[:-7],
  110. str(datetime.datetime.now())[:-7]))
  111. mysql.sql_change_msg(
  112. """insert into idc_project_module_check (project_module_id, module_name, project_name, company_name, create_time, update_time) value(%d, "%s", "%s", "%s", "%s", "%s")"""
  113. % (
  114. i.get("project_module_id"), escape_string(gc.get("gnmc")), escape_string(gc.get("xmmc")), "",
  115. str(datetime.datetime.now())[:-7],
  116. str(datetime.datetime.now())[:-7]))
  117. dup_module_id = mysql.cur.lastrowid
  118. check_module_info(mysql, gc, dl, i, dup_module_id)
  119. gnmk_gjc = {}
  120. for a in ["gnmc", "glm_desc"]:
  121. if i.get(a):
  122. content_x = i.get(a)
  123. content_y = i.get(a)
  124. if a == "gnmc":
  125. similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
  126. # 相似度 关键词
  127. similarity, keyword_x, keywords = similarity.main()
  128. # 去重
  129. keywords = list(set(keywords))
  130. gnmk_gjc[a] = keywords
  131. else:
  132. similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
  133. # 相似度 关键词
  134. similarity, keyword_x, keywords = similarity.main()
  135. # 去重
  136. keywords = list(set(keywords))
  137. gnmk_gjc[a] = keywords
  138. mysql.sql_change_msg("""insert into user_module_keywords (xmmc, gnmc, gnms) value("%s", "%s", "%s")""" % (
  139. dl[2], str(gnmk_gjc.get("gnmc"))[1:-1] if gnmk_gjc.get("gnmc") else None,
  140. str(gnmk_gjc.get("gnms"))[1:-1] if gnmk_gjc.get("gnms") else None))
  141. def check_module_info(mysql, gc, dl, pro, dup_module_id):
  142. total_similarity1 = 0
  143. total_keywords1 = []
  144. total_similarity2 = 0
  145. total_keywords2 = []
  146. for j in ["gnmc", "gnms"]:
  147. # 循环遍历每一个模块名称
  148. content_x = gc.get(j)
  149. content_y = pro.get(j)
  150. if content_x and content_y:
  151. if j == "gnmc":
  152. # similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
  153. # 相似度 关键词
  154. # similarity, keyword_x, keywords = similarity.main()
  155. # similarity = similarity * 1
  156. # total_keywords1 += keywords
  157. # print("######################相似度: %.2f%%" % similarity, "关键词: %s" % keywords)
  158. similarity, check_desc = glm_utils.AutoDLResult(f"""告诉我下面两段话的重复率百分比是多少: \n第一段话是:'{content_y}', \n ----------------- \n 第二段话是:'{content_x}'""")
  159. # similarity, check_desc = baidu.CallResult(
  160. # f"""告诉我下面两段话的重复率百分比是多少: 第一段话是:'{content_x}', ----------------- 第二段话是:'{content_y}'""")
  161. # 相似度相加
  162. if similarity is None:
  163. similarity = 0
  164. print(f"similarity is {similarity}")
  165. total_similarity1 += similarity/100
  166. mysql.sql_change_msg(
  167. """insert into idc_project_module_check_detail (dup_module_id, project_name, module_content, dup_module_content, similarity, dimension, create_time, update_time, check_desc) value (%d, "%s", "%s", "%s", %f, "%s", "%s", "%s", "%s")"""
  168. % (dup_module_id, dl[2], escape_string(content_y), escape_string(content_x), similarity,
  169. "功能名称",
  170. str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7], escape_string(check_desc)))
  171. else:
  172. # similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
  173. # # 相似度 关键词
  174. # similarity, keyword_x, keywords = similarity.main()
  175. # similarity = similarity * 99
  176. # total_keywords2 += keywords
  177. similarity, check_desc = glm_utils.AutoDLResult(f"""告诉我下面两段话的重复率百分比是多少: \n第一段话是:'{content_y}', \n ----------------- \n 第二段话是:'{content_x}'""")
  178. # similarity, check_desc = baidu.CallResult(
  179. # f"""告诉我下面两段话的重复率百分比是多少: 第一段话是:'{content_x}', ----------------- 第二段话是:'{content_y}'""")
  180. # 临时写入文件
  181. # check_desc = str(check_desc).replace("\n", " ")
  182. # prompt = f"""告诉我下面两段话的重复率百分比是多少: 第一段话是:'{content_x}', ----------------- 第二段话是:'{content_y}'"""
  183. # prompt = prompt.replace("\n", " ")
  184. # with open('train.json', 'a') as file:
  185. # file.write("{" + f"""
  186. # "content": "{prompt}",
  187. # "summary": "{check_desc}"
  188. # """ + "}\n")
  189. if similarity is None:
  190. similarity = 0
  191. print(f"similarity is {similarity}")
  192. similarity = similarity * 0.99
  193. # print("######################相似度: %.2f%%" % similarity, "关键词: %s" % keywords)
  194. # 相似度相加 gnms
  195. total_similarity2 += similarity
  196. # module_content = pro.get("gnms") + "/n" + content_y
  197. # dup_module_content = gc.get("gnms") + "/n" + content_x
  198. module_content = pro.get("gnms")
  199. dup_module_content = gc.get("gnms")
  200. mysql.sql_change_msg(
  201. """insert into idc_project_module_check_detail (dup_module_id, project_name, module_content, dup_module_content, similarity, dimension, create_time, update_time, check_desc) value (%d, "%s", "%s", "%s", %f, "%s", "%s", "%s", "%s")"""
  202. % (dup_module_id, dl[2], escape_string(module_content), escape_string(dup_module_content),
  203. similarity,
  204. "功能模块描述",
  205. str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7],
  206. escape_string(check_desc)))
  207. mysql.sql_change_msg("""update idc_project_module_check set similarity=%f where dup_module_id=%d""" % (
  208. total_similarity1 + total_similarity2, dup_module_id))
  209. def project_check(data_list):
  210. mysql = mysql_pool.ConnMysql()
  211. # mysql.sql_select_many("""select * from mkgjc""")
  212. # 读取维度和权重
  213. # xmnr_count = len(mysql.sql_select_many("""select * from xmnr_copy1"""))
  214. # gnmk_count = len(mysql.sql_select_many("""select * from gnmk_copy1"""))
  215. xmnr_count = len(mysql.sql_select_many("""select * from user_history_data"""))
  216. gnmk_count = len(mysql.sql_select_many("""select * from user_history_module_data"""))
  217. get_data_dict = getFlag()
  218. # 遍历excel存储路径
  219. for dl in data_list:
  220. # path = "0825-丽水系统查重维度1.xlsx"
  221. # 读取路径下的excel
  222. print(dl,dl[1])
  223. df = pd.read_excel(dl[1])
  224. data = df.values
  225. # 将excel文件中的所有维度内容进行拼接
  226. join_str = ""
  227. str_dict = {}
  228. gnmk_str = []
  229. title = ""
  230. er_title = set()
  231. for d in data:
  232. # if pd.notnull(d[0]):
  233. # title = d[0]
  234. # if title == "功能模块":
  235. # er_title.add(d[1])
  236. # join_str = ""
  237. # for i in d[1:]:
  238. # if pd.notnull(i):
  239. # join_str += str(i)
  240. # str_dict[wdys1.get(title)] = join_str
  241. if pd.notnull(d[0]):
  242. title = d[0]
  243. if title == "功能模块":
  244. er_title.add(d[1])
  245. join_str = ""
  246. for i in d[1:]:
  247. if pd.notnull(i):
  248. join_str += str(i)
  249. if title == "功能模块":
  250. if i == '功能描述':
  251. continue
  252. else:
  253. gnmk_str.append(i)
  254. str_dict[wdys1.get(title)] = join_str
  255. else:
  256. if title == "功能模块":
  257. er_title.add(d[1])
  258. for i in d[1:]:
  259. if pd.notnull(i):
  260. join_str += str(i)
  261. str_dict[wdys1.get(title)] = str_dict.get(wdys1.get(title)) + join_str
  262. # print(str_dict)
  263. gnmk = ",".join(gnmk_str)
  264. str_dict['gnmk'] = gnmk
  265. mysql.sql_change_msg(
  266. """insert into user_data (xmmc, xzwt, xtjc, xmmb, yqjx, jsxq, sjxq, aqxq, ywly, hxyw, ywxq, ywxt, jscj, yhfw, mbqt, jsnr, gnmk, sjgx, znys) value ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")"""
  267. % (dl[0], str_dict.get("xzwt") if str_dict.get("xzwt") else None,
  268. str_dict.get("xtjc") if str_dict.get("xtjc") else None,
  269. str_dict.get("xmmb") if str_dict.get("xmmb") else None,
  270. str_dict.get("yqjx") if str_dict.get("yqjx") else None,
  271. str_dict.get("jsxq") if str_dict.get("jsxq") else None,
  272. str_dict.get("sjxq") if str_dict.get("sjxq") else None,
  273. str_dict.get("aqxq") if str_dict.get("aqxq") else None,
  274. str_dict.get("ywly") if str_dict.get("ywly") else None,
  275. str_dict.get("hxyw") if str_dict.get("hxyw") else None,
  276. str_dict.get("ywxq") if str_dict.get("ywxq") else None,
  277. str_dict.get("ywxt") if str_dict.get("ywxt") else None,
  278. str_dict.get("jscj") if str_dict.get("jscj") else None,
  279. str_dict.get("yhfw") if str_dict.get("yhfw") else None,
  280. str_dict.get("mbqt") if str_dict.get("mbqt") else None,
  281. str_dict.get("jsnr") if str_dict.get("jsnr") else None,
  282. str_dict.get("gnmk") if str_dict.get("gnmk") else None,
  283. str_dict.get("sjgx") if str_dict.get("sjgx") else None,
  284. str_dict.get("znys") if str_dict.get("znys") else None))
  285. # 或取所有的xmnr_copy1
  286. xmnr_copy1 = mysql.sql_select_many("""select * from user_history_data WHERE xmmc = '南浔区信息化项目全生命周期管理系统'""")
  287. # 对比xmnr_copy1和xmnr维度是否都有
  288. if xmnr_copy1:
  289. # threads = [Thread(target=check_project_info, args=(mysql, dl, xc, str_dict)) for xc in xmnr_copy1]
  290. # for t in threads:
  291. # t.start()
  292. #
  293. # for t in threads:
  294. # t.join()
  295. # pro_ths = []
  296. # for xc in xmnr_copy1:
  297. # # check_project_info(mysql, dl, xc, str_dict)
  298. # p = Thread(target=check_project_info, args=(mysql, dl, xc, str_dict))
  299. # pro_ths.append(p)
  300. # p.start()
  301. # for p in pro_ths:
  302. # p.join()
  303. for xc in xmnr_copy1:
  304. check_project_info(mysql, dl, xc, str_dict)
  305. project_gjc = {}
  306. for w in wdys2.keys():
  307. content_x = str_dict.get(w)
  308. content_y = str_dict.get(w)
  309. if content_x and content_y:
  310. # 循环遍历每一个维度
  311. similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
  312. # 相似度 关键词
  313. similarity, keywords_x, keywords = similarity.main()
  314. # 去重
  315. keywords = list(set(keywords))
  316. project_gjc[w] = keywords
  317. mysql.sql_change_msg(
  318. """insert into user_keyword (xmmc, xzwt, xtjc, xmmb, yqjx, jsxq, sjxq, aqxq, ywly, hxyw, ywxq, ywxt, jscj, yhfw, mbqt, jsnr, gnmk, sjgx, znys) value ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")"""
  319. % (dl[0], str(project_gjc.get("xzwt"))[1:-1] if project_gjc.get("xzwt") else None,
  320. str(project_gjc.get("xtjc"))[1:-1] if project_gjc.get("xtjc") else None,
  321. str(project_gjc.get("xmmb"))[1:-1] if project_gjc.get("xmmb") else None,
  322. str(project_gjc.get("yqjx"))[1:-1] if project_gjc.get("yqjx") else None,
  323. str(project_gjc.get("jsxq"))[1:-1] if project_gjc.get("jsxq") else None,
  324. str(project_gjc.get("sjxq"))[1:-1] if project_gjc.get("sjxq") else None,
  325. str(project_gjc.get("aqxq"))[1:-1] if project_gjc.get("aqxq") else None,
  326. str(project_gjc.get("ywly"))[1:-1] if project_gjc.get("ywly") else None,
  327. str(project_gjc.get("hxyw"))[1:-1] if project_gjc.get("hxyw") else None,
  328. str(project_gjc.get("ywxq"))[1:-1] if project_gjc.get("ywxq") else None,
  329. str(project_gjc.get("ywxt"))[1:-1] if project_gjc.get("ywxt") else None,
  330. str(project_gjc.get("jscj"))[1:-1] if project_gjc.get("jscj") else None,
  331. str(project_gjc.get("yhfw"))[1:-1] if project_gjc.get("yhfw") else None,
  332. str(project_gjc.get("mbqt"))[1:-1] if project_gjc.get("mbqt") else None,
  333. str(project_gjc.get("jsnr"))[1:-1] if project_gjc.get("jsnr") else None,
  334. str(project_gjc.get("gnmk"))[1:-1] if project_gjc.get("gnmk") else None,
  335. str(project_gjc.get("sjgx"))[1:-1] if project_gjc.get("sjgx") else None,
  336. str(project_gjc.get("znys"))[1:-1] if project_gjc.get("znys") else None))
  337. mysql.sql_change_msg(
  338. """update idc_project set dup_status=3, one_vote_veto_status=1, self_check_status=1, history_project_count=%d ,module_count=%d where project_id=%d""" % (
  339. xmnr_count, gnmk_count, dl[0]))
  340. gong_neng_mo_kuai(mysql, dl, data, er_title)
  341. def check_project_info(mysql, dl, xc, str_dict):
  342. total_keywords = {}
  343. total_similarity = 0
  344. dup_count = 0
  345. # 保存相加后的相似度到idc_project_check
  346. mysql.sql_change_msg(
  347. """insert into idc_project_check (project_id, dup_project_name, file_path, company_name, create_year, project_tag, project_range_tag, project_area, create_time, update_time) value ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")"""
  348. % (dl[0], escape_string(xc.get("xmmc")), escape_string(dl[1]), "", "", "需求相似、业务相似", "历史项目", "",
  349. str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7]))
  350. dup_id = mysql.cur.lastrowid
  351. for x in list(xc.keys())[1:]:
  352. content_x = xc.get(x)
  353. content_y = str_dict.get(x)
  354. if content_x and content_y:
  355. if x == 'gnmk':
  356. continue
  357. elif x == 'jsnr':
  358. continue
  359. else:
  360. dup_count += 1
  361. if ((xc['gnmk'] == 'None' or xc['gnmk'] is None or str.strip(xc['gnmk']) == '') and (str_dict['gnmk'] is None or str.strip(str_dict['gnmk']) == '')) and (
  362. not xc['jsnr'] is None and xc['jsnr'] != 'None' and not str_dict['jsnr'] is None and len(str.strip(str_dict['jsnr'])) > 0):
  363. for x in list(xc.keys())[1:]:
  364. content_x = xc.get(x)
  365. content_y = str_dict.get(x)
  366. if content_x and content_y:
  367. if x == 'gnmk':
  368. # 匹配到历史数据,次数加1
  369. # dup_count += dup_file_test
  370. # 循环遍历每一个维度
  371. similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
  372. # 相似度 关键词
  373. similarity, keywords_x, keywords_y = similarity.main()
  374. similarity = similarity * 0
  375. # print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
  376. # 相似度相加
  377. total_similarity += similarity
  378. # 去重
  379. keywords_y = list(set(keywords_y))
  380. # 去重
  381. keywords_x = list(set(keywords_x))
  382. # 关键词收集
  383. total_keywords[x] = keywords_y
  384. function_content = content_y
  385. dup_function_content = content_x
  386. for word_y in keywords_y:
  387. word_y = word_y.strip().strip("'").strip('"')
  388. if word_y != '':
  389. function_content = str(function_content.replace("\"", "'")).replace(word_y,
  390. f'<span class="similarity">{word_y.strip()}</span>')
  391. for word_x in keywords_x:
  392. word_x = word_x.strip().strip("'").strip('"')
  393. if word_x != '':
  394. dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
  395. f'<span class="similarity">{word_x.strip()}</span>')
  396. # 保存每个维度对应的相似度到idc_project_check_detail
  397. mysql.sql_change_msg(
  398. """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
  399. % (dup_id, wdys2.get(x), similarity, escape_string(function_content),
  400. escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
  401. str(datetime.datetime.now())[:-7]))
  402. # content = content.replace(gjcs, f'<span class="similarity">{gjcs.strip()}</span>')
  403. elif x == 'jsnr':
  404. similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
  405. # 相似度 关键词
  406. similarity, keywords_x, keywords_y = similarity.main()
  407. similarity = similarity * 40
  408. # print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
  409. # 相似度相加
  410. total_similarity += similarity
  411. # 去重
  412. keywords_y = list(set(keywords_y))
  413. # 去重
  414. keywords_x = list(set(keywords_x))
  415. # 关键词收集
  416. total_keywords[x] = keywords_y
  417. function_content = content_y
  418. dup_function_content = content_x
  419. for word_y in keywords_y:
  420. word_y = word_y.strip().strip("'").strip('"')
  421. if word_y != '':
  422. function_content = str(function_content.replace("\"", "'")).replace(word_y,
  423. f'<span class="similarity">{word_y.strip()}</span>')
  424. for word_x in keywords_x:
  425. word_x = word_x.strip().strip("'").strip('"')
  426. if word_x != '':
  427. dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
  428. f'<span class="similarity">{word_x.strip()}</span>')
  429. # 保存每个维度对应的相似度到idc_project_check_detail
  430. mysql.sql_change_msg(
  431. """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
  432. % (dup_id, wdys2.get(x), similarity, escape_string(function_content),
  433. escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
  434. str(datetime.datetime.now())[:-7]))
  435. else:
  436. similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
  437. # 相似度 关键词
  438. similarity, keywords_x, keywords_y = similarity.main()
  439. similarity = similarity * (60 / dup_count)
  440. # print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
  441. # 相似度相加
  442. total_similarity += similarity
  443. # 去重
  444. keywords_y = list(set(keywords_y))
  445. # 去重
  446. keywords_x = list(set(keywords_x))
  447. # 关键词收集
  448. total_keywords[x] = keywords_y
  449. function_content = content_y
  450. dup_function_content = content_x
  451. for word_y in keywords_y:
  452. word_y = word_y.strip().strip("'").strip('"')
  453. if word_y != '':
  454. function_content = str(function_content.replace("\"", "'")).replace(word_y,
  455. f'<span class="similarity">{word_y.strip()}</span>')
  456. for word_x in keywords_x:
  457. word_x = word_x.strip().strip("'").strip('"')
  458. if word_x != '':
  459. dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
  460. f'<span class="similarity">{word_x.strip()}</span>')
  461. # 保存每个维度对应的相似度到idc_project_check_detail
  462. mysql.sql_change_msg(
  463. """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
  464. % (dup_id, wdys2.get(x), similarity, escape_string(function_content),
  465. escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
  466. str(datetime.datetime.now())[:-7]))
  467. elif ((xc['jsnr'] == 'None' or xc['jsnr'] is None or str.strip(xc['jsnr']) == '') and (str_dict['jsnr'] is None or str.strip(str_dict['jsnr']) == '')) and (
  468. not xc['gnmk'] is None and xc['gnmk'] != 'None' and not str_dict['gnmk'] is None and len(str.strip(str_dict['gnmk'])) > 0):
  469. for x in list(xc.keys())[1:]:
  470. content_x = xc.get(x)
  471. content_y = str_dict.get(x)
  472. if content_x and content_y:
  473. if x == 'gnmk':
  474. # 匹配到历史数据,次数加1
  475. # dup_count += dup_file_test
  476. # 循环遍历每一个维度
  477. similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
  478. # 相似度 关键词
  479. similarity, keywords_x, keywords_y = similarity.main()
  480. similarity = similarity * 50
  481. # print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
  482. # 相似度相加
  483. total_similarity += similarity
  484. # 去重
  485. keywords_y = list(set(keywords_y))
  486. # 去重
  487. keywords_x = list(set(keywords_x))
  488. # 关键词收集
  489. total_keywords[x] = keywords_y
  490. function_content = content_y
  491. dup_function_content = content_x
  492. for word_y in keywords_y:
  493. word_y = word_y.strip().strip("'").strip('"')
  494. if word_y != '':
  495. function_content = str(function_content.replace("\"", "'")).replace(word_y,
  496. f'<span class="similarity">{word_y.strip()}</span>')
  497. for word_x in keywords_x:
  498. word_x = word_x.strip().strip("'").strip('"')
  499. if word_x != '':
  500. dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
  501. f'<span class="similarity">{word_x.strip()}</span>')
  502. # 保存每个维度对应的相似度到idc_project_check_detail
  503. mysql.sql_change_msg(
  504. """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
  505. % (dup_id, wdys2.get(x), similarity, escape_string(function_content),
  506. escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
  507. str(datetime.datetime.now())[:-7]))
  508. # content = content.replace(gjcs, f'<span class="similarity">{gjcs.strip()}</span>')
  509. elif x == 'jsnr':
  510. similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
  511. # 相似度 关键词
  512. similarity, keywords_x, keywords_y = similarity.main()
  513. similarity = similarity * 0
  514. # print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
  515. # 相似度相加
  516. total_similarity += similarity
  517. # 去重
  518. keywords_y = list(set(keywords_y))
  519. # 去重
  520. keywords_x = list(set(keywords_x))
  521. # 关键词收集
  522. total_keywords[x] = keywords_y
  523. function_content = content_y
  524. dup_function_content = content_x
  525. for word_y in keywords_y:
  526. word_y = word_y.strip().strip("'").strip('"')
  527. if word_y != '':
  528. function_content = str(function_content.replace("\"", "'")).replace(word_y,
  529. f'<span class="similarity">{word_y.strip()}</span>')
  530. for word_x in keywords_x:
  531. word_x = word_x.strip().strip("'").strip('"')
  532. if word_x != '':
  533. dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
  534. f'<span class="similarity">{word_x.strip()}</span>')
  535. # 保存每个维度对应的相似度到idc_project_check_detail
  536. mysql.sql_change_msg(
  537. """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
  538. % (dup_id, wdys2.get(x), similarity, escape_string(function_content),
  539. escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
  540. str(datetime.datetime.now())[:-7]))
  541. else:
  542. similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
  543. # 相似度 关键词
  544. similarity, keywords_x, keywords_y = similarity.main()
  545. similarity = similarity * (50 / dup_count)
  546. # print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
  547. # 相似度相加
  548. total_similarity += similarity
  549. # 去重
  550. keywords_y = list(set(keywords_y))
  551. # 去重
  552. keywords_x = list(set(keywords_x))
  553. # 关键词收集
  554. total_keywords[x] = keywords_y
  555. function_content = content_y
  556. dup_function_content = content_x
  557. for word_y in keywords_y:
  558. word_y = word_y.strip().strip("'").strip('"')
  559. if word_y != '':
  560. function_content = str(function_content.replace("\"", "'")).replace(word_y,
  561. f'<span class="similarity">{word_y.strip()}</span>')
  562. for word_x in keywords_x:
  563. word_x = word_x.strip().strip("'").strip('"')
  564. if word_x != '':
  565. dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
  566. f'<span class="similarity">{word_x.strip()}</span>')
  567. # 保存每个维度对应的相似度到idc_project_check_detail
  568. mysql.sql_change_msg(
  569. """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
  570. % (dup_id, wdys2.get(x), similarity, escape_string(function_content),
  571. escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
  572. str(datetime.datetime.now())[:-7]))
  573. else:
  574. for x in list(xc.keys())[1:]:
  575. content_x = xc.get(x)
  576. content_y = str_dict.get(x)
  577. if content_x and content_y:
  578. if x == 'gnmk':
  579. # 匹配到历史数据,次数加1
  580. # dup_count += dup_file_test
  581. # 循环遍历每一个维度
  582. similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
  583. # 相似度 关键词
  584. similarity, keywords_x, keywords_y = similarity.main()
  585. similarity = similarity * 50
  586. # 相似度相加
  587. total_similarity += similarity
  588. # 去重
  589. keywords_y = list(set(keywords_y))
  590. # 去重
  591. keywords_x = list(set(keywords_x))
  592. print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
  593. # 关键词收集
  594. total_keywords[x] = keywords_y
  595. function_content = content_y
  596. dup_function_content = content_x
  597. for word_y in keywords_y:
  598. word_y = word_y.strip().strip("'").strip('"')
  599. if word_y != '':
  600. function_content = str(function_content.replace("\"", "'")).replace(word_y,
  601. f'<span class="similarity">{word_y.strip()}</span>')
  602. for word_x in keywords_x:
  603. word_x = word_x.strip().strip("'").strip('"')
  604. if word_x != '':
  605. dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
  606. f'<span class="similarity">{word_x.strip()}</span>')
  607. # 保存每个维度对应的相似度到idc_project_check_detail
  608. mysql.sql_change_msg(
  609. """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
  610. % (dup_id, wdys2.get(x), similarity, escape_string(function_content),
  611. escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
  612. str(datetime.datetime.now())[:-7]))
  613. # content = content.replace(gjcs, f'<span class="similarity">{gjcs.strip()}</span>')
  614. elif x == 'jsnr':
  615. similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
  616. # 相似度 关键词
  617. similarity, keywords_x, keywords_y = similarity.main()
  618. similarity = similarity * 40
  619. # 相似度相加
  620. total_similarity += similarity
  621. # 去重
  622. keywords_y = list(set(keywords_y))
  623. # 去重
  624. keywords_x = list(set(keywords_x))
  625. print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
  626. # 关键词收集
  627. total_keywords[x] = keywords_y
  628. function_content = content_y
  629. dup_function_content = content_x
  630. for word_y in keywords_y:
  631. word_y = word_y.strip().strip("'").strip('"')
  632. if word_y != '':
  633. function_content = str(function_content.replace("\"", "'")).replace(word_y,
  634. f'<span class="similarity">{word_y.strip()}</span>')
  635. for word_x in keywords_x:
  636. word_x = word_x.strip().strip("'").strip('"')
  637. if word_x != '':
  638. dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
  639. f'<span class="similarity">{word_x.strip()}</span>')
  640. # 保存每个维度对应的相似度到idc_project_check_detail
  641. mysql.sql_change_msg(
  642. """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
  643. % (dup_id, wdys2.get(x), similarity, escape_string(function_content),
  644. escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
  645. str(datetime.datetime.now())[:-7]))
  646. else:
  647. similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
  648. # 相似度 关键词
  649. similarity, keywords_x, keywords_y = similarity.main()
  650. similarity = similarity * (10 / dup_count)
  651. # print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
  652. # 相似度相加
  653. total_similarity += similarity
  654. # 去重
  655. keywords_y = list(set(keywords_y))
  656. # 去重
  657. keywords_x = list(set(keywords_x))
  658. # 关键词收集
  659. total_keywords[x] = keywords_y
  660. function_content = content_y
  661. dup_function_content = content_x
  662. for word_y in keywords_y:
  663. word_y = word_y.strip().strip("'").strip('"')
  664. if word_y != '':
  665. function_content = str(function_content.replace("\"", "'")).replace(word_y, f'<span class="similarity">{word_y.strip()}</span>')
  666. for word_x in keywords_x:
  667. word_x = word_x.strip().strip("'").strip('"')
  668. if word_x != '':
  669. dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x, f'<span class="similarity">{word_x.strip()}</span>')
  670. # 保存每个维度对应的相似度到idc_project_check_detail
  671. mysql.sql_change_msg(
  672. """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
  673. % (dup_id, wdys2.get(x), similarity, escape_string(function_content),
  674. escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
  675. str(datetime.datetime.now())[:-7]))
  676. mysql.sql_change_msg(
  677. """update idc_project_check set similarity=%f where dup_id=%d""" % (total_similarity, dup_id))
  678. if __name__ == "__main__":
  679. all_path = requests.get("http://127.0.0.1:19099/check/duplicates/%s" % 599).json()
  680. # print(all_path)
  681. # dict1 = {k:v for k, v in sorted(dict.items(), key= lambda item : item[1])}
  682. # print(dict1)
  683. data_list = []
  684. for ap in all_path.get("data"):
  685. # if os.path.exists(ap.get("file_path")):
  686. data_list.append((ap.get("project_id"), ap.get("file_path"), ap.get("project_name")))
  687. print(data_list)
  688. # data_list = [(11, r"C:\Users\HUAWEI\PycharmProjects\nlp\dup_check\0825-丽水系统查重维度1.xlsx", "水路运输综合监管系统建设项目.xls")]
  689. data_list = [(11, r"D:\ningda\dup_check2\dup_check\0825-丽水系统查重维度1.xlsx", "水路运输综合监管系统建设项目.xls")]
  690. project_check(data_list)