丽水查重代码
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

676 lines
38KB

  1. # coding=utf-8
  2. import sys
  3. import re
  4. import mysql_pool
  5. from pymysql.converters import escape_string
  6. import cosin_similarity
  7. import pandas as pd
  8. import datetime
  9. import requests
  10. wdys1 = {
  11. "项目名称": "xmmc",
  12. "现状问题": "xzwt",
  13. "系统基础": "xtjc",
  14. "项目目标": "xmmb",
  15. "预期绩效": "yqjx",
  16. "建设需求": "jsxq",
  17. "数据需求": "sjxq",
  18. "安全需求": "aqxq",
  19. "业务领域": "ywly",
  20. "核心业务": "hxyw",
  21. "业务需求": "ywxq",
  22. "业务协同": "ywxt",
  23. "建设层级": "jscj",
  24. "用户范围": "yhfw",
  25. "目标群体": "mbqt",
  26. "建设内容": "jsnr",
  27. "功能模块": "gnmk",
  28. "数据共享": "sjgx",
  29. "智能要素": "znys"
  30. }
  31. wdys2 = {
  32. "xmmc": "项目名称",
  33. "xzwt": "现状问题",
  34. "xtjc": "系统基础",
  35. "xmmb": "项目目标",
  36. "yqjx": "预期绩效",
  37. "jsxq": "建设需求",
  38. "sjxq": "数据需求",
  39. "aqxq": "安全需求",
  40. "ywly": "业务领域",
  41. "hxyw": "核心业务",
  42. "ywxq": "业务需求",
  43. "ywxt": "业务协同",
  44. "jscj": "建设层级",
  45. "yhfw": "用户范围",
  46. "mbqt": "目标群体",
  47. "jsnr": "建设内容",
  48. "gnmk": "功能模块",
  49. "sjgx": "数据共享",
  50. "znys": "智能要素"
  51. }
  52. gnmkys = {
  53. "gnmc": "功能名称",
  54. "gnms": "功能描述"
  55. }
  56. def getFlag():
  57. data_dict = {}
  58. df = pd.read_excel("0825.xlsx")
  59. data = df.values
  60. data = list(pd.Series(data[:, 1]).dropna())
  61. for d in data:
  62. try:
  63. wd = re.search("(.*?)(.*?%)", d).group(1).strip()
  64. wdc = wdys1.get(wd)
  65. if wdc:
  66. qz = re.search(".*?((.*?%))", d).group(1)
  67. data_dict[wdc] = qz
  68. except:
  69. pass
  70. return data_dict
  71. def gong_neng_mo_kuai(mysql, dl, data, er_title):
  72. # 将excel文件中的所有第三维度内容进行拼接
  73. str_dict = {}
  74. for et in er_title:
  75. for d in data:
  76. if d[1] == et:
  77. if str_dict.get(et):
  78. str_dict[et] = str_dict.get(et) + d[3]
  79. else:
  80. str_dict[et] = d[3]
  81. for k, v in str_dict.items():
  82. mysql.sql_change_msg(
  83. """insert into idc_project_module (project_id, check_duplicate_count, module_name, module_content, create_time, update_time, tag) value(%d, 1, "%s", "%s", "%s", "%s", "模块")""" % (
  84. int(dl[0]), k, v, str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7]))
  85. module_id_list = mysql.sql_select_many(
  86. """select project_module_id, module_name, module_content from idc_project_module where project_id=%d""" % dl[
  87. 0])
  88. data_list = []
  89. for mil in module_id_list:
  90. data_dict = {}
  91. data_dict["project_module_id"] = mil.get("project_module_id")
  92. data_dict["gnmc"] = mil.get("module_name")
  93. data_dict["gnms"] = mil.get("module_content")
  94. data_list.append(data_dict)
  95. # print(data_list)
  96. for i in data_list:
  97. gnmk_copy1 = mysql.sql_select_many("""select * from user_history_module_data""")
  98. if gnmk_copy1:
  99. for gc in gnmk_copy1:
  100. print(
  101. """insert into idc_project_module_check (project_module_id, module_name, project_name, company_name, create_time, update_time) value(%d, "%s", "%s", "%s", "%s", "%s")"""
  102. % (
  103. i.get("project_module_id"), escape_string(gc.get("gnmc")), escape_string(gc.get("xmmc")), "",
  104. str(datetime.datetime.now())[:-7],
  105. str(datetime.datetime.now())[:-7]))
  106. mysql.sql_change_msg(
  107. """insert into idc_project_module_check (project_module_id, module_name, project_name, company_name, create_time, update_time) value(%d, "%s", "%s", "%s", "%s", "%s")"""
  108. % (
  109. i.get("project_module_id"), escape_string(gc.get("gnmc")), escape_string(gc.get("xmmc")), "",
  110. str(datetime.datetime.now())[:-7],
  111. str(datetime.datetime.now())[:-7]))
  112. dup_module_id = mysql.cur.lastrowid
  113. check_module_info(mysql, gc, dl, i, dup_module_id)
  114. gnmk_gjc = {}
  115. for a in ["gnmc", "gnms"]:
  116. if i.get(a):
  117. content_x = i.get(a)
  118. content_y = i.get(a)
  119. if a == "gnmc":
  120. similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
  121. # 相似度 关键词
  122. similarity, keyword_x, keywords = similarity.main()
  123. # 去重
  124. keywords = list(set(keywords))
  125. gnmk_gjc[a] = keywords
  126. else:
  127. similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
  128. # 相似度 关键词
  129. similarity, keyword_x, keywords = similarity.main()
  130. # 去重
  131. keywords = list(set(keywords))
  132. gnmk_gjc[a] = keywords
  133. mysql.sql_change_msg("""insert into user_module_keywords (xmmc, gnmc, gnms) value("%s", "%s", "%s")""" % (
  134. dl[2], str(gnmk_gjc.get("gnmc"))[1:-1] if gnmk_gjc.get("gnmc") else None,
  135. str(gnmk_gjc.get("gnms"))[1:-1] if gnmk_gjc.get("gnms") else None))
  136. def check_module_info(mysql, gc, dl, pro, dup_module_id):
  137. total_similarity1 = 0
  138. total_keywords1 = []
  139. total_similarity2 = 0
  140. total_keywords2 = []
  141. for j in ["gnmc", "gnms"]:
  142. # 循环遍历每一个模块名称
  143. content_x = gc.get(j)
  144. content_y = pro.get(j)
  145. if content_x and content_y:
  146. if j == "gnmc":
  147. similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
  148. # 相似度 关键词
  149. similarity, keyword_x, keywords = similarity.main()
  150. similarity = similarity * 1
  151. total_keywords1 += keywords
  152. # print("######################相似度: %.2f%%" % similarity, "关键词: %s" % keywords)
  153. # 相似度相加
  154. total_similarity1 += similarity
  155. mysql.sql_change_msg(
  156. """insert into idc_project_module_check_detail (dup_module_id, project_name, module_content, dup_module_content, similarity, dimension, create_time, update_time) value (%d, "%s", "%s", "%s", %f, "%s", "%s", "%s")"""
  157. % (dup_module_id, dl[2], escape_string(content_y), escape_string(content_x), similarity,
  158. "功能名称",
  159. str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7]))
  160. else:
  161. similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
  162. # 相似度 关键词
  163. similarity, keyword_x, keywords = similarity.main()
  164. similarity = similarity * 99
  165. total_keywords2 += keywords
  166. # print("######################相似度: %.2f%%" % similarity, "关键词: %s" % keywords)
  167. # 相似度相加
  168. total_similarity2 += similarity
  169. mysql.sql_change_msg(
  170. """insert into idc_project_module_check_detail (dup_module_id, project_name, module_content, dup_module_content, similarity, dimension, create_time, update_time) value (%d, "%s", "%s", "%s", %f, "%s", "%s", "%s")"""
  171. % (dup_module_id, dl[2], escape_string(content_y), escape_string(content_x), similarity,
  172. "功能模块描述",
  173. str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7]))
  174. mysql.sql_change_msg("""update idc_project_module_check set similarity=%f where dup_module_id=%d""" % (
  175. total_similarity1 + total_similarity2, dup_module_id))
  176. def project_check(data_list):
  177. mysql = mysql_pool.ConnMysql()
  178. # mysql.sql_select_many("""select * from mkgjc""")
  179. # 读取维度和权重
  180. # xmnr_count = len(mysql.sql_select_many("""select * from xmnr_copy1"""))
  181. # gnmk_count = len(mysql.sql_select_many("""select * from gnmk_copy1"""))
  182. xmnr_count = len(mysql.sql_select_many("""select * from user_history_data"""))
  183. gnmk_count = len(mysql.sql_select_many("""select * from user_history_module_data"""))
  184. get_data_dict = getFlag()
  185. # 遍历excel存储路径
  186. for dl in data_list:
  187. # path = "0825-丽水系统查重维度1.xlsx"
  188. # 读取路径下的excel
  189. print(dl,dl[1])
  190. df = pd.read_excel(dl[1])
  191. data = df.values
  192. # 将excel文件中的所有维度内容进行拼接
  193. join_str = ""
  194. str_dict = {}
  195. gnmk_str = []
  196. title = ""
  197. er_title = set()
  198. for d in data:
  199. # if pd.notnull(d[0]):
  200. # title = d[0]
  201. # if title == "功能模块":
  202. # er_title.add(d[1])
  203. # join_str = ""
  204. # for i in d[1:]:
  205. # if pd.notnull(i):
  206. # join_str += str(i)
  207. # str_dict[wdys1.get(title)] = join_str
  208. if pd.notnull(d[0]):
  209. title = d[0]
  210. if title == "功能模块":
  211. er_title.add(d[1])
  212. join_str = ""
  213. for i in d[1:]:
  214. if pd.notnull(i):
  215. join_str += str(i)
  216. if title == "功能模块":
  217. if i == '功能描述':
  218. continue
  219. else:
  220. gnmk_str.append(i)
  221. str_dict[wdys1.get(title)] = join_str
  222. else:
  223. if title == "功能模块":
  224. er_title.add(d[1])
  225. for i in d[1:]:
  226. if pd.notnull(i):
  227. join_str += str(i)
  228. str_dict[wdys1.get(title)] = str_dict.get(wdys1.get(title)) + join_str
  229. # print(str_dict)
  230. gnmk = ",".join(gnmk_str)
  231. str_dict['gnmk'] = gnmk
  232. mysql.sql_change_msg(
  233. """insert into user_data (xmmc, xzwt, xtjc, xmmb, yqjx, jsxq, sjxq, aqxq, ywly, hxyw, ywxq, ywxt, jscj, yhfw, mbqt, jsnr, gnmk, sjgx, znys) value ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")"""
  234. % (dl[0], str_dict.get("xzwt") if str_dict.get("xzwt") else None,
  235. str_dict.get("xtjc") if str_dict.get("xtjc") else None,
  236. str_dict.get("xmmb") if str_dict.get("xmmb") else None,
  237. str_dict.get("yqjx") if str_dict.get("yqjx") else None,
  238. str_dict.get("jsxq") if str_dict.get("jsxq") else None,
  239. str_dict.get("sjxq") if str_dict.get("sjxq") else None,
  240. str_dict.get("aqxq") if str_dict.get("aqxq") else None,
  241. str_dict.get("ywly") if str_dict.get("ywly") else None,
  242. str_dict.get("hxyw") if str_dict.get("hxyw") else None,
  243. str_dict.get("ywxq") if str_dict.get("ywxq") else None,
  244. str_dict.get("ywxt") if str_dict.get("ywxt") else None,
  245. str_dict.get("jscj") if str_dict.get("jscj") else None,
  246. str_dict.get("yhfw") if str_dict.get("yhfw") else None,
  247. str_dict.get("mbqt") if str_dict.get("mbqt") else None,
  248. str_dict.get("jsnr") if str_dict.get("jsnr") else None,
  249. str_dict.get("gnmk") if str_dict.get("gnmk") else None,
  250. str_dict.get("sjgx") if str_dict.get("sjgx") else None,
  251. str_dict.get("znys") if str_dict.get("znys") else None))
  252. # 或取所有的xmnr_copy1
  253. xmnr_copy1 = mysql.sql_select_many("""select * from user_history_data""")
  254. # 对比xmnr_copy1和xmnr维度是否都有
  255. if xmnr_copy1:
  256. # threads = [Thread(target=check_project_info, args=(mysql, dl, xc, str_dict)) for xc in xmnr_copy1]
  257. # for t in threads:
  258. # t.start()
  259. #
  260. # for t in threads:
  261. # t.join()
  262. for xc in xmnr_copy1:
  263. check_project_info(mysql, dl, xc, str_dict)
  264. project_gjc = {}
  265. for w in wdys2.keys():
  266. content_x = str_dict.get(w)
  267. content_y = str_dict.get(w)
  268. if content_x and content_y:
  269. # 循环遍历每一个维度
  270. similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
  271. # 相似度 关键词
  272. similarity, keywords_x, keywords = similarity.main()
  273. # 去重
  274. keywords = list(set(keywords))
  275. project_gjc[w] = keywords
  276. mysql.sql_change_msg(
  277. """insert into user_keyword (xmmc, xzwt, xtjc, xmmb, yqjx, jsxq, sjxq, aqxq, ywly, hxyw, ywxq, ywxt, jscj, yhfw, mbqt, jsnr, gnmk, sjgx, znys) value ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")"""
  278. % (dl[0], str(project_gjc.get("xzwt"))[1:-1] if project_gjc.get("xzwt") else None,
  279. str(project_gjc.get("xtjc"))[1:-1] if project_gjc.get("xtjc") else None,
  280. str(project_gjc.get("xmmb"))[1:-1] if project_gjc.get("xmmb") else None,
  281. str(project_gjc.get("yqjx"))[1:-1] if project_gjc.get("yqjx") else None,
  282. str(project_gjc.get("jsxq"))[1:-1] if project_gjc.get("jsxq") else None,
  283. str(project_gjc.get("sjxq"))[1:-1] if project_gjc.get("sjxq") else None,
  284. str(project_gjc.get("aqxq"))[1:-1] if project_gjc.get("aqxq") else None,
  285. str(project_gjc.get("ywly"))[1:-1] if project_gjc.get("ywly") else None,
  286. str(project_gjc.get("hxyw"))[1:-1] if project_gjc.get("hxyw") else None,
  287. str(project_gjc.get("ywxq"))[1:-1] if project_gjc.get("ywxq") else None,
  288. str(project_gjc.get("ywxt"))[1:-1] if project_gjc.get("ywxt") else None,
  289. str(project_gjc.get("jscj"))[1:-1] if project_gjc.get("jscj") else None,
  290. str(project_gjc.get("yhfw"))[1:-1] if project_gjc.get("yhfw") else None,
  291. str(project_gjc.get("mbqt"))[1:-1] if project_gjc.get("mbqt") else None,
  292. str(project_gjc.get("jsnr"))[1:-1] if project_gjc.get("jsnr") else None,
  293. str(project_gjc.get("gnmk"))[1:-1] if project_gjc.get("gnmk") else None,
  294. str(project_gjc.get("sjgx"))[1:-1] if project_gjc.get("sjgx") else None,
  295. str(project_gjc.get("znys"))[1:-1] if project_gjc.get("znys") else None))
  296. mysql.sql_change_msg(
  297. """update idc_project set dup_status=3, one_vote_veto_status=1, self_check_status=1, history_project_count=%d ,module_count=%d where project_id=%d""" % (
  298. xmnr_count, gnmk_count, dl[0]))
  299. gong_neng_mo_kuai(mysql, dl, data, er_title)
  300. def check_project_info(mysql, dl, xc, str_dict):
  301. total_keywords = {}
  302. total_similarity = 0
  303. dup_count = 0
  304. # 保存相加后的相似度到idc_project_check
  305. print(f'xmmc is {xc.get("xmmc")}')
  306. mysql.sql_change_msg(
  307. """insert into idc_project_check (project_id, dup_project_name, file_path, company_name, create_year, project_tag, project_range_tag, project_area, create_time, update_time) value ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")"""
  308. % (dl[0], escape_string(xc.get("xmmc")), escape_string(dl[1]), "", "", "需求相似、业务相似", "历史项目", "",
  309. str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7]))
  310. dup_id = mysql.cur.lastrowid
  311. for x in list(xc.keys())[1:]:
  312. content_x = xc.get(x)
  313. content_y = str_dict.get(x)
  314. if content_x and content_y:
  315. if x == 'gnmk':
  316. continue
  317. elif x == 'jsnr':
  318. continue
  319. else:
  320. dup_count += 1
  321. if ((xc['gnmk'] == 'None' or xc['gnmk'] is None or str.strip(xc['gnmk']) == '') and (str_dict['gnmk'] is None or str.strip(str_dict['gnmk']) == '')) and (
  322. not xc['jsnr'] is None and xc['jsnr'] != 'None' and not str_dict['jsnr'] is None and len(str.strip(str_dict['jsnr'])) > 0):
  323. for x in list(xc.keys())[1:]:
  324. content_x = xc.get(x)
  325. content_y = str_dict.get(x)
  326. if content_x and content_y:
  327. if x == 'gnmk':
  328. # 匹配到历史数据,次数加1
  329. # dup_count += dup_file_test
  330. # 循环遍历每一个维度
  331. similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
  332. # 相似度 关键词
  333. similarity, keywords_x, keywords_y = similarity.main()
  334. similarity = similarity * 0
  335. # print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
  336. # 相似度相加
  337. total_similarity += similarity
  338. # 去重
  339. keywords_y = list(set(keywords_y))
  340. # 去重
  341. keywords_x = list(set(keywords_x))
  342. # 关键词收集
  343. total_keywords[x] = keywords_y
  344. function_content = content_y
  345. dup_function_content = content_x
  346. for word_y in keywords_y:
  347. word_y = word_y.strip().strip("'").strip('"')
  348. if word_y != '':
  349. function_content = str(function_content.replace("\"", "'")).replace(word_y,
  350. f'<span class="similarity">{word_y.strip()}</span>')
  351. for word_x in keywords_x:
  352. word_x = word_x.strip().strip("'").strip('"')
  353. if word_x != '':
  354. dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
  355. f'<span class="similarity">{word_x.strip()}</span>')
  356. # 保存每个维度对应的相似度到idc_project_check_detail
  357. mysql.sql_change_msg(
  358. """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
  359. % (dup_id, wdys2.get(x), similarity, escape_string(function_content),
  360. escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
  361. str(datetime.datetime.now())[:-7]))
  362. # content = content.replace(gjcs, f'<span class="similarity">{gjcs.strip()}</span>')
  363. elif x == 'jsnr':
  364. similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
  365. # 相似度 关键词
  366. similarity, keywords_x, keywords_y = similarity.main()
  367. similarity = similarity * 40
  368. # print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
  369. # 相似度相加
  370. total_similarity += similarity
  371. # 去重
  372. keywords_y = list(set(keywords_y))
  373. # 去重
  374. keywords_x = list(set(keywords_x))
  375. # 关键词收集
  376. total_keywords[x] = keywords_y
  377. function_content = content_y
  378. dup_function_content = content_x
  379. for word_y in keywords_y:
  380. word_y = word_y.strip().strip("'").strip('"')
  381. if word_y != '':
  382. function_content = str(function_content.replace("\"", "'")).replace(word_y,
  383. f'<span class="similarity">{word_y.strip()}</span>')
  384. for word_x in keywords_x:
  385. word_x = word_x.strip().strip("'").strip('"')
  386. if word_x != '':
  387. dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
  388. f'<span class="similarity">{word_x.strip()}</span>')
  389. # 保存每个维度对应的相似度到idc_project_check_detail
  390. mysql.sql_change_msg(
  391. """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
  392. % (dup_id, wdys2.get(x), similarity, escape_string(function_content),
  393. escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
  394. str(datetime.datetime.now())[:-7]))
  395. else:
  396. similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
  397. # 相似度 关键词
  398. similarity, keywords_x, keywords_y = similarity.main()
  399. similarity = similarity * (60 / dup_count)
  400. # print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
  401. # 相似度相加
  402. total_similarity += similarity
  403. # 去重
  404. keywords_y = list(set(keywords_y))
  405. # 去重
  406. keywords_x = list(set(keywords_x))
  407. # 关键词收集
  408. total_keywords[x] = keywords_y
  409. function_content = content_y
  410. dup_function_content = content_x
  411. for word_y in keywords_y:
  412. word_y = word_y.strip().strip("'").strip('"')
  413. if word_y != '':
  414. function_content = str(function_content.replace("\"", "'")).replace(word_y,
  415. f'<span class="similarity">{word_y.strip()}</span>')
  416. for word_x in keywords_x:
  417. word_x = word_x.strip().strip("'").strip('"')
  418. if word_x != '':
  419. dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
  420. f'<span class="similarity">{word_x.strip()}</span>')
  421. # 保存每个维度对应的相似度到idc_project_check_detail
  422. mysql.sql_change_msg(
  423. """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
  424. % (dup_id, wdys2.get(x), similarity, escape_string(function_content),
  425. escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
  426. str(datetime.datetime.now())[:-7]))
  427. elif ((xc['jsnr'] == 'None' or xc['jsnr'] is None or str.strip(xc['jsnr']) == '') and (str_dict['jsnr'] is None or str.strip(str_dict['jsnr']) == '')) and (
  428. not xc['gnmk'] is None and xc['gnmk'] != 'None' and not str_dict['gnmk'] is None and len(str.strip(str_dict['gnmk'])) > 0):
  429. for x in list(xc.keys())[1:]:
  430. content_x = xc.get(x)
  431. content_y = str_dict.get(x)
  432. if content_x and content_y:
  433. if x == 'gnmk':
  434. # 匹配到历史数据,次数加1
  435. # dup_count += dup_file_test
  436. # 循环遍历每一个维度
  437. similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
  438. # 相似度 关键词
  439. similarity, keywords_x, keywords_y = similarity.main()
  440. similarity = similarity * 50
  441. # print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
  442. # 相似度相加
  443. total_similarity += similarity
  444. # 去重
  445. keywords_y = list(set(keywords_y))
  446. # 去重
  447. keywords_x = list(set(keywords_x))
  448. # 关键词收集
  449. total_keywords[x] = keywords_y
  450. function_content = content_y
  451. dup_function_content = content_x
  452. for word_y in keywords_y:
  453. word_y = word_y.strip().strip("'").strip('"')
  454. if word_y != '':
  455. function_content = str(function_content.replace("\"", "'")).replace(word_y,
  456. f'<span class="similarity">{word_y.strip()}</span>')
  457. for word_x in keywords_x:
  458. word_x = word_x.strip().strip("'").strip('"')
  459. if word_x != '':
  460. dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
  461. f'<span class="similarity">{word_x.strip()}</span>')
  462. # 保存每个维度对应的相似度到idc_project_check_detail
  463. mysql.sql_change_msg(
  464. """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
  465. % (dup_id, wdys2.get(x), similarity, escape_string(function_content),
  466. escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
  467. str(datetime.datetime.now())[:-7]))
  468. # content = content.replace(gjcs, f'<span class="similarity">{gjcs.strip()}</span>')
  469. elif x == 'jsnr':
  470. similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
  471. # 相似度 关键词
  472. similarity, keywords_x, keywords_y = similarity.main()
  473. similarity = similarity * 0
  474. # print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
  475. # 相似度相加
  476. total_similarity += similarity
  477. # 去重
  478. keywords_y = list(set(keywords_y))
  479. # 去重
  480. keywords_x = list(set(keywords_x))
  481. # 关键词收集
  482. total_keywords[x] = keywords_y
  483. function_content = content_y
  484. dup_function_content = content_x
  485. for word_y in keywords_y:
  486. word_y = word_y.strip().strip("'").strip('"')
  487. if word_y != '':
  488. function_content = str(function_content.replace("\"", "'")).replace(word_y,
  489. f'<span class="similarity">{word_y.strip()}</span>')
  490. for word_x in keywords_x:
  491. word_x = word_x.strip().strip("'").strip('"')
  492. if word_x != '':
  493. dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
  494. f'<span class="similarity">{word_x.strip()}</span>')
  495. # 保存每个维度对应的相似度到idc_project_check_detail
  496. mysql.sql_change_msg(
  497. """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
  498. % (dup_id, wdys2.get(x), similarity, escape_string(function_content),
  499. escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
  500. str(datetime.datetime.now())[:-7]))
  501. else:
  502. similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
  503. # 相似度 关键词
  504. similarity, keywords_x, keywords_y = similarity.main()
  505. similarity = similarity * (50 / dup_count)
  506. # print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
  507. # 相似度相加
  508. total_similarity += similarity
  509. # 去重
  510. keywords_y = list(set(keywords_y))
  511. # 去重
  512. keywords_x = list(set(keywords_x))
  513. # 关键词收集
  514. total_keywords[x] = keywords_y
  515. function_content = content_y
  516. dup_function_content = content_x
  517. for word_y in keywords_y:
  518. word_y = word_y.strip().strip("'").strip('"')
  519. if word_y != '':
  520. function_content = str(function_content.replace("\"", "'")).replace(word_y,
  521. f'<span class="similarity">{word_y.strip()}</span>')
  522. for word_x in keywords_x:
  523. word_x = word_x.strip().strip("'").strip('"')
  524. if word_x != '':
  525. dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
  526. f'<span class="similarity">{word_x.strip()}</span>')
  527. # 保存每个维度对应的相似度到idc_project_check_detail
  528. mysql.sql_change_msg(
  529. """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
  530. % (dup_id, wdys2.get(x), similarity, escape_string(function_content),
  531. escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
  532. str(datetime.datetime.now())[:-7]))
  533. else:
  534. for x in list(xc.keys())[1:]:
  535. content_x = xc.get(x)
  536. content_y = str_dict.get(x)
  537. if content_x and content_y:
  538. if x == 'gnmk':
  539. # 匹配到历史数据,次数加1
  540. # dup_count += dup_file_test
  541. # 循环遍历每一个维度
  542. similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
  543. # 相似度 关键词
  544. similarity, keywords_x, keywords_y = similarity.main()
  545. similarity = similarity * 50
  546. # 相似度相加
  547. total_similarity += similarity
  548. # 去重
  549. keywords_y = list(set(keywords_y))
  550. # 去重
  551. keywords_x = list(set(keywords_x))
  552. print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
  553. # 关键词收集
  554. total_keywords[x] = keywords_y
  555. function_content = content_y
  556. dup_function_content = content_x
  557. for word_y in keywords_y:
  558. word_y = word_y.strip().strip("'").strip('"')
  559. if word_y != '':
  560. function_content = str(function_content.replace("\"", "'")).replace(word_y,
  561. f'<span class="similarity">{word_y.strip()}</span>')
  562. for word_x in keywords_x:
  563. word_x = word_x.strip().strip("'").strip('"')
  564. if word_x != '':
  565. dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
  566. f'<span class="similarity">{word_x.strip()}</span>')
  567. # 保存每个维度对应的相似度到idc_project_check_detail
  568. mysql.sql_change_msg(
  569. """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
  570. % (dup_id, wdys2.get(x), similarity, escape_string(function_content),
  571. escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
  572. str(datetime.datetime.now())[:-7]))
  573. # content = content.replace(gjcs, f'<span class="similarity">{gjcs.strip()}</span>')
  574. elif x == 'jsnr':
  575. similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
  576. # 相似度 关键词
  577. similarity, keywords_x, keywords_y = similarity.main()
  578. similarity = similarity * 40
  579. # 相似度相加
  580. total_similarity += similarity
  581. # 去重
  582. keywords_y = list(set(keywords_y))
  583. # 去重
  584. keywords_x = list(set(keywords_x))
  585. print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
  586. # 关键词收集
  587. total_keywords[x] = keywords_y
  588. function_content = content_y
  589. dup_function_content = content_x
  590. for word_y in keywords_y:
  591. word_y = word_y.strip().strip("'").strip('"')
  592. if word_y != '':
  593. function_content = str(function_content.replace("\"", "'")).replace(word_y,
  594. f'<span class="similarity">{word_y.strip()}</span>')
  595. for word_x in keywords_x:
  596. word_x = word_x.strip().strip("'").strip('"')
  597. if word_x != '':
  598. dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
  599. f'<span class="similarity">{word_x.strip()}</span>')
  600. # 保存每个维度对应的相似度到idc_project_check_detail
  601. mysql.sql_change_msg(
  602. """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
  603. % (dup_id, wdys2.get(x), similarity, escape_string(function_content),
  604. escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
  605. str(datetime.datetime.now())[:-7]))
  606. else:
  607. similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
  608. # 相似度 关键词
  609. similarity, keywords_x, keywords_y = similarity.main()
  610. similarity = similarity * (10 / dup_count)
  611. # print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
  612. # 相似度相加
  613. total_similarity += similarity
  614. # 去重
  615. keywords_y = list(set(keywords_y))
  616. # 去重
  617. keywords_x = list(set(keywords_x))
  618. # 关键词收集
  619. total_keywords[x] = keywords_y
  620. function_content = content_y
  621. dup_function_content = content_x
  622. for word_y in keywords_y:
  623. word_y = word_y.strip().strip("'").strip('"')
  624. print(f'word_y = {word_y}')
  625. if word_y != '':
  626. function_content = str(function_content.replace("\"", "'")).replace(word_y, f'<span class="similarity">{word_y.strip()}</span>')
  627. for word_x in keywords_x:
  628. word_x = word_x.strip().strip("'").strip('"')
  629. if word_x != '':
  630. dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x, f'<span class="similarity">{word_x.strip()}</span>')
  631. # 保存每个维度对应的相似度到idc_project_check_detail
  632. mysql.sql_change_msg(
  633. """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
  634. % (dup_id, wdys2.get(x), similarity, escape_string(function_content),
  635. escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
  636. str(datetime.datetime.now())[:-7]))
  637. mysql.sql_change_msg(
  638. """update idc_project_check set similarity=%f where dup_id=%d""" % (total_similarity, dup_id))
  639. if __name__ == "__main__":
  640. all_path = requests.get("http://127.0.0.1:19099/check/duplicates/%s" % 599).json()
  641. # print(all_path)
  642. # dict1 = {k:v for k, v in sorted(dict.items(), key= lambda item : item[1])}
  643. # print(dict1)
  644. data_list = []
  645. for ap in all_path.get("data"):
  646. # if os.path.exists(ap.get("file_path")):
  647. data_list.append((ap.get("project_id"), ap.get("file_path"), ap.get("project_name")))
  648. print(data_list)
  649. # data_list = [(11, r"C:\Users\HUAWEI\PycharmProjects\nlp\dup_check\0825-丽水系统查重维度1.xlsx", "水路运输综合监管系统建设项目.xls")]
  650. data_list = [(11, r"D:\ningda\dup_check2\dup_check\0825-丽水系统查重维度1.xlsx", "水路运输综合监管系统建设项目.xls")]
  651. project_check(data_list)