You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

578 lines
37KB

  1. # coding=utf-8
  2. import sys
  3. import re
  4. import mysql_pool
  5. from pymysql.converters import escape_string
  6. import cosin_similarity
  7. import pandas as pd
  8. import datetime
  9. import requests
  10. import os
  11. import pymysql
  12. wdys1 = {
  13. "项目名称": "xmmc",
  14. "现状问题": "xzwt",
  15. "系统基础": "xtjc",
  16. "项目目标": "xmmb",
  17. "预期绩效": "yqjx",
  18. "建设需求": "jsxq",
  19. "数据需求": "sjxq",
  20. "安全需求": "aqxq",
  21. "业务领域": "ywly",
  22. "核心业务": "hxyw",
  23. "业务需求": "ywxq",
  24. "业务协同": "ywxt",
  25. "建设层级": "jscj",
  26. "用户范围": "yhfw",
  27. "目标群体": "mbqt",
  28. "建设内容": "jsnr",
  29. "功能模块": "gnmk",
  30. "数据共享": "sjgx",
  31. "智能要素": "znys"
  32. }
  33. wdys2 = {
  34. "xmmc": "项目名称",
  35. "xzwt": "现状问题",
  36. "xtjc": "系统基础",
  37. "xmmb": "项目目标",
  38. "yqjx": "预期绩效",
  39. "jsxq": "建设需求",
  40. "sjxq": "数据需求",
  41. "aqxq": "安全需求",
  42. "ywly": "业务领域",
  43. "hxyw": "核心业务",
  44. "ywxq": "业务需求",
  45. "ywxt": "业务协同",
  46. "jscj": "建设层级",
  47. "yhfw": "用户范围",
  48. "mbqt": "目标群体",
  49. "jsnr": "建设内容",
  50. "gnmk": "功能模块",
  51. "sjgx": "数据共享",
  52. "znys": "智能要素"
  53. }
  54. gnmkys = {
  55. "gnmc": "功能名称",
  56. "gnms": "功能描述"
  57. }
  58. def getFlag():
  59. data_dict = {}
  60. df = pd.read_excel("0825-丽水系统查重维度.xlsx")
  61. data = df.values
  62. data = list(pd.Series(data[:, 1]).dropna())
  63. for d in data:
  64. try:
  65. wd = re.search("(.*?)(.*?%)", d).group(1).strip()
  66. wdc = wdys1.get(wd)
  67. if wdc:
  68. qz = re.search(".*?((.*?%))", d).group(1)
  69. data_dict[wdc] = qz
  70. except:
  71. pass
  72. return data_dict
  73. def gong_neng_mo_kuai(mysql, dl, data, er_title):
  74. # 将excel文件中的所有第三维度内容进行拼接
  75. str_dict = {}
  76. for et in er_title:
  77. for d in data:
  78. if d[1] == et:
  79. if str_dict.get(et):
  80. str_dict[et] = str_dict.get(et) + d[3]
  81. else:
  82. str_dict[et] = d[3]
  83. for k, v in str_dict.items():
  84. mysql.sql_change_msg(
  85. """insert into idc_project_module (project_id, check_duplicate_count, module_name, module_content, create_time, update_time, tag) value(%d, dup_file_test, "%s", "%s", "%s", "%s", "模块")""" % (
  86. int(dl[0]), k, v, str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7]))
  87. module_id_list = mysql.sql_select_many(
  88. """select project_module_id, module_name, module_content from idc_project_module where project_id=%d""" % dl[
  89. 0])
  90. data_list = []
  91. for mil in module_id_list:
  92. data_dict = {}
  93. data_dict["project_module_id"] = mil.get("project_module_id")
  94. data_dict["gnmc"] = mil.get("module_name")
  95. data_dict["gnms"] = mil.get("module_content")
  96. data_list.append(data_dict)
  97. # print(data_list)
  98. for i in data_list:
  99. gnmk_copy1 = mysql.sql_select_many("""select * from user_history_module_data""")
  100. if gnmk_copy1:
  101. for gc in gnmk_copy1:
  102. total_similarity1 = 0
  103. total_keywords1 = []
  104. total_similarity2 = 0
  105. total_keywords2 = []
  106. mysql.sql_change_msg(
  107. """insert into idc_project_module_check (project_module_id, module_name, project_name, company_name, create_time, update_time) value(%d, "%s", "%s", "%s", "%s", "%s")"""
  108. % (
  109. i.get("project_module_id"), gc.get("gnmc"), gc.get("xmmc"), "",
  110. str(datetime.datetime.now())[:-7],
  111. str(datetime.datetime.now())[:-7]))
  112. dup_module_id = mysql.cur.lastrowid
  113. for j in ["gnmc", "gnms"]:
  114. # 循环遍历每一个模块名称
  115. content_x = gc.get(j)
  116. content_y = i.get(j)
  117. if content_x and content_y:
  118. if j == "gnmc":
  119. similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
  120. # 相似度 关键词
  121. similarity, keyword_x, keywords = similarity.main()
  122. similarity = similarity * 1
  123. total_keywords1 += keywords
  124. #print("######################相似度: %.2f%%" % similarity, "关键词: %s" % keywords)
  125. # 相似度相加
  126. total_similarity1 += similarity
  127. mysql.sql_change_msg(
  128. """insert into idc_project_module_check_detail (dup_module_id, project_name, module_content, dup_module_content, similarity, dimension, create_time, update_time) value (%d, "%s", "%s", "%s", %f, "%s", "%s", "%s")"""
  129. % (dup_module_id, dl[2], escape_string(content_y), escape_string(content_x), similarity,
  130. "功能名称",
  131. str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7]))
  132. else:
  133. similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
  134. # 相似度 关键词
  135. similarity, keyword_x, keywords = similarity.main()
  136. similarity = similarity * 99
  137. total_keywords2 += keywords
  138. #print("######################相似度: %.2f%%" % similarity, "关键词: %s" % keywords)
  139. # 相似度相加
  140. total_similarity2 += similarity
  141. mysql.sql_change_msg(
  142. """insert into idc_project_module_check_detail (dup_module_id, project_name, module_content, dup_module_content, similarity, dimension, create_time, update_time) value (%d, "%s", "%s", "%s", %f, "%s", "%s", "%s")"""
  143. % (dup_module_id, dl[2], escape_string(content_y), escape_string(content_x), similarity,
  144. "功能模块描述",
  145. str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7]))
  146. mysql.sql_change_msg("""update idc_project_module_check set similarity=%f where dup_module_id=%d""" % (
  147. total_similarity1 + total_similarity2, dup_module_id))
  148. gnmk_gjc = {}
  149. for a in ["gnmc", "gnms"]:
  150. if i.get(a):
  151. content_x = i.get(a)
  152. content_y = i.get(a)
  153. if a == "gnmc":
  154. similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
  155. # 相似度 关键词
  156. similarity, keyword_x, keywords = similarity.main()
  157. gnmk_gjc[a] = keywords
  158. else:
  159. similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
  160. # 相似度 关键词
  161. similarity, keyword_x, keywords = similarity.main()
  162. gnmk_gjc[a] = keywords
  163. mysql.sql_change_msg("""insert into user_module_keywords (xmmc, gnmc, gnms) value("%s", "%s", "%s")""" % (
  164. dl[2], str(gnmk_gjc.get("gnmc"))[1:-1] if gnmk_gjc.get("gnmc") else None,
  165. str(gnmk_gjc.get("gnms"))[1:-1] if gnmk_gjc.get("gnms") else None))
  166. def project_check(data_list):
  167. mysql = mysql_pool.ConnMysql()
  168. # mysql.sql_select_many("""select * from mkgjc""")
  169. # 读取维度和权重
  170. # xmnr_count = len(mysql.sql_select_many("""select * from xmnr_copy1"""))
  171. # gnmk_count = len(mysql.sql_select_many("""select * from gnmk_copy1"""))
  172. xmnr_count = len(mysql.sql_select_many("""select * from user_history_data"""))
  173. gnmk_count = len(mysql.sql_select_many("""select * from user_history_module_data"""))
  174. get_data_dict = getFlag()
  175. # 遍历excel存储路径
  176. for dl in data_list:
  177. # path = "0825-丽水系统查重维度1.xlsx"
  178. # 读取路径下的excel
  179. print(dl,dl[1])
  180. df = pd.read_excel(dl[1])
  181. data = df.values
  182. # 将excel文件中的所有维度内容进行拼接
  183. join_str = ""
  184. str_dict = {}
  185. title = ""
  186. er_title = set()
  187. for d in data:
  188. if pd.notnull(d[0]):
  189. title = d[0]
  190. if title == "功能模块":
  191. er_title.add(d[1])
  192. join_str = ""
  193. for i in d[1:]:
  194. if pd.notnull(i):
  195. join_str += i
  196. str_dict[wdys1.get(title)] = join_str
  197. else:
  198. if title == "功能模块":
  199. er_title.add(d[1])
  200. for i in d[1:]:
  201. if pd.notnull(i):
  202. join_str += i
  203. str_dict[wdys1.get(title)] = str_dict.get(wdys1.get(title)) + join_str
  204. print(str_dict)
  205. mysql.sql_change_msg(
  206. """insert into user_data (xmmc, xzwt, xtjc, xmmb, yqjx, jsxq, sjxq, aqxq, ywly, hxyw, ywxq, ywxt, jscj, yhfw, mbqt, jsnr, gnmk, sjgx, znys) value ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")"""
  207. % (dl[0], str_dict.get("xzwt") if str_dict.get("xzwt") else None,
  208. str_dict.get("xtjc") if str_dict.get("xtjc") else None,
  209. str_dict.get("xmmb") if str_dict.get("xmmb") else None,
  210. str_dict.get("yqjx") if str_dict.get("yqjx") else None,
  211. str_dict.get("jsxq") if str_dict.get("jsxq") else None,
  212. str_dict.get("sjxq") if str_dict.get("sjxq") else None,
  213. str_dict.get("aqxq") if str_dict.get("aqxq") else None,
  214. str_dict.get("ywly") if str_dict.get("ywly") else None,
  215. str_dict.get("hxyw") if str_dict.get("hxyw") else None,
  216. str_dict.get("ywxq") if str_dict.get("ywxq") else None,
  217. str_dict.get("ywxt") if str_dict.get("ywxt") else None,
  218. str_dict.get("jscj") if str_dict.get("jscj") else None,
  219. str_dict.get("yhfw") if str_dict.get("yhfw") else None,
  220. str_dict.get("mbqt") if str_dict.get("mbqt") else None,
  221. str_dict.get("jsnr") if str_dict.get("jsnr") else None,
  222. str_dict.get("gnmk") if str_dict.get("gnmk") else None,
  223. str_dict.get("sjgx") if str_dict.get("sjgx") else None,
  224. str_dict.get("znys") if str_dict.get("znys") else None))
  225. # 或取所有的xmnr_copy1
  226. xmnr_copy1 = mysql.sql_select_many("""select * from user_history_data""")
  227. # 对比xmnr_copy1和xmnr维度是否都有
  228. if xmnr_copy1:
  229. for xc in xmnr_copy1:
  230. total_keywords = {}
  231. total_similarity = 0
  232. dup_count = 0
  233. # 保存相加后的相似度到idc_project_check
  234. mysql.sql_change_msg(
  235. """insert into idc_project_check (project_id, dup_project_name, file_path, company_name, create_year, project_tag, project_range_tag, project_area, create_time, update_time) value ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")"""
  236. % (dl[0], xc.get("xmmc"), dl[1], "", "", "需求相似、业务相似", "历史项目", "",
  237. str(datetime.datetime.now())[:-7], str(datetime.datetime.now())[:-7]))
  238. dup_id = mysql.cur.lastrowid
  239. for x in list(xc.keys())[1:]:
  240. content_x = xc.get(x)
  241. content_y = str_dict.get(x)
  242. if content_x and content_y:
  243. if x == 'gnmk':
  244. continue
  245. elif x == 'jsnr':
  246. continue
  247. else:
  248. dup_count += 1
  249. if xc.get('gnmk')==' ' and str_dict.get('gnmk')==' ':
  250. for x in list(xc.keys())[1:]:
  251. content_x = xc.get(x)
  252. content_y = str_dict.get(x)
  253. if content_x and content_y:
  254. if x == 'gnmk':
  255. # 匹配到历史数据,次数加1
  256. # dup_count += dup_file_test
  257. # 循环遍历每一个维度
  258. similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
  259. # 相似度 关键词
  260. similarity, keywords_x, keywords_y = similarity.main()
  261. similarity = similarity * 0
  262. #print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
  263. # 相似度相加
  264. total_similarity += similarity
  265. # 关键词收集
  266. total_keywords[x] = keywords_y
  267. function_content = content_y
  268. dup_function_content = content_x
  269. for word_y in keywords_y:
  270. word_y = word_y.strip().strip("'").strip('"')
  271. function_content = str(function_content.replace("\"", "'")).replace(word_y,
  272. f'<span class="similarity">{word_y.strip()}</span>')
  273. for word_x in keywords_x:
  274. word_x = word_x.strip().strip("'").strip('"')
  275. dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
  276. f'<span class="similarity">{word_x.strip()}</span>')
  277. # 保存每个维度对应的相似度到idc_project_check_detail
  278. mysql.sql_change_msg(
  279. """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
  280. % (dup_id, wdys2.get(x), similarity, escape_string(function_content),
  281. escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
  282. str(datetime.datetime.now())[:-7]))
  283. # content = content.replace(gjcs, f'<span class="similarity">{gjcs.strip()}</span>')
  284. elif x == 'jsnr':
  285. similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
  286. # 相似度 关键词
  287. similarity, keywords_x, keywords_y = similarity.main()
  288. similarity = similarity * 40
  289. #print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
  290. # 相似度相加
  291. total_similarity += similarity
  292. # 关键词收集
  293. total_keywords[x] = keywords_y
  294. function_content = content_y
  295. dup_function_content = content_x
  296. for word_y in keywords_y:
  297. word_y = word_y.strip().strip("'").strip('"')
  298. function_content = str(function_content.replace("\"", "'")).replace(word_y,
  299. f'<span class="similarity">{word_y.strip()}</span>')
  300. for word_x in keywords_x:
  301. word_x = word_x.strip().strip("'").strip('"')
  302. dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
  303. f'<span class="similarity">{word_x.strip()}</span>')
  304. # 保存每个维度对应的相似度到idc_project_check_detail
  305. mysql.sql_change_msg(
  306. """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
  307. % (dup_id, wdys2.get(x), similarity, escape_string(function_content),
  308. escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
  309. str(datetime.datetime.now())[:-7]))
  310. else:
  311. similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
  312. # 相似度 关键词
  313. similarity, keywords_x, keywords_y = similarity.main()
  314. similarity = similarity * (60 / dup_count)
  315. #print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
  316. # 相似度相加
  317. total_similarity += similarity
  318. # 关键词收集
  319. total_keywords[x] = keywords_y
  320. function_content = content_y
  321. dup_function_content = content_x
  322. for word_y in keywords_y:
  323. word_y = word_y.strip().strip("'").strip('"')
  324. function_content = str(function_content.replace("\"", "'")).replace(word_y,
  325. f'<span class="similarity">{word_y.strip()}</span>')
  326. for word_x in keywords_x:
  327. word_x = word_x.strip().strip("'").strip('"')
  328. dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
  329. f'<span class="similarity">{word_x.strip()}</span>')
  330. # 保存每个维度对应的相似度到idc_project_check_detail
  331. mysql.sql_change_msg(
  332. """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
  333. % (dup_id, wdys2.get(x), similarity, escape_string(function_content),
  334. escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
  335. str(datetime.datetime.now())[:-7]))
  336. elif xc['jsnr'] == ' ' and str_dict['jsnr'] == ' ':
  337. for x in list(xc.keys())[1:]:
  338. content_x = xc.get(x)
  339. content_y = str_dict.get(x)
  340. if content_x and content_y:
  341. if x == 'gnmk':
  342. # 匹配到历史数据,次数加1
  343. # dup_count += dup_file_test
  344. # 循环遍历每一个维度
  345. similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
  346. # 相似度 关键词
  347. similarity, keywords_x, keywords_y = similarity.main()
  348. similarity = similarity * 50
  349. #print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
  350. # 相似度相加
  351. total_similarity += similarity
  352. # 关键词收集
  353. total_keywords[x] = keywords_y
  354. function_content = content_y
  355. dup_function_content = content_x
  356. for word_y in keywords_y:
  357. word_y = word_y.strip().strip("'").strip('"')
  358. function_content = str(function_content.replace("\"", "'")).replace(word_y,
  359. f'<span class="similarity">{word_y.strip()}</span>')
  360. for word_x in keywords_x:
  361. word_x = word_x.strip().strip("'").strip('"')
  362. dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
  363. f'<span class="similarity">{word_x.strip()}</span>')
  364. # 保存每个维度对应的相似度到idc_project_check_detail
  365. mysql.sql_change_msg(
  366. """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
  367. % (dup_id, wdys2.get(x), similarity, escape_string(function_content),
  368. escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
  369. str(datetime.datetime.now())[:-7]))
  370. # content = content.replace(gjcs, f'<span class="similarity">{gjcs.strip()}</span>')
  371. elif x == 'jsnr':
  372. similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
  373. # 相似度 关键词
  374. similarity, keywords_x, keywords_y = similarity.main()
  375. similarity = similarity * 0
  376. #print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
  377. # 相似度相加
  378. total_similarity += similarity
  379. # 关键词收集
  380. total_keywords[x] = keywords_y
  381. function_content = content_y
  382. dup_function_content = content_x
  383. for word_y in keywords_y:
  384. word_y = word_y.strip().strip("'").strip('"')
  385. function_content = str(function_content.replace("\"", "'")).replace(word_y,
  386. f'<span class="similarity">{word_y.strip()}</span>')
  387. for word_x in keywords_x:
  388. word_x = word_x.strip().strip("'").strip('"')
  389. dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
  390. f'<span class="similarity">{word_x.strip()}</span>')
  391. # 保存每个维度对应的相似度到idc_project_check_detail
  392. mysql.sql_change_msg(
  393. """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
  394. % (dup_id, wdys2.get(x), similarity, escape_string(function_content),
  395. escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
  396. str(datetime.datetime.now())[:-7]))
  397. else:
  398. similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
  399. # 相似度 关键词
  400. similarity, keywords_x, keywords_y = similarity.main()
  401. similarity = similarity * (50 / dup_count)
  402. #print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
  403. # 相似度相加
  404. total_similarity += similarity
  405. # 关键词收集
  406. total_keywords[x] = keywords_y
  407. function_content = content_y
  408. dup_function_content = content_x
  409. for word_y in keywords_y:
  410. word_y = word_y.strip().strip("'").strip('"')
  411. function_content = str(function_content.replace("\"", "'")).replace(word_y,
  412. f'<span class="similarity">{word_y.strip()}</span>')
  413. for word_x in keywords_x:
  414. word_x = word_x.strip().strip("'").strip('"')
  415. dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
  416. f'<span class="similarity">{word_x.strip()}</span>')
  417. # 保存每个维度对应的相似度到idc_project_check_detail
  418. mysql.sql_change_msg(
  419. """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
  420. % (dup_id, wdys2.get(x), similarity, escape_string(function_content),
  421. escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
  422. str(datetime.datetime.now())[:-7]))
  423. else:
  424. for x in list(xc.keys())[1:]:
  425. content_x = xc.get(x)
  426. content_y = str_dict.get(x)
  427. if content_x and content_y:
  428. if x == 'gnmk':
  429. # 匹配到历史数据,次数加1
  430. # dup_count += dup_file_test
  431. # 循环遍历每一个维度
  432. similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
  433. # 相似度 关键词
  434. similarity, keywords_x, keywords_y = similarity.main()
  435. similarity = similarity * 50
  436. #print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
  437. # 相似度相加
  438. total_similarity += similarity
  439. # 关键词收集
  440. total_keywords[x] = keywords_y
  441. function_content = content_y
  442. dup_function_content = content_x
  443. for word_y in keywords_y:
  444. word_y = word_y.strip().strip("'").strip('"')
  445. function_content = str(function_content.replace("\"", "'")).replace(word_y,
  446. f'<span class="similarity">{word_y.strip()}</span>')
  447. for word_x in keywords_x:
  448. word_x = word_x.strip().strip("'").strip('"')
  449. dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
  450. f'<span class="similarity">{word_x.strip()}</span>')
  451. # 保存每个维度对应的相似度到idc_project_check_detail
  452. mysql.sql_change_msg(
  453. """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
  454. % (dup_id, wdys2.get(x), similarity, escape_string(function_content),
  455. escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
  456. str(datetime.datetime.now())[:-7]))
  457. # content = content.replace(gjcs, f'<span class="similarity">{gjcs.strip()}</span>')
  458. elif x == 'jsnr':
  459. similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
  460. # 相似度 关键词
  461. similarity, keywords_x, keywords_y = similarity.main()
  462. similarity = similarity * 40
  463. #print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
  464. # 相似度相加
  465. total_similarity += similarity
  466. # 关键词收集
  467. total_keywords[x] = keywords_y
  468. function_content = content_y
  469. dup_function_content = content_x
  470. for word_y in keywords_y:
  471. word_y = word_y.strip().strip("'").strip('"')
  472. function_content = str(function_content.replace("\"", "'")).replace(word_y,
  473. f'<span class="similarity">{word_y.strip()}</span>')
  474. for word_x in keywords_x:
  475. word_x = word_x.strip().strip("'").strip('"')
  476. dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
  477. f'<span class="similarity">{word_x.strip()}</span>')
  478. # 保存每个维度对应的相似度到idc_project_check_detail
  479. mysql.sql_change_msg(
  480. """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
  481. % (dup_id, wdys2.get(x), similarity, escape_string(function_content),
  482. escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
  483. str(datetime.datetime.now())[:-7]))
  484. else:
  485. similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
  486. # 相似度 关键词
  487. similarity, keywords_x, keywords_y = similarity.main()
  488. similarity = similarity * (10 / dup_count)
  489. #print("**************相似度: %.2f%%" % similarity, "关键词: %s" % keywords_y)
  490. # 相似度相加
  491. total_similarity += similarity
  492. # 关键词收集
  493. total_keywords[x] = keywords_y
  494. function_content = content_y
  495. dup_function_content = content_x
  496. for word_y in keywords_y:
  497. word_y = word_y.strip().strip("'").strip('"')
  498. function_content = str(function_content.replace("\"", "'")).replace(word_y,
  499. f'<span class="similarity">{word_y.strip()}</span>')
  500. for word_x in keywords_x:
  501. word_x = word_x.strip().strip("'").strip('"')
  502. dup_function_content = str(dup_function_content.replace("\"", "'")).replace(word_x,
  503. f'<span class="similarity">{word_x.strip()}</span>')
  504. # 保存每个维度对应的相似度到idc_project_check_detail
  505. mysql.sql_change_msg(
  506. """insert into idc_project_check_detail (dup_id, dimension, similarity, function_content, dup_function_content, create_time, update_time) value (%d, "%s", %f, "%s", "%s", "%s", "%s")"""
  507. % (dup_id, wdys2.get(x), similarity, escape_string(function_content),
  508. escape_string(dup_function_content), str(datetime.datetime.now())[:-7],
  509. str(datetime.datetime.now())[:-7]))
  510. mysql.sql_change_msg(
  511. """update idc_project_check set similarity=%f where dup_id=%d""" % (total_similarity, dup_id))
  512. project_gjc = {}
  513. for w in wdys2.keys():
  514. content_x = str_dict.get(w)
  515. content_y = str_dict.get(w)
  516. if content_x and content_y:
  517. # 循环遍历每一个维度
  518. similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
  519. # 相似度 关键词
  520. similarity, keywords_x, keywords = similarity.main()
  521. project_gjc[w] = keywords
  522. mysql.sql_change_msg(
  523. """insert into user_keyword (xmmc, xzwt, xtjc, xmmb, yqjx, jsxq, sjxq, aqxq, ywly, hxyw, ywxq, ywxt, jscj, yhfw, mbqt, jsnr, gnmk, sjgx, znys) value ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")"""
  524. % (dl[0], str(project_gjc.get("xzwt"))[1:-1] if project_gjc.get("xzwt") else None,
  525. str(project_gjc.get("xtjc"))[1:-1] if project_gjc.get("xtjc") else None,
  526. str(project_gjc.get("xmmb"))[1:-1] if project_gjc.get("xmmb") else None,
  527. str(project_gjc.get("yqjx"))[1:-1] if project_gjc.get("yqjx") else None,
  528. str(project_gjc.get("jsxq"))[1:-1] if project_gjc.get("jsxq") else None,
  529. str(project_gjc.get("sjxq"))[1:-1] if project_gjc.get("sjxq") else None,
  530. str(project_gjc.get("aqxq"))[1:-1] if project_gjc.get("aqxq") else None,
  531. str(project_gjc.get("ywly"))[1:-1] if project_gjc.get("ywly") else None,
  532. str(project_gjc.get("hxyw"))[1:-1] if project_gjc.get("hxyw") else None,
  533. str(project_gjc.get("ywxq"))[1:-1] if project_gjc.get("ywxq") else None,
  534. str(project_gjc.get("ywxt"))[1:-1] if project_gjc.get("ywxt") else None,
  535. str(project_gjc.get("jscj"))[1:-1] if project_gjc.get("jscj") else None,
  536. str(project_gjc.get("yhfw"))[1:-1] if project_gjc.get("yhfw") else None,
  537. str(project_gjc.get("mbqt"))[1:-1] if project_gjc.get("mbqt") else None,
  538. str(project_gjc.get("jsnr"))[1:-1] if project_gjc.get("jsnr") else None,
  539. str(project_gjc.get("gnmk"))[1:-1] if project_gjc.get("gnmk") else None,
  540. str(project_gjc.get("sjgx"))[1:-1] if project_gjc.get("sjgx") else None,
  541. str(project_gjc.get("znys"))[1:-1] if project_gjc.get("znys") else None))
  542. mysql.sql_change_msg(
  543. """update idc_project set dup_status=3, one_vote_veto_status=dup_file_test, self_check_status=dup_file_test, history_project_count=%d ,module_count=%d where project_id=%d""" % (
  544. xmnr_count, gnmk_count, dl[0]))
  545. gong_neng_mo_kuai(mysql, dl, data, er_title)
  546. if __name__ == "__main__":
  547. # all_path = requests.get("http://127.0.0.1:19099/check/duplicates/%s" % 15).json()
  548. # print(all_path)
  549. # data_list = []
  550. # for ap in all_path.get("data"):
  551. # # if os.path.exists(ap.get("file_path")):
  552. # data_list.append((ap.get("project_id"), ap.get("file_path"), ap.get("project_name")))
  553. # print(data_list)
  554. data_list = [(11, r"C:\Users\HUAWEI\PycharmProjects\nlp\dup_check\0825-丽水系统查重维度1.xlsx", "水路运输综合监管系统建设项目.xls")]
  555. project_check(data_list)
  556. """
  557. """