丽水查重代码
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

411 lines
18KB

  1. # coding=utf-8
  2. import re
  3. import mysql_pool
  4. from pymysql.converters import escape_string
  5. import cosin_similarity
  6. import pandas as pd
  7. import glm_utils
  8. import os
  9. import json
  10. wdys1 = {
  11. "项目名称": "xmmc",
  12. "现状问题": "xzwt",
  13. "系统基础": "xtjc",
  14. "项目目标": "xmmb",
  15. "预期绩效": "yqjx",
  16. "建设需求": "jsxq",
  17. "数据需求": "sjxq",
  18. "安全需求": "aqxq",
  19. "业务领域": "ywly",
  20. "核心业务": "hxyw",
  21. "业务需求": "ywxq",
  22. "业务协同": "ywxt",
  23. "建设层级": "jscj",
  24. "用户范围": "yhfw",
  25. "目标群体": "mbqt",
  26. "建设内容": "jsnr",
  27. "功能模块": "gnmk",
  28. "数据共享": "sjgx",
  29. "智能要素": "znys",
  30. "申报单位": "sbdw",
  31. "所属地区": "ssdq",
  32. "预算年度": "ysnd"
  33. }
  34. wdys2 = {
  35. "xmmc": "项目名称",
  36. "xzwt": "现状问题",
  37. "xtjc": "系统基础",
  38. "xmmb": "项目目标",
  39. "yqjx": "预期绩效",
  40. "jsxq": "建设需求",
  41. "sjxq": "数据需求",
  42. "aqxq": "安全需求",
  43. "ywly": "业务领域",
  44. "hxyw": "核心业务",
  45. "ywxq": "业务需求",
  46. "ywxt": "业务协同",
  47. "jscj": "建设层级",
  48. "yhfw": "用户范围",
  49. "mbqt": "目标群体",
  50. "jsnr": "建设内容",
  51. "gnmk": "功能模块",
  52. "sjgx": "数据共享",
  53. "znys": "智能要素",
  54. "sbdw": "申报单位",
  55. "ssdq": "所属地区",
  56. "ysnd": "预算年度"
  57. }
  58. gnmkys = {
  59. "gnmc": "功能名称",
  60. "gnms": "功能描述"
  61. }
  62. def getFlag():
  63. data_dict = {}
  64. df = pd.read_excel("0825-丽水系统查重维度.xlsx")
  65. data = df.values
  66. data = list(pd.Series(data[:, 1]).dropna())
  67. for d in data:
  68. try:
  69. wd = re.search("(.*?)(.*?%)", d).group(1).strip()
  70. wdc = wdys1.get(wd)
  71. if wdc:
  72. qz = re.search(".*?((.*?%))", d).group(1)
  73. data_dict[wdc] = qz
  74. except:
  75. pass
  76. return data_dict
  77. # getFlag()
  78. def gong_neng_mo_kuai(xmmc, mysql, dl, data, er_title, line):
  79. # 将excel文件中的所有第三维度内容进行拼接
  80. str_dict = {}
  81. for et in er_title:
  82. for d in data:
  83. if d[1] == et:
  84. if str_dict.get(et):
  85. str_dict[et] = str_dict.get(et) + d[3]
  86. else:
  87. str_dict[et] = d[3]
  88. for k, v in str_dict.items():
  89. mysql.sql_change_msg("""insert into user_history_module_data(xmmc,gnmc,gnms,line, remark) value("%s", "%s", "%s", "%s", "%s")""" % (
  90. escape_string(xmmc), escape_string(k), escape_string(v), line, ""))
  91. #
  92. similarity = cosin_similarity.CosineSimilarity(v, v)
  93. similarity, keywords_x, keywords_y = similarity.main()
  94. mysql.sql_change_msg("""insert into user_history_module_keywords (xmmc,gnmc,gnms,line) value("%s" ,"%s", "%s", "%s")""" % (
  95. xmmc, escape_string(k), str(keywords_y)[1:-1], line))
  96. def project_check(data_list, line):
  97. mysql = mysql_pool.ConnMysql()
  98. # 读取维度和权重
  99. # get_data_dict = getFlag()
  100. # 遍历excel存储路径
  101. for dl in data_list:
  102. # path = "0825-丽水系统查重维度1.xlsx"
  103. # 读取路径下的excel
  104. print(dl)
  105. df = pd.read_excel(dl[1])
  106. xmmc = df.keys()
  107. # print(type(xmmc[dup_file_test]))
  108. xmmc=xmmc[1]
  109. # print(type(xmmc))
  110. # xmmc1=''
  111. if "可研报告"or "可研性报告"or "可行性研究报告" in xmmc:
  112. xmmc=xmmc.replace('可研报告','')
  113. xmmc=xmmc.replace('可研性报告','')
  114. xmmc=xmmc.replace('可行性研究报告','')
  115. # print(xmmc)
  116. data = df.values
  117. # 将excel文件中的所有维度内容进行拼接
  118. join_str = ""
  119. str_dict = {}
  120. title = ""
  121. er_title = set()
  122. # for d in data:
  123. # # print(d)
  124. # if pd.notnull(d[0]):
  125. # title = d[0]
  126. # if title == "功能模块":
  127. # er_title.add(d[dup_file_test])
  128. # join_str = ""
  129. # for i in d[dup_file_test:]:
  130. # if pd.notnull(i):
  131. # join_str += i
  132. # str_dict[wdys1.get(title)] = join_str
  133. # else:
  134. # if title == "功能模块":
  135. # er_title.add(d[dup_file_test])
  136. # for i in d[dup_file_test:]:
  137. # if pd.notnull(i):
  138. # join_str += i
  139. # str_dict[wdys1.get(title)] = str_dict.get(wdys1.get(title)) + join_str
  140. # print(str_dict)
  141. gnmk_str = []
  142. # print(data)
  143. for d in data:
  144. if pd.notnull(d[0]):
  145. title = d[0]
  146. if title == "功能模块":
  147. er_title.add(d[1])
  148. join_str = ""
  149. for i in d[1:]:
  150. # print(type(i))
  151. # i=str(i)
  152. if pd.notnull(i):
  153. join_str += str(i)
  154. if title == "功能模块":
  155. # for j in d[3:]:
  156. if i == '功能描述':
  157. continue
  158. else:
  159. gnmk_str.append(i)
  160. str_dict[wdys1.get(title)] = join_str
  161. # print(str_dict.get(wdys1.get(title)))
  162. else:
  163. if title == "功能模块":
  164. er_title.add(d[1])
  165. for i in d[3:]:
  166. if pd.notnull(i):
  167. join_str += str(i)
  168. if title == "功能模块":
  169. gnmk_str.append(i)
  170. str_dict[wdys1.get(title)] = str_dict.get(wdys1.get(title)) + join_str
  171. # gnmk="".join(gnmk_str)
  172. # str_dict['gnmk']=gnmk
  173. gnmk = ",".join(gnmk_str)
  174. str_dict['gnmk'] = gnmk
  175. # print(str_dict)
  176. # print(str_dict.get("xzwt")if str_dict.get("xzwt") else None)
  177. # print(str_dict.get('gnmk')if str_dict.get('gnmk')else None)
  178. mysql.sql_change_msg(
  179. """insert into user_history_data (xmmc, xzwt, xtjc, xmmb, yqjx, jsxq, sjxq, aqxq, ywly, hxyw, ywxq, ywxt, jscj, yhfw, mbqt, jsnr, gnmk, sjgx, znys,sbdw,ssdq,ysnd,line,remark) value ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s","%s","%s","%s","%s","%s")"""
  180. % (escape_string(xmmc),
  181. escape_string(str_dict.get("xzwt")) if str_dict.get("xzwt") else None,
  182. escape_string(str_dict.get("xtjc")) if str_dict.get("xtjc") else None,
  183. escape_string(str_dict.get("xmmb")) if str_dict.get("xmmb") else None,
  184. escape_string(str_dict.get("yqjx")) if str_dict.get("yqjx") else None,
  185. escape_string(str_dict.get("jsxq")) if str_dict.get("jsxq") else None,
  186. escape_string(str_dict.get("sjxq")) if str_dict.get("sjxq") else None,
  187. escape_string(str_dict.get("aqxq")) if str_dict.get("aqxq") else None,
  188. escape_string(str_dict.get("ywly")) if str_dict.get("ywly") else None,
  189. escape_string(str_dict.get("hxyw")) if str_dict.get("hxyw") else None,
  190. escape_string(str_dict.get("ywxq")) if str_dict.get("ywxq") else None,
  191. escape_string(str_dict.get("ywxt")) if str_dict.get("ywxt") else None,
  192. escape_string(str_dict.get("jscj")) if str_dict.get("jscj") else None,
  193. escape_string(str_dict.get("yhfw")) if str_dict.get("yhfw") else None,
  194. escape_string(str_dict.get("mbqt")) if str_dict.get("mbqt") else None,
  195. escape_string(str_dict.get("jsnr")) if str_dict.get("jsnr") else None,
  196. escape_string(str_dict.get("gnmk")) if str_dict.get("gnmk") else None,
  197. escape_string(str_dict.get("sjgx")) if str_dict.get("sjgx") else None,
  198. escape_string(str_dict.get("znys")) if str_dict.get("znys") else None,
  199. escape_string(str_dict.get("sbdw")) if str_dict.get("sbdw") else None,
  200. escape_string(str_dict.get("ssdq")) if str_dict.get("ssdq") else None,
  201. escape_string(str_dict.get("ysnd")) if str_dict.get("ysnd") else None,
  202. line, ""))
  203. project_gjc = {}
  204. for w in wdys2.keys():
  205. content_x = str_dict.get(w)
  206. content_y = str_dict.get(w)
  207. if content_x and content_y:
  208. # 循环遍历每一个维度
  209. similarity = cosin_similarity.CosineSimilarity(content_x, content_y)
  210. # 相似度 关键词
  211. similarity, keywords_x, keywords_y = similarity.main()
  212. project_gjc[w] = keywords_y
  213. mysql.sql_change_msg(
  214. """insert into user_history_keywords (xmmc, xzwt, xtjc, xmmb, yqjx, jsxq, sjxq, aqxq, ywly, hxyw, ywxq, ywxt, jscj, yhfw, mbqt, jsnr, gnmk, sjgx, znys, line) value ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")"""
  215. % (xmmc, str(project_gjc.get("xzwt"))[1:-1] if project_gjc.get("xzwt") else None,
  216. str(project_gjc.get("xtjc"))[1:-1] if project_gjc.get("xtjc") else None,
  217. str(project_gjc.get("xmmb"))[1:-1] if project_gjc.get("xmmb") else None,
  218. str(project_gjc.get("yqjx"))[1:-1] if project_gjc.get("yqjx") else None,
  219. str(project_gjc.get("jsxq"))[1:-1] if project_gjc.get("jsxq") else None,
  220. str(project_gjc.get("sjxq"))[1:-1] if project_gjc.get("sjxq") else None,
  221. str(project_gjc.get("aqxq"))[1:-1] if project_gjc.get("aqxq") else None,
  222. str(project_gjc.get("ywly"))[1:-1] if project_gjc.get("ywly") else None,
  223. str(project_gjc.get("hxyw"))[1:-1] if project_gjc.get("hxyw") else None,
  224. str(project_gjc.get("ywxq"))[1:-1] if project_gjc.get("ywxq") else None,
  225. str(project_gjc.get("ywxt"))[1:-1] if project_gjc.get("ywxt") else None,
  226. str(project_gjc.get("jscj"))[1:-1] if project_gjc.get("jscj") else None,
  227. str(project_gjc.get("yhfw"))[1:-1] if project_gjc.get("yhfw") else None,
  228. str(project_gjc.get("mbqt"))[1:-1] if project_gjc.get("mbqt") else None,
  229. str(project_gjc.get("jsnr"))[1:-1] if project_gjc.get("jsnr") else None,
  230. str(project_gjc.get("gnmk"))[1:-1] if project_gjc.get("gnmk") else None,
  231. str(project_gjc.get("sjgx"))[1:-1] if project_gjc.get("sjgx") else None,
  232. str(project_gjc.get("znys"))[1:-1] if project_gjc.get("znys") else None,
  233. line))
  234. gong_neng_mo_kuai(xmmc, mysql, dl, data, er_title, line)
  235. def update_desc():
  236. mysql = mysql_pool.ConnMysql()
  237. module_list = mysql.sql_select_many("""select id, gnms from user_history_module_data where xmmc = '丽水市城市管理指挥中心信息系统(一期)项目'""")
  238. for module in module_list:
  239. # 通过chatglm进行提取信息
  240. gnms = module.get("gnms")
  241. content = glm_utils.CallContentResultNew(gnms)
  242. mysql.sql_change_msg(
  243. """UPDATE user_history_module_data SET glm_desc = "%s" WHERE id = %d""" % (
  244. content if content else None,
  245. module.get("id")))
  246. print(content)
  247. def update_desc1():
  248. mysql = mysql_pool.ConnMysql()
  249. module_list = mysql.sql_select_many("""select id, gnms from gnms_gml where xmmc = '丽水花园云(城市大脑)数字驾驶舱项目'""")
  250. for module in module_list:
  251. # 通过chatglm进行提取信息
  252. gnms = module.get("gnms")
  253. content = glm_utils.CallContentResultNew(gnms)
  254. mysql.sql_change_msg(
  255. """UPDATE gnms_gml SET glm_desc = "%s" WHERE id = %d""" % (
  256. content if content else None,
  257. module.get("id")))
  258. print(content)
  259. def info_word_project():
  260. mysql = mysql_pool.ConnMysql()
  261. module_list1 = mysql.sql_select_many(
  262. """select jsnr from user_history_data where xmmc = '2023年丽水市云和县数字法治门户建设项目' """)
  263. module_list2 = mysql.sql_select_many(
  264. """select jsnr from user_history_data where xmmc IN ('浙江省第二监狱重点罪犯管控模型项目',
  265. '浙江省农村水电站管理数字化应用',
  266. '浙江省河湖库保护数字化应用建设项目',
  267. '浙江省环境遥感监测业务智治',
  268. '平台项目',
  269. '浙江林业智媒平台项目',
  270. '未来e家应用建设方案',
  271. '浙江省智慧林业云平台升级改造项目建设方案',
  272. '为侨服务“全球通”平台二期建设项目')""")
  273. json_objects = []
  274. for module_info1 in module_list1:
  275. for jsnr1Ele in module_info1["jsnr"].split('-----》'):
  276. for module_info2 in module_list2:
  277. for jsnr2Ele in module_info2["jsnr"].split('-----》'):
  278. str = "A:%s\nB:%s" % (jsnr1Ele, jsnr2Ele)
  279. data = {
  280. "instruction": "现在你是一个政府提案的查重检查人员,给定两段话A和B:让我们一步步思考并判断是否相似。请以相似度从高、中、低三个等级进行评价,并给出理由。",
  281. "input": str,
  282. "output": ""
  283. }
  284. json_objects.append(data)
  285. with open('其他-建设内容.json', 'w') as f:
  286. for json_obj in json_objects:
  287. json_str = json.dumps(json_obj, ensure_ascii=False) # 将JSON对象转换为字符串
  288. f.write(json_str + '\n') # 写入字符串,并添加换行符
  289. def info_word1():
  290. mysql = mysql_pool.ConnMysql()
  291. # module_list1 = mysql.sql_select_many("""select gnms from user_history_module_data where xmmc = '莲智社区' """)
  292. # module_list2 = mysql.sql_select_many("""select gnms from user_history_module_data where xmmc IN ('古堰画乡智慧客厅项目—未来社区智慧服务平台', '未来e家')""")
  293. module_list1 = mysql.sql_select_many("""select gnms from user_history_module_data where xmmc = '丽水市遂昌县政法委数字法治综合应用' """)
  294. module_list2 = mysql.sql_select_many("""select gnms from user_history_module_data where xmmc IN ('浙江省第二监狱重点罪犯管控模型项目',
  295. '浙江省农村水电站管理数字化应用',
  296. '浙江省河湖库保护数字化应用建设项目',
  297. '浙江省环境遥感监测业务智治',
  298. '平台项目',
  299. '浙江林业智媒平台项目',
  300. '未来e家应用建设方案',
  301. '浙江省智慧林业云平台升级改造项目建设方案',
  302. '为侨服务“全球通”平台二期建设项目')""")
  303. json_objects = []
  304. for module_info1 in module_list1:
  305. for module_info2 in module_list2:
  306. str = "A:%s\nB:%s" % (module_info1["gnms"], module_info2["gnms"])
  307. data = {
  308. "instruction": "现在你是一个政府提案的查重检查人员,给定两段话A和B:让我们一步步思考并判断是否相似。请以相似度从高、中、低三个等级进行评价,并给出理由。",
  309. "input": str,
  310. "output": ""
  311. }
  312. json_objects.append(data)
  313. with open('其他-功能模块对比.json', 'w') as f:
  314. for json_obj in json_objects:
  315. json_str = json.dumps(json_obj, ensure_ascii=False) # 将JSON对象转换为字符串
  316. f.write(json_str + '\n') # 写入字符串,并添加换行符
  317. def info_word_project_yw():
  318. mysql = mysql_pool.ConnMysql()
  319. module_list1 = mysql.sql_select_many(
  320. """select jsnr from user_history_data where xmmc = '2023年丽水市云和县数字法治门户建设项目' """)
  321. module_list2 = mysql.sql_select_many(
  322. """select jsnr from user_history_data where xmmc IN ('2023年丽水市云和县数字法治门户建设项目', '浙江省司法厅全域数字法治监督应用系统(一期)', '丽水市遂昌县政法委数字法治综合应用', '丽水市龙泉市政法委法治龙泉门户', '庆元县数字法治综合门户')""")
  323. json_objects = []
  324. for module_info1 in module_list1:
  325. for jsnr1Ele in module_info1["jsnr"].split('-----》'):
  326. for module_info2 in module_list2:
  327. for jsnr2Ele in module_info2["jsnr"].split('-----》'):
  328. str = "A:%s\nB:%s" % (jsnr1Ele, jsnr2Ele)
  329. data = {
  330. "instruction": "现在你是一个政府提案的查重检查人员,给定两段话A和B:让我们一步步思考并判断是否相似。请以相似度从高、中、低三个等级进行评价,并给出理由。",
  331. "input": str,
  332. "output": ""
  333. }
  334. json_objects.append(data)
  335. with open('其他-建设内容.json', 'w') as f:
  336. for json_obj in json_objects:
  337. json_str = json.dumps(json_obj, ensure_ascii=False) # 将JSON对象转换为字符串
  338. f.write(json_str + '\n') # 写入字符串,并添加换行符
  339. if __name__ == "__main__":
  340. info_word1()
  341. print("ok.......")
  342. path = r"/Users/kebobo/Downloads/丽水/未来社区"
  343. data_list = os.listdir(path)
  344. for file in data_list:
  345. if file != '.DS_Store':
  346. data_list = [(0, path + '/' + file, "")]
  347. project_check(data_list, "2024-07-27-数字法治")
  348. print("已存入************************************* %s" % file)
  349. """
  350. 建设目标,业务功能
  351. gnmk_str = []
  352. for d in data:
  353. if pd.notnull(d[0]):
  354. title = d[0]
  355. if title == "功能模块":
  356. er_title.add(d[dup_file_test])
  357. join_str = ""
  358. for i in d[dup_file_test:]:
  359. if pd.notnull(i):
  360. join_str += i
  361. if title == "功能模块":
  362. gnmk_str.append(i)
  363. str_dict[wdys1.get(title)] = join_str
  364. else:
  365. if title == "功能模块":
  366. er_title.add(d[dup_file_test])
  367. for i in d[dup_file_test:]:
  368. if pd.notnull(i):
  369. join_str += i
  370. if title == "功能模块":
  371. gnmk_str.append(i)
  372. str_dict[wdys1.get(title)] = str_dict.get(wdys1.get(title)) + join_str
  373. gnmk = "".join(gnmk_str)
  374. """