丽水查重代码
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

124 lines
4.4KB

  1. import os
  2. import docx
  3. import mysql_pool
  4. from pymysql.converters import escape_string
  5. import xlsxwriter
  6. import uuid
  7. def read_docx(file_path):
  8. mysql = mysql_pool.ConnMysql()
  9. # print(os.path.abspath('丽水市本级信息化项目建设方案模板.docx'))
  10. # url = "http://jobapi.ningdatech.com/prometheus-yw_file_service/files/20240919/669f323c5c824f89a34bf04a66105902.doc"
  11. # file_name = "丽水市本级信息化项目建设方案模板.docx"
  12. # file_path = os.path.join("temp", file_name)
  13. excel_path = f"/Users/kebobo/fsdownload/dup_check/temp/{uuid.uuid4()}.xls"
  14. try:
  15. # r = requests.get(url)
  16. # with open(file_path, "wb") as code:
  17. # code.write(r.content)
  18. # # 转化文件格式
  19. # convert_doc_to_docx(file_path, file_path.replace('.doc', '.docx'))
  20. # file_path = file_path.replace('.doc', '.docx')
  21. # 读取文件
  22. # doc = docx.Document(os.path.abspath(file_path))
  23. doc = docx.Document(file_path)
  24. # 是否开始获取文本
  25. is_acquire = 0
  26. is_project_name = 0
  27. content = []
  28. # 功能模块
  29. feature_map = {}
  30. # 功能名
  31. feature_name = ""
  32. # 项目名
  33. xmmc = ""
  34. for para in doc.paragraphs:
  35. style = para.style.name
  36. print(f"style: {para.style.name}, value: {para.text}")
  37. if str(style).find('toc') == 1:
  38. continue
  39. # 获取文档项目名称
  40. if para.text.find('项目名称') != -1:
  41. is_project_name = 1
  42. elif para.text.find('项目类型') != -1:
  43. is_project_name = 0
  44. if is_project_name == 1:
  45. if str(style).find('Heading') == -1 and str(style).find('toc') == -1:
  46. xmmc = para.text
  47. if para.text == '3.1.2 建设内容':
  48. is_acquire = 1
  49. elif para.text == '3.2 整体架构设计':
  50. is_acquire = 0
  51. if is_acquire == 1:
  52. if str(style).find('Heading') == -1:
  53. # print(f"content: {para.text}, style: {para.style.name}")
  54. feature_map[feature_name] = para.text
  55. # 重置功能名
  56. feature_name = ""
  57. content.append(para.text)
  58. else:
  59. feature_map[para.text] = ""
  60. feature_name = para.text
  61. # 使用next函数逐个获取元素
  62. # for key, value in feature_map.items():
  63. # if key != "3.1.2 建设内容" and key != "":
  64. # print(f"Key: {key}, Value: {value}")
  65. # # 将功能描述入库
  66. # mysql.sql_change_msg(
  67. # """insert into user_history_module_data(xmmc,gnmc,gnms,line, remark) value("%s", "%s", "%s", "%s", "%s")""" % (
  68. # escape_string(xmmc), escape_string(key), escape_string(value), "", "自动拆解导入"))
  69. # 生成excel文件(兼容老版本, 减少改动量)
  70. workbook = xlsxwriter.Workbook(excel_path)
  71. worksheet = workbook.add_worksheet("Sheet1")
  72. worksheet.write('A1', '项目名称')
  73. worksheet.write('B1', escape_string(xmmc))
  74. i = 0
  75. for key, value in feature_map.items():
  76. if key != "3.1.2 建设内容" and key != "":
  77. i = i + 1
  78. worksheet.write(f"A{i + 1}", '功能模块')
  79. worksheet.write(f"B{i + 1}", escape_string(key))
  80. worksheet.write(f"C{i + 1}", '功能描述')
  81. worksheet.write(f"D{i + 1}", escape_string(value))
  82. workbook.close()
  83. return excel_path
  84. finally:
  85. # os.remove(file_path)
  86. # print("删除文件")
  87. print("转换完成")
  88. return "\n".join(content)
  89. def convert_doc_to_docx(doc_file, docx_file):
  90. try:
  91. if doc_file.endswith('.doc'):
  92. # 创建一个新的.docx文件
  93. docx_document = docx.Document()
  94. # 读取.doc文件的内容
  95. with open(doc_file, 'rb') as doc:
  96. content = doc.read()
  97. # 将.doc文件的内容写入.docx文件
  98. docx_document.add_paragraph(content)
  99. # 保存.docx文件
  100. docx_document.save(docx_file)
  101. finally:
  102. os.remove(doc_file)
  103. # file_path = "丽水市本级信息化项目建设方案模板.docx"
  104. # doc_content = read_docx()
  105. # print(doc_content)