import os import docx import mysql_pool from pymysql.converters import escape_string import xlsxwriter import uuid def read_docx(file_path): mysql = mysql_pool.ConnMysql() # print(os.path.abspath('丽水市本级信息化项目建设方案模板.docx')) # url = "http://jobapi.ningdatech.com/prometheus-yw_file_service/files/20240919/669f323c5c824f89a34bf04a66105902.doc" # file_name = "丽水市本级信息化项目建设方案模板.docx" # file_path = os.path.join("temp", file_name) excel_path = f"/Users/kebobo/fsdownload/dup_check/temp/{uuid.uuid4()}.xls" try: # r = requests.get(url) # with open(file_path, "wb") as code: # code.write(r.content) # # 转化文件格式 # convert_doc_to_docx(file_path, file_path.replace('.doc', '.docx')) # file_path = file_path.replace('.doc', '.docx') # 读取文件 # doc = docx.Document(os.path.abspath(file_path)) doc = docx.Document(file_path) # 是否开始获取文本 is_acquire = 0 is_project_name = 0 content = [] # 功能模块 feature_map = {} # 功能名 feature_name = "" # 项目名 xmmc = "" for para in doc.paragraphs: style = para.style.name print(f"style: {para.style.name}, value: {para.text}") if str(style).find('toc') == 1: continue # 获取文档项目名称 if para.text.find('项目名称') != -1: is_project_name = 1 elif para.text.find('项目类型') != -1: is_project_name = 0 if is_project_name == 1: if str(style).find('Heading') == -1 and str(style).find('toc') == -1: xmmc = para.text if para.text == '3.1.2 建设内容': is_acquire = 1 elif para.text == '3.2 整体架构设计': is_acquire = 0 if is_acquire == 1: if str(style).find('Heading') == -1: # print(f"content: {para.text}, style: {para.style.name}") feature_map[feature_name] = para.text # 重置功能名 feature_name = "" content.append(para.text) else: feature_map[para.text] = "" feature_name = para.text # 使用next函数逐个获取元素 # for key, value in feature_map.items(): # if key != "3.1.2 建设内容" and key != "": # print(f"Key: {key}, Value: {value}") # # 将功能描述入库 # mysql.sql_change_msg( # """insert into user_history_module_data(xmmc,gnmc,gnms,line, remark) value("%s", "%s", "%s", "%s", "%s")""" % ( # escape_string(xmmc), escape_string(key), escape_string(value), "", "自动拆解导入")) # 生成excel文件(兼容老版本, 减少改动量) workbook = xlsxwriter.Workbook(excel_path) worksheet = workbook.add_worksheet("Sheet1") worksheet.write('A1', '项目名称') worksheet.write('B1', escape_string(xmmc)) i = 0 for key, value in feature_map.items(): if key != "3.1.2 建设内容" and key != "": i = i + 1 worksheet.write(f"A{i + 1}", '功能模块') worksheet.write(f"B{i + 1}", escape_string(key)) worksheet.write(f"C{i + 1}", '功能描述') worksheet.write(f"D{i + 1}", escape_string(value)) workbook.close() return excel_path finally: # os.remove(file_path) # print("删除文件") print("转换完成") return "\n".join(content) def convert_doc_to_docx(doc_file, docx_file): try: if doc_file.endswith('.doc'): # 创建一个新的.docx文件 docx_document = docx.Document() # 读取.doc文件的内容 with open(doc_file, 'rb') as doc: content = doc.read() # 将.doc文件的内容写入.docx文件 docx_document.add_paragraph(content) # 保存.docx文件 docx_document.save(docx_file) finally: os.remove(doc_file) # file_path = "丽水市本级信息化项目建设方案模板.docx" # doc_content = read_docx() # print(doc_content)