feat(api&rag): add wps and xls doc parser

Added support for WPS and XLS formats, converting WPS and XLS documents to supported DOCX and XLSX formats when uploading BREAKING CHANGE: support wps and xls parser
jhaiq · Nov 30, 2024 · 98313b9 · 98313b9
1 parent 1d0a560
commit 98313b9
Show file tree

Hide file tree

Showing 4 changed files with 178 additions and 8 deletions.
diff --git a/api/apps/api_app.py b/api/apps/api_app.py
@@ -415,15 +415,27 @@ def upload():
             DocumentService.query,
             name=file.filename,
             kb_id=kb_id)
+
         filetype = filename_type(filename)
         if not filetype:
             return get_data_error_result(
                 retmsg="This type of file has not been supported yet!")
 
+        ##add by jhq for wps covert to docx
+        blob = file.read()
+        if re.search(r"\.wps$", filename, re.IGNORECASE):
+            output_format = "docx"  # »òÕß "pdf"
+            blob = convert_stream_to_format(blob,output_format)
+            filename=filename.replace(".wps",'(wps).docx')
+        if re.search(r"\.xls$", filename, re.IGNORECASE):
+            output_format = "xlsx"  # »òÕß "pdf"
+            blob = convert_stream_to_format(blob,output_format)
+            filename=filename.replace(".xls",'(xls).xlsx')
+
         location = filename
         while STORAGE_IMPL.obj_exist(kb_id, location):
             location += "_"
-        blob = request.files['file'].read()
+        # blob = request.files['file'].read() ,delet by jhq for wps covert to docx
         STORAGE_IMPL.put(kb_id, location, blob)
         doc = {
             "id": get_uuid(),

diff --git a/api/apps/file_app.py b/api/apps/file_app.py
@@ -35,7 +35,7 @@
 from rag.nlp import search
 from rag.utils.es_conn import ELASTICSEARCH
 from rag.utils.storage_factory import STORAGE_IMPL
-
+from api.utils.cover_wps2docx import convert_stream_to_format
 
 @manager.route('/upload', methods=['POST'])
 @login_required
@@ -97,15 +97,29 @@ def upload():
 
             # file type
             filetype = filename_type(file_obj_names[file_len - 1])
+
+            ##add by jhq for wps covert to docx
+            blob = file_obj.read()
+            filename = file_obj_names[file_len - 1]
+            if re.search(r"\.wps$", filename, re.IGNORECASE):
+                output_format = "docx"  # »òÕß "pdf"
+                blob = convert_stream_to_format(blob,output_format)
+                filename=filename.replace(".wps",'(wps).docx')
+            if re.search(r"\.xls$", filename, re.IGNORECASE):
+                output_format = "xlsx"  # »òÕß "pdf"
+                blob = convert_stream_to_format(blob,output_format)
+                filename=filename.replace(".xls",'(xls).xlsx')
+
             location = file_obj_names[file_len - 1]
             while STORAGE_IMPL.obj_exist(last_folder.id, location):
                 location += "_"
-            blob = file_obj.read()
+            # blob = file_obj.read() ,delet by jhq for wps covert to docx
             filename = duplicate_name(
                 FileService.query,
-                name=file_obj_names[file_len - 1],
+                # name=file_obj_names[file_len - 1],   delet by jhq for wps covert to docx
+                name=filename,  ##add by jhq for wps covert to docx
                 parent_id=last_folder.id)
-            file = {
+            infile = {
                 "id": get_uuid(),
                 "parent_id": last_folder.id,
                 "tenant_id": current_user.id,
@@ -115,7 +129,7 @@ def upload():
                 "location": location,
                 "size": len(blob),
             }
-            file = FileService.insert(file)
+            file = FileService.insert(infile)
             STORAGE_IMPL.put(last_folder.id, location, blob)
             file_res.append(file.to_json())
         return get_json_result(data=file_res)

diff --git a/api/db/services/file_service.py b/api/db/services/file_service.py
@@ -27,6 +27,7 @@
 from api.db.services.file2document_service import File2DocumentService
 from api.utils import get_uuid
 from api.utils.file_utils import filename_type, thumbnail_img
+from api.utils.cover_wps2docx import convert_stream_to_format
 from rag.utils.storage_factory import STORAGE_IMPL
 
 
@@ -348,11 +349,22 @@ def upload_document(self, kb, file_objs, user_id):
                 filetype = filename_type(filename)
                 if filetype == FileType.OTHER.value:
                     raise RuntimeError("This type of file has not been supported yet!")
-
+
+                ##add by jhq for wps covert to docx
+                blob = file.read()
+                if re.search(r"\.wps$", filename, re.IGNORECASE):
+                    output_format = "docx"  # 或者 "pdf"
+                    blob = convert_stream_to_format(blob,output_format)
+                    filename=filename.replace(".wps",'(wps).docx')
+                if re.search(r"\.xls$", filename, re.IGNORECASE):
+                    output_format = "xlsx"  # 或者 "pdf"
+                    blob = convert_stream_to_format(blob,output_format)
+                    filename=filename.replace(".xls",'(xls).xlsx')
+
                 location = filename
                 while STORAGE_IMPL.obj_exist(kb.id, location):
                     location += "_"
-                blob = file.read()
+                # blob = file.read() ,delet by jhq for wps covert to docx
                 STORAGE_IMPL.put(kb.id, location, blob)
 
                 doc_id = get_uuid()

diff --git a/api/utils/cover_wps2docx.py b/api/utils/cover_wps2docx.py
@@ -0,0 +1,132 @@
+import subprocess
+import logging
+import os
+import platform
+import time
+import tempfile
+
+
+# 配置日志
+logger = logging.getLogger(__name__)
+#########安装libreoffice#######################
+#sudo apt-get update
+#sudo apt install libreoffice
+##验证是否安装成功
+##libreoffice --version
+
+####如果报如下错误，需要安装dialog组件：debconf: unable to initialize frontend: Dialog
+###sudo apt-get install dialog
+
+##soffice --headless --convert-to docx W020240423576340405124.wps
+
+def convert_to_docx(input_file_path, output_file_path):
+    with open(input_file_path, 'rb') as f:
+        with tempfile.NamedTemporaryFile(suffix='.docx', delete=False) as temp_docx:
+            command = f"libreoffice --headless --convert-to docx:writer_docx_Export --outdir {os.path.dirname(output_file_path)} {input_file_path}"
+            subprocess.run(command, shell=True)
+            # 移动临时文件到最终位置
+            os.rename(temp_docx.name, output_file_path)
+
+def execute_libreoffice_command(command):
+    try:
+        logger.debug(f"execute_libreoffice_command cmd : {command}")
+        process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+        # 等待命令执行完成
+        stdout, stderr = process.communicate()
+
+        # 检查进程退出状态
+        exit_status = process.returncode
+        logger.debug(f"i----{exit_status}")
+
+        if exit_status != 0:
+            logger.error(f"execute_libreoffice_command cmd exitStatus {exit_status}")
+            if stderr:
+                logger.error(f"Error: {stderr.decode('utf-8')}")
+            return False
+        else:
+            logger.debug(f"execute_libreoffice_command cmd exitStatus {exit_status}")
+            if stdout:
+                logger.debug(f"Output: {stdout.decode('utf-8')}")
+
+    except subprocess.CalledProcessError as e:
+        logger.error(f"execute_libreoffice_command {command} error {e}")
+        return False
+    except Exception as e:
+        logger.error(f"An error occurred: {e}")
+        return False
+
+    logger.info("转化结束.......")
+    return True
+
+
+def convert_office_to_docxorpdf(input_file, output_file,cover_file_type='docx'):
+
+    # if cover_file_type !='docx' and cover_file_type !='pdf'
+    #     return
+
+    start_time = time.time()
+    command = ""
+    os_name = platform.system()
+    if os_name == "Windows":
+        command = f"cmd /c start soffice --headless --invisible --convert-to {cover_file_type} {input_file} --outdir {output_file}"
+    else:
+        command = f"libreoffice --headless --invisible --convert-to {cover_file_type} {input_file} --outdir {output_file}"
+
+    flag = execute_libreoffice_command(command)
+    end_time = time.time()
+    logger.debug(f"用时: {end_time - start_time} 秒")
+
+    return flag
+
+
+def convert_stream_to_format(input_stream, output_format):
+    # 创建一个临时文件
+    with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
+        # 将文件流写入临时文件
+        tmp_file.write(input_stream)
+        tmp_file.flush()
+
+        try:
+            # 构建 LibreOffice 命令
+            command = f"libreoffice --headless --convert-to {output_format} --outdir {os.path.dirname(tmp_file.name)} {tmp_file.name}"
+            subprocess.run(command, shell=True, check=True)
+
+            # 构建输出文件的路径
+            output_file = os.path.splitext(tmp_file.name)[0] + f".{output_format.split(':')[-1]}"
+
+            # 返回转换后的文件内容
+            with open(output_file, 'rb') as f:
+                return f.read()
+        except Exception as e:
+            logger.error(f"convert_stream_to_format error occurred: {e}")
+            return None
+
+        finally:
+            # 删除临时文件
+            os.unlink(tmp_file.name)
+            if os.path.exists(output_file):
+                os.unlink(output_file)
+
+
+# if __name__ == "__main__":
+
+#     # 使用示例
+#     # input_file_path = 'W020240722613678243393.wps'
+#     # out_file_path= 'W020240722613678243393.docx'
+#     # output_format = "docx"  # 或者 "docx"
+
+#     input_file_path = '20141117165909284.xls'
+#     out_file_path= '20141117165909284.xlsx'
+#     output_format = "xlsx"  # 或者 "docx"
+
+
+#     with open(input_file_path, 'rb') as f:
+#         input_stream = f.read()
+#         output_stream = converted_content = convert_stream_to_format(input_stream, output_format)
+#         with open(out_file_path, 'wb') as tmp_file:
+#             # 将文件流写入临时文件
+#             tmp_file.write(output_stream)
+#             tmp_file.flush()
+
+