From b68edb36c3c255f99444db97c973f2fcd492015d Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Wed, 18 Sep 2024 19:13:37 +0800 Subject: [PATCH] fix parser for pptx of which files are from filemanager (#2482) ### What problem does this PR solve? #2467 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- api/apps/file2document_app.py | 3 +-- api/db/services/file_service.py | 18 +++++++++--------- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/api/apps/file2document_app.py b/api/apps/file2document_app.py index 124f9c57e0..1e4b2c9ad3 100644 --- a/api/apps/file2document_app.py +++ b/api/apps/file2document_app.py @@ -77,7 +77,7 @@ def convert(): doc = DocumentService.insert({ "id": get_uuid(), "kb_id": kb.id, - "parser_id": kb.parser_id, + "parser_id": FileService.get_parser(file.type, file.name, kb.parser_id), "parser_config": kb.parser_config, "created_by": current_user.id, "type": file.type, @@ -85,7 +85,6 @@ def convert(): "location": file.location, "size": file.size }) - FileService.set_constant_parser(doc, file.name) file2document = File2DocumentService.insert({ "id": get_uuid(), "file_id": id, diff --git a/api/db/services/file_service.py b/api/db/services/file_service.py index a81f0c6d7f..48796f6b38 100644 --- a/api/db/services/file_service.py +++ b/api/db/services/file_service.py @@ -357,7 +357,7 @@ def upload_document(self, kb, file_objs, user_id): doc = { "id": get_uuid(), "kb_id": kb.id, - "parser_id": kb.parser_id, + "parser_id": self.get_parser(filetype, filename, kb.parser_id), "parser_config": kb.parser_config, "created_by": user_id, "type": filetype, @@ -366,7 +366,6 @@ def upload_document(self, kb, file_objs, user_id): "size": len(blob), "thumbnail": thumbnail(filename, blob) } - self.set_constant_parser(doc, filename) DocumentService.insert(doc) FileService.add_file_from_kb(doc, kb_folder["id"], kb.tenant_id) @@ -377,12 +376,13 @@ def upload_document(self, kb, file_objs, user_id): return err, files @staticmethod - def set_constant_parser(doc, filename): - if doc["type"] == FileType.VISUAL: - doc["parser_id"] = ParserType.PICTURE.value - if doc["type"] == FileType.AURAL: - doc["parser_id"] = ParserType.AUDIO.value + def get_parser(doc_type, filename, default): + if doc_type == FileType.VISUAL: + return ParserType.PICTURE.value + if doc_type == FileType.AURAL: + return ParserType.AUDIO.value if re.search(r"\.(ppt|pptx|pages)$", filename): - doc["parser_id"] = ParserType.PRESENTATION.value + return ParserType.PRESENTATION.value if re.search(r"\.(eml)$", filename): - doc["parser_id"] = ParserType.EMAIL.value \ No newline at end of file + return ParserType.EMAIL.value + return default \ No newline at end of file