From fcfdc1d201c0c892abb674b8723964ce97b55015 Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Thu, 7 Mar 2024 17:20:47 +0800 Subject: [PATCH] refine presentation parser --- api/apps/conversation_app.py | 5 ++++- rag/app/presentation.py | 11 ++++++++--- rag/nlp/search.py | 6 +++--- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/api/apps/conversation_app.py b/api/apps/conversation_app.py index 1f7e6cef6e..a0c2369979 100644 --- a/api/apps/conversation_app.py +++ b/api/apps/conversation_app.py @@ -212,14 +212,17 @@ def chat(dialog, messages, **kwargs): if "max_tokens" in gen_conf: gen_conf["max_tokens"] = min(gen_conf["max_tokens"], llm.max_tokens - used_token_count) answer = chat_mdl.chat(prompt_config["system"].format(**kwargs), msg, gen_conf) + stat_logger.info("User: {}|Assistant: {}".format(msg[-1]["content"], answer)) if knowledges: - answer = retrievaler.insert_citations(answer, + answer, idx = retrievaler.insert_citations(answer, [ck["content_ltks"] for ck in kbinfos["chunks"]], [ck["vector"] for ck in kbinfos["chunks"]], embd_mdl, tkweight=1 - dialog.vector_similarity_weight, vtweight=dialog.vector_similarity_weight) + idx = set([kbinfos["chunks"][int(i)]["doc_id"] for i in idx]) + kbinfos["doc_aggs"] = [d for d in kbinfos["doc_aggs"] if d["doc_id"] in idx] for c in kbinfos["chunks"]: if c.get("vector"): del c["vector"] return {"answer": answer, "reference": kbinfos} diff --git a/rag/app/presentation.py b/rag/app/presentation.py index 16c11bd76e..7525946a3c 100644 --- a/rag/app/presentation.py +++ b/rag/app/presentation.py @@ -88,20 +88,25 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca res = [] if re.search(r"\.pptx?$", filename, re.IGNORECASE): ppt_parser = Ppt() - for txt,img in ppt_parser(filename if not binary else binary, from_page, 1000000, callback): + for pn, (txt,img) in enumerate(ppt_parser(filename if not binary else binary, from_page, 1000000, callback)): d = copy.deepcopy(doc) + pn += from_page d["image"] = img - tokenize(d, txt, ppt_parser.is_english) + d["page_num_int"] = [pn+1] + d["top_int"] = [0] + d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])] + tokenize(d, txt, eng) res.append(d) return res elif re.search(r"\.pdf$", filename, re.IGNORECASE): pdf_parser = Pdf() for pn, (txt,img) in enumerate(pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback)): d = copy.deepcopy(doc) + pn += from_page d["image"] = img d["page_num_int"] = [pn+1] d["top_int"] = [0] - d["position_int"].append((pn + 1, 0, img.size[0], 0, img.size[1])) + d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])] tokenize(d, txt, eng) res.append(d) return res diff --git a/rag/nlp/search.py b/rag/nlp/search.py index bce7db4169..b87bb27a81 100644 --- a/rag/nlp/search.py +++ b/rag/nlp/search.py @@ -243,7 +243,7 @@ def insert_citations(self, answer, chunks, chunk_v, res += f" ##{c}$$" seted.add(c) - return res + return res, seted def rerank(self, sres, query, tkweight=0.3, vtweight=0.7, cfield="content_ltks"): @@ -290,7 +290,7 @@ def retrieval(self, question, embd_mdl, tenant_id, kb_ids, page, page_size, simi start_idx -= 1 if start_idx >= 0: continue - if len(ranks["chunks"]) == page_size: + if len(ranks["chunks"]) >= page_size: if aggs: continue break @@ -322,7 +322,7 @@ def retrieval(self, question, embd_mdl, tenant_id, kb_ids, page, page_size, simi if dnm not in ranks["doc_aggs"]: ranks["doc_aggs"][dnm] = {"doc_id": did, "count": 0} ranks["doc_aggs"][dnm]["count"] += 1 - ranks["doc_aggs"] = []#[{"doc_name": k, "doc_id": v["doc_id"], "count": v["count"]} for k,v in sorted(ranks["doc_aggs"].items(), key=lambda x:x[1]["count"]*-1)] + ranks["doc_aggs"] = [{"doc_name": k, "doc_id": v["doc_id"], "count": v["count"]} for k,v in sorted(ranks["doc_aggs"].items(), key=lambda x:x[1]["count"]*-1)] return ranks