Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix too long query exception #1195

Merged
merged 1 commit into from
Jun 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 13 additions & 8 deletions deepdoc/parser/docx_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,19 +113,24 @@ def blockType(b):
def __call__(self, fnm, from_page=0, to_page=100000):
self.doc = Document(fnm) if isinstance(
fnm, str) else Document(BytesIO(fnm))
pn = 0
secs = []
pn = 0 # parsed page
secs = [] # parsed contents
for p in self.doc.paragraphs:
if pn > to_page:
break
if from_page <= pn < to_page and p.text.strip():
secs.append((p.text, p.style.name))

runs_within_single_paragraph = [] # save runs within the range of pages
for run in p.runs:
if 'lastRenderedPageBreak' in run._element.xml:
pn += 1
continue
if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
if pn > to_page:
break
if from_page <= pn < to_page and p.text.strip():
runs_within_single_paragraph.append(run.text) # append run.text first

# wrap page break checker into a static method
if RAGFlowDocxParser.has_page_break(run._element.xml):
pn += 1

secs.append(("".join(runs_within_single_paragraph), p.style.name)) # then concat run.text as part of the paragraph

tbls = [self.__extract_table_content(tb) for tb in self.doc.tables]
return secs, tbls
7 changes: 3 additions & 4 deletions rag/app/qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ def beAdoc(d, q, a, eng):
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
return d


def mdQuestionLevel(s):
match = re.match(r'#*', s)
return (len(match.group(0)), s.lstrip('#').lstrip()) if match else (0, s)
Expand Down Expand Up @@ -244,7 +245,6 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
break
txt += l
lines = txt.split("\n")
comma, tab = 0, 0
last_question, last_answer = "", ""
question_stack, level_stack = [], []
code_block = False
Expand All @@ -262,7 +262,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
last_answer = f'{last_answer}\n{l}'
else: # is a question
if last_answer:
sum_question = ('\n').join(question_stack)
sum_question = '\n'.join(question_stack)
if sum_question:
res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng))
last_answer = ''
Expand All @@ -274,12 +274,11 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
question_stack.append(question)
level_stack.append(question_level)
if last_answer:
sum_question = ('\n').join(question_stack)
sum_question = '\n'.join(question_stack)
if sum_question:
res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng))
return res


raise NotImplementedError(
"Excel, csv(txt), pdf and markdown format files are supported.")

Expand Down
1 change: 1 addition & 0 deletions rag/nlp/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ def need_fine_grained_tokenize(tk):
sm = []

keywords.append(re.sub(r"[ \\\"']+", "", tk))
if len(keywords) >= 12: break

tk_syns = self.syn.lookup(tk)
tk = EsQueryer.subSpecialChar(tk)
Expand Down
4 changes: 2 additions & 2 deletions rag/nlp/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def add_filters(bqry):
if not qst:
if not req.get("sort"):
s = s.sort(
{"create_time": {"order": "desc", "unmapped_type": "date"}},
#{"create_time": {"order": "desc", "unmapped_type": "date"}},
{"create_timestamp_flt": {
"order": "desc", "unmapped_type": "float"}}
)
Expand All @@ -108,7 +108,7 @@ def add_filters(bqry):
"mode": "avg", "numeric_type": "double"}},
{"top_int": {"order": "asc", "unmapped_type": "float",
"mode": "avg", "numeric_type": "double"}},
{"create_time": {"order": "desc", "unmapped_type": "date"}},
#{"create_time": {"order": "desc", "unmapped_type": "date"}},
{"create_timestamp_flt": {
"order": "desc", "unmapped_type": "float"}}
)
Expand Down