Skip to content

Commit

Permalink
Upgrades Document Layout Analysis model. (infiniflow#4054)
Browse files Browse the repository at this point in the history
### What problem does this PR solve?

infiniflow#4052

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
  • Loading branch information
KevinHuSh authored Dec 17, 2024
1 parent b5e4a55 commit ce1e855
Show file tree
Hide file tree
Showing 5 changed files with 120 additions and 3 deletions.
4 changes: 2 additions & 2 deletions api/db/services/task_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,8 +247,8 @@ def new_task():
task["progress"] = 0.0

prev_tasks = TaskService.get_tasks(doc["id"])
ck_num = 0
if prev_tasks:
ck_num = 0
for task in tsks:
ck_num += reuse_prev_task_chunks(task, prev_tasks, chunking_config)
TaskService.filter_delete([Task.doc_id == doc["id"]])
Expand All @@ -258,7 +258,7 @@ def new_task():
chunk_ids.extend(task["chunk_ids"].split())
if chunk_ids:
settings.docStoreConn.delete({"id": chunk_ids}, search.index_name(chunking_config["tenant_id"]), chunking_config["kb_id"])
DocumentService.update_by_id(doc["id"], {"chunk_num": ck_num})
DocumentService.update_by_id(doc["id"], {"chunk_num": ck_num})

bulk_insert_into_db(Task, tsks, True)
DocumentService.begin2parse(doc["id"])
Expand Down
2 changes: 2 additions & 0 deletions conf/infinity_mapping.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
"content_with_weight": {"type": "varchar", "default": ""},
"content_ltks": {"type": "varchar", "default": "", "analyzer": "whitespace"},
"content_sm_ltks": {"type": "varchar", "default": "", "analyzer": "whitespace"},
"authors_tks": {"type": "varchar", "default": "", "analyzer": "whitespace"},
"authors_sm_tks": {"type": "varchar", "default": "", "analyzer": "whitespace"},
"page_num_int": {"type": "varchar", "default": ""},
"top_int": {"type": "varchar", "default": ""},
"position_int": {"type": "varchar", "default": ""},
Expand Down
3 changes: 2 additions & 1 deletion deepdoc/vision/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,10 @@

from .ocr import OCR
from .recognizer import Recognizer
from .layout_recognizer import LayoutRecognizer
from .layout_recognizer import LayoutRecognizer4YOLOv10 as LayoutRecognizer
from .table_structure_recognizer import TableStructureRecognizer


def init_in_out(args):
from PIL import Image
import os
Expand Down
88 changes: 88 additions & 0 deletions deepdoc/vision/layout_recognizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,14 @@
import re
from collections import Counter
from copy import deepcopy

import cv2
import numpy as np
from huggingface_hub import snapshot_download

from api.utils.file_utils import get_project_base_directory
from deepdoc.vision import Recognizer
from deepdoc.vision.operators import nms


class LayoutRecognizer(Recognizer):
Expand Down Expand Up @@ -149,3 +152,88 @@ def findLayout(ty):

ocr_res = [b for b in ocr_res if b["text"].strip() not in garbag_set]
return ocr_res, page_layout


class LayoutRecognizer4YOLOv10(LayoutRecognizer):
labels = [
"title",
"Text",
"Reference",
"Figure",
"Figure caption",
"Table",
"Table caption",
"Table caption",
"Equation",
"Figure caption",
]

def __init__(self, domain):
domain = "layout"
super().__init__(domain)
self.auto = False
self.scaleFill = False
self.scaleup = True
self.stride = 32
self.center = True

def preprocess(self, image_list):
inputs = []
new_shape = self.input_shape # height, width
for img in image_list:
shape = img.shape[:2]# current shape [height, width]
# Scale ratio (new / old)
r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
# Compute padding
new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] # wh padding
dw /= 2 # divide padding into 2 sides
dh /= 2
ww, hh = new_unpad
img = np.array(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)).astype(np.float32)
img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
top, bottom = int(round(dh - 0.1)) if self.center else 0, int(round(dh + 0.1))
left, right = int(round(dw - 0.1)) if self.center else 0, int(round(dw + 0.1))
img = cv2.copyMakeBorder(
img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114)
) # add border
img /= 255.0
img = img.transpose(2, 0, 1)
img = img[np.newaxis, :, :, :].astype(np.float32)
inputs.append({self.input_names[0]: img, "scale_factor": [shape[1]/ww, shape[0]/hh, dw, dh]})

return inputs

def postprocess(self, boxes, inputs, thr):
thr = 0.08
boxes = np.squeeze(boxes)
scores = boxes[:, 4]
boxes = boxes[scores > thr, :]
scores = scores[scores > thr]
if len(boxes) == 0:
return []
class_ids = boxes[:, -1].astype(int)
boxes = boxes[:, :4]
boxes[:, 0] -= inputs["scale_factor"][2]
boxes[:, 2] -= inputs["scale_factor"][2]
boxes[:, 1] -= inputs["scale_factor"][3]
boxes[:, 3] -= inputs["scale_factor"][3]
input_shape = np.array([inputs["scale_factor"][0], inputs["scale_factor"][1], inputs["scale_factor"][0],
inputs["scale_factor"][1]])
boxes = np.multiply(boxes, input_shape, dtype=np.float32)

unique_class_ids = np.unique(class_ids)
indices = []
for class_id in unique_class_ids:
class_indices = np.where(class_ids == class_id)[0]
class_boxes = boxes[class_indices, :]
class_scores = scores[class_indices]
class_keep_boxes = nms(class_boxes, class_scores, 0.45)
indices.extend(class_indices[class_keep_boxes])

return [{
"type": self.label_list[class_ids[i]].lower(),
"bbox": [float(t) for t in boxes[i].tolist()],
"score": float(scores[i])
} for i in indices]

26 changes: 26 additions & 0 deletions deepdoc/vision/operators.py
Original file line number Diff line number Diff line change
Expand Up @@ -709,3 +709,29 @@ def preprocess(im, preprocess_ops):
for operator in preprocess_ops:
im, im_info = operator(im, im_info)
return im, im_info


def nms(bboxes, scores, iou_thresh):
import numpy as np
x1 = bboxes[:, 0]
y1 = bboxes[:, 1]
x2 = bboxes[:, 2]
y2 = bboxes[:, 3]
areas = (y2 - y1) * (x2 - x1)

indices = []
index = scores.argsort()[::-1]
while index.size > 0:
i = index[0]
indices.append(i)
x11 = np.maximum(x1[i], x1[index[1:]])
y11 = np.maximum(y1[i], y1[index[1:]])
x22 = np.minimum(x2[i], x2[index[1:]])
y22 = np.minimum(y2[i], y2[index[1:]])
w = np.maximum(0, x22 - x11 + 1)
h = np.maximum(0, y22 - y11 + 1)
overlaps = w * h
ious = overlaps / (areas[i] + areas[index[1:]] - overlaps)
idx = np.where(ious <= iou_thresh)[0]
index = index[idx + 1]
return indices

0 comments on commit ce1e855

Please sign in to comment.