Skip to content

Commit

Permalink
Add --text-line-images option for debugging
Browse files Browse the repository at this point in the history
This extracts the lines of text identified in the input image and saves them to
the `lines` directory. This is useful for debugging text line recognition by
exporting the images that will be passed to the line recognition module. Note
that these images are not currently rescaled to 64px high as the actual
recognition model inputs are.
  • Loading branch information
robertknight committed Feb 25, 2024
1 parent bf640a9 commit 1550b4f
Showing 1 changed file with 45 additions and 3 deletions.
48 changes: 45 additions & 3 deletions ocrs-cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ use std::io::BufWriter;

use anyhow::{anyhow, Context};
use ocrs::{DecodeMethod, OcrEngine, OcrEngineParams};
use rten_imageproc::{bounding_rect, RotatedRect};
use rten_tensor::prelude::*;
use rten_tensor::{NdTensor, NdTensorView};

Expand Down Expand Up @@ -69,6 +70,30 @@ fn image_from_tensor(tensor: NdTensorView<f32, 3>) -> Vec<u8> {
.collect()
}

/// Extract images of individual text lines from `img` and save them as PNG
/// files in `output_dir`.
fn write_text_line_images(
img: NdTensorView<f32, 3>,
line_rects: &[Vec<RotatedRect>],
output_dir: &str,
) -> anyhow::Result<()> {
std::fs::create_dir_all(output_dir)
.with_context(|| format!("Failed to create dir {}/", output_dir))?;

for (line_index, word_rects) in line_rects.iter().enumerate() {
let filename = format!("{}/line-{}.png", output_dir, line_index);
let line_rect = bounding_rect(word_rects.iter());
if let Some(line_rect) = line_rect {
let [top, left, bottom, right] = line_rect.tlbr().map(|x| x.max(0.).round() as usize);
let line_img: NdTensorView<f32, 3> = img.slice((.., top..bottom, left..right));
write_image(&filename, line_img)
.with_context(|| format!("Failed to write line image to {}", filename))?;
}
}

Ok(())
}

struct Args {
/// Path to text detection model.
detection_model: Option<String>,
Expand All @@ -92,6 +117,9 @@ struct Args {

/// Generate a text probability map.
text_map: bool,

/// Extract each text line found and save as a PNG image.
text_line_images: bool,
}

fn parse_args() -> Result<Args, lexopt::Error> {
Expand All @@ -105,6 +133,7 @@ fn parse_args() -> Result<Args, lexopt::Error> {
let mut output_path = None;
let mut recognition_model = None;
let mut text_map = false;
let mut text_line_images = false;

let mut parser = lexopt::Parser::from_env();
while let Some(arg) = parser.next()? {
Expand All @@ -131,6 +160,9 @@ fn parse_args() -> Result<Args, lexopt::Error> {
Long("rec-model") => {
recognition_model = Some(parser.value()?.string()?);
}
Long("text-line-images") => {
text_line_images = true;
}
Long("text-map") => {
text_map = true;
}
Expand Down Expand Up @@ -172,15 +204,19 @@ Advanced options:
--beam
Use beam search for decoding.
Use beam search for decoding
--debug
Enable debug logging.
Enable debug logging
--text-map
Generate a text probability map for the input image.
Generate a text probability map for the input image
--text-line-images
Export images of identified text lines
",
bin_name = parser.bin_name().unwrap_or("ocrs")
);
Expand All @@ -203,6 +239,7 @@ Advanced options:
image: values.pop_front().ok_or("missing `<image>` arg")?,
recognition_model,
text_map,
text_line_images,
})
}

Expand Down Expand Up @@ -267,7 +304,12 @@ fn main() -> Result<(), Box<dyn Error>> {
}

let word_rects = engine.detect_words(&ocr_input)?;

let line_rects = engine.find_text_lines(&ocr_input, &word_rects);
if args.text_line_images {
write_text_line_images(color_img.view(), &line_rects, "lines")?;
}

let line_texts = engine.recognize_text(&ocr_input, &line_rects)?;

let write_output_str = |content: String| -> Result<(), Box<dyn Error>> {
Expand Down

0 comments on commit 1550b4f

Please sign in to comment.