From e0c9244079e19d9879fa0648e7068526033fff0f Mon Sep 17 00:00:00 2001 From: Koji Ishii Date: Mon, 25 Apr 2022 16:41:14 +0900 Subject: [PATCH] Add JavaScript `HTMLProcessor` class (#58) This class provides extended capabilities when applying to DOM, but with larger code size. Please see the JSDoc for more details. * tushuhei review --- javascript/README.md | 19 ++ javascript/src/html_processor.ts | 420 ++++++++++++++++++++++++ javascript/tests/test_html_processor.ts | 259 +++++++++++++++ 3 files changed, 698 insertions(+) create mode 100644 javascript/src/html_processor.ts create mode 100644 javascript/tests/test_html_processor.ts diff --git a/javascript/README.md b/javascript/README.md index ac4c7a74..7216fca8 100644 --- a/javascript/README.md +++ b/javascript/README.md @@ -63,6 +63,25 @@ console.log(ele.outerHTML); //

今日はとても天気です。

``` +There is another way to apply the process to an HTML element. + +```javascript +import { HTMLProcessor } from 'budoux'; +const ele = document.querySelector('p.budou-this'); +const applier = new HTMLProcessor(parser); +applier.applyToElement(ele); +``` + +The [`HTMLProcessor`] class +recognizes separate or nested paragraphs more correctly, +its output is generally more memory efficient for browsers, +and it can customize its output such as inserting a space at boundaries +which is often useful for accessibility, +but the bundle code size is larger. +Please see the JSDoc for more details. + +[`HTMLProcessor`]: https://github.com/google/budoux/blob/main/javascript/src/html_processor.ts + ### Loading a custom model You can load your own custom model as follows. diff --git a/javascript/src/html_processor.ts b/javascript/src/html_processor.ts new file mode 100644 index 00000000..6c8b16d9 --- /dev/null +++ b/javascript/src/html_processor.ts @@ -0,0 +1,420 @@ +/** + * @license + * Copyright 2021 Google LLC + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import {Parser, DEFAULT_THRES} from './parser'; + +const assert = console.assert; + +const ZWSP = '\u200B'; // U+200B ZERO WIDTH SPACE + +// We could use `Node.TEXT_NODE` and `Node.ELEMENT_NODE` in a browser context, +// but we define the same here for Node.js environments. +const NodeType = { + ELEMENT_NODE: 1, + TEXT_NODE: 3, +}; + +const DomAction = { + Inline: 0, + Block: 1, + Skip: 2, + Break: 3, +} as const; +type DomAction = typeof DomAction[keyof typeof DomAction]; + +/** + * Determines the action from an element name, as defined in + * {@link https://html.spec.whatwg.org/multipage/rendering.html HTML Rendering}. + * See also {@link actionForElement}. + */ +const domActions: {[name: string]: DomAction} = { + // Hidden elements + // https://html.spec.whatwg.org/multipage/rendering.html#hidden-elements + AREA: DomAction.Skip, + BASE: DomAction.Skip, + BASEFONT: DomAction.Skip, + DATALIST: DomAction.Skip, + HEAD: DomAction.Skip, + LINK: DomAction.Skip, + META: DomAction.Skip, + NOEMBED: DomAction.Skip, + NOFRAMES: DomAction.Skip, + PARAM: DomAction.Skip, + RP: DomAction.Skip, + SCRIPT: DomAction.Skip, + STYLE: DomAction.Skip, + TEMPLATE: DomAction.Skip, + TITLE: DomAction.Skip, + NOSCRIPT: DomAction.Skip, + + // Flow content + // https://html.spec.whatwg.org/multipage/rendering.html#flow-content-3 + HR: DomAction.Break, + // Disable if `white-space: pre`. + LISTING: DomAction.Skip, + PLAINTEXT: DomAction.Skip, + PRE: DomAction.Skip, + XMP: DomAction.Skip, + + // Phrasing content + // https://html.spec.whatwg.org/multipage/rendering.html#phrasing-content-3 + BR: DomAction.Break, + RT: DomAction.Skip, + + // Form controls + // https://html.spec.whatwg.org/multipage/rendering.html#form-controls + INPUT: DomAction.Skip, + SELECT: DomAction.Skip, + BUTTON: DomAction.Skip, + TEXTAREA: DomAction.Skip, + + // Other elements where the phrase-based line breaking should be disabled. + // https://github.com/google/budoux/blob/main/budoux/skip_nodes.json + ABBR: DomAction.Skip, + CODE: DomAction.Skip, + IFRAME: DomAction.Skip, + TIME: DomAction.Skip, + VAR: DomAction.Skip, +}; + +/** + * Determine the action for an element. + * @param element An element to determine the action for. + * @returns The {@link domActions} for the element. + */ +function actionForElement(element: Element): DomAction { + const action = domActions[element.nodeName]; + if (action !== undefined) return action; + + // jsdom does not have `getComputedStyle`. + if (typeof getComputedStyle === 'function') { + const style = getComputedStyle(element); + switch (style.whiteSpace) { + case 'nowrap': + case 'pre': + return DomAction.Skip; + } + + const display = style.display; + assert(display); + if (display !== 'inline') return DomAction.Block; + } + return DomAction.Inline; +} + +/** + * Represents a "paragraph", broken by block boundaries or forced breaks. + * + * A CSS + * {@link https://drafts.csswg.org/css2/#inline-formatting inline formatting context} + * is usually a "paragraph", but it can be broken into multiple paragraphs by + * forced breaks such as `
`. + */ +class Paragraph { + element: HTMLElement; + textNodes: Text[] = []; + + constructor(element: HTMLElement) { + this.element = element; + } + + hasText(): boolean { + return this.textNodes.length > 0; + } +} + +/** + * Options for {@link HTMLProcessor}. + */ +export interface HTMLProcessorOptions { + /** + * This class name is added to the containing block + * when the BudouX is applied. + * + * The caller is responsible for defining the class. + * {@link defineClassAs} can append a `