From fcc91564ec0def510674543770bb19dd4785fadf Mon Sep 17 00:00:00 2001 From: Koji Ishii Date: Fri, 13 May 2022 06:35:23 +0900 Subject: [PATCH] Change `applyElement` to call `HTMLProcessor` (#60) This patch changes `Parser.applyElement` to call the `HTMLProcessor` class. The `HTMLProcessorOptions.separator` is changed to accept a `Node`. This is because `undefined` had double meanings; i.e., use the default (ZWSP) and use the `` element. --- javascript/README.md | 19 ++++------ javascript/src/html_processor.ts | 22 ++++++----- javascript/src/parser.ts | 50 ++++--------------------- javascript/tests/test_html_processor.ts | 22 +++++++++++ 4 files changed, 49 insertions(+), 64 deletions(-) diff --git a/javascript/README.md b/javascript/README.md index 7216fca8..3cfa0ef1 100644 --- a/javascript/README.md +++ b/javascript/README.md @@ -63,23 +63,20 @@ console.log(ele.outerHTML); //

今日はとても天気です。

``` -There is another way to apply the process to an HTML element. +Internally, the `applyElement` calls the [`HTMLProcessor`] class +with a `` element as the separator. +You can use the [`HTMLProcessor`] class directly if desired. +For example: ```javascript import { HTMLProcessor } from 'budoux'; const ele = document.querySelector('p.budou-this'); -const applier = new HTMLProcessor(parser); -applier.applyToElement(ele); +const htmlProcessor = new HTMLProcessor(parser, { + separator: ' ' +}); +htmlProcessor.applyToElement(ele); ``` -The [`HTMLProcessor`] class -recognizes separate or nested paragraphs more correctly, -its output is generally more memory efficient for browsers, -and it can customize its output such as inserting a space at boundaries -which is often useful for accessibility, -but the bundle code size is larger. -Please see the JSDoc for more details. - [`HTMLProcessor`]: https://github.com/google/budoux/blob/main/javascript/src/html_processor.ts ### Loading a custom model diff --git a/javascript/src/html_processor.ts b/javascript/src/html_processor.ts index 6c8b16d9..8215aaef 100644 --- a/javascript/src/html_processor.ts +++ b/javascript/src/html_processor.ts @@ -153,11 +153,12 @@ export interface HTMLProcessorOptions { className?: string; /** * The separator to insert at each semantics boundary. - * The default value is U+200B ZERO WIDTH SPACE. * - * When falsy, a `` element is inserted. + * When it's a {@link Node}, a clone of the {@link Node} will be inserted. + * + * The default value is U+200B ZERO WIDTH SPACE. */ - separator?: string; + separator?: string | Node; /** * The threshold score to control the granularity of chunks. * See {@link Parser.parse}. @@ -182,7 +183,7 @@ export class HTMLProcessor { /** See {@link HTMLProcessorOptions.className}. */ className?: string; /** See {@link HTMLProcessorOptions.separator}. */ - separator?: string = ZWSP; + separator: string | Node = ZWSP; /** See {@link HTMLProcessorOptions.threshold}. */ threshold: number = DEFAULT_THRES; @@ -373,14 +374,15 @@ export class HTMLProcessor { assert(chunks.length > 1); assert(node.nodeValue === chunks.join('')); - // If the `separator` string is specified, insert it at each boundary. - if (this.separator) { - node.nodeValue = chunks.join(this.separator); + const separator = this.separator; + if (typeof separator === 'string') { + // If the `separator` is a string, insert it at each boundary. + node.nodeValue = chunks.join(separator); return; } - // Otherwise create a `Text` node for each chunk, with `` between them, - // and replace the `node` with them. + // Otherwise create a `Text` node for each chunk, with the separator node + // between them, and replace the `node` with them. const document = node.ownerDocument; let nodes = []; for (const chunk of chunks) { @@ -388,7 +390,7 @@ export class HTMLProcessor { nodes.push(null); } nodes.pop(); - nodes = nodes.map(n => (n ? n : document.createElement('wbr'))); + nodes = nodes.map(n => (n ? n : separator.cloneNode(true))); node.replaceWith(...nodes); } diff --git a/javascript/src/parser.ts b/javascript/src/parser.ts index 85ce754e..cec18908 100644 --- a/javascript/src/parser.ts +++ b/javascript/src/parser.ts @@ -15,11 +15,11 @@ */ import {unicodeBlocks} from './data/unicode_blocks'; -import {skipNodes} from './data/skip_nodes'; import {model as jaKNBCModel} from './data/models/ja-knbc'; import {model as zhHansModel} from './data/models/zh-hans'; import {parseFromString} from './dom'; -import {bisectRight, SEP, INVALID} from './utils'; +import {HTMLProcessor} from './html_processor'; +import {bisectRight, INVALID} from './utils'; /** * The default threshold value for the parser. @@ -33,8 +33,6 @@ const NODETYPE = { TEXT: 3, }; -const SKIP_NODES = new Set(skipNodes); - export class Parser { model; @@ -195,45 +193,11 @@ export class Parser { * @param threshold The threshold score to control the granularity of chunks. */ applyElement(parentElement: HTMLElement, threshold = DEFAULT_THRES) { - parentElement.style.wordBreak = 'keep-all'; - parentElement.style.overflowWrap = 'break-word'; - const chunks = this.parse(parentElement.textContent || '', threshold); - let charsToProcess = chunks.join(SEP); - const ownerDocument = parentElement.ownerDocument; - - const processChildren = (parent: HTMLElement) => { - const toSkip = SKIP_NODES.has(parent.nodeName); - const children = [...parent.childNodes]; - for (const child of children) { - if (child.nodeType === NODETYPE.TEXT) { - let textNodeContent = ''; - const nodesToAdd: (HTMLElement | Text)[] = []; - (child.textContent || '').split('').forEach(char => { - if (toSkip) { - textNodeContent += char; - charsToProcess = charsToProcess.slice( - charsToProcess[0] === SEP ? 2 : 1 - ); - } else if (char === charsToProcess[0]) { - textNodeContent += char; - charsToProcess = charsToProcess.slice(1); - } else if (charsToProcess[0] === SEP) { - nodesToAdd.push(ownerDocument.createTextNode(textNodeContent)); - nodesToAdd.push(ownerDocument.createElement('wbr')); - charsToProcess = charsToProcess.slice(2); - textNodeContent = char; - } - }); - if (textNodeContent) { - nodesToAdd.push(ownerDocument.createTextNode(textNodeContent)); - } - child.replaceWith(...nodesToAdd); - } else if (child.nodeType === NODETYPE.ELEMENT) { - processChildren(child as HTMLElement); - } - } - }; - processChildren(parentElement); + const htmlProcessor = new HTMLProcessor(this, { + separator: parentElement.ownerDocument.createElement('wbr'), + threshold: threshold, + }); + htmlProcessor.applyToElement(parentElement); } /** diff --git a/javascript/tests/test_html_processor.ts b/javascript/tests/test_html_processor.ts index ae8f70cc..5a78544f 100644 --- a/javascript/tests/test_html_processor.ts +++ b/javascript/tests/test_html_processor.ts @@ -84,6 +84,28 @@ describe('HTMLProcessor.applyToElement', () => { } }); +describe('HTMLProcessor.applyToElement.separator.node', () => { + const dom = new JSDOM('
今日は良い天気
'); + const document = dom.window.document; + const separator = document.createElement('span'); + separator.style.whiteSpace = 'normal'; + separator.textContent = '\u200B'; + const processor = new MockHTMLProcessorBase({ + separator: separator, + className: 'applied', + }); + processor.applyToElement(document.body); + it('should clone separator element deeply', () => { + expect(document.body.innerHTML).toEqual( + '
今日は' + + '\u200B' + + '良い' + + '\u200B' + + '天気
' + ); + }); +}); + describe('HTMLProcessor.getBlocks', () => { const getBlocks = (html: string) => { const dom = new JSDOM(html);