From fcc91564ec0def510674543770bb19dd4785fadf Mon Sep 17 00:00:00 2001
From: Koji Ishii <kojii@google.com>
Date: Fri, 13 May 2022 06:35:23 +0900
Subject: [PATCH] Change `applyElement` to call `HTMLProcessor` (#60)

This patch changes `Parser.applyElement` to call the `HTMLProcessor` class.

The `HTMLProcessorOptions.separator` is changed to accept a `Node`. This is because `undefined` had double meanings; i.e., use the default (ZWSP) and use the `<wbr>` element.
---
 javascript/README.md                    | 19 ++++------
 javascript/src/html_processor.ts        | 22 ++++++-----
 javascript/src/parser.ts                | 50 ++++---------------------
 javascript/tests/test_html_processor.ts | 22 +++++++++++
 4 files changed, 49 insertions(+), 64 deletions(-)
diff --git a/javascript/README.md b/javascript/README.md
index 7216fca8..3cfa0ef1 100644
--- a/javascript/README.md
+++ b/javascript/README.md
@@ -63,23 +63,20 @@ console.log(ele.outerHTML);
 // <p class="budou-this" style="word-break: keep-all; overflow-wrap: break-word;">今日は<b><wbr>とても<wbr>天気</b>です。</p>
 ```
 
-There is another way to apply the process to an HTML element.
+Internally, the `applyElement` calls the [`HTMLProcessor`] class
+with a `<wbr>` element as the separator.
+You can use the [`HTMLProcessor`] class directly if desired.
+For example:
 
 ```javascript
 import { HTMLProcessor } from 'budoux';
 const ele = document.querySelector('p.budou-this');
-const applier = new HTMLProcessor(parser);
-applier.applyToElement(ele);
+const htmlProcessor = new HTMLProcessor(parser, {
+  separator: ' '
+});
+htmlProcessor.applyToElement(ele);
 ```
 
-The [`HTMLProcessor`] class
-recognizes separate or nested paragraphs more correctly,
-its output is generally more memory efficient for browsers,
-and it can customize its output such as inserting a space at boundaries
-which is often useful for accessibility,
-but the bundle code size is larger.
-Please see the JSDoc for more details.
-
 [`HTMLProcessor`]: https://github.com/google/budoux/blob/main/javascript/src/html_processor.ts
 
 ### Loading a custom model
diff --git a/javascript/src/html_processor.ts b/javascript/src/html_processor.ts
index 6c8b16d9..8215aaef 100644
--- a/javascript/src/html_processor.ts
+++ b/javascript/src/html_processor.ts
@@ -153,11 +153,12 @@ export interface HTMLProcessorOptions {
   className?: string;
   /**
    * The separator to insert at each semantics boundary.
-   * The default value is U+200B ZERO WIDTH SPACE.
    *
-   * When falsy, a `<wbr>` element is inserted.
+   * When it's a {@link Node}, a clone of the {@link Node} will be inserted.
+   *
+   * The default value is U+200B ZERO WIDTH SPACE.
    */
-  separator?: string;
+  separator?: string | Node;
   /**
    * The threshold score to control the granularity of chunks.
    * See {@link Parser.parse}.
@@ -182,7 +183,7 @@ export class HTMLProcessor {
   /** See {@link HTMLProcessorOptions.className}. */
   className?: string;
   /** See {@link HTMLProcessorOptions.separator}. */
-  separator?: string = ZWSP;
+  separator: string | Node = ZWSP;
   /** See {@link HTMLProcessorOptions.threshold}. */
   threshold: number = DEFAULT_THRES;
 
@@ -373,14 +374,15 @@ export class HTMLProcessor {
     assert(chunks.length > 1);
     assert(node.nodeValue === chunks.join(''));
 
-    // If the `separator` string is specified, insert it at each boundary.
-    if (this.separator) {
-      node.nodeValue = chunks.join(this.separator);
+    const separator = this.separator;
+    if (typeof separator === 'string') {
+      // If the `separator` is a string, insert it at each boundary.
+      node.nodeValue = chunks.join(separator);
       return;
     }
 
-    // Otherwise create a `Text` node for each chunk, with `<wbr>` between them,
-    // and replace the `node` with them.
+    // Otherwise create a `Text` node for each chunk, with the separator node
+    // between them, and replace the `node` with them.
     const document = node.ownerDocument;
     let nodes = [];
     for (const chunk of chunks) {
@@ -388,7 +390,7 @@ export class HTMLProcessor {
       nodes.push(null);
     }
     nodes.pop();
-    nodes = nodes.map(n => (n ? n : document.createElement('wbr')));
+    nodes = nodes.map(n => (n ? n : separator.cloneNode(true)));
     node.replaceWith(...nodes);
   }
 
diff --git a/javascript/src/parser.ts b/javascript/src/parser.ts
index 85ce754e..cec18908 100644
--- a/javascript/src/parser.ts
+++ b/javascript/src/parser.ts
@@ -15,11 +15,11 @@
  */
 
 import {unicodeBlocks} from './data/unicode_blocks';
-import {skipNodes} from './data/skip_nodes';
 import {model as jaKNBCModel} from './data/models/ja-knbc';
 import {model as zhHansModel} from './data/models/zh-hans';
 import {parseFromString} from './dom';
-import {bisectRight, SEP, INVALID} from './utils';
+import {HTMLProcessor} from './html_processor';
+import {bisectRight, INVALID} from './utils';
 
 /**
  * The default threshold value for the parser.
@@ -33,8 +33,6 @@ const NODETYPE = {
   TEXT: 3,
 };
 
-const SKIP_NODES = new Set(skipNodes);
-
 export class Parser {
   model;
 
@@ -195,45 +193,11 @@ export class Parser {
    * @param threshold The threshold score to control the granularity of chunks.
    */
   applyElement(parentElement: HTMLElement, threshold = DEFAULT_THRES) {
-    parentElement.style.wordBreak = 'keep-all';
-    parentElement.style.overflowWrap = 'break-word';
-    const chunks = this.parse(parentElement.textContent || '', threshold);
-    let charsToProcess = chunks.join(SEP);
-    const ownerDocument = parentElement.ownerDocument;
-
-    const processChildren = (parent: HTMLElement) => {
-      const toSkip = SKIP_NODES.has(parent.nodeName);
-      const children = [...parent.childNodes];
-      for (const child of children) {
-        if (child.nodeType === NODETYPE.TEXT) {
-          let textNodeContent = '';
-          const nodesToAdd: (HTMLElement | Text)[] = [];
-          (child.textContent || '').split('').forEach(char => {
-            if (toSkip) {
-              textNodeContent += char;
-              charsToProcess = charsToProcess.slice(
-                charsToProcess[0] === SEP ? 2 : 1
-              );
-            } else if (char === charsToProcess[0]) {
-              textNodeContent += char;
-              charsToProcess = charsToProcess.slice(1);
-            } else if (charsToProcess[0] === SEP) {
-              nodesToAdd.push(ownerDocument.createTextNode(textNodeContent));
-              nodesToAdd.push(ownerDocument.createElement('wbr'));
-              charsToProcess = charsToProcess.slice(2);
-              textNodeContent = char;
-            }
-          });
-          if (textNodeContent) {
-            nodesToAdd.push(ownerDocument.createTextNode(textNodeContent));
-          }
-          child.replaceWith(...nodesToAdd);
-        } else if (child.nodeType === NODETYPE.ELEMENT) {
-          processChildren(child as HTMLElement);
-        }
-      }
-    };
-    processChildren(parentElement);
+    const htmlProcessor = new HTMLProcessor(this, {
+      separator: parentElement.ownerDocument.createElement('wbr'),
+      threshold: threshold,
+    });
+    htmlProcessor.applyToElement(parentElement);
   }
 
   /**
diff --git a/javascript/tests/test_html_processor.ts b/javascript/tests/test_html_processor.ts
index ae8f70cc..5a78544f 100644
--- a/javascript/tests/test_html_processor.ts
+++ b/javascript/tests/test_html_processor.ts
@@ -84,6 +84,28 @@ describe('HTMLProcessor.applyToElement', () => {
   }
 });
 
+describe('HTMLProcessor.applyToElement.separator.node', () => {
+  const dom = new JSDOM('<div>今日は良い天気</div>');
+  const document = dom.window.document;
+  const separator = document.createElement('span');
+  separator.style.whiteSpace = 'normal';
+  separator.textContent = '\u200B';
+  const processor = new MockHTMLProcessorBase({
+    separator: separator,
+    className: 'applied',
+  });
+  processor.applyToElement(document.body);
+  it('should clone separator element deeply', () => {
+    expect(document.body.innerHTML).toEqual(
+      '<div class="applied">今日は' +
+        '<span style="white-space: normal;">\u200B</span>' +
+        '良い' +
+        '<span style="white-space: normal;">\u200B</span>' +
+        '天気</div>'
+    );
+  });
+});
+
 describe('HTMLProcessor.getBlocks', () => {
   const getBlocks = (html: string) => {
     const dom = new JSDOM(html);