Skip to content

Commit

Permalink
Change applyElement to call HTMLProcessor (#60)
Browse files Browse the repository at this point in the history
This patch changes `Parser.applyElement` to call the `HTMLProcessor` class.

The `HTMLProcessorOptions.separator` is changed to accept a `Node`. This is because `undefined` had double meanings; i.e., use the default (ZWSP) and use the `<wbr>` element.
  • Loading branch information
kojiishi authored May 12, 2022
1 parent 055535d commit fcc9156
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 64 deletions.
19 changes: 8 additions & 11 deletions javascript/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,23 +63,20 @@ console.log(ele.outerHTML);
// <p class="budou-this" style="word-break: keep-all; overflow-wrap: break-word;">今日は<b><wbr>とても<wbr>天気</b>です。</p>
```

There is another way to apply the process to an HTML element.
Internally, the `applyElement` calls the [`HTMLProcessor`] class
with a `<wbr>` element as the separator.
You can use the [`HTMLProcessor`] class directly if desired.
For example:

```javascript
import { HTMLProcessor } from 'budoux';
const ele = document.querySelector('p.budou-this');
const applier = new HTMLProcessor(parser);
applier.applyToElement(ele);
const htmlProcessor = new HTMLProcessor(parser, {
separator: ' '
});
htmlProcessor.applyToElement(ele);
```

The [`HTMLProcessor`] class
recognizes separate or nested paragraphs more correctly,
its output is generally more memory efficient for browsers,
and it can customize its output such as inserting a space at boundaries
which is often useful for accessibility,
but the bundle code size is larger.
Please see the JSDoc for more details.

[`HTMLProcessor`]: https://github.com/google/budoux/blob/main/javascript/src/html_processor.ts

### Loading a custom model
Expand Down
22 changes: 12 additions & 10 deletions javascript/src/html_processor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -153,11 +153,12 @@ export interface HTMLProcessorOptions {
className?: string;
/**
* The separator to insert at each semantics boundary.
* The default value is U+200B ZERO WIDTH SPACE.
*
* When falsy, a `<wbr>` element is inserted.
* When it's a {@link Node}, a clone of the {@link Node} will be inserted.
*
* The default value is U+200B ZERO WIDTH SPACE.
*/
separator?: string;
separator?: string | Node;
/**
* The threshold score to control the granularity of chunks.
* See {@link Parser.parse}.
Expand All @@ -182,7 +183,7 @@ export class HTMLProcessor {
/** See {@link HTMLProcessorOptions.className}. */
className?: string;
/** See {@link HTMLProcessorOptions.separator}. */
separator?: string = ZWSP;
separator: string | Node = ZWSP;
/** See {@link HTMLProcessorOptions.threshold}. */
threshold: number = DEFAULT_THRES;

Expand Down Expand Up @@ -373,22 +374,23 @@ export class HTMLProcessor {
assert(chunks.length > 1);
assert(node.nodeValue === chunks.join(''));

// If the `separator` string is specified, insert it at each boundary.
if (this.separator) {
node.nodeValue = chunks.join(this.separator);
const separator = this.separator;
if (typeof separator === 'string') {
// If the `separator` is a string, insert it at each boundary.
node.nodeValue = chunks.join(separator);
return;
}

// Otherwise create a `Text` node for each chunk, with `<wbr>` between them,
// and replace the `node` with them.
// Otherwise create a `Text` node for each chunk, with the separator node
// between them, and replace the `node` with them.
const document = node.ownerDocument;
let nodes = [];
for (const chunk of chunks) {
if (chunk) nodes.push(document.createTextNode(chunk));
nodes.push(null);
}
nodes.pop();
nodes = nodes.map(n => (n ? n : document.createElement('wbr')));
nodes = nodes.map(n => (n ? n : separator.cloneNode(true)));
node.replaceWith(...nodes);
}

Expand Down
50 changes: 7 additions & 43 deletions javascript/src/parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@
*/

import {unicodeBlocks} from './data/unicode_blocks';
import {skipNodes} from './data/skip_nodes';
import {model as jaKNBCModel} from './data/models/ja-knbc';
import {model as zhHansModel} from './data/models/zh-hans';
import {parseFromString} from './dom';
import {bisectRight, SEP, INVALID} from './utils';
import {HTMLProcessor} from './html_processor';
import {bisectRight, INVALID} from './utils';

/**
* The default threshold value for the parser.
Expand All @@ -33,8 +33,6 @@ const NODETYPE = {
TEXT: 3,
};

const SKIP_NODES = new Set(skipNodes);

export class Parser {
model;

Expand Down Expand Up @@ -195,45 +193,11 @@ export class Parser {
* @param threshold The threshold score to control the granularity of chunks.
*/
applyElement(parentElement: HTMLElement, threshold = DEFAULT_THRES) {
parentElement.style.wordBreak = 'keep-all';
parentElement.style.overflowWrap = 'break-word';
const chunks = this.parse(parentElement.textContent || '', threshold);
let charsToProcess = chunks.join(SEP);
const ownerDocument = parentElement.ownerDocument;

const processChildren = (parent: HTMLElement) => {
const toSkip = SKIP_NODES.has(parent.nodeName);
const children = [...parent.childNodes];
for (const child of children) {
if (child.nodeType === NODETYPE.TEXT) {
let textNodeContent = '';
const nodesToAdd: (HTMLElement | Text)[] = [];
(child.textContent || '').split('').forEach(char => {
if (toSkip) {
textNodeContent += char;
charsToProcess = charsToProcess.slice(
charsToProcess[0] === SEP ? 2 : 1
);
} else if (char === charsToProcess[0]) {
textNodeContent += char;
charsToProcess = charsToProcess.slice(1);
} else if (charsToProcess[0] === SEP) {
nodesToAdd.push(ownerDocument.createTextNode(textNodeContent));
nodesToAdd.push(ownerDocument.createElement('wbr'));
charsToProcess = charsToProcess.slice(2);
textNodeContent = char;
}
});
if (textNodeContent) {
nodesToAdd.push(ownerDocument.createTextNode(textNodeContent));
}
child.replaceWith(...nodesToAdd);
} else if (child.nodeType === NODETYPE.ELEMENT) {
processChildren(child as HTMLElement);
}
}
};
processChildren(parentElement);
const htmlProcessor = new HTMLProcessor(this, {
separator: parentElement.ownerDocument.createElement('wbr'),
threshold: threshold,
});
htmlProcessor.applyToElement(parentElement);
}

/**
Expand Down
22 changes: 22 additions & 0 deletions javascript/tests/test_html_processor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,28 @@ describe('HTMLProcessor.applyToElement', () => {
}
});

describe('HTMLProcessor.applyToElement.separator.node', () => {
const dom = new JSDOM('<div>今日は良い天気</div>');
const document = dom.window.document;
const separator = document.createElement('span');
separator.style.whiteSpace = 'normal';
separator.textContent = '\u200B';
const processor = new MockHTMLProcessorBase({
separator: separator,
className: 'applied',
});
processor.applyToElement(document.body);
it('should clone separator element deeply', () => {
expect(document.body.innerHTML).toEqual(
'<div class="applied">今日は' +
'<span style="white-space: normal;">\u200B</span>' +
'良い' +
'<span style="white-space: normal;">\u200B</span>' +
'天気</div>'
);
});
});

describe('HTMLProcessor.getBlocks', () => {
const getBlocks = (html: string) => {
const dom = new JSDOM(html);
Expand Down

0 comments on commit fcc9156

Please sign in to comment.