Skip to content

Commit

Permalink
fix: Codeblocks apply markdown formatting to contents (fixes #22)
Browse files Browse the repository at this point in the history
  • Loading branch information
nonara committed Jul 26, 2021
1 parent 9ad08a9 commit 14fb90b
Show file tree
Hide file tree
Showing 6 changed files with 134 additions and 19 deletions.
15 changes: 10 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,8 @@ import { NodeHtmlMarkdown, NodeHtmlMarkdownOptions } from 'node-html-markdown'
NodeHtmlMarkdown.translate(
/* html */ `<b>hello</b>`,
/* options (optional) */ {},
/* customTranslators (optional) */ undefined
/* customTranslators (optional) */ undefined,
/* customCodeBlockTranslators (optional) */ undefined
);

// Multiple files
Expand All @@ -90,7 +91,8 @@ NodeHtmlMarkdown.translate(
'file2.html': `<b>goodbye</b>`
},
/* options (optional) */ {},
/* customTranslators (optional) */ undefined
/* customTranslators (optional) */ undefined,
/* customCodeBlockTranslators (optional) */ undefined
);


Expand All @@ -101,7 +103,8 @@ NodeHtmlMarkdown.translate(

const nhm = new NodeHtmlMarkdown(
/* options (optional) */ {},
/* customTransformers (optional) */ undefined
/* customTransformers (optional) */ undefined,
/* customCodeBlockTranslators (optional) */ undefined
);

// Single file
Expand Down Expand Up @@ -160,12 +163,12 @@ export interface NodeHtmlMarkdownOptions {
/**
* Supplied elements will be ignored (ignores inner text does not parse children)
*/
readonly ignore?: string[],
ignore?: string[],

/**
* Supplied elements will be treated as blocks (surrounded with blank lines)
*/
readonly blockElements?: string[],
blockElements?: string[],

/**
* Max consecutive new lines allowed
Expand Down Expand Up @@ -225,6 +228,8 @@ __For detail on how to use them see__:
- [translator.ts](https://github.com/crosstype/node-html-markdown/blob/master/src/translator.ts) - Documentation for `TranslatorConfig`
- [config.ts](https://github.com/crosstype/node-html-markdown/blob/master/src/config.ts) - Translators in `defaultTranslators`

The `NodeHtmlMarkdown#codeBlockTranslators` property is a collection of translators which handles elements within a `<pre><code>` block.

## Further improvements

Being a performance-centric library, we're always interested in further improvements.
Expand Down
13 changes: 12 additions & 1 deletion src/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -171,11 +171,13 @@ export const defaultTranslators: TranslatorConfigObject = {
noEscape: true,
prefix: codeFence + language + '\n',
postfix: '\n' + codeFence,
childTranslators: visitor.instance.codeBlockTranslators
}
} else {
return {
noEscape: true,
postprocess: ({ content }) => content.replace(/^/gm, ' ')
postprocess: ({ content }) => content.replace(/^/gm, ' '),
childTranslators: visitor.instance.codeBlockTranslators
}
}
},
Expand Down Expand Up @@ -215,6 +217,15 @@ export const defaultTranslators: TranslatorConfigObject = {
},
}

export const defaultCodeBlockTranslators: TranslatorConfigObject = {
'br': { content: `\n`, recurse: false },
'hr': { content: '---', recurse: false },
'h1,h2,h3,h4,h5,h6': { prefix: '[', postfix: ']' },
'ol,ul': defaultTranslators['ol,ul'],
'li': defaultTranslators['li'],
'img': { recurse: false }
}

// endregion


Expand Down
29 changes: 21 additions & 8 deletions src/main.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import { NodeHtmlMarkdownOptions } from './options';
import { TranslatorCollection, TranslatorConfigObject } from './translator';
import { defaultBlockElements, defaultIgnoreElements, defaultOptions, defaultTranslators } from './config';
import {
defaultBlockElements, defaultCodeBlockTranslators, defaultIgnoreElements, defaultOptions, defaultTranslators
} from './config';
import { parseHTML } from './utilities';
import { getMarkdownForHtmlNodes } from './visitor';

Expand All @@ -21,22 +23,33 @@ type Options = Partial<NodeHtmlMarkdownOptions>

export class NodeHtmlMarkdown {
public translators = new TranslatorCollection();
public codeBlockTranslators = new TranslatorCollection();
public readonly options: NodeHtmlMarkdownOptions

constructor(options?: Options, customTranslators?: TranslatorConfigObject) {
constructor(options?: Options, customTranslators?: TranslatorConfigObject, customCodeBlockTranslators?: TranslatorConfigObject) {
/* Setup Options */
this.options = { ...defaultOptions, ...options };
const ignoredElements = this.options.ignore?.concat(defaultIgnoreElements) ?? defaultIgnoreElements;
const blockElements = this.options.blockElements?.concat(defaultBlockElements) ?? defaultBlockElements;

/* Setup Translator Bases */
ignoredElements?.forEach(el => this.translators.set(el, { ignore: true, recurse: false }));
blockElements?.forEach(el => this.translators.set(el, { surroundingNewlines: 2 }));
ignoredElements?.forEach(el => {
this.translators.set(el, { ignore: true, recurse: false });
this.codeBlockTranslators.set(el, { ignore: true, recurse: false });
})

blockElements?.forEach(el => {
this.translators.set(el, { surroundingNewlines: 2 });
this.codeBlockTranslators.set(el, { surroundingNewlines: 2 });
});

/* Add and merge bases with default and custom translator configs */
for (const [ elems, cfg ] of Object.entries({ ...defaultTranslators, ...customTranslators }))
this.translators.set(elems, cfg, true);

for (const [ elems, cfg ] of Object.entries({ ...defaultCodeBlockTranslators, ...customCodeBlockTranslators }))
this.codeBlockTranslators.set(elems, cfg, true);

// TODO - Workaround for upstream issue (may not be fixed) - https://github.com/taoqf/node-html-parser/issues/78
if (!this.options.textReplace) this.options.textReplace = [];
this.options.textReplace.push([ /^<!DOCTYPE.*>/gmi, '' ]);
Expand All @@ -50,15 +63,15 @@ export class NodeHtmlMarkdown {
/**
* Translate HTML source text to markdown
*/
static translate(html: string, options?: Options, customTranslators?: TranslatorConfigObject): string
static translate(html: string, options?: Options, customTranslators?: TranslatorConfigObject, customCodeBlockTranslators?: TranslatorConfigObject): string
/**
* Translate collection of HTML source text to markdown
*/
static translate(files: FileCollection, options?: Options, customTranslators?: TranslatorConfigObject): FileCollection
static translate(htmlOrFiles: string | FileCollection, opt?: Options, trans?: TranslatorConfigObject):
static translate(files: FileCollection, options?: Options, customTranslators?: TranslatorConfigObject, customCodeBlockTranslators?: TranslatorConfigObject): FileCollection
static translate(htmlOrFiles: string | FileCollection, opt?: Options, customTranslators?: TranslatorConfigObject, customCodeBlockTranslators?: TranslatorConfigObject):
string | FileCollection
{
return NodeHtmlMarkdown.prototype.translateWorker.call(new NodeHtmlMarkdown(opt, trans), htmlOrFiles);
return NodeHtmlMarkdown.prototype.translateWorker.call(new NodeHtmlMarkdown(opt, customTranslators, customCodeBlockTranslators), htmlOrFiles);
}

// endregion
Expand Down
5 changes: 5 additions & 0 deletions src/translator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,11 @@ export type TranslatorConfig = {
* Keep whitespace as it is
*/
preserveWhitespace?: boolean

/**
* Custom translator collection to use for child HTML nodes
*/
childTranslators?: TranslatorCollection
}

export enum PostProcessResult {
Expand Down
11 changes: 6 additions & 5 deletions src/visitor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import { ElementNode, HtmlNode, isElementNode, isTextNode } from './nodes';
import { getChildNodes, getTrailingWhitespaceInfo, perfStart, perfStop, trimNewLines } from './utilities';
import {
createTranslatorContext, isTranslatorConfig, PostProcessResult, TranslatorConfig, TranslatorConfigFactory,
TranslatorContext
TranslatorConfigObject, TranslatorContext
} from './translator';
import { NodeHtmlMarkdownOptions } from './options';
import { contentlessElements } from './config';
Expand All @@ -19,6 +19,7 @@ export interface NodeMetadata {
listItemNumber?: number
noEscape?: boolean
preserveWhitespace?: boolean
translators?: TranslatorConfigObject
}

export type NodeMetadataMap = Map<ElementNode, NodeMetadata>
Expand Down Expand Up @@ -160,7 +161,7 @@ export class Visitor {
if (textOnly || !isElementNode(node)) return;

/* Handle element node */
const { instance: { translators } } = this;
const translators = metadata?.translators ?? this.instance.translators;
const translatorCfgOrFactory = translators[node.tagName] as TranslatorConfig | TranslatorConfigFactory;

/* Update metadata with list detail */
Expand Down Expand Up @@ -202,9 +203,9 @@ export class Visitor {
// Skip and don't check children if ignore flag set
if (cfg.ignore) return;

/* Update metadata for noEscape flag */
if (cfg.noEscape && !metadata?.noEscape) {
metadata = { ...metadata, noEscape: true };
/* Update metadata if needed */
if ((cfg.noEscape && !metadata?.noEscape) || (cfg.childTranslators && !metadata?.translators)) {
metadata = { ...metadata, noEscape: cfg.noEscape, translators: cfg.childTranslators };
this.nodeMetadata.set(node, metadata);
}

Expand Down
80 changes: 80 additions & 0 deletions test/default-tags-codeblock.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
// noinspection HtmlUnknownTarget

import { NodeHtmlMarkdown } from '../src';


/* ****************************************************************************************************************** *
* Tests
* ****************************************************************************************************************** */

// Note: Newline handling for block elements within code blocks is not very clean. This can be fixed later.
describe(`Default Tags`, () => {
let instance: NodeHtmlMarkdown;
const translateAsBlock = (html: string) => instance.translate(`<pre><code>${html}</code></pre>`);
const getExpected = (s: string) => '```\n' + s + '\n```';
beforeAll(() => {
instance = new NodeHtmlMarkdown();
});

test(`Line Break (br)`, () => {
const res = translateAsBlock(`a<br>b`);
expect(res).toBe(getExpected(`a\nb`));
});

test(`Horizontal Rule (hr)`, () => {
const res = translateAsBlock(`a<hr>b`);
expect(res).toBe(getExpected(`a\n\n---\n\nb`));
});

test(`Non-processed Elements (b, strong, del, s, strike, em, i, pre, code, blockquote, a)`, () => {
const tags = [ 'b', 'strong', 'del', 's', 'strike', 'em', 'i', 'code', 'a', 'pre', 'blockquote' ];
const html = tags.map(t => `<${t}>${t}</${t}>`).join(' ');
const exp = 'b strong del s strike em i code a \n\npre\n\n blockquote\n\n';

const res = translateAsBlock(html);
expect(res).toBe(getExpected(exp));
});

test(`Image (img)`, () => {
const res = translateAsBlock(`a<img src="https://www.google.com/">b`);
expect(res).toBe(getExpected(`ab`));
});

test(`Headings (h1, h2, h3, h4, h5, h6)`, () => {
let nodes: string[] = [];
for (let i = 1; i < 8; i++) nodes.push(`<h${i}>a</h${i}>`);
const res = translateAsBlock(nodes.join(''));
expect(res).toBe(getExpected('\n[a]\n'.repeat(6) + '\na'));
});

// Note: Newline handling here for block elements is unusual
describe(`Lists (ol + li, ul + li)`, () => {
test(`Multi-level Ordered List`, () => {
const res = translateAsBlock(`
<ol>
<li>a<br><br><s>b</s></li>
<li> </li> <!-- Elided due to whitespace -->
<li>b
<ol><li>c<br>d</li></ol>
<ul><li>e<br>f</li></ul>
</li>
</ol>
`);
expect(res).toBe(getExpected(` \n \n1. a \nb\n \n \n2. b \n 1. c \n d \n \n * e \n f\n \n `));
});

test(`Multi-level Unordered List`, () => {
const res = translateAsBlock(`
<ul>
<li>a<br><br><s>b</s></li>
<li> </li> <!-- Elided due to whitespace -->
<li>b
<ul><li>c<br>d</li></ul>
<ol><li>e<br>f</li></ol>
</li>
</ul>
`);
expect(res).toBe(getExpected(` \n \n* a \nb\n \n \n* b \n * c \n d \n \n 1. e \n f\n \n `));
});
});
});

0 comments on commit 14fb90b

Please sign in to comment.