Skip to content

Commit

Permalink
Better encoding detection
Browse files Browse the repository at this point in the history
Fix #7
  • Loading branch information
yishn committed Feb 3, 2020
1 parent 1975f01 commit 742d8d1
Show file tree
Hide file tree
Showing 5 changed files with 101 additions and 30 deletions.
3 changes: 2 additions & 1 deletion src/iconv-lite.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ module.exports = (() => {
} catch (err) {
return {
encodingExists: () => true,
decode: buffer => buffer.toString()
decode: buffer => buffer.toString(),
encode: (str, _) => str
}
}
})()
22 changes: 20 additions & 2 deletions src/jschardet.js
Original file line number Diff line number Diff line change
@@ -1,12 +1,30 @@
module.exports = (() => {
let jschardet = (() => {
try {
let m = require('jschardet')
if (m == null) throw new Error()

return m
} catch (err) {
return {
detect: () => ({encoding: 'UTF-8'})
detect: () => ({encoding: 'UTF-8'}),
detectBuffers() {
return this.detect()
}
}
}
})()

module.exports = {
detectBuffers(buffers) {
let u = new jschardet.UniversalDetector()
u.reset()

for (let buf of buffers) {
u.feed(buf.toString('binary'))
}

u.close()
return u.result
},
...jschardet
}
57 changes: 40 additions & 17 deletions src/tokenize.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,19 @@ const iconv = require('./iconv-lite')
const jschardet = require('./jschardet')
const {unescapeString} = require('./helper')

const tokenizeInner = createTokenizer({
const encodingDetectionProps = [
'EV',
'GN',
'GC',
'AN',
'BT',
'WT',
'PW',
'PB',
'C'
]

const _tokenize = createTokenizer({
rules: [
regexRule('_whitespace', /\s+/y, {lineBreaks: true}),
regexRule('parenthesis', /(\(|\))/y),
Expand All @@ -16,7 +28,7 @@ const tokenizeInner = createTokenizer({
exports.tokenizeIter = function*(contents) {
let length = contents.length

for (let token of tokenizeInner(contents)) {
for (let token of _tokenize(contents)) {
token.progress = token.pos / (length - 1)
delete token.length

Expand All @@ -34,51 +46,62 @@ exports.tokenizeBufferIter = function*(buffer, {encoding = null} = {}) {

// Guess encoding

let detectedEncoding = jschardet.detect(buffer.slice(0, 100)).encoding
let detectedEncoding = jschardet.detect(buffer).encoding
let contents = iconv.decode(buffer, detectedEncoding)
let tokens = exports.tokenizeIter(contents)

// Search for encoding

let prelude = []
let secondSemicolon = false
let givenEncoding = detectedEncoding
let testBuffers = []

while (true) {
let next = tokens.next()
if (next.done) break

let {type, value} = next.value
let i = prelude.length
let lastToken = prelude[prelude.length - 1]

prelude.push(next.value)

if (type === 'semicolon') {
if (!secondSemicolon) secondSemicolon = true
else break
} else if (
if (
type === 'c_value_type' &&
i > 0 &&
prelude[i - 1].type === 'prop_ident' &&
prelude[i - 1].value === 'CA'
lastToken != null &&
lastToken.type === 'prop_ident' &&
lastToken.value === 'CA'
) {
givenEncoding = unescapeString(value.slice(1, -1))
encoding = unescapeString(value.slice(1, -1))
break
} else if (
type === 'c_value_type' &&
lastToken != null &&
lastToken.type === 'prop_ident' &&
encodingDetectionProps.includes(lastToken.value)
) {
testBuffers.push(iconv.encode(value.slice(1, -1), detectedEncoding))

if (testBuffers.reduce((sum, buf) => sum + buf.length, 0) > 100) break
}
}

if (encoding == null && testBuffers.length > 0) {
encoding = jschardet.detectBuffers(testBuffers).encoding
}

if (
detectedEncoding !== givenEncoding &&
iconv.encodingExists(givenEncoding)
encoding != null &&
encoding != detectedEncoding &&
iconv.encodingExists(encoding)
) {
yield* exports.tokenizeIter(iconv.decode(buffer, givenEncoding))
yield* exports.tokenizeIter(iconv.decode(buffer, encoding))
} else {
yield* prelude
yield* tokens
}
}

exports.tokenize = contents => [...exports.tokenizeIter(contents)]

exports.tokenizeBuffer = (buffer, opts) => [
...exports.tokenizeBufferIter(buffer, opts)
]
18 changes: 18 additions & 0 deletions tests/no-ca.sgf
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
(;AB[dd][gc][rc][qc][pc][oc][pd][pe][pf][qe][of][og]AW[qf][rg][re][rd][qd][pg][ph]
[nc][ob][pb]C[�������͡� ����
���Ǵ�ѹ���Ƕ�ʽ�ݱ�������Ρ������������ںڱ��ϵİ������أ�

]
AP[MultiGo:4.2.1]SZ[19]MULTIGOGM[1]

(;B[mc]
(;W[nd];B[mb];W[nb];B[md];W[ne];B[lf]C[����
��1����������һ�֣�Ҳ����ǿ�Ĺ�������2ֻ�ܳ�������3�������֣�Ȼ����5��7λǿ�⣬�����׳���ȥ�����Ե���])
(;W[mb];B[nd];W[nb];B[lc];W[lb];B[kb]LB[kc:A]C[�仯
������1λ�⣬��˳����2λ���ִ�֮��ڿ���ǿ����4��6λӲ�ԣ��ײ������ܡ�����Aλ�ĶϺڲ��¡�]))
(;B[lc];W[mc];B[ld];W[lb];B[kb];W[ma]LB[md:A]C[ƽ��
����1λ����һ���˵ĸо���������������������2λ������4��6λ����޶�������ֶΣ��׻���Aλ������ܵ��ֶΡ�])
(;B[nd];W[mc];B[md];W[lc]C[����
��1������֣���2��4�����󣬺ں����ջ񡣺ڵ������߷��Ǻܳ����ġ�])
(;B[mb];W[mc];B[lb];W[nb];B[lc];W[me]C[�޲�
��1�㲻��Ҫ�죬��2ѹ��4ճ����5�պ󣬰�6�����������������ӳ������޷����ס�]))
31 changes: 21 additions & 10 deletions tests/parse.test.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
const t = require('tap')
const path = require('path')
const sgf = require('..')

function getJSON(tree) {
Expand Down Expand Up @@ -128,7 +129,7 @@ t.test('should convert lower case properties', t => {
})

t.test('should parse a relatively complex file', t => {
let trees = sgf.parseFile(__dirname + '/complex.sgf')
let trees = sgf.parseFile(path.resolve(__dirname, 'complex.sgf'))

t.equal(trees.length, 1)
t.end()
Expand Down Expand Up @@ -182,8 +183,8 @@ let languageMap = {
for (let language in languageMap) {
t.test('should be able to decode non-UTF-8 text nodes', t => {
t.equal(
sgf.parseFile(`${__dirname}/${language}.sgf`)[0].children[0].children[0]
.data.C[0],
sgf.parseFile(path.resolve(__dirname, `${language}.sgf`))[0].children[0]
.children[0].data.C[0],
`${languageMap[language]} is fun`
)

Expand All @@ -192,17 +193,17 @@ for (let language in languageMap) {
}

t.test('should be able to go back and re-parse attributes set before CA', t => {
t.equal(sgf.parseFile(__dirname + '/chinese.sgf')[0].data.PW[0], '柯洁')

t.equal(sgf.parseFile(__dirname + '/chinese.sgf')[0].data.PB[0], '古力')
let gameTrees = sgf.parseFile(path.resolve(__dirname, 'chinese.sgf'))

t.equal(gameTrees[0].data.PW[0], '柯洁')
t.equal(gameTrees[0].data.PB[0], '古力')
t.end()
})

t.test('should ignore unknown encodings', t => {
t.notEqual(
sgf.parseFile(__dirname + '/japanese_bad.sgf')[0].children[0].children[0]
.data.C[0],
sgf.parseFile(path.resolve(__dirname, 'japanese_bad.sgf'))[0].children[0]
.children[0].data.C[0],
`${languageMap['japanese']} is fun`
)

Expand All @@ -211,16 +212,26 @@ t.test('should ignore unknown encodings', t => {

t.test('should ignore BOM markers', t => {
t.doesNotThrow(() => {
sgf.parseFile(__dirname + '/utf8bom.sgf')
sgf.parseFile(path.resolve(__dirname, 'utf8bom.sgf'))
})

t.end()
})

t.test('should parse a UTF-16 LE file correctly', t => {
t.doesNotThrow(() => {
sgf.parseFile(__dirname + '/utf16le.sgf')
sgf.parseFile(path.resolve(__dirname, 'utf16le.sgf'))
})

t.end()
})

t.test('should detect encoding automatically', t => {
t.ok(
sgf
.parseFile(path.resolve(__dirname, 'no-ca.sgf'))[0]
.data.C[0].startsWith('【第三型】')
)

t.end()
})

0 comments on commit 742d8d1

Please sign in to comment.