Skip to content

Commit

Permalink
Check if given encodings exist against the denormalized list of suppo…
Browse files Browse the repository at this point in the history
…rted encodings
  • Loading branch information
aadsm authored Jan 14, 2022
1 parent af66fa6 commit 7897929
Show file tree
Hide file tree
Showing 7 changed files with 78 additions and 5 deletions.
4 changes: 4 additions & 0 deletions src/charsetgroupprober.js
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,10 @@ function CharSetGroupProber() {
return this._mBestGuessProber.getCharsetName();
}

this.getSupportedCharsetNames = function() {
throw new Error("Unimplemented method getSupportedCharsetNames()");
}

this.feed = function(aBuf) {
for( var i = 0, prober; prober = this._mProbers[i]; i++ ) {
if( !prober || !prober.active ) continue;
Expand Down
4 changes: 4 additions & 0 deletions src/charsetprober.js
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ function CharSetProber() {
return null;
}

this.getSupportedCharsetNames = function() {
throw new Error("Unimplemented method getSupportedCharsetNames()");
}

this.feed = function(aBuf) {
}

Expand Down
8 changes: 8 additions & 0 deletions src/escprober.js
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,10 @@ function EscCharSetProber() {
new CodingStateMachine(escsm.ISO2022JPSMModel),
new CodingStateMachine(escsm.ISO2022KRSMModel)
];
self._supportedCharsetNames = [];
for (const codingSM of self._mCodingSM) {
self._supportedCharsetNames.push(codingSM.getCodingStateMachine());
}
self.reset();
}

Expand All @@ -62,6 +66,10 @@ function EscCharSetProber() {
return this._mDetectedCharset;
}

this.getSupportedCharsetNames = function() {
return self._supportedCharsetNames;
}

this.getConfidence = function() {
if( this._mDetectedCharset ) {
return 0.99;
Expand Down
4 changes: 4 additions & 0 deletions src/latin1prober.js
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,10 @@ function Latin1Prober() {
return "windows-1252";
}

this.getSupportedCharsetNames = function() {
return [this.getCharsetName()];
}

this.feed = function(aBuf) {
aBuf = this.filterWithEnglishLetters(aBuf);
for( var i = 0; i < aBuf.length; i++ ) {
Expand Down
10 changes: 10 additions & 0 deletions src/mbcsgroupprober.js
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,16 @@ function MBCSGroupProber() {
new Big5Prober(),
new EUCTWProber()
];
const supportedCharsetNames = (function() {
const charsetNames = [];
for (const prober of this._mProbers) {
charsetNames.push(prober.getCharsetName())
}
return charsetNames;
});
this.getSupportedCharsetNames = function() {
return supportedCharsetNames;
}
this.reset();
}
MBCSGroupProber.prototype = new CharSetGroupProber();
Expand Down
9 changes: 9 additions & 0 deletions src/sbcsgroupprober.js
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,18 @@ function SBCSGroupProber() {
hebrewProber.setModelProbers(logicalHebrewProber, visualHebrewProber);
self._mProbers.push(hebrewProber, logicalHebrewProber, visualHebrewProber);

self._supportedCharsetNames = [];
for (const prober of self._mProbers) {
self._supportedCharsetNames.push(prober.getCharsetName())
}

self.reset();
}

this.getSupportedCharsetNames = function() {
return self._supportedCharsetNames;
}

init();
}
SBCSGroupProber.prototype = new CharSetGroupProber();
Expand Down
44 changes: 39 additions & 5 deletions src/universaldetector.js
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,50 @@ var constants = require('./constants');
var MBCSGroupProber = require('./mbcsgroupprober');
var SBCSGroupProber = require('./sbcsgroupprober');
var Latin1Prober = require('./latin1prober');
var EscCharSetProber = require('./escprober')
var EscCharSetProber = require('./escprober');
var logger = require('./logger');

const supportedEncodings = (function() {
const BOM_UTF = [
"UTF-8", "UTF-32LE", "UTF-32BE", "UTF-32BE", "UTF-16LE", "UTF-16BE",
"X-ISO-10646-UCS-4-3412", "X-ISO-10646-UCS-4-2143"
]
const probers = [
new EscCharSetProber(),
new MBCSGroupProber(),
new SBCSGroupProber(),
new Latin1Prober()
];
const encodings = BOM_UTF.slice(0);
for (const prober of probers) {
[].push.apply(encodings, prober.getSupportedCharsetNames());
}
return encodings;
})();

const supportedEncodingsDenormalized = (function() {
denormalizedEncodings = [];
for (const encoding of supportedEncodings) {
denormalizedEncodings.push(
encoding.toLocaleLowerCase(),
encoding.toLocaleLowerCase().replace(/-/g, "")
);
}
return denormalizedEncodings;
})();

function UniversalDetector(options) {
if (!options) options = {};
if (!options.minimumThreshold) options.minimumThreshold = 0.20;

if (options.detectEncodings) {
for (const encoding of options.detectEncodings) {
if (!supportedEncodingsDenormalized.includes(encoding.toLowerCase())) {
throw new Error(`Encoding ${encoding} is not supported. Supported encodings: ${supportedEncodings}.`);
}
}
}

var _state = {
pureAscii : 0,
escAscii : 1,
Expand All @@ -61,10 +98,7 @@ function UniversalDetector(options) {
if (!options.detectEncodings) {
return true;
}
// TODO: we probably should normalize detectEncodings
// and/or somehow indicate to the user that a given
// encoding does not exist.
return options.detectEncodings.includes(encoding);
return options.detectEncodings.includes(encoding.toLowerCase());
}

this.reset = function() {
Expand Down

0 comments on commit 7897929

Please sign in to comment.