From 6ba92e423a57ecb25ffd97688eca3770973b2bb6 Mon Sep 17 00:00:00 2001 From: Nick Frasser Date: Tue, 23 Nov 2021 19:04:52 -0500 Subject: [PATCH] Script to automatically update tlds.js --- package-lock.json | 1 + package.json | 4 +- packages/linkifyjs/src/core/tlds.js | 39 +++++++---------- tasks/update-tlds.js | 65 +++++++++++++++++++++++++++++ 4 files changed, 84 insertions(+), 25 deletions(-) create mode 100644 tasks/update-tlds.js diff --git a/package-lock.json b/package-lock.json index 4bd9e654..b7a4e165 100644 --- a/package-lock.json +++ b/package-lock.json @@ -40,6 +40,7 @@ "mocha-lcov-reporter": "^1.3.0", "npm-run-all": "^4.1.5", "nyc": "^15.1.0", + "punycode": "^2.1.1", "qunit": "^2.14.1", "react": "^17.0.1", "react-dom": "^17.0.1", diff --git a/package.json b/package.json index de346bcf..0412df2e 100644 --- a/package.json +++ b/package.json @@ -17,7 +17,8 @@ "dist:ci": "run-s clean build:ci copy", "lint": "eslint *.js packages/**/*.js test/**/*.js", "test": "nyc --reporter=lcov --reporter=text mocha test/index.js", - "test:ci": "karma start test/ci.conf.js --single-run" + "test:ci": "karma start test/ci.conf.js --single-run", + "tlds": "node tasks/update-tlds.js" }, "author": "Hypercontext", "license": "MIT", @@ -48,6 +49,7 @@ "mocha-lcov-reporter": "^1.3.0", "npm-run-all": "^4.1.5", "nyc": "^15.1.0", + "punycode": "^2.1.1", "qunit": "^2.14.1", "react": "^17.0.1", "react-dom": "^17.0.1", diff --git a/packages/linkifyjs/src/core/tlds.js b/packages/linkifyjs/src/core/tlds.js index 0948c41d..512bb84c 100644 --- a/packages/linkifyjs/src/core/tlds.js +++ b/packages/linkifyjs/src/core/tlds.js @@ -1,13 +1,5 @@ -// NOTE: punycode versions of IDNs are not included here because these will not -// be as commonly used without the http prefix anyway and linkify will already -// force-encode those. - -// NOTE: vermögensberater vermögensberatung are special cases because they're -// the only ones in this list that contain non-ASCII characters - -// To be updated with the values in this list -// http://data.iana.org/TLD/tlds-alpha-by-domain.txt -// Version 2021101300, Last Updated Wed Oct 13 07:07:01 2021 UTC +// THIS FILE IS AUTOMATICALLY GENERATED DO NOT EDIT DIRECTLY +// https://data.iana.org/TLD/tlds-alpha-by-domain.txt export const tlds = 'aaa \ aarp \ abarth \ @@ -830,6 +822,7 @@ mtn \ mtr \ mu \ museum \ +music \ mutual \ mv \ mw \ @@ -1351,7 +1344,6 @@ zm \ zone \ zuerich \ zw'.split(' '); - // Internationalized domain names containing non-ASCII export const utlds = 'ελ \ ευ \ @@ -1361,7 +1353,6 @@ export const utlds = 'ελ \ ею \ католик \ ком \ -қаз \ мкд \ мон \ москва \ @@ -1372,7 +1363,7 @@ export const utlds = 'ελ \ сайт \ срб \ укр \ -გე \ +қаз \ հայ \ ישראל \ קום \ @@ -1389,10 +1380,8 @@ export const utlds = 'ελ \ ایران \ بارت \ بازار \ -بھارت \ بيتك \ -پاکستان \ -ڀارت \ +بھارت \ تونس \ سودان \ سورية \ @@ -1409,6 +1398,8 @@ export const utlds = 'ελ \ موريتانيا \ موقع \ همراه \ +پاکستان \ +ڀارت \ कॉम \ नेट \ भारत \ @@ -1431,19 +1422,16 @@ export const utlds = 'ελ \ คอม \ ไทย \ ລາວ \ -닷넷 \ -닷컴 \ -삼성 \ -한국 \ +გე \ +みんな \ アマゾン \ -グーグル \ クラウド \ +グーグル \ コム \ ストア \ セール \ ファッション \ ポイント \ -みんな \ 世界 \ 中信 \ 中国 \ @@ -1465,7 +1453,6 @@ export const utlds = 'ελ \ 嘉里 \ 嘉里大酒店 \ 在线 \ -大众汽车 \ 大拿 \ 天主教 \ 娱乐 \ @@ -1504,4 +1491,8 @@ export const utlds = 'ελ \ 食品 \ 餐厅 \ 香格里拉 \ -香港'.split(' '); +香港 \ +닷넷 \ +닷컴 \ +삼성 \ +한국'.split(' '); diff --git a/tasks/update-tlds.js b/tasks/update-tlds.js new file mode 100644 index 00000000..d0db0956 --- /dev/null +++ b/tasks/update-tlds.js @@ -0,0 +1,65 @@ +const http = require('https'); // or 'https' for https:// URLs +const fs = require('fs'); +const punycode = require('punycode/'); + +const tldsListUrl = 'https://data.iana.org/TLD/tlds-alpha-by-domain.txt'; +const tldsjs = 'packages/linkifyjs/src/core/tlds.js'; +let tldsListContents = ''; + +http.get(tldsListUrl, (response) => { + console.log(`Downloading ${tldsListUrl}...`); + response.on('data', (chunk) => { tldsListContents += chunk; }); + response.on('end', () => { + console.log(`Downloaded. Re-generating ${tldsjs}...`); + + // NOTE: punycode versions of IDNs (e.g., `XN--...`) do not get included + // in the TLDs list because these will not be as commonly used without + // the http prefix anyway and linkify will already force-encode those. + let tlds = []; + let utlds = []; + + // NOTE: vermögensberater vermögensberatung are special cases because + // they're the only ones that contain a mix of ASCII and non-ASCII + // characters. + const specialTlds = ['XN--VERMGENSBERATER-CTB', 'XN--VERMGENSBERATUNG-PWB']; + const specialUtlds = specialTlds.map(tld => punycode.toUnicode(tld.toLowerCase())); + + for (const line of tldsListContents.split('\n').map(line => line.trim())) { + if (!line || line[0] === '#' || specialTlds.includes(line)) { continue; } + if (/^XN--/.test(line)) { + utlds.push(punycode.toUnicode(line.toLowerCase())); + } else { + tlds.push(line.toLowerCase()); + } + } + tlds = tlds.concat(specialUtlds).sort(); + utlds = utlds.sort(); + + const jsFile = fs.openSync(tldsjs, 'w'); + fs.writeSync(jsFile, '// THIS FILE IS AUTOMATICALLY GENERATED DO NOT EDIT DIRECTLY\n'); + fs.writeSync(jsFile, `// ${tldsListUrl}\n`); + + // Write TLDs + fs.writeSync(jsFile, 'export const tlds = \''); + let firstWrite = false; + for (const tld of tlds) { + if (firstWrite) { fs.writeSync(jsFile, ' \\\n'); } + else { firstWrite = true; } + fs.writeSync(jsFile, tld); + } + fs.writeSync(jsFile, '\'.split(\' \');\n'); + fs.writeSync(jsFile, '// Internationalized domain names containing non-ASCII\n'); + fs.writeSync(jsFile, 'export const utlds = \''); + + firstWrite = false; + for (const utld of utlds) { + if (firstWrite) { fs.writeSync(jsFile, ' \\\n'); } + else { firstWrite = true; } + fs.writeSync(jsFile, utld); + } + fs.writeSync(jsFile, '\'.split(\' \');\n'); + fs.closeSync(jsFile); + + console.log('Done'); + }); +});