Skip to content

Commit

Permalink
Script to automatically update tlds.js
Browse files Browse the repository at this point in the history
  • Loading branch information
Nick Frasser committed Nov 24, 2021
1 parent 1ae5f69 commit 6ba92e4
Show file tree
Hide file tree
Showing 4 changed files with 84 additions and 25 deletions.
1 change: 1 addition & 0 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@
"dist:ci": "run-s clean build:ci copy",
"lint": "eslint *.js packages/**/*.js test/**/*.js",
"test": "nyc --reporter=lcov --reporter=text mocha test/index.js",
"test:ci": "karma start test/ci.conf.js --single-run"
"test:ci": "karma start test/ci.conf.js --single-run",
"tlds": "node tasks/update-tlds.js"
},
"author": "Hypercontext",
"license": "MIT",
Expand Down Expand Up @@ -48,6 +49,7 @@
"mocha-lcov-reporter": "^1.3.0",
"npm-run-all": "^4.1.5",
"nyc": "^15.1.0",
"punycode": "^2.1.1",
"qunit": "^2.14.1",
"react": "^17.0.1",
"react-dom": "^17.0.1",
Expand Down
39 changes: 15 additions & 24 deletions packages/linkifyjs/src/core/tlds.js
Original file line number Diff line number Diff line change
@@ -1,13 +1,5 @@
// NOTE: punycode versions of IDNs are not included here because these will not
// be as commonly used without the http prefix anyway and linkify will already
// force-encode those.

// NOTE: vermögensberater vermögensberatung are special cases because they're
// the only ones in this list that contain non-ASCII characters

// To be updated with the values in this list
// http://data.iana.org/TLD/tlds-alpha-by-domain.txt
// Version 2021101300, Last Updated Wed Oct 13 07:07:01 2021 UTC
// THIS FILE IS AUTOMATICALLY GENERATED DO NOT EDIT DIRECTLY
// https://data.iana.org/TLD/tlds-alpha-by-domain.txt
export const tlds = 'aaa \
aarp \
abarth \
Expand Down Expand Up @@ -830,6 +822,7 @@ mtn \
mtr \
mu \
museum \
music \
mutual \
mv \
mw \
Expand Down Expand Up @@ -1351,7 +1344,6 @@ zm \
zone \
zuerich \
zw'.split(' ');

// Internationalized domain names containing non-ASCII
export const utlds = 'ελ \
ευ \
Expand All @@ -1361,7 +1353,6 @@ export const utlds = 'ελ \
ею \
католик \
ком \
қаз \
мкд \
мон \
москва \
Expand All @@ -1372,7 +1363,7 @@ export const utlds = 'ελ \
сайт \
срб \
укр \
გე \
қаз \
հայ \
ישראל \
קום \
Expand All @@ -1389,10 +1380,8 @@ export const utlds = 'ελ \
ایران \
بارت \
بازار \
بھارت \
بيتك \
پاکستان \
ڀارت \
بھارت \
تونس \
سودان \
سورية \
Expand All @@ -1409,6 +1398,8 @@ export const utlds = 'ελ \
موريتانيا \
موقع \
همراه \
پاکستان \
ڀارت \
कॉम \
नेट \
भारत \
Expand All @@ -1431,19 +1422,16 @@ export const utlds = 'ελ \
คอม \
ไทย \
ລາວ \
닷넷 \
닷컴 \
삼성 \
한국 \
გე \
みんな \
アマゾン \
グーグル \
クラウド \
グーグル \
コム \
ストア \
セール \
ファッション \
ポイント \
みんな \
世界 \
中信 \
中国 \
Expand All @@ -1465,7 +1453,6 @@ export const utlds = 'ελ \
嘉里 \
嘉里大酒店 \
在线 \
大众汽车 \
大拿 \
天主教 \
娱乐 \
Expand Down Expand Up @@ -1504,4 +1491,8 @@ export const utlds = 'ελ \
食品 \
餐厅 \
香格里拉 \
香港'.split(' ');
香港 \
닷넷 \
닷컴 \
삼성 \
한국'.split(' ');
65 changes: 65 additions & 0 deletions tasks/update-tlds.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
const http = require('https'); // or 'https' for https:// URLs
const fs = require('fs');
const punycode = require('punycode/');

const tldsListUrl = 'https://data.iana.org/TLD/tlds-alpha-by-domain.txt';
const tldsjs = 'packages/linkifyjs/src/core/tlds.js';
let tldsListContents = '';

http.get(tldsListUrl, (response) => {
console.log(`Downloading ${tldsListUrl}...`);
response.on('data', (chunk) => { tldsListContents += chunk; });
response.on('end', () => {
console.log(`Downloaded. Re-generating ${tldsjs}...`);

// NOTE: punycode versions of IDNs (e.g., `XN--...`) do not get included
// in the TLDs list because these will not be as commonly used without
// the http prefix anyway and linkify will already force-encode those.
let tlds = [];
let utlds = [];

// NOTE: vermögensberater vermögensberatung are special cases because
// they're the only ones that contain a mix of ASCII and non-ASCII
// characters.
const specialTlds = ['XN--VERMGENSBERATER-CTB', 'XN--VERMGENSBERATUNG-PWB'];
const specialUtlds = specialTlds.map(tld => punycode.toUnicode(tld.toLowerCase()));

for (const line of tldsListContents.split('\n').map(line => line.trim())) {
if (!line || line[0] === '#' || specialTlds.includes(line)) { continue; }
if (/^XN--/.test(line)) {
utlds.push(punycode.toUnicode(line.toLowerCase()));
} else {
tlds.push(line.toLowerCase());
}
}
tlds = tlds.concat(specialUtlds).sort();
utlds = utlds.sort();

const jsFile = fs.openSync(tldsjs, 'w');
fs.writeSync(jsFile, '// THIS FILE IS AUTOMATICALLY GENERATED DO NOT EDIT DIRECTLY\n');
fs.writeSync(jsFile, `// ${tldsListUrl}\n`);

// Write TLDs
fs.writeSync(jsFile, 'export const tlds = \'');
let firstWrite = false;
for (const tld of tlds) {
if (firstWrite) { fs.writeSync(jsFile, ' \\\n'); }
else { firstWrite = true; }
fs.writeSync(jsFile, tld);
}
fs.writeSync(jsFile, '\'.split(\' \');\n');
fs.writeSync(jsFile, '// Internationalized domain names containing non-ASCII\n');
fs.writeSync(jsFile, 'export const utlds = \'');

firstWrite = false;
for (const utld of utlds) {
if (firstWrite) { fs.writeSync(jsFile, ' \\\n'); }
else { firstWrite = true; }
fs.writeSync(jsFile, utld);
}
fs.writeSync(jsFile, '\'.split(\' \');\n');
fs.closeSync(jsFile);

console.log('Done');
});
});

0 comments on commit 6ba92e4

Please sign in to comment.