1253 lines
64 KiB
JavaScript
1253 lines
64 KiB
JavaScript
|
|
'use strict';
|
|||
|
|
|
|||
|
|
// created 2023-09-25T01:01:55.148Z
|
|||
|
|
// compressed base64-encoded blob for include-ens data
|
|||
|
|
// source: https://github.com/adraffy/ens-normalize.js/blob/main/src/make.js
|
|||
|
|
// see: https://github.com/adraffy/ens-normalize.js#security
|
|||
|
|
// SHA-256: 0565ed049b9cf1614bb9e11ba7d8ac6a6fb96c893253d890f7e2b2884b9ded32
|
|||
|
|
var COMPRESSED$1 = 'AEEUdwmgDS8BxQKKAP4BOgDjATAAngDUAIMAoABoAOAAagCOAEQAhABMAHIAOwA9ACsANgAmAGIAHgAuACgAJwAXAC0AGgAjAB8ALwAUACkAEgAeAAkAGwARABkAFgA5ACgALQArADcAFQApABAAHgAiABAAGgAeABMAGAUhBe8BFxREN8sF2wC5AK5HAW8ArQkDzQCuhzc3NzcBP68NEfMABQdHBuw5BV8FYAA9MzkI9r4ZBg7QyQAWA9CeOwLNCjcCjqkChuA/lm+RAsXTAoP6ASfnEQDytQFJAjWVCkeXAOsA6godAB/cwdAUE0WlBCN/AQUCQRjFD/MRBjHxDQSJbw0jBzUAswBxme+tnIcAYwabAysG8QAjAEMMmxcDqgPKQyDXCMMxA7kUQwD3NXOrAKmFIAAfBC0D3x4BJQDBGdUFAhEgVD8JnwmQJiNWYUzrg0oAGwAUAB0AFnNcACkAFgBP9h3gPfsDOWDKneY2ChglX1UDYD30ABsAFAAdABZzIGRAnwDD8wAjAEEMzRbDqgMB2sAFYwXqAtCnAsS4AwpUJKRtFHsadUz9AMMVbwLpABM1NJEX0ZkCgYMBEyMAxRVvAukAEzUBUFAtmUwSAy4DBTER33EftQHfSwB5MxJ/AjkWKQLzL8E/cwBB6QH9LQDPDtO9ASNriQC5DQANAwCK21EFI91zHwCoL9kBqQcHBwcHKzUDowBvAQohPvU3fAQgHwCyAc8CKQMA5zMSezr7ULgFmDp/LzVQBgEGAi8FYQVgt8AFcTtlQhpCWEmfe5tmZ6IAExsDzQ8t+X8rBKtTAltbAn0jsy8Bl6utPWMDTR8Ei2kRANkDBrNHNysDBzECQWUAcwFpJ3kAiyUhAJ0BUb8AL3EfAbfNAz81KUsFWwF3YQZtAm0A+VEfAzEJDQBRSQCzAQBlAHsAM70GD/v3IZWHBwARKQAxALsjTwHZAeMPEzmXgIHwABIAGQA8AEUAQDt3gdvIEGcQZAkGTRFMdEIVEwK0D64L7REdDNkq09PgADSxB/MDWwfzA1sDWwfzB/MDWwfzA1sDWwNbA1scEvAi28gQZw9QBHUFlgWTBN4IiyZREYkHMAjaVBV0JhxPA00BBCMtSSQ7mzMTJUpMFE0LCAQ2SmyvfUADTzGzVP2QqgPTMlc5dAkGHnkSqAAyD3skNb1OhnpPcagKU0+2tYdJak5vAsY6sEAACikJm2/Dd1YGRRAfJ6kQ+ww3AbkBPw3xS9wE9QY/BM0fgRkdD9GVoAipLeEM8SbnLqWAXiP5KocF8Uv4POELUVFsD10LaQnnOmeBUgMlAREijwrhDT0IcRD3Cs1vDekRSQc9A9lJngCpBwULFR05FbkmFGKwCw05ewb/GvoLkyazEy17AAXXGiUGUQEtGwMA0y7rhbRaNVwgT2MGBwspI8sUrFAkDSlAu3hMGh8HGSWtApVDdEqLUToelyH6PEENai4XUYAH+TwJGVMLhTyiRq9FEhHWPpE9TCJNTDAEOYMsMyePCdMPiQy9fHYBXQklCbUMdRM1ERs3yQg9Bx0xlygnGQglRplgngT7owP3E9UDDwVDCUUHFwO5HDETMhUtBRGBKNsC9zbZLrcCk1aEARsFzw8pH+MQVEfkDu0InwJpA4cl7wAxFSUAGyKfCEdnAGOP3FMJLs8Iy2pwI3gDaxTrZRF3B5UOWwerHDcVwxzlcMxeD4YMKKezCV8BeQmdAWME5wgNNV+MpCBFZ1eLXBifIGVBQ14AAjUMaRWjRMGHfAKPD28SHwE5AXcHPQ0FAnsR8RFvEJkI74YINbkz/DopBFMhhyAVCisDU2zSCysm/Qz8bQGnEmYDEDRBd/Jnr2C6KBgBBx0yyUFkIfULlk/RDKAaxRhGVDIZ6AfDA/ca9yfuQVsGAwOnBxc6UTPyBMELbQiPCUMATQ6nGwfbGG4KdYzUATWPAbudA1uVhwJzkwY7Bw8Aaw+LBX3pACECqwinAAkA0wNbAD0CsQehAB0AiUUBQQMrMwEl6QKTA5cINc8BmTMB9y0EH8cMGQD7O25OAsO1AoBuZqYF4VwCkgJNOQFRKQQJUktVA7N15QDfAE8GF+NLARmvTs8e50cB43MvAMsA/wAJOQcJRQHRAfdxALsBYws1Caa3uQFR7S0AhwAZbwHbAo0A4QA5AIP1AVcAUQVd/QXXAlNNARU1HC9bZQG/AyMBNwERAH0Gz5GpzQsjBHEH1wIQHxXlAu8yB7kFAyLjE9FCyQK94lkAMhoKPAqrCqpgX2Q3CjV2PVQAEh+sPss/UgVVO1c7XDtXO1w7VztcO1c7XDtXO1wDm8Pmw+YKcF9JYe8Mqg3YRMw6TRPfYFVgNhPMLbsUxRXSJVoZQRrAJwkl6FUNDwgt12Y0CDA0eRfAAEMpbINFY4oeNApPHOtTlVT8LR8AtUumM7MNsBsZREQFS3XxYi4WEgomAmSFAmJGX1GzAV83JAKh+wJonAJmDQKfiDgfDwJmPwJmKgRyBIMDfxcDfpY5Cjl7GzmGOicnAmwhAjI6OA4CbcsCbbLzjgM3a0kvAWsA4gDlAE4JB5wMkQECD8YAEbkCdzMCdqZDAnlPRwJ4viFg30WyRvcCfEMCeswCfQ0CfPRIBEiBZygALxlJXEpfGRtK0ALRBQLQ0EsrA4hTA4fqRMmRNgLypV0HAwOyS9JMMSkH001QTbMCi0MCitzFHwshR2sJuwKOOwKOYESbhQKO3QKOYHxRuFM5AQ5S2FSJApP/ApMQAO0AIFUiVbNV1AosHymZijLleGpFPz0Cl6MC77ZYJawAXSkClpMCloCgAK1ZsFoNhVEAPwKWuQKWUlxIXNUCmc8CmWhczl0LHQKcnznGOqECnBoCn58CnryOACETNS4TAp31Ap6WALlBYThh8wKe1wKgcgGtAp6jIwKeUqljzGQrKS8CJ7MCJoICoP8CoFDbAqYzAqXSAqgDAIECp/ZogGi1AAdNaiBq1QKs5wKssgKtawKtBgJXIQJV4AKx5dsDH1JsmwKywRECsuwbbORtZ21MYwMl0QK2YD9DbpQDKUkCuGICuUsZArkue3A6cOUCvR0DLbYDMhUCvoxyBgMzdQK+HnMmc1MCw88CwwhzhnRPOUl05AM8qwEDPJ4DPcMCxYACxksCxhSNAshtVQLISALJUwLJMgJkoQLd1nh9ZXiyeSlL1AMYp2cGAmH4GfeVKHsPXpZevxUCz28Cz3AzT1fW9xejAMqxAs93AS3uA04Wfk8JAtwrAtuOAtJTA1JgA1NjAQUDVZCAjUMEzxrxZEl5A4LSg5EC2ssC2eKEFIRNp0ADhqkAMwNkEoZ1Xf0AWQLfaQLevHd7AuIz7RgB8zQrAfSfAfLWiwLr9wLpdH0DAur9AuroAP1LAb0C7o0C66CWrpcHAu5DA4XkmH1w5HGlAvMHAG0DjhqZlwL3FwORcgOSiwL3nAL53QL4apogmq+/O5siA52HAv7+AR8APZ8gAZ+3AwWRA6ZuA6bdANXJAwZuoYyiCQ0DDE0BEwEjB3EGZb1rCQC/BG/DFY8etxEAG3k9ACcDNxJRA42DAWcrJQCM8wAlAOanC6OVCLsGI6fJBgCvBRnDBvElRUYFFoAFcD9GSDNCKUK8X3kZX8QAls0FOgCQVCGbwTsuYDoZutcONxjOGJHJ/gVfBWAFXwVgBWsFYAVfBWAFXwVgBV8FYAVfBWBOHQjfjW8KCgoKbF7xMwTRA7kGN8PDAMMEr8MA70gxFroFTj5xPnhCR0K+X30/X/AAWBkzswCNBsxzzASm70aCRS4rDDMeLz49fnXfcsH5GcoscQFz13Y4HwVnBXLJycnACNdRYwgICAqEXoWTxgA7P4kACxbZBu21Kw0AjMsTAwkVAOVtJUUsJ1JCuULESUArXy9gPi9AKwnJRQYKTD9LPoA+iT54PnkCkULEUUpDX9NWV3JVEjQAc1w3A3IBE3YnX+g7QiMJb6MKaiszRCUuQrNCxDPMCcwEX9EWJzYREBEEBwIHKn6l33JCNVIfybPJtAltydPUCmhBZw/tEKsZAJOVJU1CLRuxbUHOQAo7P0s+eEJ
|
|||
|
|
const FENCED = new Map([[8217,"apostrophe"],[8260,"fraction slash"],[12539,"middle dot"]]);
|
|||
|
|
const NSM_MAX = 4;
|
|||
|
|
|
|||
|
|
function decode_arithmetic(bytes) {
|
|||
|
|
let pos = 0;
|
|||
|
|
function u16() { return (bytes[pos++] << 8) | bytes[pos++]; }
|
|||
|
|
|
|||
|
|
// decode the frequency table
|
|||
|
|
let symbol_count = u16();
|
|||
|
|
let total = 1;
|
|||
|
|
let acc = [0, 1]; // first symbol has frequency 1
|
|||
|
|
for (let i = 1; i < symbol_count; i++) {
|
|||
|
|
acc.push(total += u16());
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// skip the sized-payload that the last 3 symbols index into
|
|||
|
|
let skip = u16();
|
|||
|
|
let pos_payload = pos;
|
|||
|
|
pos += skip;
|
|||
|
|
|
|||
|
|
let read_width = 0;
|
|||
|
|
let read_buffer = 0;
|
|||
|
|
function read_bit() {
|
|||
|
|
if (read_width == 0) {
|
|||
|
|
// this will read beyond end of buffer
|
|||
|
|
// but (undefined|0) => zero pad
|
|||
|
|
read_buffer = (read_buffer << 8) | bytes[pos++];
|
|||
|
|
read_width = 8;
|
|||
|
|
}
|
|||
|
|
return (read_buffer >> --read_width) & 1;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const N = 31;
|
|||
|
|
const FULL = 2**N;
|
|||
|
|
const HALF = FULL >>> 1;
|
|||
|
|
const QRTR = HALF >> 1;
|
|||
|
|
const MASK = FULL - 1;
|
|||
|
|
|
|||
|
|
// fill register
|
|||
|
|
let register = 0;
|
|||
|
|
for (let i = 0; i < N; i++) register = (register << 1) | read_bit();
|
|||
|
|
|
|||
|
|
let symbols = [];
|
|||
|
|
let low = 0;
|
|||
|
|
let range = FULL; // treat like a float
|
|||
|
|
while (true) {
|
|||
|
|
let value = Math.floor((((register - low + 1) * total) - 1) / range);
|
|||
|
|
let start = 0;
|
|||
|
|
let end = symbol_count;
|
|||
|
|
while (end - start > 1) { // binary search
|
|||
|
|
let mid = (start + end) >>> 1;
|
|||
|
|
if (value < acc[mid]) {
|
|||
|
|
end = mid;
|
|||
|
|
} else {
|
|||
|
|
start = mid;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
if (start == 0) break; // first symbol is end mark
|
|||
|
|
symbols.push(start);
|
|||
|
|
let a = low + Math.floor(range * acc[start] / total);
|
|||
|
|
let b = low + Math.floor(range * acc[start+1] / total) - 1;
|
|||
|
|
while (((a ^ b) & HALF) == 0) {
|
|||
|
|
register = (register << 1) & MASK | read_bit();
|
|||
|
|
a = (a << 1) & MASK;
|
|||
|
|
b = (b << 1) & MASK | 1;
|
|||
|
|
}
|
|||
|
|
while (a & ~b & QRTR) {
|
|||
|
|
register = (register & HALF) | ((register << 1) & (MASK >>> 1)) | read_bit();
|
|||
|
|
a = (a << 1) ^ HALF;
|
|||
|
|
b = ((b ^ HALF) << 1) | HALF | 1;
|
|||
|
|
}
|
|||
|
|
low = a;
|
|||
|
|
range = 1 + b - a;
|
|||
|
|
}
|
|||
|
|
let offset = symbol_count - 4;
|
|||
|
|
return symbols.map(x => { // index into payload
|
|||
|
|
switch (x - offset) {
|
|||
|
|
case 3: return offset + 0x10100 + ((bytes[pos_payload++] << 16) | (bytes[pos_payload++] << 8) | bytes[pos_payload++]);
|
|||
|
|
case 2: return offset + 0x100 + ((bytes[pos_payload++] << 8) | bytes[pos_payload++]);
|
|||
|
|
case 1: return offset + bytes[pos_payload++];
|
|||
|
|
default: return x - 1;
|
|||
|
|
}
|
|||
|
|
});
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// returns an iterator which returns the next symbol
|
|||
|
|
function read_payload(v) {
|
|||
|
|
let pos = 0;
|
|||
|
|
return () => v[pos++];
|
|||
|
|
}
|
|||
|
|
function read_compressed_payload(s) {
|
|||
|
|
return read_payload(decode_arithmetic(unsafe_atob(s)));
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// unsafe in the sense:
|
|||
|
|
// expected well-formed Base64 w/o padding
|
|||
|
|
// 20220922: added for https://github.com/adraffy/ens-normalize.js/issues/4
|
|||
|
|
function unsafe_atob(s) {
|
|||
|
|
let lookup = [];
|
|||
|
|
[...'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/'].forEach((c, i) => lookup[c.charCodeAt(0)] = i);
|
|||
|
|
let n = s.length;
|
|||
|
|
let ret = new Uint8Array((6 * n) >> 3);
|
|||
|
|
for (let i = 0, pos = 0, width = 0, carry = 0; i < n; i++) {
|
|||
|
|
carry = (carry << 6) | lookup[s.charCodeAt(i)];
|
|||
|
|
width += 6;
|
|||
|
|
if (width >= 8) {
|
|||
|
|
ret[pos++] = (carry >> (width -= 8));
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
return ret;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// eg. [0,1,2,3...] => [0,-1,1,-2,...]
|
|||
|
|
function signed(i) {
|
|||
|
|
return (i & 1) ? (~i >> 1) : (i >> 1);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function read_deltas(n, next) {
|
|||
|
|
let v = Array(n);
|
|||
|
|
for (let i = 0, x = 0; i < n; i++) v[i] = x += signed(next());
|
|||
|
|
return v;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// [123][5] => [0 3] [1 1] [0 0]
|
|||
|
|
function read_sorted(next, prev = 0) {
|
|||
|
|
let ret = [];
|
|||
|
|
while (true) {
|
|||
|
|
let x = next();
|
|||
|
|
let n = next();
|
|||
|
|
if (!n) break;
|
|||
|
|
prev += x;
|
|||
|
|
for (let i = 0; i < n; i++) {
|
|||
|
|
ret.push(prev + i);
|
|||
|
|
}
|
|||
|
|
prev += n + 1;
|
|||
|
|
}
|
|||
|
|
return ret;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function read_sorted_arrays(next) {
|
|||
|
|
return read_array_while(() => {
|
|||
|
|
let v = read_sorted(next);
|
|||
|
|
if (v.length) return v;
|
|||
|
|
});
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// returns map of x => ys
|
|||
|
|
function read_mapped(next) {
|
|||
|
|
let ret = [];
|
|||
|
|
while (true) {
|
|||
|
|
let w = next();
|
|||
|
|
if (w == 0) break;
|
|||
|
|
ret.push(read_linear_table(w, next));
|
|||
|
|
}
|
|||
|
|
while (true) {
|
|||
|
|
let w = next() - 1;
|
|||
|
|
if (w < 0) break;
|
|||
|
|
ret.push(read_replacement_table(w, next));
|
|||
|
|
}
|
|||
|
|
return ret.flat();
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// read until next is falsy
|
|||
|
|
// return array of read values
|
|||
|
|
function read_array_while(next) {
|
|||
|
|
let v = [];
|
|||
|
|
while (true) {
|
|||
|
|
let x = next(v.length);
|
|||
|
|
if (!x) break;
|
|||
|
|
v.push(x);
|
|||
|
|
}
|
|||
|
|
return v;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// read w columns of length n
|
|||
|
|
// return as n rows of length w
|
|||
|
|
function read_transposed(n, w, next) {
|
|||
|
|
let m = Array(n).fill().map(() => []);
|
|||
|
|
for (let i = 0; i < w; i++) {
|
|||
|
|
read_deltas(n, next).forEach((x, j) => m[j].push(x));
|
|||
|
|
}
|
|||
|
|
return m;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// returns [[x, ys], [x+dx, ys+dy], [x+2*dx, ys+2*dy], ...]
|
|||
|
|
// where dx/dy = steps, n = run size, w = length of y
|
|||
|
|
function read_linear_table(w, next) {
|
|||
|
|
let dx = 1 + next();
|
|||
|
|
let dy = next();
|
|||
|
|
let vN = read_array_while(next);
|
|||
|
|
let m = read_transposed(vN.length, 1+w, next);
|
|||
|
|
return m.flatMap((v, i) => {
|
|||
|
|
let [x, ...ys] = v;
|
|||
|
|
return Array(vN[i]).fill().map((_, j) => {
|
|||
|
|
let j_dy = j * dy;
|
|||
|
|
return [x + j * dx, ys.map(y => y + j_dy)];
|
|||
|
|
});
|
|||
|
|
});
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// return [[x, ys...], ...]
|
|||
|
|
// where w = length of y
|
|||
|
|
function read_replacement_table(w, next) {
|
|||
|
|
let n = 1 + next();
|
|||
|
|
let m = read_transposed(n, 1+w, next);
|
|||
|
|
return m.map(v => [v[0], v.slice(1)]);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
|
|||
|
|
function read_trie(next) {
|
|||
|
|
let ret = [];
|
|||
|
|
let sorted = read_sorted(next);
|
|||
|
|
expand(decode([]), []);
|
|||
|
|
return ret; // not sorted
|
|||
|
|
function decode(Q) { // characters that lead into this node
|
|||
|
|
let S = next(); // state: valid, save, check
|
|||
|
|
let B = read_array_while(() => { // buckets leading to new nodes
|
|||
|
|
let cps = read_sorted(next).map(i => sorted[i]);
|
|||
|
|
if (cps.length) return decode(cps);
|
|||
|
|
});
|
|||
|
|
return {S, B, Q};
|
|||
|
|
}
|
|||
|
|
function expand({S, B}, cps, saved) {
|
|||
|
|
if (S & 4 && saved === cps[cps.length-1]) return;
|
|||
|
|
if (S & 2) saved = cps[cps.length-1];
|
|||
|
|
if (S & 1) ret.push(cps);
|
|||
|
|
for (let br of B) {
|
|||
|
|
for (let cp of br.Q) {
|
|||
|
|
expand(br, [...cps, cp], saved);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function hex_cp(cp) {
|
|||
|
|
return cp.toString(16).toUpperCase().padStart(2, '0');
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function quote_cp(cp) {
|
|||
|
|
return `{${hex_cp(cp)}}`; // raffy convention: like "\u{X}" w/o the "\u"
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/*
|
|||
|
|
export function explode_cp(s) {
|
|||
|
|
return [...s].map(c => c.codePointAt(0));
|
|||
|
|
}
|
|||
|
|
*/
|
|||
|
|
function explode_cp(s) { // this is about 2x faster
|
|||
|
|
let cps = [];
|
|||
|
|
for (let pos = 0, len = s.length; pos < len; ) {
|
|||
|
|
let cp = s.codePointAt(pos);
|
|||
|
|
pos += cp < 0x10000 ? 1 : 2;
|
|||
|
|
cps.push(cp);
|
|||
|
|
}
|
|||
|
|
return cps;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function str_from_cps(cps) {
|
|||
|
|
const chunk = 4096;
|
|||
|
|
let len = cps.length;
|
|||
|
|
if (len < chunk) return String.fromCodePoint(...cps);
|
|||
|
|
let buf = [];
|
|||
|
|
for (let i = 0; i < len; ) {
|
|||
|
|
buf.push(String.fromCodePoint(...cps.slice(i, i += chunk)));
|
|||
|
|
}
|
|||
|
|
return buf.join('');
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function compare_arrays(a, b) {
|
|||
|
|
let n = a.length;
|
|||
|
|
let c = n - b.length;
|
|||
|
|
for (let i = 0; c == 0 && i < n; i++) c = a[i] - b[i];
|
|||
|
|
return c;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// created 2023-09-25T01:01:55.148Z
|
|||
|
|
// compressed base64-encoded blob for include-nf data
|
|||
|
|
// source: https://github.com/adraffy/ens-normalize.js/blob/main/src/make.js
|
|||
|
|
// see: https://github.com/adraffy/ens-normalize.js#security
|
|||
|
|
// SHA-256: a974b6f8541fc29d919bc85118af0a44015851fab5343f8679cb31be2bdb209e
|
|||
|
|
var COMPRESSED = 'AEUDTAHBCFQATQDRADAAcgAgADQAFAAsABQAHwAOACQADQARAAoAFwAHABIACAAPAAUACwAFAAwABAAQAAMABwAEAAoABQAIAAIACgABAAQAFAALAAIACwABAAIAAQAHAAMAAwAEAAsADAAMAAwACgANAA0AAwAKAAkABAAdAAYAZwDSAdsDJgC0CkMB8xhZAqfoC190UGcThgBurwf7PT09Pb09AjgJum8OjDllxHYUKXAPxzq6tABAxgK8ysUvWAgMPT09PT09PSs6LT2HcgWXWwFLoSMEEEl5RFVMKvO0XQ8ExDdJMnIgsj26PTQyy8FfEQ8AY8IPAGcEbwRwBHEEcgRzBHQEdQR2BHcEeAR6BHsEfAR+BIAEgfndBQoBYgULAWIFDAFiBNcE2ATZBRAFEQUvBdALFAsVDPcNBw13DYcOMA4xDjMB4BllHI0B2grbAMDpHLkQ7QHVAPRNQQFnGRUEg0yEB2uaJF8AJpIBpob5AERSMAKNoAXqaQLUBMCzEiACnwRZEkkVsS7tANAsBG0RuAQLEPABv9HICTUBXigPZwRBApMDOwAamhtaABqEAY8KvKx3LQ4ArAB8UhwEBAVSagD8AEFZADkBIadVj2UMUgx5Il4ANQC9AxIB1BlbEPMAs30CGxlXAhwZKQIECBc6EbsCoxngzv7UzRQA8M0BawL6ZwkN7wABAD33OQRcsgLJCjMCjqUChtw/km+NAsXPAoP2BT84PwURAK0RAvptb6cApQS/OMMey5HJS84UdxpxTPkCogVFITaTOwERAK5pAvkNBOVyA7q3BKlOJSALAgUIBRcEdASpBXqzABXFSWZOawLCOqw//AolCZdvv3dSBkEQGyelEPcMMwG1ATsN7UvYBPEGOwTJH30ZGQ/NlZwIpS3dDO0m4y6hgFoj9SqDBe1L9DzdC01RaA9ZC2UJ4zpjgU4DIQENIosK3Q05CG0Q8wrJaw3lEUUHOQPVSZoApQcBCxEdNRW1JhBirAsJOXcG+xr2C48mrxMpevwF0xohBk0BKRr/AM8u54WwWjFcHE9fBgMLJSPHFKhQIA0lQLd4SBobBxUlqQKRQ3BKh1E2HpMh9jw9DWYuE1F8B/U8BRlPC4E8nkarRQ4R0j6NPUgiSUwsBDV/LC8niwnPD4UMuXxyAVkJIQmxDHETMREXN8UIOQcZLZckJxUIIUaVYJoE958D8xPRAwsFPwlBBxMDtRwtEy4VKQUNgSTXAvM21S6zAo9WgAEXBcsPJR/fEFBH4A7pCJsCZQODJesALRUhABcimwhDYwBfj9hTBS7LCMdqbCN0A2cU52ERcweRDlcHpxwzFb8c4XDIXguGCCijrwlbAXUJmQFfBOMICTVbjKAgQWdTi1gYmyBhQT9d/AIxDGUVn0S9h3gCiw9rEhsBNQFzBzkNAQJ3Ee0RaxCVCOuGBDW1M/g6JQRPIYMgEQonA09szgsnJvkM+GkBoxJiAww0PXfuZ6tgtiQX/QcZMsVBYCHxC5JPzQycGsEYQlQuGeQHvwPzGvMn6kFXBf8DowMTOk0z7gS9C2kIiwk/AEkOoxcH1xhqCnGM0AExiwG3mQNXkYMCb48GNwcLAGcLhwV55QAdAqcIowAFAM8DVwA5Aq0HnQAZAIVBAT0DJy8BIeUCjwOTCDHLAZUvAfMpBBvDDBUA9zduSgLDsQKAamaiBd1YAo4CSTUBTSUEBU5HUQOvceEA2wBLBhPfRwEVq0rLGuNDAd9vKwDHAPsABTUHBUEBzQHzbQC3AV8LMQmis7UBTekpAIMAFWsB1wKJAN0ANQB/8QFTAE0FWfkF0wJPSQERMRgrV2EBuwMfATMBDQB5BsuNpckHHwRtB9MCEBsV4QLvLge1AQMi3xPNQsUCvd5VoWACZIECYkJbTa9bNyACofcCaJgCZgkCn4Q4GwsCZjsCZiYEbgR/A38TA36SOQY5dxc5gjojIwJsHQIyNjgKAm3HAm2u74ozZ0UrAWcA3gDhAEoFB5gMjQD+C8IADbUCdy8CdqI/AnlLQwJ4uh1c20WuRtcCfD8CesgCfQkCfPAFWQUgSABIfWMkAoFtAoAAAoAFAn+uSVhKWxUXSswC0QEC0MxLJwOITwOH5kTFkTIC8qFdAwMDrkvOTC0lA89NTE2vAos/AorYwRsHHUNnBbcCjjcCjlxAl4ECjtkCjlx4UbRTNQpS1FSFApP7ApMMAOkAHFUeVa9V0AYsGymVhjLheGZFOzkCl58C77JYIagAWSUClo8ClnycAKlZrFoJgU0AOwKWtQKWTlxEXNECmcsCmWRcyl0HGQKcmznCOp0CnBYCn5sCnriKAB0PMSoPAp3xAp6SALU9YTRh7wKe0wKgbgGpAp6fHwKeTqVjyGQnJSsCJ68CJn4CoPsCoEwCot0CocQCpi8Cpc4Cp/8AfQKn8mh8aLEAA0lqHGrRAqzjAqyuAq1nAq0CAlcdAlXcArHh1wMfTmyXArK9DQKy6Bds4G1jbUhfAyXNArZcOz9ukAMpRQK4XgK5RxUCuSp3cDZw4QK9GQK72nCWAzIRAr6IcgIDM3ECvhpzInNPAsPLAsMEc4J0SzVFdOADPKcDPJoDPb8CxXwCxkcCxhCJAshpUQLIRALJTwLJLgJknQLd0nh5YXiueSVL0AMYo2cCAmH0GfOVJHsLXpJeuxECz2sCz2wvS1PS8xOfAMatAs9zASnqA04SfksFAtwnAtuKAtJPA1JcA1NfAQEDVYyAiT8AyxbtYEWCHILTgs6DjQLaxwLZ3oQQhEmnPAOGpQAvA2QOhnFZ+QBVAt9lAt64c3cC4i/tFAHzMCcB9JsB8tKHAuvzAulweQLq+QLq5AD5RwG5Au6JAuuclqqXAwLuPwOF4Jh5cOBxoQLzAwBpA44WmZMC9xMDkW4DkocC95gC+dkC+GaaHJqruzebHgOdgwL++gEbADmfHJ+zAwWNA6ZqA6bZANHFAwZqoYiiBQkDDEkCwAA/AwDhQRdTARHzA2sHl2cFAJMtK7evvdsBiZkUfxEEOQH7KQUhDp0JnwCS/SlXxQL3AZ0AtwW5AG8LbUEuFCaNLgFDAYD8AbUmAHUDDgRtACwCFgyhAAAKAj0CagPdA34EkQEgRQUhfAoABQBEABMANhICdwEABdUDa+8KxQIA9wqfJ7+xt+UBkSFBQgHpFH8RNMCJAAQAGwBaAkUChIsABjpTOpSNbQC4Oo860ACNOME63AClAOgAywE6gTo7Ofw5+Tt2iTpbO56JOm85GAFWATMBbAUvNV01njWtNWY1dTW2NcU1gjWRNdI14TWeNa017jX9NbI1wTYCNhE1xjXVNhY2JzXeNe02LjY9Ni41LSE2OjY9Njw2yTcIBJA8VzY4Nt03IDcPNsogN4k3MAoEsDxnNiQ3GTdsOo03IULUQwdC4EMLHA8PCZsobShRVQYA6X8A6bABFCnXAukBowC9BbcAbwNzBL8MDAMMAQgDAAkKCwsLCQoGBAVVBI/DvwDz9b29kaUCb0QtsRTNLt4eGBcSHAMZFhYZEhYEARAEBUEcQRxBHEEcQRxBHEEaQRxBHEFCSTxBPElISUhBNkM2QTYbNklISVmBVIgBFLWZAu0BhQCjBcEAbykBvwGJAaQcEZ0ePCklMAAhMvAIMAL54gC7Bm8EescjzQMpARQpKgDUABavAj626xQAJP0A3etzuf4NNRA7efy2Z9NQrCnC0OSyANz5BBIbJ5IFDR6miIavYS6tprjjmuKebxm5C74Q225X1pkaYYPb6f1DK4k3xMEBb9S2WMjEibTNWhsRJIA+vwNVEiXTE5iXs/wezV66oFLfp9NZGYW+Gk19J2+bCT6Ye2w6LDYdgzKMUabk595eLBCXANz9HUpWbATq9vqXVx9XDg+Pc9Xp4+bsS005SVM/BJBM4687WUuf+Uj9dEi8aDNaPxtpbDxcG1THTImUMZq4UCaaNYpsVqraNyKLJXDYsFZ/5jl7bLRtO88t7P3xZaAxhb5OdPMXqsSkp1WCieG8jXm1U99+blvLlXzPCS+M93VnJCiK+09LfaSaBAVBomyDgJua8dfUzR7ga34Iv
|
|||
|
|
|
|||
|
|
// https://unicode.org/reports/tr15/
|
|||
|
|
// for reference implementation
|
|||
|
|
// see: /derive/nf.js
|
|||
|
|
|
|||
|
|
|
|||
|
|
// algorithmic hangul
|
|||
|
|
// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf (page 144)
|
|||
|
|
const S0 = 0xAC00;
|
|||
|
|
const L0 = 0x1100;
|
|||
|
|
const V0 = 0x1161;
|
|||
|
|
const T0 = 0x11A7;
|
|||
|
|
const L_COUNT = 19;
|
|||
|
|
const V_COUNT = 21;
|
|||
|
|
const T_COUNT = 28;
|
|||
|
|
const N_COUNT = V_COUNT * T_COUNT;
|
|||
|
|
const S_COUNT = L_COUNT * N_COUNT;
|
|||
|
|
const S1 = S0 + S_COUNT;
|
|||
|
|
const L1 = L0 + L_COUNT;
|
|||
|
|
const V1 = V0 + V_COUNT;
|
|||
|
|
const T1 = T0 + T_COUNT;
|
|||
|
|
|
|||
|
|
function unpack_cc(packed) {
|
|||
|
|
return (packed >> 24) & 0xFF;
|
|||
|
|
}
|
|||
|
|
function unpack_cp(packed) {
|
|||
|
|
return packed & 0xFFFFFF;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
let SHIFTED_RANK, EXCLUSIONS, DECOMP, RECOMP;
|
|||
|
|
|
|||
|
|
function init$1() {
|
|||
|
|
//console.time('nf');
|
|||
|
|
let r = read_compressed_payload(COMPRESSED);
|
|||
|
|
SHIFTED_RANK = new Map(read_sorted_arrays(r).flatMap((v, i) => v.map(x => [x, (i+1) << 24]))); // pre-shifted
|
|||
|
|
EXCLUSIONS = new Set(read_sorted(r));
|
|||
|
|
DECOMP = new Map();
|
|||
|
|
RECOMP = new Map();
|
|||
|
|
for (let [cp, cps] of read_mapped(r)) {
|
|||
|
|
if (!EXCLUSIONS.has(cp) && cps.length == 2) {
|
|||
|
|
let [a, b] = cps;
|
|||
|
|
let bucket = RECOMP.get(a);
|
|||
|
|
if (!bucket) {
|
|||
|
|
bucket = new Map();
|
|||
|
|
RECOMP.set(a, bucket);
|
|||
|
|
}
|
|||
|
|
bucket.set(b, cp);
|
|||
|
|
}
|
|||
|
|
DECOMP.set(cp, cps.reverse()); // stored reversed
|
|||
|
|
}
|
|||
|
|
//console.timeEnd('nf');
|
|||
|
|
// 20230905: 11ms
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function is_hangul(cp) {
|
|||
|
|
return cp >= S0 && cp < S1;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function compose_pair(a, b) {
|
|||
|
|
if (a >= L0 && a < L1 && b >= V0 && b < V1) {
|
|||
|
|
return S0 + (a - L0) * N_COUNT + (b - V0) * T_COUNT;
|
|||
|
|
} else if (is_hangul(a) && b > T0 && b < T1 && (a - S0) % T_COUNT == 0) {
|
|||
|
|
return a + (b - T0);
|
|||
|
|
} else {
|
|||
|
|
let recomp = RECOMP.get(a);
|
|||
|
|
if (recomp) {
|
|||
|
|
recomp = recomp.get(b);
|
|||
|
|
if (recomp) {
|
|||
|
|
return recomp;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
return -1;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function decomposed(cps) {
|
|||
|
|
if (!SHIFTED_RANK) init$1();
|
|||
|
|
let ret = [];
|
|||
|
|
let buf = [];
|
|||
|
|
let check_order = false;
|
|||
|
|
function add(cp) {
|
|||
|
|
let cc = SHIFTED_RANK.get(cp);
|
|||
|
|
if (cc) {
|
|||
|
|
check_order = true;
|
|||
|
|
cp |= cc;
|
|||
|
|
}
|
|||
|
|
ret.push(cp);
|
|||
|
|
}
|
|||
|
|
for (let cp of cps) {
|
|||
|
|
while (true) {
|
|||
|
|
if (cp < 0x80) {
|
|||
|
|
ret.push(cp);
|
|||
|
|
} else if (is_hangul(cp)) {
|
|||
|
|
let s_index = cp - S0;
|
|||
|
|
let l_index = s_index / N_COUNT | 0;
|
|||
|
|
let v_index = (s_index % N_COUNT) / T_COUNT | 0;
|
|||
|
|
let t_index = s_index % T_COUNT;
|
|||
|
|
add(L0 + l_index);
|
|||
|
|
add(V0 + v_index);
|
|||
|
|
if (t_index > 0) add(T0 + t_index);
|
|||
|
|
} else {
|
|||
|
|
let mapped = DECOMP.get(cp);
|
|||
|
|
if (mapped) {
|
|||
|
|
buf.push(...mapped);
|
|||
|
|
} else {
|
|||
|
|
add(cp);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
if (!buf.length) break;
|
|||
|
|
cp = buf.pop();
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
if (check_order && ret.length > 1) {
|
|||
|
|
let prev_cc = unpack_cc(ret[0]);
|
|||
|
|
for (let i = 1; i < ret.length; i++) {
|
|||
|
|
let cc = unpack_cc(ret[i]);
|
|||
|
|
if (cc == 0 || prev_cc <= cc) {
|
|||
|
|
prev_cc = cc;
|
|||
|
|
continue;
|
|||
|
|
}
|
|||
|
|
let j = i-1;
|
|||
|
|
while (true) {
|
|||
|
|
let tmp = ret[j+1];
|
|||
|
|
ret[j+1] = ret[j];
|
|||
|
|
ret[j] = tmp;
|
|||
|
|
if (!j) break;
|
|||
|
|
prev_cc = unpack_cc(ret[--j]);
|
|||
|
|
if (prev_cc <= cc) break;
|
|||
|
|
}
|
|||
|
|
prev_cc = unpack_cc(ret[i]);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
return ret;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function composed_from_decomposed(v) {
|
|||
|
|
let ret = [];
|
|||
|
|
let stack = [];
|
|||
|
|
let prev_cp = -1;
|
|||
|
|
let prev_cc = 0;
|
|||
|
|
for (let packed of v) {
|
|||
|
|
let cc = unpack_cc(packed);
|
|||
|
|
let cp = unpack_cp(packed);
|
|||
|
|
if (prev_cp == -1) {
|
|||
|
|
if (cc == 0) {
|
|||
|
|
prev_cp = cp;
|
|||
|
|
} else {
|
|||
|
|
ret.push(cp);
|
|||
|
|
}
|
|||
|
|
} else if (prev_cc > 0 && prev_cc >= cc) {
|
|||
|
|
if (cc == 0) {
|
|||
|
|
ret.push(prev_cp, ...stack);
|
|||
|
|
stack.length = 0;
|
|||
|
|
prev_cp = cp;
|
|||
|
|
} else {
|
|||
|
|
stack.push(cp);
|
|||
|
|
}
|
|||
|
|
prev_cc = cc;
|
|||
|
|
} else {
|
|||
|
|
let composed = compose_pair(prev_cp, cp);
|
|||
|
|
if (composed >= 0) {
|
|||
|
|
prev_cp = composed;
|
|||
|
|
} else if (prev_cc == 0 && cc == 0) {
|
|||
|
|
ret.push(prev_cp);
|
|||
|
|
prev_cp = cp;
|
|||
|
|
} else {
|
|||
|
|
stack.push(cp);
|
|||
|
|
prev_cc = cc;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
if (prev_cp >= 0) {
|
|||
|
|
ret.push(prev_cp, ...stack);
|
|||
|
|
}
|
|||
|
|
return ret;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// note: cps can be iterable
|
|||
|
|
function nfd(cps) {
|
|||
|
|
return decomposed(cps).map(unpack_cp);
|
|||
|
|
}
|
|||
|
|
function nfc(cps) {
|
|||
|
|
return composed_from_decomposed(decomposed(cps));
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const HYPHEN = 0x2D;
|
|||
|
|
const STOP = 0x2E;
|
|||
|
|
const STOP_CH = '.';
|
|||
|
|
const FE0F = 0xFE0F;
|
|||
|
|
const UNIQUE_PH = 1;
|
|||
|
|
|
|||
|
|
// 20230913: replace [...v] with Array_from(v) to avoid large spreads
|
|||
|
|
const Array_from = x => Array.from(x); // Array.from.bind(Array);
|
|||
|
|
|
|||
|
|
function group_has_cp(g, cp) {
|
|||
|
|
// 20230913: keep primary and secondary distinct instead of creating valid union
|
|||
|
|
return g.P.has(cp) || g.Q.has(cp);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
class Emoji extends Array {
|
|||
|
|
get is_emoji() { return true; } // free tagging system
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
let MAPPED, IGNORED, CM, NSM, ESCAPE, NFC_CHECK, GROUPS, WHOLE_VALID, WHOLE_MAP, VALID, EMOJI_LIST, EMOJI_ROOT;
|
|||
|
|
|
|||
|
|
function init() {
|
|||
|
|
if (MAPPED) return;
|
|||
|
|
|
|||
|
|
let r = read_compressed_payload(COMPRESSED$1);
|
|||
|
|
const read_sorted_array = () => read_sorted(r);
|
|||
|
|
const read_sorted_set = () => new Set(read_sorted_array());
|
|||
|
|
const set_add_many = (set, v) => v.forEach(x => set.add(x));
|
|||
|
|
|
|||
|
|
MAPPED = new Map(read_mapped(r));
|
|||
|
|
IGNORED = read_sorted_set(); // ignored characters are not valid, so just read raw codepoints
|
|||
|
|
|
|||
|
|
/*
|
|||
|
|
// direct include from payload is smaller than the decompression code
|
|||
|
|
const FENCED = new Map(read_array_while(() => {
|
|||
|
|
let cp = r();
|
|||
|
|
if (cp) return [cp, read_str(r())];
|
|||
|
|
}));
|
|||
|
|
*/
|
|||
|
|
// 20230217: we still need all CM for proper error formatting
|
|||
|
|
// but norm only needs NSM subset that are potentially-valid
|
|||
|
|
CM = read_sorted_array();
|
|||
|
|
NSM = new Set(read_sorted_array().map(i => CM[i]));
|
|||
|
|
CM = new Set(CM);
|
|||
|
|
|
|||
|
|
ESCAPE = read_sorted_set(); // characters that should not be printed
|
|||
|
|
NFC_CHECK = read_sorted_set(); // only needed to illustrate ens_tokenize() transformations
|
|||
|
|
|
|||
|
|
let chunks = read_sorted_arrays(r);
|
|||
|
|
let unrestricted = r();
|
|||
|
|
//const read_chunked = () => new Set(read_sorted_array().flatMap(i => chunks[i]).concat(read_sorted_array()));
|
|||
|
|
const read_chunked = () => {
|
|||
|
|
// 20230921: build set in parts, 2x faster
|
|||
|
|
let set = new Set();
|
|||
|
|
read_sorted_array().forEach(i => set_add_many(set, chunks[i]));
|
|||
|
|
set_add_many(set, read_sorted_array());
|
|||
|
|
return set;
|
|||
|
|
};
|
|||
|
|
GROUPS = read_array_while(i => {
|
|||
|
|
// minifier property mangling seems unsafe
|
|||
|
|
// so these are manually renamed to single chars
|
|||
|
|
let N = read_array_while(r).map(x => x+0x60);
|
|||
|
|
if (N.length) {
|
|||
|
|
let R = i >= unrestricted; // unrestricted then restricted
|
|||
|
|
N[0] -= 32; // capitalize
|
|||
|
|
N = str_from_cps(N);
|
|||
|
|
if (R) N=`Restricted[${N}]`;
|
|||
|
|
let P = read_chunked(); // primary
|
|||
|
|
let Q = read_chunked(); // secondary
|
|||
|
|
let M = !r(); // not-whitelisted, check for NSM
|
|||
|
|
// *** this code currently isn't needed ***
|
|||
|
|
/*
|
|||
|
|
let V = [...P, ...Q].sort((a, b) => a-b); // derive: sorted valid
|
|||
|
|
let M = r()-1; // number of combining mark
|
|||
|
|
if (M < 0) { // whitelisted
|
|||
|
|
M = new Map(read_array_while(() => {
|
|||
|
|
let i = r();
|
|||
|
|
if (i) return [V[i-1], read_array_while(() => {
|
|||
|
|
let v = read_array_while(r);
|
|||
|
|
if (v.length) return v.map(x => x-1);
|
|||
|
|
})];
|
|||
|
|
}));
|
|||
|
|
}*/
|
|||
|
|
return {N, P, Q, M, R};
|
|||
|
|
}
|
|||
|
|
});
|
|||
|
|
|
|||
|
|
// decode compressed wholes
|
|||
|
|
WHOLE_VALID = read_sorted_set();
|
|||
|
|
WHOLE_MAP = new Map();
|
|||
|
|
let wholes = read_sorted_array().concat(Array_from(WHOLE_VALID)).sort((a, b) => a-b); // must be sorted
|
|||
|
|
wholes.forEach((cp, i) => {
|
|||
|
|
let d = r();
|
|||
|
|
let w = wholes[i] = d ? wholes[i-d] : {V: [], M: new Map()};
|
|||
|
|
w.V.push(cp); // add to member set
|
|||
|
|
if (!WHOLE_VALID.has(cp)) {
|
|||
|
|
WHOLE_MAP.set(cp, w); // register with whole map
|
|||
|
|
}
|
|||
|
|
});
|
|||
|
|
|
|||
|
|
// compute confusable-extent complements
|
|||
|
|
// usage: WHOLE_MAP.get(cp).M.get(cp) = complement set
|
|||
|
|
for (let {V, M} of new Set(WHOLE_MAP.values())) {
|
|||
|
|
// connect all groups that have each whole character
|
|||
|
|
let recs = [];
|
|||
|
|
for (let cp of V) {
|
|||
|
|
let gs = GROUPS.filter(g => group_has_cp(g, cp));
|
|||
|
|
let rec = recs.find(({G}) => gs.some(g => G.has(g)));
|
|||
|
|
if (!rec) {
|
|||
|
|
rec = {G: new Set(), V: []};
|
|||
|
|
recs.push(rec);
|
|||
|
|
}
|
|||
|
|
rec.V.push(cp);
|
|||
|
|
set_add_many(rec.G, gs);
|
|||
|
|
}
|
|||
|
|
// per character cache groups which are not a member of the extent
|
|||
|
|
let union = recs.flatMap(x => Array_from(x.G)); // all of the groups used by this whole
|
|||
|
|
for (let {G, V} of recs) {
|
|||
|
|
let complement = new Set(union.filter(g => !G.has(g))); // groups not covered by the extent
|
|||
|
|
for (let cp of V) {
|
|||
|
|
M.set(cp, complement); // this is the same reference
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// compute valid set
|
|||
|
|
// 20230924: VALID was union but can be re-used
|
|||
|
|
VALID = new Set(); // exists in 1+ groups
|
|||
|
|
let multi = new Set(); // exists in 2+ groups
|
|||
|
|
const add_to_union = cp => VALID.has(cp) ? multi.add(cp) : VALID.add(cp);
|
|||
|
|
for (let g of GROUPS) {
|
|||
|
|
for (let cp of g.P) add_to_union(cp);
|
|||
|
|
for (let cp of g.Q) add_to_union(cp);
|
|||
|
|
}
|
|||
|
|
// dual purpose WHOLE_MAP: return placeholder if unique non-confusable
|
|||
|
|
for (let cp of VALID) {
|
|||
|
|
if (!WHOLE_MAP.has(cp) && !multi.has(cp)) {
|
|||
|
|
WHOLE_MAP.set(cp, UNIQUE_PH);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
// add all decomposed parts
|
|||
|
|
// see derive: "Valid is Closed (via Brute-force)"
|
|||
|
|
set_add_many(VALID, nfd(VALID));
|
|||
|
|
|
|||
|
|
// decode emoji
|
|||
|
|
// 20230719: emoji are now fully-expanded to avoid quirk logic
|
|||
|
|
EMOJI_LIST = read_trie(r).map(v => Emoji.from(v)).sort(compare_arrays);
|
|||
|
|
EMOJI_ROOT = new Map(); // this has approx 7K nodes (2+ per emoji)
|
|||
|
|
for (let cps of EMOJI_LIST) {
|
|||
|
|
// 20230719: change to *slightly* stricter algorithm which disallows
|
|||
|
|
// insertion of misplaced FE0F in emoji sequences (matching ENSIP-15)
|
|||
|
|
// example: beautified [A B] (eg. flag emoji)
|
|||
|
|
// before: allow: [A FE0F B], error: [A FE0F FE0F B]
|
|||
|
|
// after: error: both
|
|||
|
|
// note: this code now matches ENSNormalize.{cs,java} logic
|
|||
|
|
let prev = [EMOJI_ROOT];
|
|||
|
|
for (let cp of cps) {
|
|||
|
|
let next = prev.map(node => {
|
|||
|
|
let child = node.get(cp);
|
|||
|
|
if (!child) {
|
|||
|
|
// should this be object?
|
|||
|
|
// (most have 1-2 items, few have many)
|
|||
|
|
// 20230719: no, v8 default map is 4?
|
|||
|
|
child = new Map();
|
|||
|
|
node.set(cp, child);
|
|||
|
|
}
|
|||
|
|
return child;
|
|||
|
|
});
|
|||
|
|
if (cp === FE0F) {
|
|||
|
|
prev.push(...next); // less than 20 elements
|
|||
|
|
} else {
|
|||
|
|
prev = next;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
for (let x of prev) {
|
|||
|
|
x.V = cps;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// if escaped: {HEX}
|
|||
|
|
// else: "x" {HEX}
|
|||
|
|
function quoted_cp(cp) {
|
|||
|
|
return (should_escape(cp) ? '' : `${bidi_qq(safe_str_from_cps([cp]))} `) + quote_cp(cp);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 20230211: some messages can be mixed-directional and result in spillover
|
|||
|
|
// use 200E after a quoted string to force the remainder of a string from
|
|||
|
|
// acquring the direction of the quote
|
|||
|
|
// https://www.w3.org/International/questions/qa-bidi-unicode-controls#exceptions
|
|||
|
|
function bidi_qq(s) {
|
|||
|
|
return `"${s}"\u200E`; // strong LTR
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function check_label_extension(cps) {
|
|||
|
|
if (cps.length >= 4 && cps[2] == HYPHEN && cps[3] == HYPHEN) {
|
|||
|
|
throw new Error(`invalid label extension: "${str_from_cps(cps.slice(0, 4))}"`); // this can only be ascii so cant be bidi
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
function check_leading_underscore(cps) {
|
|||
|
|
const UNDERSCORE = 0x5F;
|
|||
|
|
for (let i = cps.lastIndexOf(UNDERSCORE); i > 0; ) {
|
|||
|
|
if (cps[--i] !== UNDERSCORE) {
|
|||
|
|
throw new Error('underscore allowed only at start');
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
// check that a fenced cp is not leading, trailing, or touching another fenced cp
|
|||
|
|
function check_fenced(cps) {
|
|||
|
|
let cp = cps[0];
|
|||
|
|
let prev = FENCED.get(cp);
|
|||
|
|
if (prev) throw error_placement(`leading ${prev}`);
|
|||
|
|
let n = cps.length;
|
|||
|
|
let last = -1; // prevents trailing from throwing
|
|||
|
|
for (let i = 1; i < n; i++) {
|
|||
|
|
cp = cps[i];
|
|||
|
|
let match = FENCED.get(cp);
|
|||
|
|
if (match) {
|
|||
|
|
// since cps[0] isn't fenced, cps[1] cannot throw
|
|||
|
|
if (last == i) throw error_placement(`${prev} + ${match}`);
|
|||
|
|
last = i + 1;
|
|||
|
|
prev = match;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
if (last == n) throw error_placement(`trailing ${prev}`);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// create a safe to print string
|
|||
|
|
// invisibles are escaped
|
|||
|
|
// leading cm uses placeholder
|
|||
|
|
// if cps exceed max, middle truncate with ellipsis
|
|||
|
|
// quoter(cp) => string, eg. 3000 => "{3000}"
|
|||
|
|
// note: in html, you'd call this function then replace [<>&] with entities
|
|||
|
|
function safe_str_from_cps(cps, max = Infinity, quoter = quote_cp) {
|
|||
|
|
//if (Number.isInteger(cps)) cps = [cps];
|
|||
|
|
//if (!Array.isArray(cps)) throw new TypeError(`expected codepoints`);
|
|||
|
|
let buf = [];
|
|||
|
|
if (is_combining_mark(cps[0])) buf.push('◌');
|
|||
|
|
if (cps.length > max) {
|
|||
|
|
max >>= 1;
|
|||
|
|
cps = [...cps.slice(0, max), 0x2026, ...cps.slice(-max)];
|
|||
|
|
}
|
|||
|
|
let prev = 0;
|
|||
|
|
let n = cps.length;
|
|||
|
|
for (let i = 0; i < n; i++) {
|
|||
|
|
let cp = cps[i];
|
|||
|
|
if (should_escape(cp)) {
|
|||
|
|
buf.push(str_from_cps(cps.slice(prev, i)));
|
|||
|
|
buf.push(quoter(cp));
|
|||
|
|
prev = i + 1;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
buf.push(str_from_cps(cps.slice(prev, n)));
|
|||
|
|
return buf.join('');
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// note: set(s) cannot be exposed because they can be modified
|
|||
|
|
// note: Object.freeze() doesn't work
|
|||
|
|
function is_combining_mark(cp) {
|
|||
|
|
init();
|
|||
|
|
return CM.has(cp);
|
|||
|
|
}
|
|||
|
|
function should_escape(cp) {
|
|||
|
|
init();
|
|||
|
|
return ESCAPE.has(cp);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// return all supported emoji as fully-qualified emoji
|
|||
|
|
// ordered by length then lexicographic
|
|||
|
|
function ens_emoji() {
|
|||
|
|
init();
|
|||
|
|
return EMOJI_LIST.map(x => x.slice()); // emoji are exposed so copy
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function ens_normalize_fragment(frag, decompose) {
|
|||
|
|
init();
|
|||
|
|
let nf = decompose ? nfd : nfc;
|
|||
|
|
return frag.split(STOP_CH).map(label => str_from_cps(tokens_from_str(explode_cp(label), nf, filter_fe0f).flat())).join(STOP_CH);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function ens_normalize(name) {
|
|||
|
|
return flatten(split(name, nfc, filter_fe0f));
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function ens_beautify(name) {
|
|||
|
|
let labels = split(name, nfc, x => x); // emoji not exposed
|
|||
|
|
for (let {type, output, error} of labels) {
|
|||
|
|
if (error) break; // flatten will throw
|
|||
|
|
|
|||
|
|
// replace leading/trailing hyphen
|
|||
|
|
// 20230121: consider beautifing all or leading/trailing hyphen to unicode variant
|
|||
|
|
// not exactly the same in every font, but very similar: "-" vs "‐"
|
|||
|
|
/*
|
|||
|
|
const UNICODE_HYPHEN = 0x2010;
|
|||
|
|
// maybe this should replace all for visual consistancy?
|
|||
|
|
// `node tools/reg-count.js regex ^-\{2,\}` => 592
|
|||
|
|
//for (let i = 0; i < output.length; i++) if (output[i] == 0x2D) output[i] = 0x2010;
|
|||
|
|
if (output[0] == HYPHEN) output[0] = UNICODE_HYPHEN;
|
|||
|
|
let end = output.length-1;
|
|||
|
|
if (output[end] == HYPHEN) output[end] = UNICODE_HYPHEN;
|
|||
|
|
*/
|
|||
|
|
// 20230123: WHATWG URL uses "CheckHyphens" false
|
|||
|
|
// https://url.spec.whatwg.org/#idna
|
|||
|
|
|
|||
|
|
// update ethereum symbol
|
|||
|
|
// ξ => Ξ if not greek
|
|||
|
|
if (type !== 'Greek') array_replace(output, 0x3BE, 0x39E);
|
|||
|
|
|
|||
|
|
// 20221213: fixes bidi subdomain issue, but breaks invariant (200E is disallowed)
|
|||
|
|
// could be fixed with special case for: 2D (.) + 200E (LTR)
|
|||
|
|
// https://discuss.ens.domains/t/bidi-label-ordering-spoof/15824
|
|||
|
|
//output.splice(0, 0, 0x200E);
|
|||
|
|
}
|
|||
|
|
return flatten(labels);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function array_replace(v, a, b) {
|
|||
|
|
let prev = 0;
|
|||
|
|
while (true) {
|
|||
|
|
let next = v.indexOf(a, prev);
|
|||
|
|
if (next < 0) break;
|
|||
|
|
v[next] = b;
|
|||
|
|
prev = next + 1;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function ens_split(name, preserve_emoji) {
|
|||
|
|
return split(name, nfc, preserve_emoji ? x => x.slice() : filter_fe0f); // emoji are exposed so copy
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function split(name, nf, ef) {
|
|||
|
|
if (!name) return []; // 20230719: empty name allowance
|
|||
|
|
init();
|
|||
|
|
let offset = 0;
|
|||
|
|
// https://unicode.org/reports/tr46/#Validity_Criteria
|
|||
|
|
// 4.) "The label must not contain a U+002E ( . ) FULL STOP."
|
|||
|
|
return name.split(STOP_CH).map(label => {
|
|||
|
|
let input = explode_cp(label);
|
|||
|
|
let info = {
|
|||
|
|
input,
|
|||
|
|
offset, // codepoint, not substring!
|
|||
|
|
};
|
|||
|
|
offset += input.length + 1; // + stop
|
|||
|
|
try {
|
|||
|
|
// 1.) "The label must be in Unicode Normalization Form NFC"
|
|||
|
|
let tokens = info.tokens = tokens_from_str(input, nf, ef);
|
|||
|
|
let token_count = tokens.length;
|
|||
|
|
let type;
|
|||
|
|
if (!token_count) { // the label was effectively empty (could of had ignored characters)
|
|||
|
|
//norm = [];
|
|||
|
|
//type = 'None'; // use this instead of next match, "ASCII"
|
|||
|
|
// 20230120: change to strict
|
|||
|
|
// https://discuss.ens.domains/t/ens-name-normalization-2nd/14564/59
|
|||
|
|
throw new Error(`empty label`);
|
|||
|
|
}
|
|||
|
|
let norm = info.output = tokens.flat();
|
|||
|
|
check_leading_underscore(norm);
|
|||
|
|
let emoji = info.emoji = token_count > 1 || tokens[0].is_emoji; // same as: tokens.some(x => x.is_emoji);
|
|||
|
|
if (!emoji && norm.every(cp => cp < 0x80)) { // special case for ascii
|
|||
|
|
// 20230123: matches matches WHATWG, see note 3.3
|
|||
|
|
check_label_extension(norm); // only needed for ascii
|
|||
|
|
// cant have fenced
|
|||
|
|
// cant have cm
|
|||
|
|
// cant have wholes
|
|||
|
|
// see derive: "Fastpath ASCII"
|
|||
|
|
type = 'ASCII';
|
|||
|
|
} else {
|
|||
|
|
let chars = tokens.flatMap(x => x.is_emoji ? [] : x); // all of the nfc tokens concat together
|
|||
|
|
if (!chars.length) { // theres no text, just emoji
|
|||
|
|
type = 'Emoji';
|
|||
|
|
} else {
|
|||
|
|
// 5.) "The label must not begin with a combining mark, that is: General_Category=Mark."
|
|||
|
|
if (CM.has(norm[0])) throw error_placement('leading combining mark');
|
|||
|
|
for (let i = 1; i < token_count; i++) { // we've already checked the first token
|
|||
|
|
let cps = tokens[i];
|
|||
|
|
if (!cps.is_emoji && CM.has(cps[0])) { // every text token has emoji neighbors, eg. EtEEEtEt...
|
|||
|
|
// bidi_qq() not needed since emoji is LTR and cps is a CM
|
|||
|
|
throw error_placement(`emoji + combining mark: "${str_from_cps(tokens[i-1])} + ${safe_str_from_cps([cps[0]])}"`);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
check_fenced(norm);
|
|||
|
|
let unique = Array_from(new Set(chars));
|
|||
|
|
let [g] = determine_group(unique); // take the first match
|
|||
|
|
// see derive: "Matching Groups have Same CM Style"
|
|||
|
|
// alternative: could form a hybrid type: Latin/Japanese/...
|
|||
|
|
check_group(g, chars); // need text in order
|
|||
|
|
check_whole(g, unique); // only need unique text (order would be required for multiple-char confusables)
|
|||
|
|
type = g.N;
|
|||
|
|
// 20230121: consider exposing restricted flag
|
|||
|
|
// it's simpler to just check for 'Restricted'
|
|||
|
|
// or even better: type.endsWith(']')
|
|||
|
|
//if (g.R) info.restricted = true;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
info.type = type;
|
|||
|
|
} catch (err) {
|
|||
|
|
info.error = err; // use full error object
|
|||
|
|
}
|
|||
|
|
return info;
|
|||
|
|
});
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function check_whole(group, unique) {
|
|||
|
|
let maker;
|
|||
|
|
let shared = [];
|
|||
|
|
for (let cp of unique) {
|
|||
|
|
let whole = WHOLE_MAP.get(cp);
|
|||
|
|
if (whole === UNIQUE_PH) return; // unique, non-confusable
|
|||
|
|
if (whole) {
|
|||
|
|
let set = whole.M.get(cp); // groups which have a character that look-like this character
|
|||
|
|
maker = maker ? maker.filter(g => set.has(g)) : Array_from(set);
|
|||
|
|
if (!maker.length) return; // confusable intersection is empty
|
|||
|
|
} else {
|
|||
|
|
shared.push(cp);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
if (maker) {
|
|||
|
|
// we have 1+ confusable
|
|||
|
|
// check if any of the remaining groups
|
|||
|
|
// contain the shared characters too
|
|||
|
|
for (let g of maker) {
|
|||
|
|
if (shared.every(cp => group_has_cp(g, cp))) {
|
|||
|
|
throw new Error(`whole-script confusable: ${group.N}/${g.N}`);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// assumption: unique.size > 0
|
|||
|
|
// returns list of matching groups
|
|||
|
|
function determine_group(unique) {
|
|||
|
|
let groups = GROUPS;
|
|||
|
|
for (let cp of unique) {
|
|||
|
|
// note: we need to dodge CM that are whitelisted
|
|||
|
|
// but that code isn't currently necessary
|
|||
|
|
let gs = groups.filter(g => group_has_cp(g, cp));
|
|||
|
|
if (!gs.length) {
|
|||
|
|
if (!GROUPS.some(g => group_has_cp(g, cp))) {
|
|||
|
|
// the character was composed of valid parts
|
|||
|
|
// but it's NFC form is invalid
|
|||
|
|
// 20230716: change to more exact statement, see: ENSNormalize.{cs,java}
|
|||
|
|
// note: this doesn't have to be a composition
|
|||
|
|
// 20230720: change to full check
|
|||
|
|
throw error_disallowed(cp); // this should be rare
|
|||
|
|
} else {
|
|||
|
|
// there is no group that contains all these characters
|
|||
|
|
// throw using the highest priority group that matched
|
|||
|
|
// https://www.unicode.org/reports/tr39/#mixed_script_confusables
|
|||
|
|
throw error_group_member(groups[0], cp);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
groups = gs;
|
|||
|
|
if (gs.length == 1) break; // there is only one group left
|
|||
|
|
}
|
|||
|
|
// there are at least 1 group(s) with all of these characters
|
|||
|
|
return groups;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// throw on first error
|
|||
|
|
function flatten(split) {
|
|||
|
|
return split.map(({input, error, output}) => {
|
|||
|
|
if (error) {
|
|||
|
|
// don't print label again if just a single label
|
|||
|
|
let msg = error.message;
|
|||
|
|
// bidi_qq() only necessary if msg is digits
|
|||
|
|
throw new Error(split.length == 1 ? msg : `Invalid label ${bidi_qq(safe_str_from_cps(input, 63))}: ${msg}`);
|
|||
|
|
}
|
|||
|
|
return str_from_cps(output);
|
|||
|
|
}).join(STOP_CH);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function error_disallowed(cp) {
|
|||
|
|
// TODO: add cp to error?
|
|||
|
|
return new Error(`disallowed character: ${quoted_cp(cp)}`);
|
|||
|
|
}
|
|||
|
|
function error_group_member(g, cp) {
|
|||
|
|
let quoted = quoted_cp(cp);
|
|||
|
|
let gg = GROUPS.find(g => g.P.has(cp)); // only check primary
|
|||
|
|
if (gg) {
|
|||
|
|
quoted = `${gg.N} ${quoted}`;
|
|||
|
|
}
|
|||
|
|
return new Error(`illegal mixture: ${g.N} + ${quoted}`);
|
|||
|
|
}
|
|||
|
|
function error_placement(where) {
|
|||
|
|
return new Error(`illegal placement: ${where}`);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// assumption: cps.length > 0
|
|||
|
|
// assumption: cps[0] isn't a CM
|
|||
|
|
// assumption: the previous character isn't an emoji
|
|||
|
|
function check_group(g, cps) {
|
|||
|
|
for (let cp of cps) {
|
|||
|
|
if (!group_has_cp(g, cp)) {
|
|||
|
|
// for whitelisted scripts, this will throw illegal mixture on invalid cm, eg. "e{300}{300}"
|
|||
|
|
// at the moment, it's unnecessary to introduce an extra error type
|
|||
|
|
// until there exists a whitelisted multi-character
|
|||
|
|
// eg. if (M < 0 && is_combining_mark(cp)) { ... }
|
|||
|
|
// there are 3 cases:
|
|||
|
|
// 1. illegal cm for wrong group => mixture error
|
|||
|
|
// 2. illegal cm for same group => cm error
|
|||
|
|
// requires set of whitelist cm per group:
|
|||
|
|
// eg. new Set([...g.P, ...g.Q].flatMap(nfc).filter(cp => CM.has(cp)))
|
|||
|
|
// 3. wrong group => mixture error
|
|||
|
|
throw error_group_member(g, cp);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
//if (M >= 0) { // we have a known fixed cm count
|
|||
|
|
if (g.M) { // we need to check for NSM
|
|||
|
|
let decomposed = nfd(cps);
|
|||
|
|
for (let i = 1, e = decomposed.length; i < e; i++) { // see: assumption
|
|||
|
|
// 20230210: bugfix: using cps instead of decomposed h/t Carbon225
|
|||
|
|
/*
|
|||
|
|
if (CM.has(decomposed[i])) {
|
|||
|
|
let j = i + 1;
|
|||
|
|
while (j < e && CM.has(decomposed[j])) j++;
|
|||
|
|
if (j - i > M) {
|
|||
|
|
throw new Error(`too many combining marks: ${g.N} ${bidi_qq(str_from_cps(decomposed.slice(i-1, j)))} (${j-i}/${M})`);
|
|||
|
|
}
|
|||
|
|
i = j;
|
|||
|
|
}
|
|||
|
|
*/
|
|||
|
|
// 20230217: switch to NSM counting
|
|||
|
|
// https://www.unicode.org/reports/tr39/#Optional_Detection
|
|||
|
|
if (NSM.has(decomposed[i])) {
|
|||
|
|
let j = i + 1;
|
|||
|
|
for (let cp; j < e && NSM.has(cp = decomposed[j]); j++) {
|
|||
|
|
// a. Forbid sequences of the same nonspacing mark.
|
|||
|
|
for (let k = i; k < j; k++) { // O(n^2) but n < 100
|
|||
|
|
if (decomposed[k] == cp) {
|
|||
|
|
throw new Error(`duplicate non-spacing marks: ${quoted_cp(cp)}`);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
// parse to end so we have full nsm count
|
|||
|
|
// b. Forbid sequences of more than 4 nonspacing marks (gc=Mn or gc=Me).
|
|||
|
|
if (j - i > NSM_MAX) {
|
|||
|
|
// note: this slice starts with a base char or spacing-mark cm
|
|||
|
|
throw new Error(`excessive non-spacing marks: ${bidi_qq(safe_str_from_cps(decomposed.slice(i-1, j)))} (${j-i}/${NSM_MAX})`);
|
|||
|
|
}
|
|||
|
|
i = j;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
// *** this code currently isn't needed ***
|
|||
|
|
/*
|
|||
|
|
let cm_whitelist = M instanceof Map;
|
|||
|
|
for (let i = 0, e = cps.length; i < e; ) {
|
|||
|
|
let cp = cps[i++];
|
|||
|
|
let seqs = cm_whitelist && M.get(cp);
|
|||
|
|
if (seqs) {
|
|||
|
|
// list of codepoints that can follow
|
|||
|
|
// if this exists, this will always be 1+
|
|||
|
|
let j = i;
|
|||
|
|
while (j < e && CM.has(cps[j])) j++;
|
|||
|
|
let cms = cps.slice(i, j);
|
|||
|
|
let match = seqs.find(seq => !compare_arrays(seq, cms));
|
|||
|
|
if (!match) throw new Error(`disallowed combining mark sequence: "${safe_str_from_cps([cp, ...cms])}"`);
|
|||
|
|
i = j;
|
|||
|
|
} else if (!V.has(cp)) {
|
|||
|
|
// https://www.unicode.org/reports/tr39/#mixed_script_confusables
|
|||
|
|
let quoted = quoted_cp(cp);
|
|||
|
|
for (let cp of cps) {
|
|||
|
|
let u = UNIQUE.get(cp);
|
|||
|
|
if (u && u !== g) {
|
|||
|
|
// if both scripts are restricted this error is confusing
|
|||
|
|
// because we don't differentiate RestrictedA from RestrictedB
|
|||
|
|
if (!u.R) quoted = `${quoted} is ${u.N}`;
|
|||
|
|
break;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
throw new Error(`disallowed ${g.N} character: ${quoted}`);
|
|||
|
|
//throw new Error(`disallowed character: ${quoted} (expected ${g.N})`);
|
|||
|
|
//throw new Error(`${g.N} does not allow: ${quoted}`);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
if (!cm_whitelist) {
|
|||
|
|
let decomposed = nfd(cps);
|
|||
|
|
for (let i = 1, e = decomposed.length; i < e; i++) { // we know it can't be cm leading
|
|||
|
|
if (CM.has(decomposed[i])) {
|
|||
|
|
let j = i + 1;
|
|||
|
|
while (j < e && CM.has(decomposed[j])) j++;
|
|||
|
|
if (j - i > M) {
|
|||
|
|
throw new Error(`too many combining marks: "${str_from_cps(decomposed.slice(i-1, j))}" (${j-i}/${M})`);
|
|||
|
|
}
|
|||
|
|
i = j;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
*/
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// given a list of codepoints
|
|||
|
|
// returns a list of lists, where emoji are a fully-qualified (as Array subclass)
|
|||
|
|
// eg. explode_cp("abc💩d") => [[61, 62, 63], Emoji[1F4A9, FE0F], [64]]
|
|||
|
|
// 20230818: rename for 'process' name collision h/t Javarome
|
|||
|
|
// https://github.com/adraffy/ens-normalize.js/issues/23
|
|||
|
|
function tokens_from_str(input, nf, ef) {
|
|||
|
|
let ret = [];
|
|||
|
|
let chars = [];
|
|||
|
|
input = input.slice().reverse(); // flip so we can pop
|
|||
|
|
while (input.length) {
|
|||
|
|
let emoji = consume_emoji_reversed(input);
|
|||
|
|
if (emoji) {
|
|||
|
|
if (chars.length) {
|
|||
|
|
ret.push(nf(chars));
|
|||
|
|
chars = [];
|
|||
|
|
}
|
|||
|
|
ret.push(ef(emoji));
|
|||
|
|
} else {
|
|||
|
|
let cp = input.pop();
|
|||
|
|
if (VALID.has(cp)) {
|
|||
|
|
chars.push(cp);
|
|||
|
|
} else {
|
|||
|
|
let cps = MAPPED.get(cp);
|
|||
|
|
if (cps) {
|
|||
|
|
chars.push(...cps); // less than 10 elements
|
|||
|
|
} else if (!IGNORED.has(cp)) {
|
|||
|
|
// 20230912: unicode 15.1 changed the order of processing such that
|
|||
|
|
// disallowed parts are only rejected after NFC
|
|||
|
|
// https://unicode.org/reports/tr46/#Validity_Criteria
|
|||
|
|
// this doesn't impact normalization as of today
|
|||
|
|
// technically, this error can be removed as the group logic will apply similar logic
|
|||
|
|
// however the error type might be less clear
|
|||
|
|
throw error_disallowed(cp);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
if (chars.length) {
|
|||
|
|
ret.push(nf(chars));
|
|||
|
|
}
|
|||
|
|
return ret;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function filter_fe0f(cps) {
|
|||
|
|
return cps.filter(cp => cp != FE0F);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// given array of codepoints
|
|||
|
|
// returns the longest valid emoji sequence (or undefined if no match)
|
|||
|
|
// *MUTATES* the supplied array
|
|||
|
|
// disallows interleaved ignored characters
|
|||
|
|
// fills (optional) eaten array with matched codepoints
|
|||
|
|
function consume_emoji_reversed(cps, eaten) {
|
|||
|
|
let node = EMOJI_ROOT;
|
|||
|
|
let emoji;
|
|||
|
|
let pos = cps.length;
|
|||
|
|
while (pos) {
|
|||
|
|
node = node.get(cps[--pos]);
|
|||
|
|
if (!node) break;
|
|||
|
|
let {V} = node;
|
|||
|
|
if (V) { // this is a valid emoji (so far)
|
|||
|
|
emoji = V;
|
|||
|
|
if (eaten) eaten.push(...cps.slice(pos).reverse()); // (optional) copy input, used for ens_tokenize()
|
|||
|
|
cps.length = pos; // truncate
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
return emoji;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ************************************************************
|
|||
|
|
// tokenizer
|
|||
|
|
|
|||
|
|
const TY_VALID = 'valid';
|
|||
|
|
const TY_MAPPED = 'mapped';
|
|||
|
|
const TY_IGNORED = 'ignored';
|
|||
|
|
const TY_DISALLOWED = 'disallowed';
|
|||
|
|
const TY_EMOJI = 'emoji';
|
|||
|
|
const TY_NFC = 'nfc';
|
|||
|
|
const TY_STOP = 'stop';
|
|||
|
|
|
|||
|
|
function ens_tokenize(name, {
|
|||
|
|
nf = true, // collapse unnormalized runs into a single token
|
|||
|
|
} = {}) {
|
|||
|
|
init();
|
|||
|
|
let input = explode_cp(name).reverse();
|
|||
|
|
let eaten = [];
|
|||
|
|
let tokens = [];
|
|||
|
|
while (input.length) {
|
|||
|
|
let emoji = consume_emoji_reversed(input, eaten);
|
|||
|
|
if (emoji) {
|
|||
|
|
tokens.push({
|
|||
|
|
type: TY_EMOJI,
|
|||
|
|
emoji: emoji.slice(), // copy emoji
|
|||
|
|
input: eaten,
|
|||
|
|
cps: filter_fe0f(emoji)
|
|||
|
|
});
|
|||
|
|
eaten = []; // reset buffer
|
|||
|
|
} else {
|
|||
|
|
let cp = input.pop();
|
|||
|
|
if (cp == STOP) {
|
|||
|
|
tokens.push({type: TY_STOP, cp});
|
|||
|
|
} else if (VALID.has(cp)) {
|
|||
|
|
tokens.push({type: TY_VALID, cps: [cp]});
|
|||
|
|
} else if (IGNORED.has(cp)) {
|
|||
|
|
tokens.push({type: TY_IGNORED, cp});
|
|||
|
|
} else {
|
|||
|
|
let cps = MAPPED.get(cp);
|
|||
|
|
if (cps) {
|
|||
|
|
tokens.push({type: TY_MAPPED, cp, cps: cps.slice()});
|
|||
|
|
} else {
|
|||
|
|
tokens.push({type: TY_DISALLOWED, cp});
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
if (nf) {
|
|||
|
|
for (let i = 0, start = -1; i < tokens.length; i++) {
|
|||
|
|
let token = tokens[i];
|
|||
|
|
if (is_valid_or_mapped(token.type)) {
|
|||
|
|
if (requires_check(token.cps)) { // normalization might be needed
|
|||
|
|
let end = i + 1;
|
|||
|
|
for (let pos = end; pos < tokens.length; pos++) { // find adjacent text
|
|||
|
|
let {type, cps} = tokens[pos];
|
|||
|
|
if (is_valid_or_mapped(type)) {
|
|||
|
|
if (!requires_check(cps)) break;
|
|||
|
|
end = pos + 1;
|
|||
|
|
} else if (type !== TY_IGNORED) { // || type !== TY_DISALLOWED) {
|
|||
|
|
break;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
if (start < 0) start = i;
|
|||
|
|
let slice = tokens.slice(start, end);
|
|||
|
|
let cps0 = slice.flatMap(x => is_valid_or_mapped(x.type) ? x.cps : []); // strip junk tokens
|
|||
|
|
let cps = nfc(cps0);
|
|||
|
|
if (compare_arrays(cps, cps0)) { // bundle into an nfc token
|
|||
|
|
tokens.splice(start, end - start, {
|
|||
|
|
type: TY_NFC,
|
|||
|
|
input: cps0, // there are 3 states: tokens0 ==(process)=> input ==(nfc)=> tokens/cps
|
|||
|
|
cps,
|
|||
|
|
tokens0: collapse_valid_tokens(slice),
|
|||
|
|
tokens: ens_tokenize(str_from_cps(cps), {nf: false})
|
|||
|
|
});
|
|||
|
|
i = start;
|
|||
|
|
} else {
|
|||
|
|
i = end - 1; // skip to end of slice
|
|||
|
|
}
|
|||
|
|
start = -1; // reset
|
|||
|
|
} else {
|
|||
|
|
start = i; // remember last
|
|||
|
|
}
|
|||
|
|
} else if (token.type !== TY_IGNORED) { // 20221024: is this correct?
|
|||
|
|
start = -1; // reset
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
return collapse_valid_tokens(tokens);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function is_valid_or_mapped(type) {
|
|||
|
|
return type == TY_VALID || type == TY_MAPPED;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function requires_check(cps) {
|
|||
|
|
return cps.some(cp => NFC_CHECK.has(cp));
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function collapse_valid_tokens(tokens) {
|
|||
|
|
for (let i = 0; i < tokens.length; i++) {
|
|||
|
|
if (tokens[i].type == TY_VALID) {
|
|||
|
|
let j = i + 1;
|
|||
|
|
while (j < tokens.length && tokens[j].type == TY_VALID) j++;
|
|||
|
|
tokens.splice(i, j - i, {type: TY_VALID, cps: tokens.slice(i, j).flatMap(x => x.cps)});
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
return tokens;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
exports.ens_beautify = ens_beautify;
|
|||
|
|
exports.ens_emoji = ens_emoji;
|
|||
|
|
exports.ens_normalize = ens_normalize;
|
|||
|
|
exports.ens_normalize_fragment = ens_normalize_fragment;
|
|||
|
|
exports.ens_split = ens_split;
|
|||
|
|
exports.ens_tokenize = ens_tokenize;
|
|||
|
|
exports.is_combining_mark = is_combining_mark;
|
|||
|
|
exports.nfc = nfc;
|
|||
|
|
exports.nfd = nfd;
|
|||
|
|
exports.safe_str_from_cps = safe_str_from_cps;
|
|||
|
|
exports.should_escape = should_escape;
|