diff options
Diffstat (limited to 'etc/charsets')
| -rw-r--r-- | etc/charsets/gb18030-2.awk | 73 | ||||
| -rw-r--r-- | etc/charsets/gb18030-4.awk | 108 |
2 files changed, 82 insertions, 99 deletions
diff --git a/etc/charsets/gb18030-2.awk b/etc/charsets/gb18030-2.awk index 5b461fdfaa0..94d0a9e410a 100644 --- a/etc/charsets/gb18030-2.awk +++ b/etc/charsets/gb18030-2.awk | |||
| @@ -11,6 +11,10 @@ BEGIN { | |||
| 11 | tohex["d"] = 13; | 11 | tohex["d"] = 13; |
| 12 | tohex["e"] = 14; | 12 | tohex["e"] = 14; |
| 13 | tohex["f"] = 15; | 13 | tohex["f"] = 15; |
| 14 | from_gb = 0; | ||
| 15 | to_gb = -1; | ||
| 16 | to_unicode = 0; | ||
| 17 | from_unicode = 0; | ||
| 14 | } | 18 | } |
| 15 | 19 | ||
| 16 | function decode_hex(str) { | 20 | function decode_hex(str) { |
| @@ -30,56 +34,47 @@ function decode_hex(str) { | |||
| 30 | function gb_to_index(gb) { | 34 | function gb_to_index(gb) { |
| 31 | b0 = int(gb / 256); | 35 | b0 = int(gb / 256); |
| 32 | b1 = gb % 256; | 36 | b1 = gb % 256; |
| 33 | idx = (((b0 - 129)) * 190 + b1 - 64); | 37 | idx = (((b0 - 129)) * 191 + b1 - 64); |
| 34 | if (b1 >= 128) | 38 | # if (b1 >= 128) |
| 35 | idx--; | 39 | # idx--; |
| 36 | return idx | 40 | return idx |
| 37 | } | 41 | } |
| 38 | 42 | ||
| 39 | function index_to_gb(idx) { | 43 | function index_to_gb(idx) { |
| 40 | b0 = int(idx / 190) + 129; | 44 | b0 = int(idx / 191) + 129; |
| 41 | b1 = (idx % 190) + 64; | 45 | b1 = (idx % 191) + 64; |
| 42 | if (b1 >= 127) | 46 | # if (b1 >= 127) |
| 43 | b1++; | 47 | # b1++; |
| 44 | return (b0 * 256 + b1); | 48 | return (b0 * 256 + b1); |
| 45 | } | 49 | } |
| 46 | function decode_gb(str) { | 50 | |
| 47 | b0 = decode_hex(substr(str, 3, 2)); | 51 | /^\#/ { |
| 48 | b1 = decode_hex(substr(str, 7, 2)); | 52 | print; |
| 49 | return (b0 * 256 + b1) | 53 | next; |
| 50 | } | 54 | } |
| 51 | 55 | ||
| 52 | /^<U[0-9A-F][0-9A-F][0-9A-F][0-9A-F]>/ { | 56 | { |
| 53 | if ($2 ~ /^\\x[0-9A-F][0-9A-F]\\x[0-9A-F][0-9A-F]$/) | 57 | gb = gb_to_index(decode_hex(substr($1, 3, 4))); |
| 58 | unicode = decode_hex(substr($2, 3, 4)); | ||
| 59 | if ((gb == to_gb + 1) && (unicode == to_unicode + 1)) | ||
| 60 | { | ||
| 61 | to_gb++; | ||
| 62 | to_unicode++; | ||
| 63 | } | ||
| 64 | else | ||
| 54 | { | 65 | { |
| 55 | unicode = decode_hex(substr($1, 3, 4)); | 66 | if (from_gb == to_gb) |
| 56 | gb = decode_gb($2); | 67 | printf "0x%04X 0x%04X\n", index_to_gb(from_gb), from_unicode; |
| 57 | idx = gb_to_index(gb); | 68 | else if (from_gb < to_gb) |
| 58 | gb_table[idx] = unicode; | 69 | printf "0x%04X-0x%04X 0x%04X\n", |
| 70 | index_to_gb(from_gb), index_to_gb(to_gb), from_unicode; | ||
| 71 | from_gb = to_gb = gb; | ||
| 72 | from_unicode = to_unicode = unicode; | ||
| 59 | } | 73 | } |
| 60 | } | 74 | } |
| 61 | 75 | ||
| 62 | END { | 76 | END { |
| 63 | last_idx = gb_to_index(decode_hex("FEFE")); | 77 | if (from_gb <= to_gb) |
| 64 | from_idx = 0; | 78 | printf "0x%04X-0x%04X 0x%04X\n", |
| 65 | from_unicode = gb_table[0]; | 79 | index_to_gb(from_gb), index_to_gb(to_gb), from_unicode; |
| 66 | for (i = 1; i <= last_idx; i++) | ||
| 67 | { | ||
| 68 | gb = index_to_gb(i); | ||
| 69 | unicode = gb_table[i]; | ||
| 70 | if (i - from_idx != unicode - from_unicode) | ||
| 71 | { | ||
| 72 | if (i - 1 == from_idx) | ||
| 73 | printf ("0x%04X 0x%04X\n", | ||
| 74 | index_to_gb(from_idx), from_unicode); | ||
| 75 | else | ||
| 76 | printf ("0x%04X-0x%04X 0x%04X\n", | ||
| 77 | index_to_gb(from_idx), index_to_gb(i - 1), from_unicode); | ||
| 78 | from_idx = i; | ||
| 79 | from_unicode=unicode; | ||
| 80 | } | ||
| 81 | } | ||
| 82 | if (i - from_idx != unicode - from_unicode) | ||
| 83 | printf ("0x%04X-0x%04X 0x%04X\n", | ||
| 84 | index_to_gb(from_idx), index_to_gb(i - 1), from_unicode); | ||
| 85 | } | 80 | } |
diff --git a/etc/charsets/gb18030-4.awk b/etc/charsets/gb18030-4.awk index 74780458687..9dac34bceff 100644 --- a/etc/charsets/gb18030-4.awk +++ b/etc/charsets/gb18030-4.awk | |||
| @@ -27,88 +27,76 @@ function decode_hex(str) { | |||
| 27 | return n; | 27 | return n; |
| 28 | } | 28 | } |
| 29 | 29 | ||
| 30 | function gb_to_index(b0,b1,b2,b3) { | 30 | function gb_to_index(gb) { |
| 31 | return ((((b0 - 129) * 10 + (b1 - 48)) * 126 + (b2 - 129)) * 10 + b3 - 48); | 31 | b0 = int(gb / 256); |
| 32 | b1 = gb % 256; | ||
| 33 | idx = (((b0 - 129)) * 191 + b1 - 64); | ||
| 34 | # if (b1 >= 127) | ||
| 35 | # idx--; | ||
| 36 | return idx | ||
| 32 | } | 37 | } |
| 33 | 38 | ||
| 34 | function index_to_gb(idx) { | 39 | function index_to_gb(idx) { |
| 35 | b3 = (idx % 10) + 48; | 40 | b3 = (idx % 10) + 48; |
| 36 | idx /= 10; | 41 | idx = int(idx / 10); |
| 37 | b2 = (idx % 126) + 129; | 42 | b2 = (idx % 126) + 129; |
| 38 | idx /= 126; | 43 | idx = int(idx / 126); |
| 39 | b1 = (idx % 10) + 48; | 44 | b1 = (idx % 10) + 48; |
| 40 | b0 = (idx / 10) + 129; | 45 | b0 = int(idx / 10) + 129; |
| 41 | return sprintf("%02X%02X%02X%02X", b0, b1, b2, b3); | 46 | return sprintf("%02X%02X%02X%02X", b0, b1, b2, b3); |
| 42 | } | 47 | } |
| 43 | 48 | ||
| 44 | function decode_gb(str) { | 49 | /^\#/ { |
| 45 | b0 = decode_hex(substr(str, 3, 2)); | 50 | print; |
| 46 | b1 = decode_hex(substr(str, 7, 2)); | 51 | next; |
| 47 | b2 = decode_hex(substr(str, 11, 2)); | ||
| 48 | b3 = decode_hex(substr(str, 15, 2)); | ||
| 49 | return gb_to_index(b0, b1, b2, b3); | ||
| 50 | } | 52 | } |
| 51 | 53 | ||
| 52 | function printline(from, to) { | 54 | /0x....-0x..../ { |
| 53 | fromgb = index_to_gb(from); | 55 | gb_from = gb_to_index(decode_hex(substr($1, 3, 4))); |
| 54 | fromuni = gbtable[from]; | 56 | gb_to = gb_to_index(decode_hex(substr($1, 10, 4))); |
| 55 | if (from == to) | 57 | unicode = decode_hex(substr($2, 3, 4)); |
| 56 | printf ("0x%s 0x%04X\n", fromgb, fromuni); | 58 | while (gb_from <= gb_to) |
| 57 | else | 59 | { |
| 58 | printf ("0x%s-0x%s 0x%04X\n", fromgb, index_to_gb(to), fromuni); | 60 | table[unicode++] = 1; |
| 61 | gb_from++; | ||
| 62 | } | ||
| 63 | next; | ||
| 59 | } | 64 | } |
| 60 | 65 | ||
| 61 | /^<U[0-9A-F][0-9A-F][0-9A-F][0-9A-F]>/ { | 66 | { |
| 62 | unicode = decode_hex(substr($1, 3, 4)); | 67 | gb = decode_hex(substr($1, 3, 4)); |
| 63 | if ($2 ~ /\\x8[1-4]\\x3[0-9]\\x[8-9A-F][0-9A-F]\\x3[0-9]/) | 68 | unicode = decode_hex(substr($2, 3, 4)); |
| 64 | unitable[unicode] = decode_gb($2); | 69 | table[unicode] = 1; |
| 65 | else | ||
| 66 | unitable[unicode] = -1; | ||
| 67 | } | 70 | } |
| 68 | 71 | ||
| 69 | END { | 72 | END { |
| 70 | lastgb = 0; | 73 | from_gb = -1; |
| 71 | surrogate_min = decode_hex("D800"); | 74 | to_gb = 0; |
| 72 | surrogate_max = decode_hex("DFFF"); | 75 | from_i = 0; |
| 73 | lastgb = unitable[128]; | 76 | table[65536] = 1; |
| 74 | gbtable[lastgb] = 128; | 77 | for (i = 128; i <= 65536; i++) |
| 75 | for (i = 129; i < 65536; i++) | ||
| 76 | { | 78 | { |
| 77 | if (unitable[i] == 0 && (i < surrogate_min || i > surrogate_max)) | 79 | if (table[i] == 0) |
| 78 | { | ||
| 79 | lastgb++; | ||
| 80 | gbtable[lastgb] = i; | ||
| 81 | unitable[i] = lastgb; | ||
| 82 | } | ||
| 83 | else if (unitable[i] > 0) | ||
| 84 | { | 80 | { |
| 85 | lastgb = unitable[i]; | 81 | if (i < 55296 || i >= 57344) |
| 86 | gbtable[lastgb] = i; | ||
| 87 | } | ||
| 88 | } | ||
| 89 | |||
| 90 | fromgb = lastgb = unitable[128]; | ||
| 91 | for (i = 129; i < 65536; i++) | ||
| 92 | { | ||
| 93 | if (unitable[i] > 0) | ||
| 94 | { | ||
| 95 | if (lastgb + 1 == unitable[i]) | ||
| 96 | { | ||
| 97 | lastgb++; | ||
| 98 | } | ||
| 99 | else | ||
| 100 | { | 82 | { |
| 101 | if (lastgb >= 0) | 83 | if (from_gb < 0) |
| 102 | printline(fromgb, lastgb); | 84 | { |
| 103 | fromgb = lastgb = unitable[i]; | 85 | from_gb = to_gb; |
| 86 | from_i = i; | ||
| 87 | } | ||
| 88 | to_gb++; | ||
| 104 | } | 89 | } |
| 105 | } | 90 | } |
| 106 | else # i.e. (unitable[i] < 0) | 91 | else if (from_gb >= 0) |
| 107 | { | 92 | { |
| 108 | if (lastgb >= 0) | 93 | if (from_gb + 1 == to_gb) |
| 109 | printline(fromgb, lastgb); | 94 | printf "0x%s\t\t0x%04X\n", |
| 110 | lastgb = -1; | 95 | index_to_gb(from_gb), from_i; |
| 96 | else | ||
| 97 | printf "0x%s-0x%s\t0x%04X\n", | ||
| 98 | index_to_gb(from_gb), index_to_gb(to_gb - 1), from_i; | ||
| 99 | from_gb = -1; | ||
| 111 | } | 100 | } |
| 112 | } | 101 | } |
| 113 | printline(fromgb, unitable[65535]); | ||
| 114 | } | 102 | } |