aboutsummaryrefslogtreecommitdiffstats
path: root/etc/charsets
diff options
context:
space:
mode:
Diffstat (limited to 'etc/charsets')
-rw-r--r--etc/charsets/gb18030-2.awk73
-rw-r--r--etc/charsets/gb18030-4.awk108
2 files changed, 82 insertions, 99 deletions
diff --git a/etc/charsets/gb18030-2.awk b/etc/charsets/gb18030-2.awk
index 5b461fdfaa0..94d0a9e410a 100644
--- a/etc/charsets/gb18030-2.awk
+++ b/etc/charsets/gb18030-2.awk
@@ -11,6 +11,10 @@ BEGIN {
11 tohex["d"] = 13; 11 tohex["d"] = 13;
12 tohex["e"] = 14; 12 tohex["e"] = 14;
13 tohex["f"] = 15; 13 tohex["f"] = 15;
14 from_gb = 0;
15 to_gb = -1;
16 to_unicode = 0;
17 from_unicode = 0;
14} 18}
15 19
16function decode_hex(str) { 20function decode_hex(str) {
@@ -30,56 +34,47 @@ function decode_hex(str) {
30function gb_to_index(gb) { 34function gb_to_index(gb) {
31 b0 = int(gb / 256); 35 b0 = int(gb / 256);
32 b1 = gb % 256; 36 b1 = gb % 256;
33 idx = (((b0 - 129)) * 190 + b1 - 64); 37 idx = (((b0 - 129)) * 191 + b1 - 64);
34 if (b1 >= 128) 38# if (b1 >= 128)
35 idx--; 39# idx--;
36 return idx 40 return idx
37} 41}
38 42
39function index_to_gb(idx) { 43function index_to_gb(idx) {
40 b0 = int(idx / 190) + 129; 44 b0 = int(idx / 191) + 129;
41 b1 = (idx % 190) + 64; 45 b1 = (idx % 191) + 64;
42 if (b1 >= 127) 46# if (b1 >= 127)
43 b1++; 47# b1++;
44 return (b0 * 256 + b1); 48 return (b0 * 256 + b1);
45} 49}
46function decode_gb(str) { 50
47 b0 = decode_hex(substr(str, 3, 2)); 51/^\#/ {
48 b1 = decode_hex(substr(str, 7, 2)); 52 print;
49 return (b0 * 256 + b1) 53 next;
50} 54}
51 55
52/^<U[0-9A-F][0-9A-F][0-9A-F][0-9A-F]>/ { 56{
53 if ($2 ~ /^\\x[0-9A-F][0-9A-F]\\x[0-9A-F][0-9A-F]$/) 57 gb = gb_to_index(decode_hex(substr($1, 3, 4)));
58 unicode = decode_hex(substr($2, 3, 4));
59 if ((gb == to_gb + 1) && (unicode == to_unicode + 1))
60 {
61 to_gb++;
62 to_unicode++;
63 }
64 else
54 { 65 {
55 unicode = decode_hex(substr($1, 3, 4)); 66 if (from_gb == to_gb)
56 gb = decode_gb($2); 67 printf "0x%04X 0x%04X\n", index_to_gb(from_gb), from_unicode;
57 idx = gb_to_index(gb); 68 else if (from_gb < to_gb)
58 gb_table[idx] = unicode; 69 printf "0x%04X-0x%04X 0x%04X\n",
70 index_to_gb(from_gb), index_to_gb(to_gb), from_unicode;
71 from_gb = to_gb = gb;
72 from_unicode = to_unicode = unicode;
59 } 73 }
60} 74}
61 75
62END { 76END {
63 last_idx = gb_to_index(decode_hex("FEFE")); 77 if (from_gb <= to_gb)
64 from_idx = 0; 78 printf "0x%04X-0x%04X 0x%04X\n",
65 from_unicode = gb_table[0]; 79 index_to_gb(from_gb), index_to_gb(to_gb), from_unicode;
66 for (i = 1; i <= last_idx; i++)
67 {
68 gb = index_to_gb(i);
69 unicode = gb_table[i];
70 if (i - from_idx != unicode - from_unicode)
71 {
72 if (i - 1 == from_idx)
73 printf ("0x%04X 0x%04X\n",
74 index_to_gb(from_idx), from_unicode);
75 else
76 printf ("0x%04X-0x%04X 0x%04X\n",
77 index_to_gb(from_idx), index_to_gb(i - 1), from_unicode);
78 from_idx = i;
79 from_unicode=unicode;
80 }
81 }
82 if (i - from_idx != unicode - from_unicode)
83 printf ("0x%04X-0x%04X 0x%04X\n",
84 index_to_gb(from_idx), index_to_gb(i - 1), from_unicode);
85} 80}
diff --git a/etc/charsets/gb18030-4.awk b/etc/charsets/gb18030-4.awk
index 74780458687..9dac34bceff 100644
--- a/etc/charsets/gb18030-4.awk
+++ b/etc/charsets/gb18030-4.awk
@@ -27,88 +27,76 @@ function decode_hex(str) {
27 return n; 27 return n;
28} 28}
29 29
30function gb_to_index(b0,b1,b2,b3) { 30function gb_to_index(gb) {
31 return ((((b0 - 129) * 10 + (b1 - 48)) * 126 + (b2 - 129)) * 10 + b3 - 48); 31 b0 = int(gb / 256);
32 b1 = gb % 256;
33 idx = (((b0 - 129)) * 191 + b1 - 64);
34# if (b1 >= 127)
35# idx--;
36 return idx
32} 37}
33 38
34function index_to_gb(idx) { 39function index_to_gb(idx) {
35 b3 = (idx % 10) + 48; 40 b3 = (idx % 10) + 48;
36 idx /= 10; 41 idx = int(idx / 10);
37 b2 = (idx % 126) + 129; 42 b2 = (idx % 126) + 129;
38 idx /= 126; 43 idx = int(idx / 126);
39 b1 = (idx % 10) + 48; 44 b1 = (idx % 10) + 48;
40 b0 = (idx / 10) + 129; 45 b0 = int(idx / 10) + 129;
41 return sprintf("%02X%02X%02X%02X", b0, b1, b2, b3); 46 return sprintf("%02X%02X%02X%02X", b0, b1, b2, b3);
42} 47}
43 48
44function decode_gb(str) { 49/^\#/ {
45 b0 = decode_hex(substr(str, 3, 2)); 50 print;
46 b1 = decode_hex(substr(str, 7, 2)); 51 next;
47 b2 = decode_hex(substr(str, 11, 2));
48 b3 = decode_hex(substr(str, 15, 2));
49 return gb_to_index(b0, b1, b2, b3);
50} 52}
51 53
52function printline(from, to) { 54/0x....-0x..../ {
53 fromgb = index_to_gb(from); 55 gb_from = gb_to_index(decode_hex(substr($1, 3, 4)));
54 fromuni = gbtable[from]; 56 gb_to = gb_to_index(decode_hex(substr($1, 10, 4)));
55 if (from == to) 57 unicode = decode_hex(substr($2, 3, 4));
56 printf ("0x%s 0x%04X\n", fromgb, fromuni); 58 while (gb_from <= gb_to)
57 else 59 {
58 printf ("0x%s-0x%s 0x%04X\n", fromgb, index_to_gb(to), fromuni); 60 table[unicode++] = 1;
61 gb_from++;
62 }
63 next;
59} 64}
60 65
61/^<U[0-9A-F][0-9A-F][0-9A-F][0-9A-F]>/ { 66{
62 unicode = decode_hex(substr($1, 3, 4)); 67 gb = decode_hex(substr($1, 3, 4));
63 if ($2 ~ /\\x8[1-4]\\x3[0-9]\\x[8-9A-F][0-9A-F]\\x3[0-9]/) 68 unicode = decode_hex(substr($2, 3, 4));
64 unitable[unicode] = decode_gb($2); 69 table[unicode] = 1;
65 else
66 unitable[unicode] = -1;
67} 70}
68 71
69END { 72END {
70 lastgb = 0; 73 from_gb = -1;
71 surrogate_min = decode_hex("D800"); 74 to_gb = 0;
72 surrogate_max = decode_hex("DFFF"); 75 from_i = 0;
73 lastgb = unitable[128]; 76 table[65536] = 1;
74 gbtable[lastgb] = 128; 77 for (i = 128; i <= 65536; i++)
75 for (i = 129; i < 65536; i++)
76 { 78 {
77 if (unitable[i] == 0 && (i < surrogate_min || i > surrogate_max)) 79 if (table[i] == 0)
78 {
79 lastgb++;
80 gbtable[lastgb] = i;
81 unitable[i] = lastgb;
82 }
83 else if (unitable[i] > 0)
84 { 80 {
85 lastgb = unitable[i]; 81 if (i < 55296 || i >= 57344)
86 gbtable[lastgb] = i;
87 }
88 }
89
90 fromgb = lastgb = unitable[128];
91 for (i = 129; i < 65536; i++)
92 {
93 if (unitable[i] > 0)
94 {
95 if (lastgb + 1 == unitable[i])
96 {
97 lastgb++;
98 }
99 else
100 { 82 {
101 if (lastgb >= 0) 83 if (from_gb < 0)
102 printline(fromgb, lastgb); 84 {
103 fromgb = lastgb = unitable[i]; 85 from_gb = to_gb;
86 from_i = i;
87 }
88 to_gb++;
104 } 89 }
105 } 90 }
106 else # i.e. (unitable[i] < 0) 91 else if (from_gb >= 0)
107 { 92 {
108 if (lastgb >= 0) 93 if (from_gb + 1 == to_gb)
109 printline(fromgb, lastgb); 94 printf "0x%s\t\t0x%04X\n",
110 lastgb = -1; 95 index_to_gb(from_gb), from_i;
96 else
97 printf "0x%s-0x%s\t0x%04X\n",
98 index_to_gb(from_gb), index_to_gb(to_gb - 1), from_i;
99 from_gb = -1;
111 } 100 }
112 } 101 }
113 printline(fromgb, unitable[65535]);
114} 102}