diff options
| author | Kenichi Handa | 2008-01-09 06:05:23 +0000 |
|---|---|---|
| committer | Kenichi Handa | 2008-01-09 06:05:23 +0000 |
| commit | 36a04480a59ed8a3dccdab99620b4d73cb67ab66 (patch) | |
| tree | 32cd30adeef5c6cf29ee84ad7b6963ee6369bc90 /src | |
| parent | ca8dfeda7c6edcfeacb95a2e2d3b5dcb3841886c (diff) | |
| download | emacs-36a04480a59ed8a3dccdab99620b4d73cb67ab66.tar.gz emacs-36a04480a59ed8a3dccdab99620b4d73cb67ab66.zip | |
(detect_coding_iso2022): New arg latin_extra_code_state. Allow Latin
extra codes only when *latin_extra_code_state is nonzero.
(detect_coding_mask): If there is a NULL byte, detect the encoding as
UTF-16 or binary. If there is a Latin extra code, detect the encoding
as ISO-2022 only when no other proper encoding is found.
Diffstat (limited to 'src')
| -rw-r--r-- | src/coding.c | 113 |
1 files changed, 89 insertions, 24 deletions
diff --git a/src/coding.c b/src/coding.c index 0e8a75647a2..4e4147370cb 100644 --- a/src/coding.c +++ b/src/coding.c | |||
| @@ -1406,12 +1406,17 @@ enum iso_code_class_type iso_code_class[256]; | |||
| 1406 | CODING_CATEGORY_MASK_ISO_7_ELSE | 1406 | CODING_CATEGORY_MASK_ISO_7_ELSE |
| 1407 | CODING_CATEGORY_MASK_ISO_8_ELSE | 1407 | CODING_CATEGORY_MASK_ISO_8_ELSE |
| 1408 | are set. If a code which should never appear in ISO2022 is found, | 1408 | are set. If a code which should never appear in ISO2022 is found, |
| 1409 | returns 0. */ | 1409 | returns 0. |
| 1410 | |||
| 1411 | If *latin_extra_code_state is zero and Latin extra codes are found, | ||
| 1412 | set *latin_extra_code_state to 1 and return 0. If it is nonzero, | ||
| 1413 | accept Latin extra codes. */ | ||
| 1410 | 1414 | ||
| 1411 | static int | 1415 | static int |
| 1412 | detect_coding_iso2022 (src, src_end, multibytep) | 1416 | detect_coding_iso2022 (src, src_end, multibytep, latin_extra_code_state) |
| 1413 | unsigned char *src, *src_end; | 1417 | unsigned char *src, *src_end; |
| 1414 | int multibytep; | 1418 | int multibytep; |
| 1419 | int *latin_extra_code_state; | ||
| 1415 | { | 1420 | { |
| 1416 | int mask = CODING_CATEGORY_MASK_ISO; | 1421 | int mask = CODING_CATEGORY_MASK_ISO; |
| 1417 | int mask_found = 0; | 1422 | int mask_found = 0; |
| @@ -1574,6 +1579,11 @@ detect_coding_iso2022 (src, src_end, multibytep) | |||
| 1574 | if (VECTORP (Vlatin_extra_code_table) | 1579 | if (VECTORP (Vlatin_extra_code_table) |
| 1575 | && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])) | 1580 | && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])) |
| 1576 | { | 1581 | { |
| 1582 | if (! *latin_extra_code_state) | ||
| 1583 | { | ||
| 1584 | *latin_extra_code_state = 1; | ||
| 1585 | return 0; | ||
| 1586 | } | ||
| 1577 | if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags | 1587 | if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags |
| 1578 | & CODING_FLAG_ISO_LATIN_EXTRA) | 1588 | & CODING_FLAG_ISO_LATIN_EXTRA) |
| 1579 | newmask |= CODING_CATEGORY_MASK_ISO_8_1; | 1589 | newmask |= CODING_CATEGORY_MASK_ISO_8_1; |
| @@ -1600,6 +1610,11 @@ detect_coding_iso2022 (src, src_end, multibytep) | |||
| 1600 | { | 1610 | { |
| 1601 | int newmask = 0; | 1611 | int newmask = 0; |
| 1602 | 1612 | ||
| 1613 | if (! *latin_extra_code_state) | ||
| 1614 | { | ||
| 1615 | *latin_extra_code_state = 1; | ||
| 1616 | return 0; | ||
| 1617 | } | ||
| 1603 | if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags | 1618 | if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags |
| 1604 | & CODING_FLAG_ISO_LATIN_EXTRA) | 1619 | & CODING_FLAG_ISO_LATIN_EXTRA) |
| 1605 | newmask |= CODING_CATEGORY_MASK_ISO_8_1; | 1620 | newmask |= CODING_CATEGORY_MASK_ISO_8_1; |
| @@ -4127,6 +4142,8 @@ detect_coding_mask (source, src_bytes, priorities, skip, multibytep) | |||
| 4127 | unsigned char *src = source, *src_end = source + src_bytes; | 4142 | unsigned char *src = source, *src_end = source + src_bytes; |
| 4128 | unsigned int mask, utf16_examined_p, iso2022_examined_p; | 4143 | unsigned int mask, utf16_examined_p, iso2022_examined_p; |
| 4129 | int i; | 4144 | int i; |
| 4145 | int null_byte_found; | ||
| 4146 | int latin_extra_code_state = 1; | ||
| 4130 | 4147 | ||
| 4131 | /* At first, skip all ASCII characters and control characters except | 4148 | /* At first, skip all ASCII characters and control characters except |
| 4132 | for three ISO2022 specific control characters. */ | 4149 | for three ISO2022 specific control characters. */ |
| @@ -4135,21 +4152,32 @@ detect_coding_mask (source, src_bytes, priorities, skip, multibytep) | |||
| 4135 | ascii_skip_code[ISO_CODE_ESC] = 0; | 4152 | ascii_skip_code[ISO_CODE_ESC] = 0; |
| 4136 | 4153 | ||
| 4137 | label_loop_detect_coding: | 4154 | label_loop_detect_coding: |
| 4138 | while (src < src_end && ascii_skip_code[*src]) src++; | 4155 | null_byte_found = 0; |
| 4156 | while (src < src_end && ascii_skip_code[*src]) | ||
| 4157 | null_byte_found |= (! *src++); | ||
| 4158 | if (! null_byte_found) | ||
| 4159 | { | ||
| 4160 | unsigned char *p = src + 1; | ||
| 4161 | while (p < src_end) | ||
| 4162 | null_byte_found |= (! *p++); | ||
| 4163 | } | ||
| 4139 | *skip = src - source; | 4164 | *skip = src - source; |
| 4140 | 4165 | ||
| 4141 | if (src >= src_end) | 4166 | if (src >= src_end) |
| 4142 | /* We found nothing other than ASCII. There's nothing to do. */ | 4167 | /* We found nothing other than ASCII (and NULL byte). There's |
| 4168 | nothing to do. */ | ||
| 4143 | return 0; | 4169 | return 0; |
| 4144 | 4170 | ||
| 4145 | c = *src; | 4171 | c = *src; |
| 4146 | /* The text seems to be encoded in some multilingual coding system. | 4172 | /* The text seems to be encoded in some multilingual coding system. |
| 4147 | Now, try to find in which coding system the text is encoded. */ | 4173 | Now, try to find in which coding system the text is encoded. */ |
| 4148 | if (c < 0x80) | 4174 | if (! null_byte_found && c < 0x80) |
| 4149 | { | 4175 | { |
| 4150 | /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */ | 4176 | /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */ |
| 4151 | /* C is an ISO2022 specific control code of C0. */ | 4177 | /* C is an ISO2022 specific control code of C0. */ |
| 4152 | mask = detect_coding_iso2022 (src, src_end, multibytep); | 4178 | latin_extra_code_state = 1; |
| 4179 | mask = detect_coding_iso2022 (src, src_end, multibytep, | ||
| 4180 | &latin_extra_code_state); | ||
| 4153 | if (mask == 0) | 4181 | if (mask == 0) |
| 4154 | { | 4182 | { |
| 4155 | /* No valid ISO2022 code follows C. Try again. */ | 4183 | /* No valid ISO2022 code follows C. Try again. */ |
| @@ -4177,21 +4205,27 @@ detect_coding_mask (source, src_bytes, priorities, skip, multibytep) | |||
| 4177 | if (multibytep && c == LEADING_CODE_8_BIT_CONTROL) | 4205 | if (multibytep && c == LEADING_CODE_8_BIT_CONTROL) |
| 4178 | c = src[1] - 0x20; | 4206 | c = src[1] - 0x20; |
| 4179 | 4207 | ||
| 4180 | if (c < 0xA0) | 4208 | if (null_byte_found) |
| 4209 | { | ||
| 4210 | try = (CODING_CATEGORY_MASK_UTF_16_BE | ||
| 4211 | | CODING_CATEGORY_MASK_UTF_16_LE); | ||
| 4212 | } | ||
| 4213 | else if (c < 0xA0) | ||
| 4181 | { | 4214 | { |
| 4182 | /* C is the first byte of SJIS character code, | 4215 | /* C is the first byte of SJIS character code, |
| 4183 | or a leading-code of Emacs' internal format (emacs-mule), | 4216 | or a leading-code of Emacs' internal format (emacs-mule), |
| 4184 | or the first byte of UTF-16. */ | 4217 | or the first byte of UTF-16. */ |
| 4185 | try = (CODING_CATEGORY_MASK_SJIS | 4218 | try = (CODING_CATEGORY_MASK_SJIS |
| 4186 | | CODING_CATEGORY_MASK_EMACS_MULE | 4219 | | CODING_CATEGORY_MASK_EMACS_MULE |
| 4187 | | CODING_CATEGORY_MASK_UTF_16_BE | 4220 | | CODING_CATEGORY_MASK_UTF_16_BE |
| 4188 | | CODING_CATEGORY_MASK_UTF_16_LE); | 4221 | | CODING_CATEGORY_MASK_UTF_16_LE); |
| 4189 | 4222 | ||
| 4190 | /* Or, if C is a special latin extra code, | 4223 | /* Or, if C is a special latin extra code, |
| 4191 | or is an ISO2022 specific control code of C1 (SS2 or SS3), | 4224 | or is an ISO2022 specific control code of C1 (SS2 or SS3), |
| 4192 | or is an ISO2022 control-sequence-introducer (CSI), | 4225 | or is an ISO2022 control-sequence-introducer (CSI), |
| 4193 | we should also consider the possibility of ISO2022 codings. */ | 4226 | we should also consider the possibility of ISO2022 codings. */ |
| 4194 | if ((VECTORP (Vlatin_extra_code_table) | 4227 | if ((latin_extra_code_state |
| 4228 | && VECTORP (Vlatin_extra_code_table) | ||
| 4195 | && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])) | 4229 | && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])) |
| 4196 | || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3) | 4230 | || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3) |
| 4197 | || (c == ISO_CODE_CSI | 4231 | || (c == ISO_CODE_CSI |
| @@ -4201,7 +4235,7 @@ detect_coding_mask (source, src_bytes, priorities, skip, multibytep) | |||
| 4201 | && src + 1 < src_end | 4235 | && src + 1 < src_end |
| 4202 | && src[1] == ']'))))) | 4236 | && src[1] == ']'))))) |
| 4203 | try |= (CODING_CATEGORY_MASK_ISO_8_ELSE | 4237 | try |= (CODING_CATEGORY_MASK_ISO_8_ELSE |
| 4204 | | CODING_CATEGORY_MASK_ISO_8BIT); | 4238 | | CODING_CATEGORY_MASK_ISO_8BIT); |
| 4205 | } | 4239 | } |
| 4206 | else | 4240 | else |
| 4207 | /* C is a character of ISO2022 in graphic plane right, | 4241 | /* C is a character of ISO2022 in graphic plane right, |
| @@ -4209,29 +4243,36 @@ detect_coding_mask (source, src_bytes, priorities, skip, multibytep) | |||
| 4209 | or the first byte of BIG5's 2-byte code, | 4243 | or the first byte of BIG5's 2-byte code, |
| 4210 | or the first byte of UTF-8/16. */ | 4244 | or the first byte of UTF-8/16. */ |
| 4211 | try = (CODING_CATEGORY_MASK_ISO_8_ELSE | 4245 | try = (CODING_CATEGORY_MASK_ISO_8_ELSE |
| 4212 | | CODING_CATEGORY_MASK_ISO_8BIT | 4246 | | CODING_CATEGORY_MASK_ISO_8BIT |
| 4213 | | CODING_CATEGORY_MASK_SJIS | 4247 | | CODING_CATEGORY_MASK_SJIS |
| 4214 | | CODING_CATEGORY_MASK_BIG5 | 4248 | | CODING_CATEGORY_MASK_BIG5 |
| 4215 | | CODING_CATEGORY_MASK_UTF_8 | 4249 | | CODING_CATEGORY_MASK_UTF_8 |
| 4216 | | CODING_CATEGORY_MASK_UTF_16_BE | 4250 | | CODING_CATEGORY_MASK_UTF_16_BE |
| 4217 | | CODING_CATEGORY_MASK_UTF_16_LE); | 4251 | | CODING_CATEGORY_MASK_UTF_16_LE); |
| 4218 | 4252 | ||
| 4219 | /* Or, we may have to consider the possibility of CCL. */ | 4253 | /* Or, we may have to consider the possibility of CCL. */ |
| 4220 | if (coding_system_table[CODING_CATEGORY_IDX_CCL] | 4254 | if (! null_byte_found |
| 4255 | && coding_system_table[CODING_CATEGORY_IDX_CCL] | ||
| 4221 | && (coding_system_table[CODING_CATEGORY_IDX_CCL] | 4256 | && (coding_system_table[CODING_CATEGORY_IDX_CCL] |
| 4222 | ->spec.ccl.valid_codes)[c]) | 4257 | ->spec.ccl.valid_codes)[c]) |
| 4223 | try |= CODING_CATEGORY_MASK_CCL; | 4258 | try |= CODING_CATEGORY_MASK_CCL; |
| 4224 | 4259 | ||
| 4225 | mask = 0; | 4260 | mask = 0; |
| 4226 | utf16_examined_p = iso2022_examined_p = 0; | ||
| 4227 | if (priorities) | 4261 | if (priorities) |
| 4228 | { | 4262 | { |
| 4263 | /* At first try detection with Latin extra codes not-allowed. | ||
| 4264 | If no proper coding system is found because of Latin extra | ||
| 4265 | codes, try detection with Latin extra codes allowed. */ | ||
| 4266 | latin_extra_code_state = 0; | ||
| 4267 | label_retry: | ||
| 4268 | utf16_examined_p = iso2022_examined_p = 0; | ||
| 4229 | for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++) | 4269 | for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++) |
| 4230 | { | 4270 | { |
| 4231 | if (!iso2022_examined_p | 4271 | if (!iso2022_examined_p |
| 4232 | && (priorities[i] & try & CODING_CATEGORY_MASK_ISO)) | 4272 | && (priorities[i] & try & CODING_CATEGORY_MASK_ISO)) |
| 4233 | { | 4273 | { |
| 4234 | mask |= detect_coding_iso2022 (src, src_end, multibytep); | 4274 | mask |= detect_coding_iso2022 (src, src_end, multibytep, |
| 4275 | &latin_extra_code_state); | ||
| 4235 | iso2022_examined_p = 1; | 4276 | iso2022_examined_p = 1; |
| 4236 | } | 4277 | } |
| 4237 | else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS) | 4278 | else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS) |
| @@ -4252,16 +4293,40 @@ detect_coding_mask (source, src_bytes, priorities, skip, multibytep) | |||
| 4252 | else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL) | 4293 | else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL) |
| 4253 | mask |= detect_coding_ccl (src, src_end, multibytep); | 4294 | mask |= detect_coding_ccl (src, src_end, multibytep); |
| 4254 | else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT) | 4295 | else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT) |
| 4255 | mask |= CODING_CATEGORY_MASK_RAW_TEXT; | 4296 | { |
| 4297 | if (latin_extra_code_state == 1) | ||
| 4298 | { | ||
| 4299 | /* Detection of ISO-2022 based coding system | ||
| 4300 | failed because of Latin extra codes. Before | ||
| 4301 | falling back to raw-text, try again with | ||
| 4302 | Latin extra codes allowed. */ | ||
| 4303 | latin_extra_code_state = 2; | ||
| 4304 | try = (mask | CODING_CATEGORY_MASK_ISO_8_ELSE | ||
| 4305 | | CODING_CATEGORY_MASK_ISO_8BIT); | ||
| 4306 | goto label_retry; | ||
| 4307 | } | ||
| 4308 | mask |= CODING_CATEGORY_MASK_RAW_TEXT; | ||
| 4309 | } | ||
| 4256 | else if (priorities[i] & CODING_CATEGORY_MASK_BINARY) | 4310 | else if (priorities[i] & CODING_CATEGORY_MASK_BINARY) |
| 4257 | mask |= CODING_CATEGORY_MASK_BINARY; | 4311 | { |
| 4312 | if (latin_extra_code_state == 1) | ||
| 4313 | { | ||
| 4314 | /* See the above comment. */ | ||
| 4315 | latin_extra_code_state = 2; | ||
| 4316 | try = (mask | CODING_CATEGORY_MASK_ISO_8_ELSE | ||
| 4317 | | CODING_CATEGORY_MASK_ISO_8BIT); | ||
| 4318 | goto label_retry; | ||
| 4319 | } | ||
| 4320 | mask |= CODING_CATEGORY_MASK_BINARY; | ||
| 4321 | } | ||
| 4258 | if (mask & priorities[i]) | 4322 | if (mask & priorities[i]) |
| 4259 | return priorities[i]; | 4323 | return priorities[i]; |
| 4260 | } | 4324 | } |
| 4261 | return CODING_CATEGORY_MASK_RAW_TEXT; | 4325 | return CODING_CATEGORY_MASK_RAW_TEXT; |
| 4262 | } | 4326 | } |
| 4263 | if (try & CODING_CATEGORY_MASK_ISO) | 4327 | if (try & CODING_CATEGORY_MASK_ISO) |
| 4264 | mask |= detect_coding_iso2022 (src, src_end, multibytep); | 4328 | mask |= detect_coding_iso2022 (src, src_end, multibytep, |
| 4329 | &latin_extra_code_state); | ||
| 4265 | if (try & CODING_CATEGORY_MASK_SJIS) | 4330 | if (try & CODING_CATEGORY_MASK_SJIS) |
| 4266 | mask |= detect_coding_sjis (src, src_end, multibytep); | 4331 | mask |= detect_coding_sjis (src, src_end, multibytep); |
| 4267 | if (try & CODING_CATEGORY_MASK_BIG5) | 4332 | if (try & CODING_CATEGORY_MASK_BIG5) |