diff options
Diffstat (limited to 'src/coding.c')
| -rw-r--r-- | src/coding.c | 117 |
1 files changed, 93 insertions, 24 deletions
diff --git a/src/coding.c b/src/coding.c index 716eb98a87f..dd201ae61d6 100644 --- a/src/coding.c +++ b/src/coding.c | |||
| @@ -1410,12 +1410,17 @@ enum iso_code_class_type iso_code_class[256]; | |||
| 1410 | CODING_CATEGORY_MASK_ISO_7_ELSE | 1410 | CODING_CATEGORY_MASK_ISO_7_ELSE |
| 1411 | CODING_CATEGORY_MASK_ISO_8_ELSE | 1411 | CODING_CATEGORY_MASK_ISO_8_ELSE |
| 1412 | are set. If a code which should never appear in ISO2022 is found, | 1412 | are set. If a code which should never appear in ISO2022 is found, |
| 1413 | returns 0. */ | 1413 | returns 0. |
| 1414 | |||
| 1415 | If *latin_extra_code_state is zero and Latin extra codes are found, | ||
| 1416 | set *latin_extra_code_state to 1 and return 0. If it is nonzero, | ||
| 1417 | accept Latin extra codes. */ | ||
| 1414 | 1418 | ||
| 1415 | static int | 1419 | static int |
| 1416 | detect_coding_iso2022 (src, src_end, multibytep) | 1420 | detect_coding_iso2022 (src, src_end, multibytep, latin_extra_code_state) |
| 1417 | unsigned char *src, *src_end; | 1421 | unsigned char *src, *src_end; |
| 1418 | int multibytep; | 1422 | int multibytep; |
| 1423 | int *latin_extra_code_state; | ||
| 1419 | { | 1424 | { |
| 1420 | int mask = CODING_CATEGORY_MASK_ISO; | 1425 | int mask = CODING_CATEGORY_MASK_ISO; |
| 1421 | int mask_found = 0; | 1426 | int mask_found = 0; |
| @@ -1578,6 +1583,11 @@ detect_coding_iso2022 (src, src_end, multibytep) | |||
| 1578 | if (VECTORP (Vlatin_extra_code_table) | 1583 | if (VECTORP (Vlatin_extra_code_table) |
| 1579 | && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])) | 1584 | && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])) |
| 1580 | { | 1585 | { |
| 1586 | if (! *latin_extra_code_state) | ||
| 1587 | { | ||
| 1588 | *latin_extra_code_state = 1; | ||
| 1589 | return 0; | ||
| 1590 | } | ||
| 1581 | if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags | 1591 | if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags |
| 1582 | & CODING_FLAG_ISO_LATIN_EXTRA) | 1592 | & CODING_FLAG_ISO_LATIN_EXTRA) |
| 1583 | newmask |= CODING_CATEGORY_MASK_ISO_8_1; | 1593 | newmask |= CODING_CATEGORY_MASK_ISO_8_1; |
| @@ -1604,6 +1614,11 @@ detect_coding_iso2022 (src, src_end, multibytep) | |||
| 1604 | { | 1614 | { |
| 1605 | int newmask = 0; | 1615 | int newmask = 0; |
| 1606 | 1616 | ||
| 1617 | if (! *latin_extra_code_state) | ||
| 1618 | { | ||
| 1619 | *latin_extra_code_state = 1; | ||
| 1620 | return 0; | ||
| 1621 | } | ||
| 1607 | if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags | 1622 | if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags |
| 1608 | & CODING_FLAG_ISO_LATIN_EXTRA) | 1623 | & CODING_FLAG_ISO_LATIN_EXTRA) |
| 1609 | newmask |= CODING_CATEGORY_MASK_ISO_8_1; | 1624 | newmask |= CODING_CATEGORY_MASK_ISO_8_1; |
| @@ -4131,6 +4146,8 @@ detect_coding_mask (source, src_bytes, priorities, skip, multibytep) | |||
| 4131 | unsigned char *src = source, *src_end = source + src_bytes; | 4146 | unsigned char *src = source, *src_end = source + src_bytes; |
| 4132 | unsigned int mask, utf16_examined_p, iso2022_examined_p; | 4147 | unsigned int mask, utf16_examined_p, iso2022_examined_p; |
| 4133 | int i; | 4148 | int i; |
| 4149 | int null_byte_found; | ||
| 4150 | int latin_extra_code_state = 1; | ||
| 4134 | 4151 | ||
| 4135 | /* At first, skip all ASCII characters and control characters except | 4152 | /* At first, skip all ASCII characters and control characters except |
| 4136 | for three ISO2022 specific control characters. */ | 4153 | for three ISO2022 specific control characters. */ |
| @@ -4139,21 +4156,36 @@ detect_coding_mask (source, src_bytes, priorities, skip, multibytep) | |||
| 4139 | ascii_skip_code[ISO_CODE_ESC] = 0; | 4156 | ascii_skip_code[ISO_CODE_ESC] = 0; |
| 4140 | 4157 | ||
| 4141 | label_loop_detect_coding: | 4158 | label_loop_detect_coding: |
| 4142 | while (src < src_end && ascii_skip_code[*src]) src++; | 4159 | null_byte_found = 0; |
| 4160 | /* We stop this loop before the last byte because it may be a NULL | ||
| 4161 | anchor byte. */ | ||
| 4162 | while (src < src_end - 1 && ascii_skip_code[*src]) | ||
| 4163 | null_byte_found |= (! *src++); | ||
| 4164 | if (ascii_skip_code[*src]) | ||
| 4165 | src++; | ||
| 4166 | else if (! null_byte_found) | ||
| 4167 | { | ||
| 4168 | unsigned char *p = src + 1; | ||
| 4169 | while (p < src_end - 1) | ||
| 4170 | null_byte_found |= (! *p++); | ||
| 4171 | } | ||
| 4143 | *skip = src - source; | 4172 | *skip = src - source; |
| 4144 | 4173 | ||
| 4145 | if (src >= src_end) | 4174 | if (src >= src_end) |
| 4146 | /* We found nothing other than ASCII. There's nothing to do. */ | 4175 | /* We found nothing other than ASCII (and NULL byte). There's |
| 4176 | nothing to do. */ | ||
| 4147 | return 0; | 4177 | return 0; |
| 4148 | 4178 | ||
| 4149 | c = *src; | 4179 | c = *src; |
| 4150 | /* The text seems to be encoded in some multilingual coding system. | 4180 | /* The text seems to be encoded in some multilingual coding system. |
| 4151 | Now, try to find in which coding system the text is encoded. */ | 4181 | Now, try to find in which coding system the text is encoded. */ |
| 4152 | if (c < 0x80) | 4182 | if (! null_byte_found && c < 0x80) |
| 4153 | { | 4183 | { |
| 4154 | /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */ | 4184 | /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */ |
| 4155 | /* C is an ISO2022 specific control code of C0. */ | 4185 | /* C is an ISO2022 specific control code of C0. */ |
| 4156 | mask = detect_coding_iso2022 (src, src_end, multibytep); | 4186 | latin_extra_code_state = 1; |
| 4187 | mask = detect_coding_iso2022 (src, src_end, multibytep, | ||
| 4188 | &latin_extra_code_state); | ||
| 4157 | if (mask == 0) | 4189 | if (mask == 0) |
| 4158 | { | 4190 | { |
| 4159 | /* No valid ISO2022 code follows C. Try again. */ | 4191 | /* No valid ISO2022 code follows C. Try again. */ |
| @@ -4181,21 +4213,27 @@ detect_coding_mask (source, src_bytes, priorities, skip, multibytep) | |||
| 4181 | if (multibytep && c == LEADING_CODE_8_BIT_CONTROL) | 4213 | if (multibytep && c == LEADING_CODE_8_BIT_CONTROL) |
| 4182 | c = src[1] - 0x20; | 4214 | c = src[1] - 0x20; |
| 4183 | 4215 | ||
| 4184 | if (c < 0xA0) | 4216 | if (null_byte_found) |
| 4217 | { | ||
| 4218 | try = (CODING_CATEGORY_MASK_UTF_16_BE | ||
| 4219 | | CODING_CATEGORY_MASK_UTF_16_LE); | ||
| 4220 | } | ||
| 4221 | else if (c < 0xA0) | ||
| 4185 | { | 4222 | { |
| 4186 | /* C is the first byte of SJIS character code, | 4223 | /* C is the first byte of SJIS character code, |
| 4187 | or a leading-code of Emacs' internal format (emacs-mule), | 4224 | or a leading-code of Emacs' internal format (emacs-mule), |
| 4188 | or the first byte of UTF-16. */ | 4225 | or the first byte of UTF-16. */ |
| 4189 | try = (CODING_CATEGORY_MASK_SJIS | 4226 | try = (CODING_CATEGORY_MASK_SJIS |
| 4190 | | CODING_CATEGORY_MASK_EMACS_MULE | 4227 | | CODING_CATEGORY_MASK_EMACS_MULE |
| 4191 | | CODING_CATEGORY_MASK_UTF_16_BE | 4228 | | CODING_CATEGORY_MASK_UTF_16_BE |
| 4192 | | CODING_CATEGORY_MASK_UTF_16_LE); | 4229 | | CODING_CATEGORY_MASK_UTF_16_LE); |
| 4193 | 4230 | ||
| 4194 | /* Or, if C is a special latin extra code, | 4231 | /* Or, if C is a special latin extra code, |
| 4195 | or is an ISO2022 specific control code of C1 (SS2 or SS3), | 4232 | or is an ISO2022 specific control code of C1 (SS2 or SS3), |
| 4196 | or is an ISO2022 control-sequence-introducer (CSI), | 4233 | or is an ISO2022 control-sequence-introducer (CSI), |
| 4197 | we should also consider the possibility of ISO2022 codings. */ | 4234 | we should also consider the possibility of ISO2022 codings. */ |
| 4198 | if ((VECTORP (Vlatin_extra_code_table) | 4235 | if ((latin_extra_code_state |
| 4236 | && VECTORP (Vlatin_extra_code_table) | ||
| 4199 | && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])) | 4237 | && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])) |
| 4200 | || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3) | 4238 | || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3) |
| 4201 | || (c == ISO_CODE_CSI | 4239 | || (c == ISO_CODE_CSI |
| @@ -4205,7 +4243,7 @@ detect_coding_mask (source, src_bytes, priorities, skip, multibytep) | |||
| 4205 | && src + 1 < src_end | 4243 | && src + 1 < src_end |
| 4206 | && src[1] == ']'))))) | 4244 | && src[1] == ']'))))) |
| 4207 | try |= (CODING_CATEGORY_MASK_ISO_8_ELSE | 4245 | try |= (CODING_CATEGORY_MASK_ISO_8_ELSE |
| 4208 | | CODING_CATEGORY_MASK_ISO_8BIT); | 4246 | | CODING_CATEGORY_MASK_ISO_8BIT); |
| 4209 | } | 4247 | } |
| 4210 | else | 4248 | else |
| 4211 | /* C is a character of ISO2022 in graphic plane right, | 4249 | /* C is a character of ISO2022 in graphic plane right, |
| @@ -4213,29 +4251,36 @@ detect_coding_mask (source, src_bytes, priorities, skip, multibytep) | |||
| 4213 | or the first byte of BIG5's 2-byte code, | 4251 | or the first byte of BIG5's 2-byte code, |
| 4214 | or the first byte of UTF-8/16. */ | 4252 | or the first byte of UTF-8/16. */ |
| 4215 | try = (CODING_CATEGORY_MASK_ISO_8_ELSE | 4253 | try = (CODING_CATEGORY_MASK_ISO_8_ELSE |
| 4216 | | CODING_CATEGORY_MASK_ISO_8BIT | 4254 | | CODING_CATEGORY_MASK_ISO_8BIT |
| 4217 | | CODING_CATEGORY_MASK_SJIS | 4255 | | CODING_CATEGORY_MASK_SJIS |
| 4218 | | CODING_CATEGORY_MASK_BIG5 | 4256 | | CODING_CATEGORY_MASK_BIG5 |
| 4219 | | CODING_CATEGORY_MASK_UTF_8 | 4257 | | CODING_CATEGORY_MASK_UTF_8 |
| 4220 | | CODING_CATEGORY_MASK_UTF_16_BE | 4258 | | CODING_CATEGORY_MASK_UTF_16_BE |
| 4221 | | CODING_CATEGORY_MASK_UTF_16_LE); | 4259 | | CODING_CATEGORY_MASK_UTF_16_LE); |
| 4222 | 4260 | ||
| 4223 | /* Or, we may have to consider the possibility of CCL. */ | 4261 | /* Or, we may have to consider the possibility of CCL. */ |
| 4224 | if (coding_system_table[CODING_CATEGORY_IDX_CCL] | 4262 | if (! null_byte_found |
| 4263 | && coding_system_table[CODING_CATEGORY_IDX_CCL] | ||
| 4225 | && (coding_system_table[CODING_CATEGORY_IDX_CCL] | 4264 | && (coding_system_table[CODING_CATEGORY_IDX_CCL] |
| 4226 | ->spec.ccl.valid_codes)[c]) | 4265 | ->spec.ccl.valid_codes)[c]) |
| 4227 | try |= CODING_CATEGORY_MASK_CCL; | 4266 | try |= CODING_CATEGORY_MASK_CCL; |
| 4228 | 4267 | ||
| 4229 | mask = 0; | 4268 | mask = 0; |
| 4230 | utf16_examined_p = iso2022_examined_p = 0; | ||
| 4231 | if (priorities) | 4269 | if (priorities) |
| 4232 | { | 4270 | { |
| 4271 | /* At first try detection with Latin extra codes not-allowed. | ||
| 4272 | If no proper coding system is found because of Latin extra | ||
| 4273 | codes, try detection with Latin extra codes allowed. */ | ||
| 4274 | latin_extra_code_state = 0; | ||
| 4275 | label_retry: | ||
| 4276 | utf16_examined_p = iso2022_examined_p = 0; | ||
| 4233 | for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++) | 4277 | for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++) |
| 4234 | { | 4278 | { |
| 4235 | if (!iso2022_examined_p | 4279 | if (!iso2022_examined_p |
| 4236 | && (priorities[i] & try & CODING_CATEGORY_MASK_ISO)) | 4280 | && (priorities[i] & try & CODING_CATEGORY_MASK_ISO)) |
| 4237 | { | 4281 | { |
| 4238 | mask |= detect_coding_iso2022 (src, src_end, multibytep); | 4282 | mask |= detect_coding_iso2022 (src, src_end, multibytep, |
| 4283 | &latin_extra_code_state); | ||
| 4239 | iso2022_examined_p = 1; | 4284 | iso2022_examined_p = 1; |
| 4240 | } | 4285 | } |
| 4241 | else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS) | 4286 | else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS) |
| @@ -4256,16 +4301,40 @@ detect_coding_mask (source, src_bytes, priorities, skip, multibytep) | |||
| 4256 | else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL) | 4301 | else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL) |
| 4257 | mask |= detect_coding_ccl (src, src_end, multibytep); | 4302 | mask |= detect_coding_ccl (src, src_end, multibytep); |
| 4258 | else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT) | 4303 | else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT) |
| 4259 | mask |= CODING_CATEGORY_MASK_RAW_TEXT; | 4304 | { |
| 4305 | if (latin_extra_code_state == 1) | ||
| 4306 | { | ||
| 4307 | /* Detection of ISO-2022 based coding system | ||
| 4308 | failed because of Latin extra codes. Before | ||
| 4309 | falling back to raw-text, try again with | ||
| 4310 | Latin extra codes allowed. */ | ||
| 4311 | latin_extra_code_state = 2; | ||
| 4312 | try = (mask | CODING_CATEGORY_MASK_ISO_8_ELSE | ||
| 4313 | | CODING_CATEGORY_MASK_ISO_8BIT); | ||
| 4314 | goto label_retry; | ||
| 4315 | } | ||
| 4316 | mask |= CODING_CATEGORY_MASK_RAW_TEXT; | ||
| 4317 | } | ||
| 4260 | else if (priorities[i] & CODING_CATEGORY_MASK_BINARY) | 4318 | else if (priorities[i] & CODING_CATEGORY_MASK_BINARY) |
| 4261 | mask |= CODING_CATEGORY_MASK_BINARY; | 4319 | { |
| 4320 | if (latin_extra_code_state == 1) | ||
| 4321 | { | ||
| 4322 | /* See the above comment. */ | ||
| 4323 | latin_extra_code_state = 2; | ||
| 4324 | try = (mask | CODING_CATEGORY_MASK_ISO_8_ELSE | ||
| 4325 | | CODING_CATEGORY_MASK_ISO_8BIT); | ||
| 4326 | goto label_retry; | ||
| 4327 | } | ||
| 4328 | mask |= CODING_CATEGORY_MASK_BINARY; | ||
| 4329 | } | ||
| 4262 | if (mask & priorities[i]) | 4330 | if (mask & priorities[i]) |
| 4263 | return priorities[i]; | 4331 | return priorities[i]; |
| 4264 | } | 4332 | } |
| 4265 | return CODING_CATEGORY_MASK_RAW_TEXT; | 4333 | return CODING_CATEGORY_MASK_RAW_TEXT; |
| 4266 | } | 4334 | } |
| 4267 | if (try & CODING_CATEGORY_MASK_ISO) | 4335 | if (try & CODING_CATEGORY_MASK_ISO) |
| 4268 | mask |= detect_coding_iso2022 (src, src_end, multibytep); | 4336 | mask |= detect_coding_iso2022 (src, src_end, multibytep, |
| 4337 | &latin_extra_code_state); | ||
| 4269 | if (try & CODING_CATEGORY_MASK_SJIS) | 4338 | if (try & CODING_CATEGORY_MASK_SJIS) |
| 4270 | mask |= detect_coding_sjis (src, src_end, multibytep); | 4339 | mask |= detect_coding_sjis (src, src_end, multibytep); |
| 4271 | if (try & CODING_CATEGORY_MASK_BIG5) | 4340 | if (try & CODING_CATEGORY_MASK_BIG5) |