aboutsummaryrefslogtreecommitdiffstats
path: root/src/coding.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/coding.c')
-rw-r--r--src/coding.c117
1 files changed, 93 insertions, 24 deletions
diff --git a/src/coding.c b/src/coding.c
index 716eb98a87f..dd201ae61d6 100644
--- a/src/coding.c
+++ b/src/coding.c
@@ -1410,12 +1410,17 @@ enum iso_code_class_type iso_code_class[256];
1410 CODING_CATEGORY_MASK_ISO_7_ELSE 1410 CODING_CATEGORY_MASK_ISO_7_ELSE
1411 CODING_CATEGORY_MASK_ISO_8_ELSE 1411 CODING_CATEGORY_MASK_ISO_8_ELSE
1412 are set. If a code which should never appear in ISO2022 is found, 1412 are set. If a code which should never appear in ISO2022 is found,
1413 returns 0. */ 1413 returns 0.
1414
1415 If *latin_extra_code_state is zero and Latin extra codes are found,
1416 set *latin_extra_code_state to 1 and return 0. If it is nonzero,
1417 accept Latin extra codes. */
1414 1418
1415static int 1419static int
1416detect_coding_iso2022 (src, src_end, multibytep) 1420detect_coding_iso2022 (src, src_end, multibytep, latin_extra_code_state)
1417 unsigned char *src, *src_end; 1421 unsigned char *src, *src_end;
1418 int multibytep; 1422 int multibytep;
1423 int *latin_extra_code_state;
1419{ 1424{
1420 int mask = CODING_CATEGORY_MASK_ISO; 1425 int mask = CODING_CATEGORY_MASK_ISO;
1421 int mask_found = 0; 1426 int mask_found = 0;
@@ -1578,6 +1583,11 @@ detect_coding_iso2022 (src, src_end, multibytep)
1578 if (VECTORP (Vlatin_extra_code_table) 1583 if (VECTORP (Vlatin_extra_code_table)
1579 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])) 1584 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1580 { 1585 {
1586 if (! *latin_extra_code_state)
1587 {
1588 *latin_extra_code_state = 1;
1589 return 0;
1590 }
1581 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags 1591 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1582 & CODING_FLAG_ISO_LATIN_EXTRA) 1592 & CODING_FLAG_ISO_LATIN_EXTRA)
1583 newmask |= CODING_CATEGORY_MASK_ISO_8_1; 1593 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
@@ -1604,6 +1614,11 @@ detect_coding_iso2022 (src, src_end, multibytep)
1604 { 1614 {
1605 int newmask = 0; 1615 int newmask = 0;
1606 1616
1617 if (! *latin_extra_code_state)
1618 {
1619 *latin_extra_code_state = 1;
1620 return 0;
1621 }
1607 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags 1622 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1608 & CODING_FLAG_ISO_LATIN_EXTRA) 1623 & CODING_FLAG_ISO_LATIN_EXTRA)
1609 newmask |= CODING_CATEGORY_MASK_ISO_8_1; 1624 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
@@ -4131,6 +4146,8 @@ detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
4131 unsigned char *src = source, *src_end = source + src_bytes; 4146 unsigned char *src = source, *src_end = source + src_bytes;
4132 unsigned int mask, utf16_examined_p, iso2022_examined_p; 4147 unsigned int mask, utf16_examined_p, iso2022_examined_p;
4133 int i; 4148 int i;
4149 int null_byte_found;
4150 int latin_extra_code_state = 1;
4134 4151
4135 /* At first, skip all ASCII characters and control characters except 4152 /* At first, skip all ASCII characters and control characters except
4136 for three ISO2022 specific control characters. */ 4153 for three ISO2022 specific control characters. */
@@ -4139,21 +4156,36 @@ detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
4139 ascii_skip_code[ISO_CODE_ESC] = 0; 4156 ascii_skip_code[ISO_CODE_ESC] = 0;
4140 4157
4141 label_loop_detect_coding: 4158 label_loop_detect_coding:
4142 while (src < src_end && ascii_skip_code[*src]) src++; 4159 null_byte_found = 0;
4160 /* We stop this loop before the last byte because it may be a NULL
4161 anchor byte. */
4162 while (src < src_end - 1 && ascii_skip_code[*src])
4163 null_byte_found |= (! *src++);
4164 if (ascii_skip_code[*src])
4165 src++;
4166 else if (! null_byte_found)
4167 {
4168 unsigned char *p = src + 1;
4169 while (p < src_end - 1)
4170 null_byte_found |= (! *p++);
4171 }
4143 *skip = src - source; 4172 *skip = src - source;
4144 4173
4145 if (src >= src_end) 4174 if (src >= src_end)
4146 /* We found nothing other than ASCII. There's nothing to do. */ 4175 /* We found nothing other than ASCII (and NULL byte). There's
4176 nothing to do. */
4147 return 0; 4177 return 0;
4148 4178
4149 c = *src; 4179 c = *src;
4150 /* The text seems to be encoded in some multilingual coding system. 4180 /* The text seems to be encoded in some multilingual coding system.
4151 Now, try to find in which coding system the text is encoded. */ 4181 Now, try to find in which coding system the text is encoded. */
4152 if (c < 0x80) 4182 if (! null_byte_found && c < 0x80)
4153 { 4183 {
4154 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */ 4184 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
4155 /* C is an ISO2022 specific control code of C0. */ 4185 /* C is an ISO2022 specific control code of C0. */
4156 mask = detect_coding_iso2022 (src, src_end, multibytep); 4186 latin_extra_code_state = 1;
4187 mask = detect_coding_iso2022 (src, src_end, multibytep,
4188 &latin_extra_code_state);
4157 if (mask == 0) 4189 if (mask == 0)
4158 { 4190 {
4159 /* No valid ISO2022 code follows C. Try again. */ 4191 /* No valid ISO2022 code follows C. Try again. */
@@ -4181,21 +4213,27 @@ detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
4181 if (multibytep && c == LEADING_CODE_8_BIT_CONTROL) 4213 if (multibytep && c == LEADING_CODE_8_BIT_CONTROL)
4182 c = src[1] - 0x20; 4214 c = src[1] - 0x20;
4183 4215
4184 if (c < 0xA0) 4216 if (null_byte_found)
4217 {
4218 try = (CODING_CATEGORY_MASK_UTF_16_BE
4219 | CODING_CATEGORY_MASK_UTF_16_LE);
4220 }
4221 else if (c < 0xA0)
4185 { 4222 {
4186 /* C is the first byte of SJIS character code, 4223 /* C is the first byte of SJIS character code,
4187 or a leading-code of Emacs' internal format (emacs-mule), 4224 or a leading-code of Emacs' internal format (emacs-mule),
4188 or the first byte of UTF-16. */ 4225 or the first byte of UTF-16. */
4189 try = (CODING_CATEGORY_MASK_SJIS 4226 try = (CODING_CATEGORY_MASK_SJIS
4190 | CODING_CATEGORY_MASK_EMACS_MULE 4227 | CODING_CATEGORY_MASK_EMACS_MULE
4191 | CODING_CATEGORY_MASK_UTF_16_BE 4228 | CODING_CATEGORY_MASK_UTF_16_BE
4192 | CODING_CATEGORY_MASK_UTF_16_LE); 4229 | CODING_CATEGORY_MASK_UTF_16_LE);
4193 4230
4194 /* Or, if C is a special latin extra code, 4231 /* Or, if C is a special latin extra code,
4195 or is an ISO2022 specific control code of C1 (SS2 or SS3), 4232 or is an ISO2022 specific control code of C1 (SS2 or SS3),
4196 or is an ISO2022 control-sequence-introducer (CSI), 4233 or is an ISO2022 control-sequence-introducer (CSI),
4197 we should also consider the possibility of ISO2022 codings. */ 4234 we should also consider the possibility of ISO2022 codings. */
4198 if ((VECTORP (Vlatin_extra_code_table) 4235 if ((latin_extra_code_state
4236 && VECTORP (Vlatin_extra_code_table)
4199 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])) 4237 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
4200 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3) 4238 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
4201 || (c == ISO_CODE_CSI 4239 || (c == ISO_CODE_CSI
@@ -4205,7 +4243,7 @@ detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
4205 && src + 1 < src_end 4243 && src + 1 < src_end
4206 && src[1] == ']'))))) 4244 && src[1] == ']')))))
4207 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE 4245 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
4208 | CODING_CATEGORY_MASK_ISO_8BIT); 4246 | CODING_CATEGORY_MASK_ISO_8BIT);
4209 } 4247 }
4210 else 4248 else
4211 /* C is a character of ISO2022 in graphic plane right, 4249 /* C is a character of ISO2022 in graphic plane right,
@@ -4213,29 +4251,36 @@ detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
4213 or the first byte of BIG5's 2-byte code, 4251 or the first byte of BIG5's 2-byte code,
4214 or the first byte of UTF-8/16. */ 4252 or the first byte of UTF-8/16. */
4215 try = (CODING_CATEGORY_MASK_ISO_8_ELSE 4253 try = (CODING_CATEGORY_MASK_ISO_8_ELSE
4216 | CODING_CATEGORY_MASK_ISO_8BIT 4254 | CODING_CATEGORY_MASK_ISO_8BIT
4217 | CODING_CATEGORY_MASK_SJIS 4255 | CODING_CATEGORY_MASK_SJIS
4218 | CODING_CATEGORY_MASK_BIG5 4256 | CODING_CATEGORY_MASK_BIG5
4219 | CODING_CATEGORY_MASK_UTF_8 4257 | CODING_CATEGORY_MASK_UTF_8
4220 | CODING_CATEGORY_MASK_UTF_16_BE 4258 | CODING_CATEGORY_MASK_UTF_16_BE
4221 | CODING_CATEGORY_MASK_UTF_16_LE); 4259 | CODING_CATEGORY_MASK_UTF_16_LE);
4222 4260
4223 /* Or, we may have to consider the possibility of CCL. */ 4261 /* Or, we may have to consider the possibility of CCL. */
4224 if (coding_system_table[CODING_CATEGORY_IDX_CCL] 4262 if (! null_byte_found
4263 && coding_system_table[CODING_CATEGORY_IDX_CCL]
4225 && (coding_system_table[CODING_CATEGORY_IDX_CCL] 4264 && (coding_system_table[CODING_CATEGORY_IDX_CCL]
4226 ->spec.ccl.valid_codes)[c]) 4265 ->spec.ccl.valid_codes)[c])
4227 try |= CODING_CATEGORY_MASK_CCL; 4266 try |= CODING_CATEGORY_MASK_CCL;
4228 4267
4229 mask = 0; 4268 mask = 0;
4230 utf16_examined_p = iso2022_examined_p = 0;
4231 if (priorities) 4269 if (priorities)
4232 { 4270 {
4271 /* At first try detection with Latin extra codes not-allowed.
4272 If no proper coding system is found because of Latin extra
4273 codes, try detection with Latin extra codes allowed. */
4274 latin_extra_code_state = 0;
4275 label_retry:
4276 utf16_examined_p = iso2022_examined_p = 0;
4233 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++) 4277 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4234 { 4278 {
4235 if (!iso2022_examined_p 4279 if (!iso2022_examined_p
4236 && (priorities[i] & try & CODING_CATEGORY_MASK_ISO)) 4280 && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
4237 { 4281 {
4238 mask |= detect_coding_iso2022 (src, src_end, multibytep); 4282 mask |= detect_coding_iso2022 (src, src_end, multibytep,
4283 &latin_extra_code_state);
4239 iso2022_examined_p = 1; 4284 iso2022_examined_p = 1;
4240 } 4285 }
4241 else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS) 4286 else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
@@ -4256,16 +4301,40 @@ detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
4256 else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL) 4301 else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
4257 mask |= detect_coding_ccl (src, src_end, multibytep); 4302 mask |= detect_coding_ccl (src, src_end, multibytep);
4258 else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT) 4303 else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
4259 mask |= CODING_CATEGORY_MASK_RAW_TEXT; 4304 {
4305 if (latin_extra_code_state == 1)
4306 {
4307 /* Detection of ISO-2022 based coding system
4308 failed because of Latin extra codes. Before
4309 falling back to raw-text, try again with
4310 Latin extra codes allowed. */
4311 latin_extra_code_state = 2;
4312 try = (mask | CODING_CATEGORY_MASK_ISO_8_ELSE
4313 | CODING_CATEGORY_MASK_ISO_8BIT);
4314 goto label_retry;
4315 }
4316 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
4317 }
4260 else if (priorities[i] & CODING_CATEGORY_MASK_BINARY) 4318 else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
4261 mask |= CODING_CATEGORY_MASK_BINARY; 4319 {
4320 if (latin_extra_code_state == 1)
4321 {
4322 /* See the above comment. */
4323 latin_extra_code_state = 2;
4324 try = (mask | CODING_CATEGORY_MASK_ISO_8_ELSE
4325 | CODING_CATEGORY_MASK_ISO_8BIT);
4326 goto label_retry;
4327 }
4328 mask |= CODING_CATEGORY_MASK_BINARY;
4329 }
4262 if (mask & priorities[i]) 4330 if (mask & priorities[i])
4263 return priorities[i]; 4331 return priorities[i];
4264 } 4332 }
4265 return CODING_CATEGORY_MASK_RAW_TEXT; 4333 return CODING_CATEGORY_MASK_RAW_TEXT;
4266 } 4334 }
4267 if (try & CODING_CATEGORY_MASK_ISO) 4335 if (try & CODING_CATEGORY_MASK_ISO)
4268 mask |= detect_coding_iso2022 (src, src_end, multibytep); 4336 mask |= detect_coding_iso2022 (src, src_end, multibytep,
4337 &latin_extra_code_state);
4269 if (try & CODING_CATEGORY_MASK_SJIS) 4338 if (try & CODING_CATEGORY_MASK_SJIS)
4270 mask |= detect_coding_sjis (src, src_end, multibytep); 4339 mask |= detect_coding_sjis (src, src_end, multibytep);
4271 if (try & CODING_CATEGORY_MASK_BIG5) 4340 if (try & CODING_CATEGORY_MASK_BIG5)