diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/coding.c | 198 |
1 files changed, 116 insertions, 82 deletions
diff --git a/src/coding.c b/src/coding.c index 5c3299b6b56..68855c05609 100644 --- a/src/coding.c +++ b/src/coding.c | |||
| @@ -958,52 +958,49 @@ detect_coding_iso2022 (src, src_end) | |||
| 958 | } \ | 958 | } \ |
| 959 | } while (0) | 959 | } while (0) |
| 960 | 960 | ||
| 961 | /* Check if the current composing sequence contains only valid codes. | 961 | /* Return 0 if there's a valid composing sequence starting at SRC and |
| 962 | If the composing sequence doesn't end before SRC_END, return -1. | 962 | ending before SRC_END, else return -1. */ |
| 963 | Else, if it contains only valid codes, return 0. | ||
| 964 | Else return the length of the composing sequence. */ | ||
| 965 | 963 | ||
| 966 | int | 964 | int |
| 967 | check_composing_code (coding, src, src_end) | 965 | check_composing_code (coding, src, src_end) |
| 968 | struct coding_system *coding; | 966 | struct coding_system *coding; |
| 969 | unsigned char *src, *src_end; | 967 | unsigned char *src, *src_end; |
| 970 | { | 968 | { |
| 971 | unsigned char *src_start = src; | ||
| 972 | int invalid_code_found = 0; | ||
| 973 | int charset, c, c1, dim; | 969 | int charset, c, c1, dim; |
| 974 | 970 | ||
| 975 | while (src < src_end) | 971 | while (src < src_end) |
| 976 | { | 972 | { |
| 977 | if (*src++ != ISO_CODE_ESC) continue; | 973 | c = *src++; |
| 978 | if (src >= src_end) break; | 974 | if (c >= 0x20) |
| 979 | if ((c = *src++) == '1') /* end of compsition */ | 975 | continue; |
| 980 | return (invalid_code_found ? src - src_start : 0); | 976 | if (c != ISO_CODE_ESC || src >= src_end) |
| 981 | if (src + 2 >= src_end) break; | 977 | return -1; |
| 982 | if (!coding->flags & CODING_FLAG_ISO_DESIGNATION) | 978 | c = *src++; |
| 983 | invalid_code_found = 1; | 979 | if (c == '1') /* end of compsition */ |
| 984 | else | 980 | return 0; |
| 981 | if (src + 2 >= src_end | ||
| 982 | || !coding->flags & CODING_FLAG_ISO_DESIGNATION) | ||
| 983 | return -1; | ||
| 984 | |||
| 985 | dim = (c == '$'); | ||
| 986 | if (dim == 1) | ||
| 987 | c = (*src >= '@' && *src <= 'B') ? '(' : *src++; | ||
| 988 | if (c >= '(' && c <= '/') | ||
| 985 | { | 989 | { |
| 986 | dim = 0; | 990 | c1 = *src++; |
| 987 | if (c == '$') | 991 | if ((c1 < ' ' || c1 >= 0x80) |
| 988 | { | 992 | || (charset = iso_charset_table[dim][c >= ','][c1]) < 0 |
| 989 | dim = 1; | 993 | || ! coding->safe_charsets[charset] |
| 990 | c = (*src >= '@' && *src <= 'B') ? '(' : *src++; | 994 | || (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) |
| 991 | } | 995 | == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)) |
| 992 | if (c >= '(' && c <= '/') | 996 | return -1; |
| 993 | { | ||
| 994 | c1 = *src++; | ||
| 995 | if ((c1 < ' ' || c1 >= 0x80) | ||
| 996 | || (charset = iso_charset_table[dim][c >= ','][c1]) < 0 | ||
| 997 | || ! coding->safe_charsets[charset] | ||
| 998 | || (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) | ||
| 999 | == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)) | ||
| 1000 | invalid_code_found = 1; | ||
| 1001 | } | ||
| 1002 | else | ||
| 1003 | invalid_code_found = 1; | ||
| 1004 | } | 997 | } |
| 998 | else | ||
| 999 | return -1; | ||
| 1005 | } | 1000 | } |
| 1006 | return (invalid_code_found ? src - src_start : -1); | 1001 | |
| 1002 | /* We have not found the sequence "ESC 1". */ | ||
| 1003 | return -1; | ||
| 1007 | } | 1004 | } |
| 1008 | 1005 | ||
| 1009 | /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */ | 1006 | /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */ |
| @@ -1183,7 +1180,7 @@ decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes) | |||
| 1183 | ONE_MORE_BYTE (c1); | 1180 | ONE_MORE_BYTE (c1); |
| 1184 | if (c1 >= '@' && c1 <= 'B') | 1181 | if (c1 >= '@' && c1 <= 'B') |
| 1185 | { /* designation of JISX0208.1978, GB2312.1980, | 1182 | { /* designation of JISX0208.1978, GB2312.1980, |
| 1186 | or JISX0208.1980 */ | 1183 | or JISX0208.1980 */ |
| 1187 | DECODE_DESIGNATION (0, 2, 94, c1); | 1184 | DECODE_DESIGNATION (0, 2, 94, c1); |
| 1188 | } | 1185 | } |
| 1189 | else if (c1 >= 0x28 && c1 <= 0x2B) | 1186 | else if (c1 >= 0x28 && c1 <= 0x2B) |
| @@ -1237,41 +1234,32 @@ decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes) | |||
| 1237 | case '0': case '2': /* start composing */ | 1234 | case '0': case '2': /* start composing */ |
| 1238 | /* Before processing composing, we must be sure that all | 1235 | /* Before processing composing, we must be sure that all |
| 1239 | characters being composed are supported by CODING. | 1236 | characters being composed are supported by CODING. |
| 1240 | If not, we must give up composing and insert the | 1237 | If not, we must give up composing. */ |
| 1241 | bunch of codes for composing as is without decoding. */ | 1238 | if (check_composing_code (coding, src, src_end) == 0) |
| 1242 | { | 1239 | { |
| 1243 | int result1; | 1240 | /* We are looking at a valid composition sequence. */ |
| 1244 | 1241 | coding->composing = (c1 == '0' | |
| 1245 | result1 = check_composing_code (coding, src, src_end); | 1242 | ? COMPOSING_NO_RULE_HEAD |
| 1246 | if (result1 == 0) | 1243 | : COMPOSING_WITH_RULE_HEAD); |
| 1247 | { | 1244 | coding->composed_chars = 0; |
| 1248 | coding->composing = (c1 == '0' | 1245 | } |
| 1249 | ? COMPOSING_NO_RULE_HEAD | 1246 | else |
| 1250 | : COMPOSING_WITH_RULE_HEAD); | 1247 | { |
| 1251 | coding->composed_chars = 0; | 1248 | *dst++ = ISO_CODE_ESC; |
| 1252 | } | 1249 | *dst++ = c1; |
| 1253 | else if (result1 > 0) | 1250 | coding->produced_char += 2; |
| 1254 | { | 1251 | } |
| 1255 | if (result1 + 2 < (dst_bytes ? dst_end : src_base) - dst) | ||
| 1256 | { | ||
| 1257 | bcopy (src_base, dst, result1 + 2); | ||
| 1258 | src += result1; | ||
| 1259 | dst += result1 + 2; | ||
| 1260 | coding->produced_char += result1 + 2; | ||
| 1261 | coding->fake_multibyte = 1; | ||
| 1262 | } | ||
| 1263 | else | ||
| 1264 | { | ||
| 1265 | result = CODING_FINISH_INSUFFICIENT_DST; | ||
| 1266 | goto label_end_of_loop_2; | ||
| 1267 | } | ||
| 1268 | } | ||
| 1269 | else | ||
| 1270 | goto label_end_of_loop; | ||
| 1271 | } | ||
| 1272 | break; | 1252 | break; |
| 1273 | 1253 | ||
| 1274 | case '1': /* end composing */ | 1254 | case '1': /* end composing */ |
| 1255 | if (!coding->composing) | ||
| 1256 | { | ||
| 1257 | *dst++ = ISO_CODE_ESC; | ||
| 1258 | *dst++ = c1; | ||
| 1259 | coding->produced_char += 2; | ||
| 1260 | break; | ||
| 1261 | } | ||
| 1262 | |||
| 1275 | if (coding->composed_chars > 0) | 1263 | if (coding->composed_chars > 0) |
| 1276 | { | 1264 | { |
| 1277 | if (coding->composed_chars == 1) | 1265 | if (coding->composed_chars == 1) |
| @@ -2002,6 +1990,11 @@ encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes) | |||
| 2002 | ENCODE_RESET_PLANE_AND_REGISTER; | 1990 | ENCODE_RESET_PLANE_AND_REGISTER; |
| 2003 | if (COMPOSING_P (coding->composing)) | 1991 | if (COMPOSING_P (coding->composing)) |
| 2004 | ENCODE_COMPOSITION_END; | 1992 | ENCODE_COMPOSITION_END; |
| 1993 | if (result == CODING_FINISH_INSUFFICIENT_SRC) | ||
| 1994 | { | ||
| 1995 | while (src < src_end && dst < dst_end) | ||
| 1996 | *dst++ = *src++; | ||
| 1997 | } | ||
| 2005 | } | 1998 | } |
| 2006 | coding->consumed = src - source; | 1999 | coding->consumed = src - source; |
| 2007 | coding->produced = coding->produced_char = dst - destination; | 2000 | coding->produced = coding->produced_char = dst - destination; |
| @@ -2876,8 +2869,6 @@ setup_coding_system (coding_system, coding) | |||
| 2876 | 2869 | ||
| 2877 | /* Initialize remaining fields. */ | 2870 | /* Initialize remaining fields. */ |
| 2878 | coding->composing = 0; | 2871 | coding->composing = 0; |
| 2879 | coding->translation_table_for_decode = Qnil; | ||
| 2880 | coding->translation_table_for_encode = Qnil; | ||
| 2881 | 2872 | ||
| 2882 | /* Get values of coding system properties: | 2873 | /* Get values of coding system properties: |
| 2883 | `post-read-conversion', `pre-write-conversion', | 2874 | `post-read-conversion', `pre-write-conversion', |
| @@ -3862,6 +3853,7 @@ shrink_decoding_region (beg, end, coding, str) | |||
| 3862 | { | 3853 | { |
| 3863 | unsigned char *begp_orig, *begp, *endp_orig, *endp, c; | 3854 | unsigned char *begp_orig, *begp, *endp_orig, *endp, c; |
| 3864 | int eol_conversion; | 3855 | int eol_conversion; |
| 3856 | Lisp_Object translation_table; | ||
| 3865 | 3857 | ||
| 3866 | if (coding->type == coding_type_ccl | 3858 | if (coding->type == coding_type_ccl |
| 3867 | || coding->type == coding_type_undecided | 3859 | || coding->type == coding_type_undecided |
| @@ -3877,6 +3869,21 @@ shrink_decoding_region (beg, end, coding, str) | |||
| 3877 | return; | 3869 | return; |
| 3878 | } | 3870 | } |
| 3879 | 3871 | ||
| 3872 | translation_table = coding->translation_table_for_decode; | ||
| 3873 | if (NILP (translation_table) && !NILP (Venable_character_translation)) | ||
| 3874 | translation_table = Vstandard_translation_table_for_decode; | ||
| 3875 | if (CHAR_TABLE_P (translation_table)) | ||
| 3876 | { | ||
| 3877 | int i; | ||
| 3878 | for (i = 0; i < 128; i++) | ||
| 3879 | if (!NILP (CHAR_TABLE_REF (translation_table, i))) | ||
| 3880 | break; | ||
| 3881 | if (i < 128) | ||
| 3882 | /* Some ASCII character should be tranlsated. We give up | ||
| 3883 | shrinking. */ | ||
| 3884 | return; | ||
| 3885 | } | ||
| 3886 | |||
| 3880 | eol_conversion = (coding->eol_type != CODING_EOL_LF); | 3887 | eol_conversion = (coding->eol_type != CODING_EOL_LF); |
| 3881 | 3888 | ||
| 3882 | if ((! eol_conversion) && (coding->heading_ascii >= 0)) | 3889 | if ((! eol_conversion) && (coding->heading_ascii >= 0)) |
| @@ -4022,6 +4029,7 @@ shrink_encoding_region (beg, end, coding, str) | |||
| 4022 | { | 4029 | { |
| 4023 | unsigned char *begp_orig, *begp, *endp_orig, *endp; | 4030 | unsigned char *begp_orig, *begp, *endp_orig, *endp; |
| 4024 | int eol_conversion; | 4031 | int eol_conversion; |
| 4032 | Lisp_Object translation_table; | ||
| 4025 | 4033 | ||
| 4026 | if (coding->type == coding_type_ccl) | 4034 | if (coding->type == coding_type_ccl) |
| 4027 | /* We can't skip any data. */ | 4035 | /* We can't skip any data. */ |
| @@ -4033,6 +4041,21 @@ shrink_encoding_region (beg, end, coding, str) | |||
| 4033 | return; | 4041 | return; |
| 4034 | } | 4042 | } |
| 4035 | 4043 | ||
| 4044 | translation_table = coding->translation_table_for_encode; | ||
| 4045 | if (NILP (translation_table) && !NILP (Venable_character_translation)) | ||
| 4046 | translation_table = Vstandard_translation_table_for_encode; | ||
| 4047 | if (CHAR_TABLE_P (translation_table)) | ||
| 4048 | { | ||
| 4049 | int i; | ||
| 4050 | for (i = 0; i < 128; i++) | ||
| 4051 | if (!NILP (CHAR_TABLE_REF (translation_table, i))) | ||
| 4052 | break; | ||
| 4053 | if (i < 128) | ||
| 4054 | /* Some ASCII character should be tranlsated. We give up | ||
| 4055 | shrinking. */ | ||
| 4056 | return; | ||
| 4057 | } | ||
| 4058 | |||
| 4036 | if (str) | 4059 | if (str) |
| 4037 | { | 4060 | { |
| 4038 | begp_orig = begp = str + *beg; | 4061 | begp_orig = begp = str + *beg; |
| @@ -4097,6 +4120,20 @@ shrink_encoding_region (beg, end, coding, str) | |||
| 4097 | return; | 4120 | return; |
| 4098 | } | 4121 | } |
| 4099 | 4122 | ||
| 4123 | /* As shrinking conversion region requires some overhead, we don't try | ||
| 4124 | shrinking if the length of conversion region is less than this | ||
| 4125 | value. */ | ||
| 4126 | static int shrink_conversion_region_threshhold = 1024; | ||
| 4127 | |||
| 4128 | #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep) \ | ||
| 4129 | do { \ | ||
| 4130 | if (*(end) - *(beg) > shrink_conversion_region_threshhold) \ | ||
| 4131 | { \ | ||
| 4132 | if (encodep) shrink_encoding_region (beg, end, coding, str); \ | ||
| 4133 | else shrink_decoding_region (beg, end, coding, str); \ | ||
| 4134 | } \ | ||
| 4135 | } while (0) | ||
| 4136 | |||
| 4100 | /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the | 4137 | /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the |
| 4101 | text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by | 4138 | text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by |
| 4102 | coding system CODING, and return the status code of code conversion | 4139 | coding system CODING, and return the status code of code conversion |
| @@ -4240,10 +4277,7 @@ code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace) | |||
| 4240 | 4277 | ||
| 4241 | if (from < GPT && GPT < to) | 4278 | if (from < GPT && GPT < to) |
| 4242 | move_gap_both (from, from_byte); | 4279 | move_gap_both (from, from_byte); |
| 4243 | if (encodep) | 4280 | SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep); |
| 4244 | shrink_encoding_region (&from_byte, &to_byte, coding, NULL); | ||
| 4245 | else | ||
| 4246 | shrink_decoding_region (&from_byte, &to_byte, coding, NULL); | ||
| 4247 | if (from_byte == to_byte | 4281 | if (from_byte == to_byte |
| 4248 | && ! (coding->mode & CODING_MODE_LAST_BLOCK | 4282 | && ! (coding->mode & CODING_MODE_LAST_BLOCK |
| 4249 | && CODING_REQUIRE_FLUSHING (coding))) | 4283 | && CODING_REQUIRE_FLUSHING (coding))) |
| @@ -4264,6 +4298,11 @@ code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace) | |||
| 4264 | len -= total_skip; len_byte -= total_skip; | 4298 | len -= total_skip; len_byte -= total_skip; |
| 4265 | } | 4299 | } |
| 4266 | 4300 | ||
| 4301 | /* The code conversion routine can not preserve text properties for | ||
| 4302 | now. So, we must remove all text properties in the region. */ | ||
| 4303 | if (replace) | ||
| 4304 | Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil); | ||
| 4305 | |||
| 4267 | /* For converion, we must put the gap before the text in addition to | 4306 | /* For converion, we must put the gap before the text in addition to |
| 4268 | making the gap larger for efficient decoding. The required gap | 4307 | making the gap larger for efficient decoding. The required gap |
| 4269 | size starts from 2000 which is the magic number used in make_gap. | 4308 | size starts from 2000 which is the magic number used in make_gap. |
| @@ -4439,8 +4478,9 @@ code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace) | |||
| 4439 | if (src - dst > 0) *dst = 0; /* Put an anchor. */ | 4478 | if (src - dst > 0) *dst = 0; /* Put an anchor. */ |
| 4440 | 4479 | ||
| 4441 | if (multibyte | 4480 | if (multibyte |
| 4442 | && (fake_multibyte | 4481 | && (encodep |
| 4443 | || !encodep && (to - from) != (to_byte - from_byte))) | 4482 | || fake_multibyte |
| 4483 | || (to - from) != (to_byte - from_byte))) | ||
| 4444 | inserted = multibyte_chars_in_text (GPT_ADDR, inserted_byte); | 4484 | inserted = multibyte_chars_in_text (GPT_ADDR, inserted_byte); |
| 4445 | 4485 | ||
| 4446 | /* If we have shrinked the conversion area, adjust it now. */ | 4486 | /* If we have shrinked the conversion area, adjust it now. */ |
| @@ -4562,10 +4602,8 @@ code_convert_string (str, coding, encodep, nocopy) | |||
| 4562 | else | 4602 | else |
| 4563 | { | 4603 | { |
| 4564 | /* Try to skip the heading and tailing ASCIIs. */ | 4604 | /* Try to skip the heading and tailing ASCIIs. */ |
| 4565 | if (encodep) | 4605 | SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data, |
| 4566 | shrink_encoding_region (&from, &to_byte, coding, XSTRING (str)->data); | 4606 | encodep); |
| 4567 | else | ||
| 4568 | shrink_decoding_region (&from, &to_byte, coding, XSTRING (str)->data); | ||
| 4569 | } | 4607 | } |
| 4570 | if (from == to_byte) | 4608 | if (from == to_byte) |
| 4571 | return (nocopy ? str : Fcopy_sequence (str)); | 4609 | return (nocopy ? str : Fcopy_sequence (str)); |
| @@ -4814,10 +4852,6 @@ code_convert_region1 (start, end, coding_system, encodep) | |||
| 4814 | if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0) | 4852 | if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0) |
| 4815 | error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data); | 4853 | error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data); |
| 4816 | 4854 | ||
| 4817 | /* The code conversion routine can not preserve text properties for | ||
| 4818 | now. So, we must remove all text properties in the region. */ | ||
| 4819 | Fset_text_properties (start, end, Qnil, Qnil); | ||
| 4820 | |||
| 4821 | coding.mode |= CODING_MODE_LAST_BLOCK; | 4855 | coding.mode |= CODING_MODE_LAST_BLOCK; |
| 4822 | code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to), | 4856 | code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to), |
| 4823 | &coding, encodep, 1); | 4857 | &coding, encodep, 1); |