aboutsummaryrefslogtreecommitdiffstats
path: root/src/coding.c
diff options
context:
space:
mode:
authorKenichi Handa2008-03-02 08:05:14 +0000
committerKenichi Handa2008-03-02 08:05:14 +0000
commit119852e72b941a90312b943c2edd467a53f599d7 (patch)
tree1848639590b80df41e39d7e25ed2e6b2f2a75daf /src/coding.c
parent0caa490be5402b0e1028d24f954f59a30c70460a (diff)
downloademacs-119852e72b941a90312b943c2edd467a53f599d7.tar.gz
emacs-119852e72b941a90312b943c2edd467a53f599d7.zip
(decode_coding_utf_8): When eol-type of CODING is
`dos', don't decode '\r' if that is the last in the source. (decode_coding_utf_16, decode_coding_emacs_mule) (decode_coding_iso_2022, decode_coding_sjis, decode_coding_big5) (decode_coding_raw_text, decode_coding_charset): Likewise. (produce_chars): Don't decode EOL here. Use EMACS_INT.
Diffstat (limited to 'src/coding.c')
-rw-r--r--src/coding.c204
1 files changed, 119 insertions, 85 deletions
diff --git a/src/coding.c b/src/coding.c
index a43b5cb8187..525764d693d 100644
--- a/src/coding.c
+++ b/src/coding.c
@@ -955,6 +955,11 @@ record_conversion_result (struct coding_system *coding,
955 } while (0) 955 } while (0)
956 956
957 957
958/* If there are at least BYTES length of room at dst, allocate memory
959 for coding->destination and update dst and dst_end. We don't have
960 to take care of coding->source which will be relocated. It is
961 handled by calling coding_set_source in encode_coding. */
962
958#define ASSURE_DESTINATION(bytes) \ 963#define ASSURE_DESTINATION(bytes) \
959 do { \ 964 do { \
960 if (dst + (bytes) >= dst_end) \ 965 if (dst + (bytes) >= dst_end) \
@@ -1225,6 +1230,8 @@ decode_coding_utf_8 (coding)
1225 int consumed_chars = 0, consumed_chars_base; 1230 int consumed_chars = 0, consumed_chars_base;
1226 int multibytep = coding->src_multibyte; 1231 int multibytep = coding->src_multibyte;
1227 Lisp_Object attr, charset_list; 1232 Lisp_Object attr, charset_list;
1233 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1234 int byte_after_cr = -1;
1228 1235
1229 CODING_GET_INFO (coding, attr, charset_list); 1236 CODING_GET_INFO (coding, attr, charset_list);
1230 1237
@@ -1238,13 +1245,18 @@ decode_coding_utf_8 (coding)
1238 if (charbuf >= charbuf_end) 1245 if (charbuf >= charbuf_end)
1239 break; 1246 break;
1240 1247
1241 ONE_MORE_BYTE (c1); 1248 if (byte_after_cr >= 0)
1249 c1 = byte_after_cr, byte_after_cr = -1;
1250 else
1251 ONE_MORE_BYTE (c1);
1242 if (c1 < 0) 1252 if (c1 < 0)
1243 { 1253 {
1244 c = - c1; 1254 c = - c1;
1245 } 1255 }
1246 else if (UTF_8_1_OCTET_P(c1)) 1256 else if (UTF_8_1_OCTET_P(c1))
1247 { 1257 {
1258 if (eol_crlf && c1 == '\r')
1259 ONE_MORE_BYTE (byte_after_cr);
1248 c = c1; 1260 c = c1;
1249 } 1261 }
1250 else 1262 else
@@ -1458,6 +1470,8 @@ decode_coding_utf_16 (coding)
1458 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding); 1470 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1459 int surrogate = CODING_UTF_16_SURROGATE (coding); 1471 int surrogate = CODING_UTF_16_SURROGATE (coding);
1460 Lisp_Object attr, charset_list; 1472 Lisp_Object attr, charset_list;
1473 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1474 int byte_after_cr1 = -1, byte_after_cr2 = -1;
1461 1475
1462 CODING_GET_INFO (coding, attr, charset_list); 1476 CODING_GET_INFO (coding, attr, charset_list);
1463 1477
@@ -1497,13 +1511,19 @@ decode_coding_utf_16 (coding)
1497 if (charbuf + 2 >= charbuf_end) 1511 if (charbuf + 2 >= charbuf_end)
1498 break; 1512 break;
1499 1513
1500 ONE_MORE_BYTE (c1); 1514 if (byte_after_cr1 >= 0)
1515 c1 = byte_after_cr1, byte_after_cr1 = -1;
1516 else
1517 ONE_MORE_BYTE (c1);
1501 if (c1 < 0) 1518 if (c1 < 0)
1502 { 1519 {
1503 *charbuf++ = -c1; 1520 *charbuf++ = -c1;
1504 continue; 1521 continue;
1505 } 1522 }
1506 ONE_MORE_BYTE (c2); 1523 if (byte_after_cr2 >= 0)
1524 c2 = byte_after_cr2, byte_after_cr2 = -1;
1525 else
1526 ONE_MORE_BYTE (c2);
1507 if (c2 < 0) 1527 if (c2 < 0)
1508 { 1528 {
1509 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1); 1529 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
@@ -1512,6 +1532,7 @@ decode_coding_utf_16 (coding)
1512 } 1532 }
1513 c = (endian == utf_16_big_endian 1533 c = (endian == utf_16_big_endian
1514 ? ((c1 << 8) | c2) : ((c2 << 8) | c1)); 1534 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1535
1515 if (surrogate) 1536 if (surrogate)
1516 { 1537 {
1517 if (! UTF_16_LOW_SURROGATE_P (c)) 1538 if (! UTF_16_LOW_SURROGATE_P (c))
@@ -1540,7 +1561,14 @@ decode_coding_utf_16 (coding)
1540 if (UTF_16_HIGH_SURROGATE_P (c)) 1561 if (UTF_16_HIGH_SURROGATE_P (c))
1541 CODING_UTF_16_SURROGATE (coding) = surrogate = c; 1562 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1542 else 1563 else
1543 *charbuf++ = c; 1564 {
1565 if (eol_crlf && c == '\r')
1566 {
1567 ONE_MORE_BYTE (byte_after_cr1);
1568 ONE_MORE_BYTE (byte_after_cr2);
1569 }
1570 *charbuf++ = c;
1571 }
1544 } 1572 }
1545 } 1573 }
1546 1574
@@ -2072,6 +2100,8 @@ decode_coding_emacs_mule (coding)
2072 int char_offset = coding->produced_char; 2100 int char_offset = coding->produced_char;
2073 int last_offset = char_offset; 2101 int last_offset = char_offset;
2074 int last_id = charset_ascii; 2102 int last_id = charset_ascii;
2103 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2104 int byte_after_cr = -1;
2075 2105
2076 CODING_GET_INFO (coding, attrs, charset_list); 2106 CODING_GET_INFO (coding, attrs, charset_list);
2077 2107
@@ -2085,7 +2115,10 @@ decode_coding_emacs_mule (coding)
2085 if (charbuf >= charbuf_end) 2115 if (charbuf >= charbuf_end)
2086 break; 2116 break;
2087 2117
2088 ONE_MORE_BYTE (c); 2118 if (byte_after_cr >= 0)
2119 c = byte_after_cr, byte_after_cr = -1;
2120 else
2121 ONE_MORE_BYTE (c);
2089 if (c < 0) 2122 if (c < 0)
2090 { 2123 {
2091 *charbuf++ = -c; 2124 *charbuf++ = -c;
@@ -2093,6 +2126,8 @@ decode_coding_emacs_mule (coding)
2093 } 2126 }
2094 else if (c < 0x80) 2127 else if (c < 0x80)
2095 { 2128 {
2129 if (eol_crlf && c == '\r')
2130 ONE_MORE_BYTE (byte_after_cr);
2096 *charbuf++ = c; 2131 *charbuf++ = c;
2097 char_offset++; 2132 char_offset++;
2098 } 2133 }
@@ -2945,6 +2980,8 @@ decode_coding_iso_2022 (coding)
2945 int char_offset = coding->produced_char; 2980 int char_offset = coding->produced_char;
2946 int last_offset = char_offset; 2981 int last_offset = char_offset;
2947 int last_id = charset_ascii; 2982 int last_id = charset_ascii;
2983 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2984 int byte_after_cr = -1;
2948 2985
2949 CODING_GET_INFO (coding, attrs, charset_list); 2986 CODING_GET_INFO (coding, attrs, charset_list);
2950 setup_iso_safe_charsets (attrs); 2987 setup_iso_safe_charsets (attrs);
@@ -2962,7 +2999,10 @@ decode_coding_iso_2022 (coding)
2962 if (charbuf >= charbuf_end) 2999 if (charbuf >= charbuf_end)
2963 break; 3000 break;
2964 3001
2965 ONE_MORE_BYTE (c1); 3002 if (byte_after_cr >= 0)
3003 c1 = byte_after_cr, byte_after_cr = -1;
3004 else
3005 ONE_MORE_BYTE (c1);
2966 if (c1 < 0) 3006 if (c1 < 0)
2967 goto invalid_code; 3007 goto invalid_code;
2968 3008
@@ -3021,6 +3061,8 @@ decode_coding_iso_2022 (coding)
3021 break; 3061 break;
3022 3062
3023 case ISO_control_0: 3063 case ISO_control_0:
3064 if (eol_crlf && c1 == '\r')
3065 ONE_MORE_BYTE (byte_after_cr);
3024 MAYBE_FINISH_COMPOSITION (); 3066 MAYBE_FINISH_COMPOSITION ();
3025 charset = CHARSET_FROM_ID (charset_ascii); 3067 charset = CHARSET_FROM_ID (charset_ascii);
3026 break; 3068 break;
@@ -4091,6 +4133,8 @@ decode_coding_sjis (coding)
4091 int char_offset = coding->produced_char; 4133 int char_offset = coding->produced_char;
4092 int last_offset = char_offset; 4134 int last_offset = char_offset;
4093 int last_id = charset_ascii; 4135 int last_id = charset_ascii;
4136 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4137 int byte_after_cr = -1;
4094 4138
4095 CODING_GET_INFO (coding, attrs, charset_list); 4139 CODING_GET_INFO (coding, attrs, charset_list);
4096 4140
@@ -4111,11 +4155,18 @@ decode_coding_sjis (coding)
4111 if (charbuf >= charbuf_end) 4155 if (charbuf >= charbuf_end)
4112 break; 4156 break;
4113 4157
4114 ONE_MORE_BYTE (c); 4158 if (byte_after_cr >= 0)
4159 c = byte_after_cr, byte_after_cr = -1;
4160 else
4161 ONE_MORE_BYTE (c);
4115 if (c < 0) 4162 if (c < 0)
4116 goto invalid_code; 4163 goto invalid_code;
4117 if (c < 0x80) 4164 if (c < 0x80)
4118 charset = charset_roman; 4165 {
4166 if (eol_crlf && c == '\r')
4167 ONE_MORE_BYTE (byte_after_cr);
4168 charset = charset_roman;
4169 }
4119 else if (c == 0x80 || c == 0xA0) 4170 else if (c == 0x80 || c == 0xA0)
4120 goto invalid_code; 4171 goto invalid_code;
4121 else if (c >= 0xA1 && c <= 0xDF) 4172 else if (c >= 0xA1 && c <= 0xDF)
@@ -4193,6 +4244,8 @@ decode_coding_big5 (coding)
4193 int char_offset = coding->produced_char; 4244 int char_offset = coding->produced_char;
4194 int last_offset = char_offset; 4245 int last_offset = char_offset;
4195 int last_id = charset_ascii; 4246 int last_id = charset_ascii;
4247 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4248 int byte_after_cr = -1;
4196 4249
4197 CODING_GET_INFO (coding, attrs, charset_list); 4250 CODING_GET_INFO (coding, attrs, charset_list);
4198 val = charset_list; 4251 val = charset_list;
@@ -4210,12 +4263,19 @@ decode_coding_big5 (coding)
4210 if (charbuf >= charbuf_end) 4263 if (charbuf >= charbuf_end)
4211 break; 4264 break;
4212 4265
4213 ONE_MORE_BYTE (c); 4266 if (byte_after_cr >= 0)
4267 c1 = byte_after_cr, byte_after_cr = -1;
4268 else
4269 ONE_MORE_BYTE (c);
4214 4270
4215 if (c < 0) 4271 if (c < 0)
4216 goto invalid_code; 4272 goto invalid_code;
4217 if (c < 0x80) 4273 if (c < 0x80)
4218 charset = charset_roman; 4274 {
4275 if (eol_crlf && c1 == '\r')
4276 ONE_MORE_BYTE (byte_after_cr);
4277 charset = charset_roman;
4278 }
4219 else 4279 else
4220 { 4280 {
4221 /* BIG5 -> Big5 */ 4281 /* BIG5 -> Big5 */
@@ -4632,10 +4692,19 @@ static void
4632decode_coding_raw_text (coding) 4692decode_coding_raw_text (coding)
4633 struct coding_system *coding; 4693 struct coding_system *coding;
4634{ 4694{
4695 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4696
4635 coding->chars_at_source = 1; 4697 coding->chars_at_source = 1;
4636 coding->consumed_char = 0; 4698 coding->consumed_char = coding->src_chars;
4637 coding->consumed = 0; 4699 coding->consumed = coding->src_bytes;
4638 record_conversion_result (coding, CODING_RESULT_SUCCESS); 4700 if (eol_crlf && coding->source[coding->src_bytes - 1] == '\r')
4701 {
4702 coding->consumed_char--;
4703 coding->consumed--;
4704 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4705 }
4706 else
4707 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4639} 4708}
4640 4709
4641static int 4710static int
@@ -4829,6 +4898,8 @@ decode_coding_charset (coding)
4829 int char_offset = coding->produced_char; 4898 int char_offset = coding->produced_char;
4830 int last_offset = char_offset; 4899 int last_offset = char_offset;
4831 int last_id = charset_ascii; 4900 int last_id = charset_ascii;
4901 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4902 int byte_after_cr = -1;
4832 4903
4833 CODING_GET_INFO (coding, attrs, charset_list); 4904 CODING_GET_INFO (coding, attrs, charset_list);
4834 valids = AREF (attrs, coding_attr_charset_valids); 4905 valids = AREF (attrs, coding_attr_charset_valids);
@@ -4848,7 +4919,17 @@ decode_coding_charset (coding)
4848 if (charbuf >= charbuf_end) 4919 if (charbuf >= charbuf_end)
4849 break; 4920 break;
4850 4921
4851 ONE_MORE_BYTE (c); 4922 if (byte_after_cr >= 0)
4923 {
4924 c = byte_after_cr;
4925 byte_after_cr = -1;
4926 }
4927 else
4928 {
4929 ONE_MORE_BYTE (c);
4930 if (eol_crlf && c == '\r')
4931 ONE_MORE_BYTE (byte_after_cr);
4932 }
4852 if (c < 0) 4933 if (c < 0)
4853 goto invalid_code; 4934 goto invalid_code;
4854 code = c; 4935 code = c;
@@ -5880,13 +5961,13 @@ produce_chars (coding, translation_table, last_block)
5880{ 5961{
5881 unsigned char *dst = coding->destination + coding->produced; 5962 unsigned char *dst = coding->destination + coding->produced;
5882 unsigned char *dst_end = coding->destination + coding->dst_bytes; 5963 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5883 int produced; 5964 EMACS_INT produced;
5884 int produced_chars = 0; 5965 EMACS_INT produced_chars = 0;
5885 int carryover = 0; 5966 int carryover = 0;
5886 5967
5887 if (! coding->chars_at_source) 5968 if (! coding->chars_at_source)
5888 { 5969 {
5889 /* Characters are in coding->charbuf. */ 5970 /* Source characters are in coding->charbuf. */
5890 int *buf = coding->charbuf; 5971 int *buf = coding->charbuf;
5891 int *buf_end = buf + coding->charbuf_used; 5972 int *buf_end = buf + coding->charbuf_used;
5892 5973
@@ -5945,18 +6026,16 @@ produce_chars (coding, translation_table, last_block)
5945 } 6026 }
5946 else 6027 else
5947 { 6028 {
6029 /* Source characters are at coding->source. */
5948 const unsigned char *src = coding->source; 6030 const unsigned char *src = coding->source;
5949 const unsigned char *src_end = src + coding->src_bytes; 6031 const unsigned char *src_end = src + coding->consumed;
5950 Lisp_Object eol_type;
5951
5952 eol_type = CODING_ID_EOL_TYPE (coding->id);
5953 6032
5954 if (coding->src_multibyte != coding->dst_multibyte) 6033 if (coding->src_multibyte != coding->dst_multibyte)
5955 { 6034 {
5956 if (coding->src_multibyte) 6035 if (coding->src_multibyte)
5957 { 6036 {
5958 int multibytep = 1; 6037 int multibytep = 1;
5959 int consumed_chars; 6038 EMACS_INT consumed_chars;
5960 6039
5961 while (1) 6040 while (1)
5962 { 6041 {
@@ -5964,37 +6043,21 @@ produce_chars (coding, translation_table, last_block)
5964 int c; 6043 int c;
5965 6044
5966 ONE_MORE_BYTE (c); 6045 ONE_MORE_BYTE (c);
5967 if (c == '\r') 6046 if (dst == dst_end)
5968 { 6047 {
5969 if (EQ (eol_type, Qdos)) 6048 if (EQ (coding->src_object, coding->dst_object))
6049 dst_end = (unsigned char *) src;
6050 if (dst == dst_end)
5970 { 6051 {
5971 if (src == src_end) 6052 EMACS_INT offset = src - coding->source;
5972 { 6053
5973 record_conversion_result 6054 dst = alloc_destination (coding, src_end - src + 1,
5974 (coding, CODING_RESULT_INSUFFICIENT_SRC); 6055 dst);
5975 goto no_more_source; 6056 dst_end = coding->destination + coding->dst_bytes;
5976 } 6057 coding_set_source (coding);
5977 if (*src == '\n') 6058 src = coding->source + offset;
5978 c = *src++; 6059 src_end = coding->source + coding->src_bytes;
5979 } 6060 }
5980 else if (EQ (eol_type, Qmac))
5981 c = '\n';
5982 }
5983 if (dst == dst_end)
5984 {
5985 coding->consumed = src - coding->source;
5986
5987 if (EQ (coding->src_object, coding->dst_object))
5988 dst_end = (unsigned char *) src;
5989 if (dst == dst_end)
5990 {
5991 dst = alloc_destination (coding, src_end - src + 1,
5992 dst);
5993 dst_end = coding->destination + coding->dst_bytes;
5994 coding_set_source (coding);
5995 src = coding->source + coding->consumed;
5996 src_end = coding->source + coding->src_bytes;
5997 }
5998 } 6061 }
5999 *dst++ = c; 6062 *dst++ = c;
6000 produced_chars++; 6063 produced_chars++;
@@ -6008,30 +6071,19 @@ produce_chars (coding, translation_table, last_block)
6008 int multibytep = 1; 6071 int multibytep = 1;
6009 int c = *src++; 6072 int c = *src++;
6010 6073
6011 if (c == '\r')
6012 {
6013 if (EQ (eol_type, Qdos))
6014 {
6015 if (src < src_end
6016 && *src == '\n')
6017 c = *src++;
6018 }
6019 else if (EQ (eol_type, Qmac))
6020 c = '\n';
6021 }
6022 if (dst >= dst_end - 1) 6074 if (dst >= dst_end - 1)
6023 { 6075 {
6024 coding->consumed = src - coding->source;
6025
6026 if (EQ (coding->src_object, coding->dst_object)) 6076 if (EQ (coding->src_object, coding->dst_object))
6027 dst_end = (unsigned char *) src; 6077 dst_end = (unsigned char *) src;
6028 if (dst >= dst_end - 1) 6078 if (dst >= dst_end - 1)
6029 { 6079 {
6080 EMACS_INT offset = src - coding->source;
6081
6030 dst = alloc_destination (coding, src_end - src + 2, 6082 dst = alloc_destination (coding, src_end - src + 2,
6031 dst); 6083 dst);
6032 dst_end = coding->destination + coding->dst_bytes; 6084 dst_end = coding->destination + coding->dst_bytes;
6033 coding_set_source (coding); 6085 coding_set_source (coding);
6034 src = coding->source + coding->consumed; 6086 src = coding->source + offset;
6035 src_end = coding->source + coding->src_bytes; 6087 src_end = coding->source + coding->src_bytes;
6036 } 6088 }
6037 } 6089 }
@@ -6042,7 +6094,7 @@ produce_chars (coding, translation_table, last_block)
6042 { 6094 {
6043 if (!EQ (coding->src_object, coding->dst_object)) 6095 if (!EQ (coding->src_object, coding->dst_object))
6044 { 6096 {
6045 int require = coding->src_bytes - coding->dst_bytes; 6097 EMACS_INT require = coding->src_bytes - coding->dst_bytes;
6046 6098
6047 if (require > 0) 6099 if (require > 0)
6048 { 6100 {
@@ -6054,28 +6106,10 @@ produce_chars (coding, translation_table, last_block)
6054 src_end = coding->source + coding->src_bytes; 6106 src_end = coding->source + coding->src_bytes;
6055 } 6107 }
6056 } 6108 }
6057 produced_chars = coding->src_chars; 6109 produced_chars = coding->consumed_char;
6058 while (src < src_end) 6110 while (src < src_end)
6059 { 6111 *dst += *src++;
6060 int c = *src++;
6061
6062 if (c == '\r')
6063 {
6064 if (EQ (eol_type, Qdos))
6065 {
6066 if (src < src_end
6067 && *src == '\n')
6068 c = *src++;
6069 produced_chars--;
6070 }
6071 else if (EQ (eol_type, Qmac))
6072 c = '\n';
6073 }
6074 *dst++ = c;
6075 }
6076 } 6112 }
6077 coding->consumed = coding->src_bytes;
6078 coding->consumed_char = coding->src_chars;
6079 } 6113 }
6080 6114
6081 produced = dst - (coding->destination + coding->produced); 6115 produced = dst - (coding->destination + coding->produced);