aboutsummaryrefslogtreecommitdiffstats
path: root/src/coding.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/coding.c')
-rw-r--r--src/coding.c724
1 files changed, 549 insertions, 175 deletions
diff --git a/src/coding.c b/src/coding.c
index 94a2d9fea80..42fd81b6322 100644
--- a/src/coding.c
+++ b/src/coding.c
@@ -1,5 +1,5 @@
1/* Coding system handler (conversion, detection, etc). 1/* Coding system handler (conversion, detection, etc).
2 Copyright (C) 2001-2012 Free Software Foundation, Inc. 2 Copyright (C) 2001-2013 Free Software Foundation, Inc.
3 Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 3 Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
4 2005, 2006, 2007, 2008, 2009, 2010, 2011 4 2005, 2006, 2007, 2008, 2009, 2010, 2011
5 National Institute of Advanced Industrial Science and Technology (AIST) 5 National Institute of Advanced Industrial Science and Technology (AIST)
@@ -285,7 +285,10 @@ encode_coding_XXX (struct coding_system *coding)
285 285
286#include <config.h> 286#include <config.h>
287#include <stdio.h> 287#include <stdio.h>
288#include <setjmp.h> 288
289#ifdef HAVE_WCHAR_H
290#include <wchar.h>
291#endif /* HAVE_WCHAR_H */
289 292
290#include "lisp.h" 293#include "lisp.h"
291#include "character.h" 294#include "character.h"
@@ -303,6 +306,7 @@ Lisp_Object Vcoding_system_hash_table;
303static Lisp_Object Qcoding_system, Qeol_type; 306static Lisp_Object Qcoding_system, Qeol_type;
304static Lisp_Object Qcoding_aliases; 307static Lisp_Object Qcoding_aliases;
305Lisp_Object Qunix, Qdos; 308Lisp_Object Qunix, Qdos;
309static Lisp_Object Qmac;
306Lisp_Object Qbuffer_file_coding_system; 310Lisp_Object Qbuffer_file_coding_system;
307static Lisp_Object Qpost_read_conversion, Qpre_write_conversion; 311static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
308static Lisp_Object Qdefault_char; 312static Lisp_Object Qdefault_char;
@@ -322,8 +326,7 @@ Lisp_Object Qcall_process, Qcall_process_region;
322Lisp_Object Qstart_process, Qopen_network_stream; 326Lisp_Object Qstart_process, Qopen_network_stream;
323static Lisp_Object Qtarget_idx; 327static Lisp_Object Qtarget_idx;
324 328
325static Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source; 329static Lisp_Object Qinsufficient_source, Qinvalid_source, Qinterrupted;
326static Lisp_Object Qinterrupted, Qinsufficient_memory;
327 330
328/* If a symbol has this property, evaluate the value to define the 331/* If a symbol has this property, evaluate the value to define the
329 symbol as a coding system. */ 332 symbol as a coding system. */
@@ -344,6 +347,10 @@ Lisp_Object Qcoding_system_p, Qcoding_system_error;
344Lisp_Object Qemacs_mule, Qraw_text; 347Lisp_Object Qemacs_mule, Qraw_text;
345Lisp_Object Qutf_8_emacs; 348Lisp_Object Qutf_8_emacs;
346 349
350#if defined (WINDOWSNT) || defined (CYGWIN)
351static Lisp_Object Qutf_16le;
352#endif
353
347/* Coding-systems are handed between Emacs Lisp programs and C internal 354/* Coding-systems are handed between Emacs Lisp programs and C internal
348 routines by the following three variables. */ 355 routines by the following three variables. */
349/* Coding system to be used to encode text for terminal display when 356/* Coding system to be used to encode text for terminal display when
@@ -416,7 +423,7 @@ enum iso_code_class_type
416 ISO_shift_out, /* ISO_CODE_SO (0x0E) */ 423 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
417 ISO_shift_in, /* ISO_CODE_SI (0x0F) */ 424 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
418 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */ 425 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
419 ISO_escape, /* ISO_CODE_SO (0x1B) */ 426 ISO_escape, /* ISO_CODE_ESC (0x1B) */
420 ISO_control_1, /* Control codes in the range 427 ISO_control_1, /* Control codes in the range
421 0x80..0x9F, except for the 428 0x80..0x9F, except for the
422 following 3 codes. */ 429 following 3 codes. */
@@ -816,18 +823,12 @@ record_conversion_result (struct coding_system *coding,
816 case CODING_RESULT_INSUFFICIENT_SRC: 823 case CODING_RESULT_INSUFFICIENT_SRC:
817 Vlast_code_conversion_error = Qinsufficient_source; 824 Vlast_code_conversion_error = Qinsufficient_source;
818 break; 825 break;
819 case CODING_RESULT_INCONSISTENT_EOL:
820 Vlast_code_conversion_error = Qinconsistent_eol;
821 break;
822 case CODING_RESULT_INVALID_SRC: 826 case CODING_RESULT_INVALID_SRC:
823 Vlast_code_conversion_error = Qinvalid_source; 827 Vlast_code_conversion_error = Qinvalid_source;
824 break; 828 break;
825 case CODING_RESULT_INTERRUPT: 829 case CODING_RESULT_INTERRUPT:
826 Vlast_code_conversion_error = Qinterrupted; 830 Vlast_code_conversion_error = Qinterrupted;
827 break; 831 break;
828 case CODING_RESULT_INSUFFICIENT_MEM:
829 Vlast_code_conversion_error = Qinsufficient_memory;
830 break;
831 case CODING_RESULT_INSUFFICIENT_DST: 832 case CODING_RESULT_INSUFFICIENT_DST:
832 /* Don't record this error in Vlast_code_conversion_error 833 /* Don't record this error in Vlast_code_conversion_error
833 because it happens just temporarily and is resolved when the 834 because it happens just temporarily and is resolved when the
@@ -921,65 +922,18 @@ record_conversion_result (struct coding_system *coding,
921 922
922 923
923/* Store multibyte form of the character C in P, and advance P to the 924/* Store multibyte form of the character C in P, and advance P to the
924 end of the multibyte form. This is like CHAR_STRING_ADVANCE but it 925 end of the multibyte form. This used to be like CHAR_STRING_ADVANCE
925 never calls MAYBE_UNIFY_CHAR. */ 926 without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
926 927 MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE. */
927#define CHAR_STRING_ADVANCE_NO_UNIFY(c, p) \
928 do { \
929 if ((c) <= MAX_1_BYTE_CHAR) \
930 *(p)++ = (c); \
931 else if ((c) <= MAX_2_BYTE_CHAR) \
932 *(p)++ = (0xC0 | ((c) >> 6)), \
933 *(p)++ = (0x80 | ((c) & 0x3F)); \
934 else if ((c) <= MAX_3_BYTE_CHAR) \
935 *(p)++ = (0xE0 | ((c) >> 12)), \
936 *(p)++ = (0x80 | (((c) >> 6) & 0x3F)), \
937 *(p)++ = (0x80 | ((c) & 0x3F)); \
938 else if ((c) <= MAX_4_BYTE_CHAR) \
939 *(p)++ = (0xF0 | (c >> 18)), \
940 *(p)++ = (0x80 | ((c >> 12) & 0x3F)), \
941 *(p)++ = (0x80 | ((c >> 6) & 0x3F)), \
942 *(p)++ = (0x80 | (c & 0x3F)); \
943 else if ((c) <= MAX_5_BYTE_CHAR) \
944 *(p)++ = 0xF8, \
945 *(p)++ = (0x80 | ((c >> 18) & 0x0F)), \
946 *(p)++ = (0x80 | ((c >> 12) & 0x3F)), \
947 *(p)++ = (0x80 | ((c >> 6) & 0x3F)), \
948 *(p)++ = (0x80 | (c & 0x3F)); \
949 else \
950 (p) += BYTE8_STRING ((c) - 0x3FFF80, p); \
951 } while (0)
952 928
929#define CHAR_STRING_ADVANCE_NO_UNIFY(c, p) CHAR_STRING_ADVANCE(c, p)
953 930
954/* Return the character code of character whose multibyte form is at 931/* Return the character code of character whose multibyte form is at
955 P, and advance P to the end of the multibyte form. This is like 932 P, and advance P to the end of the multibyte form. This used to be
956 STRING_CHAR_ADVANCE, but it never calls MAYBE_UNIFY_CHAR. */ 933 like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
957 934 nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR. */
958#define STRING_CHAR_ADVANCE_NO_UNIFY(p) \
959 (!((p)[0] & 0x80) \
960 ? *(p)++ \
961 : ! ((p)[0] & 0x20) \
962 ? ((p) += 2, \
963 ((((p)[-2] & 0x1F) << 6) \
964 | ((p)[-1] & 0x3F) \
965 | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0))) \
966 : ! ((p)[0] & 0x10) \
967 ? ((p) += 3, \
968 ((((p)[-3] & 0x0F) << 12) \
969 | (((p)[-2] & 0x3F) << 6) \
970 | ((p)[-1] & 0x3F))) \
971 : ! ((p)[0] & 0x08) \
972 ? ((p) += 4, \
973 ((((p)[-4] & 0xF) << 18) \
974 | (((p)[-3] & 0x3F) << 12) \
975 | (((p)[-2] & 0x3F) << 6) \
976 | ((p)[-1] & 0x3F))) \
977 : ((p) += 5, \
978 ((((p)[-4] & 0x3F) << 18) \
979 | (((p)[-3] & 0x3F) << 12) \
980 | (((p)[-2] & 0x3F) << 6) \
981 | ((p)[-1] & 0x3F))))
982 935
936#define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
983 937
984/* Set coding->source from coding->src_object. */ 938/* Set coding->source from coding->src_object. */
985 939
@@ -1092,14 +1046,7 @@ coding_alloc_by_making_gap (struct coding_system *coding,
1092 GPT -= gap_head_used, GPT_BYTE -= gap_head_used; 1046 GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1093 } 1047 }
1094 else 1048 else
1095 { 1049 make_gap_1 (XBUFFER (coding->dst_object), bytes);
1096 Lisp_Object this_buffer;
1097
1098 this_buffer = Fcurrent_buffer ();
1099 set_buffer_internal (XBUFFER (coding->dst_object));
1100 make_gap (bytes);
1101 set_buffer_internal (XBUFFER (this_buffer));
1102 }
1103} 1050}
1104 1051
1105 1052
@@ -1178,6 +1125,14 @@ alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1178 *buf++ = id; \ 1125 *buf++ = id; \
1179 } while (0) 1126 } while (0)
1180 1127
1128
1129/* Bitmasks for coding->eol_seen. */
1130
1131#define EOL_SEEN_NONE 0
1132#define EOL_SEEN_LF 1
1133#define EOL_SEEN_CR 2
1134#define EOL_SEEN_CRLF 4
1135
1181 1136
1182/*** 2. Emacs' internal format (emacs-utf-8) ***/ 1137/*** 2. Emacs' internal format (emacs-utf-8) ***/
1183 1138
@@ -1200,6 +1155,9 @@ alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1200#define UTF_8_BOM_2 0xBB 1155#define UTF_8_BOM_2 0xBB
1201#define UTF_8_BOM_3 0xBF 1156#define UTF_8_BOM_3 0xBF
1202 1157
1158/* Unlike the other detect_coding_XXX, this function counts number of
1159 characters and check EOL format. */
1160
1203static bool 1161static bool
1204detect_coding_utf_8 (struct coding_system *coding, 1162detect_coding_utf_8 (struct coding_system *coding,
1205 struct coding_detection_info *detect_info) 1163 struct coding_detection_info *detect_info)
@@ -1209,11 +1167,23 @@ detect_coding_utf_8 (struct coding_system *coding,
1209 bool multibytep = coding->src_multibyte; 1167 bool multibytep = coding->src_multibyte;
1210 ptrdiff_t consumed_chars = 0; 1168 ptrdiff_t consumed_chars = 0;
1211 bool bom_found = 0; 1169 bool bom_found = 0;
1212 bool found = 0; 1170 int nchars = coding->head_ascii;
1171 int eol_seen = coding->eol_seen;
1213 1172
1214 detect_info->checked |= CATEGORY_MASK_UTF_8; 1173 detect_info->checked |= CATEGORY_MASK_UTF_8;
1215 /* A coding system of this category is always ASCII compatible. */ 1174 /* A coding system of this category is always ASCII compatible. */
1216 src += coding->head_ascii; 1175 src += nchars;
1176
1177 if (src == coding->source /* BOM should be at the head. */
1178 && src + 3 < src_end /* BOM is 3-byte long. */
1179 && src[0] == UTF_8_BOM_1
1180 && src[1] == UTF_8_BOM_2
1181 && src[2] == UTF_8_BOM_3)
1182 {
1183 bom_found = 1;
1184 src += 3;
1185 nchars++;
1186 }
1217 1187
1218 while (1) 1188 while (1)
1219 { 1189 {
@@ -1222,13 +1192,29 @@ detect_coding_utf_8 (struct coding_system *coding,
1222 src_base = src; 1192 src_base = src;
1223 ONE_MORE_BYTE (c); 1193 ONE_MORE_BYTE (c);
1224 if (c < 0 || UTF_8_1_OCTET_P (c)) 1194 if (c < 0 || UTF_8_1_OCTET_P (c))
1225 continue; 1195 {
1196 nchars++;
1197 if (c == '\r')
1198 {
1199 if (src < src_end && *src == '\n')
1200 {
1201 eol_seen |= EOL_SEEN_CRLF;
1202 src++;
1203 nchars++;
1204 }
1205 else
1206 eol_seen |= EOL_SEEN_CR;
1207 }
1208 else if (c == '\n')
1209 eol_seen |= EOL_SEEN_LF;
1210 continue;
1211 }
1226 ONE_MORE_BYTE (c1); 1212 ONE_MORE_BYTE (c1);
1227 if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1)) 1213 if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1228 break; 1214 break;
1229 if (UTF_8_2_OCTET_LEADING_P (c)) 1215 if (UTF_8_2_OCTET_LEADING_P (c))
1230 { 1216 {
1231 found = 1; 1217 nchars++;
1232 continue; 1218 continue;
1233 } 1219 }
1234 ONE_MORE_BYTE (c2); 1220 ONE_MORE_BYTE (c2);
@@ -1236,10 +1222,7 @@ detect_coding_utf_8 (struct coding_system *coding,
1236 break; 1222 break;
1237 if (UTF_8_3_OCTET_LEADING_P (c)) 1223 if (UTF_8_3_OCTET_LEADING_P (c))
1238 { 1224 {
1239 found = 1; 1225 nchars++;
1240 if (src_base == coding->source
1241 && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1242 bom_found = 1;
1243 continue; 1226 continue;
1244 } 1227 }
1245 ONE_MORE_BYTE (c3); 1228 ONE_MORE_BYTE (c3);
@@ -1247,7 +1230,7 @@ detect_coding_utf_8 (struct coding_system *coding,
1247 break; 1230 break;
1248 if (UTF_8_4_OCTET_LEADING_P (c)) 1231 if (UTF_8_4_OCTET_LEADING_P (c))
1249 { 1232 {
1250 found = 1; 1233 nchars++;
1251 continue; 1234 continue;
1252 } 1235 }
1253 ONE_MORE_BYTE (c4); 1236 ONE_MORE_BYTE (c4);
@@ -1255,7 +1238,7 @@ detect_coding_utf_8 (struct coding_system *coding,
1255 break; 1238 break;
1256 if (UTF_8_5_OCTET_LEADING_P (c)) 1239 if (UTF_8_5_OCTET_LEADING_P (c))
1257 { 1240 {
1258 found = 1; 1241 nchars++;
1259 continue; 1242 continue;
1260 } 1243 }
1261 break; 1244 break;
@@ -1272,14 +1255,17 @@ detect_coding_utf_8 (struct coding_system *coding,
1272 if (bom_found) 1255 if (bom_found)
1273 { 1256 {
1274 /* The first character 0xFFFE doesn't necessarily mean a BOM. */ 1257 /* The first character 0xFFFE doesn't necessarily mean a BOM. */
1275 detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG; 1258 detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1276 } 1259 }
1277 else 1260 else
1278 { 1261 {
1279 detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG; 1262 detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1280 if (found) 1263 if (nchars < src_end - coding->source)
1281 detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG; 1264 /* The found characters are less than source bytes, which
1265 means that we found a valid non-ASCII characters. */
1266 detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_NOSIG;
1282 } 1267 }
1268 coding->detected_utf8_chars = nchars;
1283 return 1; 1269 return 1;
1284} 1270}
1285 1271
@@ -3107,20 +3093,7 @@ detect_coding_iso_2022 (struct coding_system *coding,
3107 } 3093 }
3108 if (single_shifting) 3094 if (single_shifting)
3109 break; 3095 break;
3110 check_extra_latin: 3096 goto check_extra_latin;
3111 if (! VECTORP (Vlatin_extra_code_table)
3112 || NILP (AREF (Vlatin_extra_code_table, c)))
3113 {
3114 rejected = CATEGORY_MASK_ISO;
3115 break;
3116 }
3117 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3118 & CODING_ISO_FLAG_LATIN_EXTRA)
3119 found |= CATEGORY_MASK_ISO_8_1;
3120 else
3121 rejected |= CATEGORY_MASK_ISO_8_1;
3122 rejected |= CATEGORY_MASK_ISO_8_2;
3123 break;
3124 3097
3125 default: 3098 default:
3126 if (c < 0) 3099 if (c < 0)
@@ -3171,6 +3144,20 @@ detect_coding_iso_2022 (struct coding_system *coding,
3171 } 3144 }
3172 break; 3145 break;
3173 } 3146 }
3147 check_extra_latin:
3148 if (! VECTORP (Vlatin_extra_code_table)
3149 || NILP (AREF (Vlatin_extra_code_table, c)))
3150 {
3151 rejected = CATEGORY_MASK_ISO;
3152 break;
3153 }
3154 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3155 & CODING_ISO_FLAG_LATIN_EXTRA)
3156 found |= CATEGORY_MASK_ISO_8_1;
3157 else
3158 rejected |= CATEGORY_MASK_ISO_8_1;
3159 rejected |= CATEGORY_MASK_ISO_8_2;
3160 break;
3174 } 3161 }
3175 } 3162 }
3176 detect_info->rejected |= CATEGORY_MASK_ISO; 3163 detect_info->rejected |= CATEGORY_MASK_ISO;
@@ -3939,6 +3926,14 @@ decode_coding_iso_2022 (struct coding_system *coding)
3939 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c); 3926 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3940 char_offset++; 3927 char_offset++;
3941 coding->errors++; 3928 coding->errors++;
3929 /* Reset the invocation and designation status to the safest
3930 one; i.e. designate ASCII to the graphic register 0, and
3931 invoke that register to the graphic plane 0. This typically
3932 helps the case that an designation sequence for ASCII "ESC (
3933 B" is somehow broken (e.g. broken by a newline). */
3934 CODING_ISO_INVOCATION (coding, 0) = 0;
3935 CODING_ISO_DESIGNATION (coding, 0) = charset_ascii;
3936 charset_id_0 = charset_ascii;
3942 continue; 3937 continue;
3943 3938
3944 break_loop: 3939 break_loop:
@@ -5107,6 +5102,7 @@ decode_coding_ccl (struct coding_system *coding)
5107 while (1) 5102 while (1)
5108 { 5103 {
5109 const unsigned char *p = src; 5104 const unsigned char *p = src;
5105 ptrdiff_t offset;
5110 int i = 0; 5106 int i = 0;
5111 5107
5112 if (multibytep) 5108 if (multibytep)
@@ -5124,8 +5120,17 @@ decode_coding_ccl (struct coding_system *coding)
5124 5120
5125 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK) 5121 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5126 ccl->last_block = 1; 5122 ccl->last_block = 1;
5123 /* As ccl_driver calls DECODE_CHAR, buffer may be relocated. */
5124 charset_map_loaded = 0;
5127 ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf, 5125 ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5128 charset_list); 5126 charset_list);
5127 if (charset_map_loaded
5128 && (offset = coding_change_source (coding)))
5129 {
5130 p += offset;
5131 src += offset;
5132 src_end += offset;
5133 }
5129 charbuf += ccl->produced; 5134 charbuf += ccl->produced;
5130 if (multibytep) 5135 if (multibytep)
5131 src += source_byteidx[ccl->consumed]; 5136 src += source_byteidx[ccl->consumed];
@@ -5178,8 +5183,15 @@ encode_coding_ccl (struct coding_system *coding)
5178 5183
5179 do 5184 do
5180 { 5185 {
5186 ptrdiff_t offset;
5187
5188 /* As ccl_driver calls DECODE_CHAR, buffer may be relocated. */
5189 charset_map_loaded = 0;
5181 ccl_driver (ccl, charbuf, destination_charbuf, 5190 ccl_driver (ccl, charbuf, destination_charbuf,
5182 charbuf_end - charbuf, 1024, charset_list); 5191 charbuf_end - charbuf, 1024, charset_list);
5192 if (charset_map_loaded
5193 && (offset = coding_change_destination (coding)))
5194 dst += offset;
5183 if (multibytep) 5195 if (multibytep)
5184 { 5196 {
5185 ASSURE_DESTINATION (ccl->produced * 2); 5197 ASSURE_DESTINATION (ccl->produced * 2);
@@ -5649,7 +5661,6 @@ setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5649 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id); 5661 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5650 5662
5651 coding->mode = 0; 5663 coding->mode = 0;
5652 coding->head_ascii = -1;
5653 if (VECTORP (eol_type)) 5664 if (VECTORP (eol_type))
5654 coding->common_flags = (CODING_REQUIRE_DECODING_MASK 5665 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5655 | CODING_REQUIRE_DETECTION_MASK); 5666 | CODING_REQUIRE_DETECTION_MASK);
@@ -6101,10 +6112,181 @@ complement_process_encoding_system (Lisp_Object coding_system)
6101 6112
6102*/ 6113*/
6103 6114
6104#define EOL_SEEN_NONE 0 6115static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
6105#define EOL_SEEN_LF 1 6116 int eol_seen);
6106#define EOL_SEEN_CR 2 6117
6107#define EOL_SEEN_CRLF 4 6118
6119/* Return the number of ASCII characters at the head of the source.
6120 By side effects, set coding->head_ascii and update
6121 coding->eol_seen. The value of coding->eol_seen is "logical or" of
6122 EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is
6123 reliable only when all the source bytes are ASCII. */
6124
6125static int
6126check_ascii (struct coding_system *coding)
6127{
6128 const unsigned char *src, *end;
6129 Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6130 int eol_seen = coding->eol_seen;
6131
6132 coding_set_source (coding);
6133 src = coding->source;
6134 end = src + coding->src_bytes;
6135
6136 if (inhibit_eol_conversion
6137 || SYMBOLP (eol_type))
6138 {
6139 /* We don't have to check EOL format. */
6140 while (src < end && !( *src & 0x80))
6141 {
6142 if (*src++ == '\n')
6143 eol_seen |= EOL_SEEN_LF;
6144 }
6145 }
6146 else
6147 {
6148 end--; /* We look ahead one byte for "CR LF". */
6149 while (src < end)
6150 {
6151 int c = *src;
6152
6153 if (c & 0x80)
6154 break;
6155 src++;
6156 if (c == '\r')
6157 {
6158 if (*src == '\n')
6159 {
6160 eol_seen |= EOL_SEEN_CRLF;
6161 src++;
6162 }
6163 else
6164 eol_seen |= EOL_SEEN_CR;
6165 }
6166 else if (c == '\n')
6167 eol_seen |= EOL_SEEN_LF;
6168 }
6169 if (src == end)
6170 {
6171 int c = *src;
6172
6173 /* All bytes but the last one C are ASCII. */
6174 if (! (c & 0x80))
6175 {
6176 if (c == '\r')
6177 eol_seen |= EOL_SEEN_CR;
6178 else if (c == '\n')
6179 eol_seen |= EOL_SEEN_LF;
6180 src++;
6181 }
6182 }
6183 }
6184 coding->head_ascii = src - coding->source;
6185 coding->eol_seen = eol_seen;
6186 return (coding->head_ascii);
6187}
6188
6189
6190/* Return the number of characters at the source if all the bytes are
6191 valid UTF-8 (of Unicode range). Otherwise, return -1. By side
6192 effects, update coding->eol_seen. The value of coding->eol_seen is
6193 "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but
6194 the value is reliable only when all the source bytes are valid
6195 UTF-8. */
6196
6197static int
6198check_utf_8 (struct coding_system *coding)
6199{
6200 const unsigned char *src, *end;
6201 int eol_seen;
6202 int nchars = coding->head_ascii;
6203
6204 if (coding->head_ascii < 0)
6205 check_ascii (coding);
6206 else
6207 coding_set_source (coding);
6208 src = coding->source + coding->head_ascii;
6209 /* We look ahead one byte for CR LF. */
6210 end = coding->source + coding->src_bytes - 1;
6211 eol_seen = coding->eol_seen;
6212 while (src < end)
6213 {
6214 int c = *src;
6215
6216 if (UTF_8_1_OCTET_P (*src))
6217 {
6218 src++;
6219 if (c < 0x20)
6220 {
6221 if (c == '\r')
6222 {
6223 if (*src == '\n')
6224 {
6225 eol_seen |= EOL_SEEN_CRLF;
6226 src++;
6227 nchars++;
6228 }
6229 else
6230 eol_seen |= EOL_SEEN_CR;
6231 }
6232 else if (c == '\n')
6233 eol_seen |= EOL_SEEN_LF;
6234 }
6235 }
6236 else if (UTF_8_2_OCTET_LEADING_P (c))
6237 {
6238 if (c < 0xC2 /* overlong sequence */
6239 || src + 1 >= end
6240 || ! UTF_8_EXTRA_OCTET_P (src[1]))
6241 return -1;
6242 src += 2;
6243 }
6244 else if (UTF_8_3_OCTET_LEADING_P (c))
6245 {
6246 if (src + 2 >= end
6247 || ! (UTF_8_EXTRA_OCTET_P (src[1])
6248 && UTF_8_EXTRA_OCTET_P (src[2])))
6249 return -1;
6250 c = (((c & 0xF) << 12)
6251 | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
6252 if (c < 0x800 /* overlong sequence */
6253 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
6254 return -1;
6255 src += 3;
6256 }
6257 else if (UTF_8_4_OCTET_LEADING_P (c))
6258 {
6259 if (src + 3 >= end
6260 || ! (UTF_8_EXTRA_OCTET_P (src[1])
6261 && UTF_8_EXTRA_OCTET_P (src[2])
6262 && UTF_8_EXTRA_OCTET_P (src[3])))
6263 return -1;
6264 c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12)
6265 | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
6266 if (c < 0x10000 /* overlong sequence */
6267 || c >= 0x110000) /* non-Unicode character */
6268 return -1;
6269 src += 4;
6270 }
6271 else
6272 return -1;
6273 nchars++;
6274 }
6275
6276 if (src == end)
6277 {
6278 if (! UTF_8_1_OCTET_P (*src))
6279 return -1;
6280 nchars++;
6281 if (*src == '\r')
6282 eol_seen |= EOL_SEEN_CR;
6283 else if (*src == '\n')
6284 eol_seen |= EOL_SEEN_LF;
6285 }
6286 coding->eol_seen = eol_seen;
6287 return nchars;
6288}
6289
6108 6290
6109/* Detect how end-of-line of a text of length SRC_BYTES pointed by 6291/* Detect how end-of-line of a text of length SRC_BYTES pointed by
6110 SOURCE is encoded. If CATEGORY is one of 6292 SOURCE is encoded. If CATEGORY is one of
@@ -6217,6 +6399,9 @@ adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6217 Lisp_Object eol_type; 6399 Lisp_Object eol_type;
6218 6400
6219 eol_type = CODING_ID_EOL_TYPE (coding->id); 6401 eol_type = CODING_ID_EOL_TYPE (coding->id);
6402 if (! VECTORP (eol_type))
6403 /* Already adjusted. */
6404 return eol_type;
6220 if (eol_seen & EOL_SEEN_LF) 6405 if (eol_seen & EOL_SEEN_LF)
6221 { 6406 {
6222 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0)); 6407 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
@@ -6244,14 +6429,16 @@ detect_coding (struct coding_system *coding)
6244{ 6429{
6245 const unsigned char *src, *src_end; 6430 const unsigned char *src, *src_end;
6246 unsigned int saved_mode = coding->mode; 6431 unsigned int saved_mode = coding->mode;
6432 Lisp_Object found = Qnil;
6433 Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6247 6434
6248 coding->consumed = coding->consumed_char = 0; 6435 coding->consumed = coding->consumed_char = 0;
6249 coding->produced = coding->produced_char = 0; 6436 coding->produced = coding->produced_char = 0;
6250 coding_set_source (coding); 6437 coding_set_source (coding);
6251 6438
6252 src_end = coding->source + coding->src_bytes; 6439 src_end = coding->source + coding->src_bytes;
6253 coding->head_ascii = 0;
6254 6440
6441 coding->eol_seen = EOL_SEEN_NONE;
6255 /* If we have not yet decided the text encoding type, detect it 6442 /* If we have not yet decided the text encoding type, detect it
6256 now. */ 6443 now. */
6257 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided)) 6444 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
@@ -6260,6 +6447,7 @@ detect_coding (struct coding_system *coding)
6260 struct coding_detection_info detect_info; 6447 struct coding_detection_info detect_info;
6261 bool null_byte_found = 0, eight_bit_found = 0; 6448 bool null_byte_found = 0, eight_bit_found = 0;
6262 6449
6450 coding->head_ascii = 0;
6263 detect_info.checked = detect_info.found = detect_info.rejected = 0; 6451 detect_info.checked = detect_info.found = detect_info.rejected = 0;
6264 for (src = coding->source; src < src_end; src++) 6452 for (src = coding->source; src < src_end; src++)
6265 { 6453 {
@@ -6298,6 +6486,27 @@ detect_coding (struct coding_system *coding)
6298 if (eight_bit_found) 6486 if (eight_bit_found)
6299 break; 6487 break;
6300 } 6488 }
6489 else if (! disable_ascii_optimization
6490 && ! inhibit_eol_conversion)
6491 {
6492 if (c == '\r')
6493 {
6494 if (src < src_end && src[1] == '\n')
6495 {
6496 coding->eol_seen |= EOL_SEEN_CRLF;
6497 src++;
6498 if (! eight_bit_found)
6499 coding->head_ascii++;
6500 }
6501 else
6502 coding->eol_seen |= EOL_SEEN_CR;
6503 }
6504 else if (c == '\n')
6505 {
6506 coding->eol_seen |= EOL_SEEN_LF;
6507 }
6508 }
6509
6301 if (! eight_bit_found) 6510 if (! eight_bit_found)
6302 coding->head_ascii++; 6511 coding->head_ascii++;
6303 } 6512 }
@@ -6332,6 +6541,9 @@ detect_coding (struct coding_system *coding)
6332 { 6541 {
6333 category = coding_priorities[i]; 6542 category = coding_priorities[i];
6334 this = coding_categories + category; 6543 this = coding_categories + category;
6544 /* Some of this->detector (e.g. detect_coding_sjis)
6545 require this information. */
6546 coding->id = this->id;
6335 if (this->id < 0) 6547 if (this->id < 0)
6336 { 6548 {
6337 /* No coding system of this category is defined. */ 6549 /* No coding system of this category is defined. */
@@ -6346,32 +6558,58 @@ detect_coding (struct coding_system *coding)
6346 } 6558 }
6347 else if ((*(this->detector)) (coding, &detect_info) 6559 else if ((*(this->detector)) (coding, &detect_info)
6348 && detect_info.found & (1 << category)) 6560 && detect_info.found & (1 << category))
6349 { 6561 break;
6350 if (category == coding_category_utf_16_auto)
6351 {
6352 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6353 category = coding_category_utf_16_le;
6354 else
6355 category = coding_category_utf_16_be;
6356 }
6357 break;
6358 }
6359 } 6562 }
6360 } 6563 }
6361 6564
6362 if (i < coding_category_raw_text) 6565 if (i < coding_category_raw_text)
6363 setup_coding_system (CODING_ID_NAME (this->id), coding); 6566 {
6567 if (category == coding_category_utf_8_auto)
6568 {
6569 Lisp_Object coding_systems;
6570
6571 coding_systems = AREF (CODING_ID_ATTRS (this->id),
6572 coding_attr_utf_bom);
6573 if (CONSP (coding_systems))
6574 {
6575 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6576 found = XCAR (coding_systems);
6577 else
6578 found = XCDR (coding_systems);
6579 }
6580 else
6581 found = CODING_ID_NAME (this->id);
6582 }
6583 else if (category == coding_category_utf_16_auto)
6584 {
6585 Lisp_Object coding_systems;
6586
6587 coding_systems = AREF (CODING_ID_ATTRS (this->id),
6588 coding_attr_utf_bom);
6589 if (CONSP (coding_systems))
6590 {
6591 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6592 found = XCAR (coding_systems);
6593 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6594 found = XCDR (coding_systems);
6595 }
6596 else
6597 found = CODING_ID_NAME (this->id);
6598 }
6599 else
6600 found = CODING_ID_NAME (this->id);
6601 }
6364 else if (null_byte_found) 6602 else if (null_byte_found)
6365 setup_coding_system (Qno_conversion, coding); 6603 found = Qno_conversion;
6366 else if ((detect_info.rejected & CATEGORY_MASK_ANY) 6604 else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6367 == CATEGORY_MASK_ANY) 6605 == CATEGORY_MASK_ANY)
6368 setup_coding_system (Qraw_text, coding); 6606 found = Qraw_text;
6369 else if (detect_info.rejected) 6607 else if (detect_info.rejected)
6370 for (i = 0; i < coding_category_raw_text; i++) 6608 for (i = 0; i < coding_category_raw_text; i++)
6371 if (! (detect_info.rejected & (1 << coding_priorities[i]))) 6609 if (! (detect_info.rejected & (1 << coding_priorities[i])))
6372 { 6610 {
6373 this = coding_categories + coding_priorities[i]; 6611 this = coding_categories + coding_priorities[i];
6374 setup_coding_system (CODING_ID_NAME (this->id), coding); 6612 found = CODING_ID_NAME (this->id);
6375 break; 6613 break;
6376 } 6614 }
6377 } 6615 }
@@ -6385,14 +6623,21 @@ detect_coding (struct coding_system *coding)
6385 coding_systems 6623 coding_systems
6386 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom); 6624 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6387 detect_info.found = detect_info.rejected = 0; 6625 detect_info.found = detect_info.rejected = 0;
6388 coding->head_ascii = 0; 6626 if (check_ascii (coding) == coding->src_bytes)
6389 if (CONSP (coding_systems)
6390 && detect_coding_utf_8 (coding, &detect_info))
6391 { 6627 {
6392 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG) 6628 if (CONSP (coding_systems))
6393 setup_coding_system (XCAR (coding_systems), coding); 6629 found = XCDR (coding_systems);
6394 else 6630 }
6395 setup_coding_system (XCDR (coding_systems), coding); 6631 else
6632 {
6633 if (CONSP (coding_systems)
6634 && detect_coding_utf_8 (coding, &detect_info))
6635 {
6636 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6637 found = XCAR (coding_systems);
6638 else
6639 found = XCDR (coding_systems);
6640 }
6396 } 6641 }
6397 } 6642 }
6398 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id))) 6643 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
@@ -6409,11 +6654,24 @@ detect_coding (struct coding_system *coding)
6409 && detect_coding_utf_16 (coding, &detect_info)) 6654 && detect_coding_utf_16 (coding, &detect_info))
6410 { 6655 {
6411 if (detect_info.found & CATEGORY_MASK_UTF_16_LE) 6656 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6412 setup_coding_system (XCAR (coding_systems), coding); 6657 found = XCAR (coding_systems);
6413 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE) 6658 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6414 setup_coding_system (XCDR (coding_systems), coding); 6659 found = XCDR (coding_systems);
6415 } 6660 }
6416 } 6661 }
6662
6663 if (! NILP (found))
6664 {
6665 int specified_eol = (VECTORP (eol_type) ? EOL_SEEN_NONE
6666 : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
6667 : EQ (eol_type, Qmac) ? EOL_SEEN_CR
6668 : EOL_SEEN_LF);
6669
6670 setup_coding_system (found, coding);
6671 if (specified_eol != EOL_SEEN_NONE)
6672 adjust_coding_eol_type (coding, specified_eol);
6673 }
6674
6417 coding->mode = saved_mode; 6675 coding->mode = saved_mode;
6418} 6676}
6419 6677
@@ -6842,7 +7100,7 @@ produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6842 7100
6843 produced = dst - (coding->destination + coding->produced); 7101 produced = dst - (coding->destination + coding->produced);
6844 if (BUFFERP (coding->dst_object) && produced_chars > 0) 7102 if (BUFFERP (coding->dst_object) && produced_chars > 0)
6845 insert_from_gap (produced_chars, produced); 7103 insert_from_gap (produced_chars, produced, 0);
6846 coding->produced += produced; 7104 coding->produced += produced;
6847 coding->produced_char += produced_chars; 7105 coding->produced_char += produced_chars;
6848 return carryover; 7106 return carryover;
@@ -6853,7 +7111,7 @@ produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6853 [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ] 7111 [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
6854 */ 7112 */
6855 7113
6856static inline void 7114static void
6857produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos) 7115produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
6858{ 7116{
6859 int len; 7117 int len;
@@ -6897,7 +7155,7 @@ produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
6897 [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ] 7155 [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6898 */ 7156 */
6899 7157
6900static inline void 7158static void
6901produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos) 7159produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
6902{ 7160{
6903 ptrdiff_t from = pos - charbuf[2]; 7161 ptrdiff_t from = pos - charbuf[2];
@@ -6913,22 +7171,8 @@ produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
6913 7171
6914#define ALLOC_CONVERSION_WORK_AREA(coding) \ 7172#define ALLOC_CONVERSION_WORK_AREA(coding) \
6915 do { \ 7173 do { \
6916 int size = CHARBUF_SIZE; \ 7174 coding->charbuf = SAFE_ALLOCA (CHARBUF_SIZE * sizeof (int)); \
6917 \ 7175 coding->charbuf_size = CHARBUF_SIZE; \
6918 coding->charbuf = NULL; \
6919 while (size > 1024) \
6920 { \
6921 coding->charbuf = alloca (sizeof (int) * size); \
6922 if (coding->charbuf) \
6923 break; \
6924 size >>= 1; \
6925 } \
6926 if (! coding->charbuf) \
6927 { \
6928 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
6929 return; \
6930 } \
6931 coding->charbuf_size = size; \
6932 } while (0) 7176 } while (0)
6933 7177
6934 7178
@@ -6997,6 +7241,8 @@ decode_coding (struct coding_system *coding)
6997 int carryover; 7241 int carryover;
6998 int i; 7242 int i;
6999 7243
7244 USE_SAFE_ALLOCA;
7245
7000 if (BUFFERP (coding->src_object) 7246 if (BUFFERP (coding->src_object)
7001 && coding->src_pos > 0 7247 && coding->src_pos > 0
7002 && coding->src_pos < GPT 7248 && coding->src_pos < GPT
@@ -7119,6 +7365,8 @@ decode_coding (struct coding_system *coding)
7119 bset_undo_list (current_buffer, undo_list); 7365 bset_undo_list (current_buffer, undo_list);
7120 record_insert (coding->dst_pos, coding->produced_char); 7366 record_insert (coding->dst_pos, coding->produced_char);
7121 } 7367 }
7368
7369 SAFE_FREE ();
7122} 7370}
7123 7371
7124 7372
@@ -7132,7 +7380,7 @@ decode_coding (struct coding_system *coding)
7132 position of a composition after POS (if any) or to LIMIT, and 7380 position of a composition after POS (if any) or to LIMIT, and
7133 return BUF. */ 7381 return BUF. */
7134 7382
7135static inline int * 7383static int *
7136handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit, 7384handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7137 struct coding_system *coding, int *buf, 7385 struct coding_system *coding, int *buf,
7138 ptrdiff_t *stop) 7386 ptrdiff_t *stop)
@@ -7215,7 +7463,7 @@ handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7215 If the property value is nil, set *STOP to the position where the 7463 If the property value is nil, set *STOP to the position where the
7216 property value is non-nil (limiting by LIMIT), and return BUF. */ 7464 property value is non-nil (limiting by LIMIT), and return BUF. */
7217 7465
7218static inline int * 7466static int *
7219handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit, 7467handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7220 struct coding_system *coding, int *buf, 7468 struct coding_system *coding, int *buf,
7221 ptrdiff_t *stop) 7469 ptrdiff_t *stop)
@@ -7402,6 +7650,8 @@ encode_coding (struct coding_system *coding)
7402 int max_lookup; 7650 int max_lookup;
7403 struct ccl_spec cclspec; 7651 struct ccl_spec cclspec;
7404 7652
7653 USE_SAFE_ALLOCA;
7654
7405 attrs = CODING_ID_ATTRS (coding->id); 7655 attrs = CODING_ID_ATTRS (coding->id);
7406 if (coding->encoder == encode_coding_raw_text) 7656 if (coding->encoder == encode_coding_raw_text)
7407 translation_table = Qnil, max_lookup = 0; 7657 translation_table = Qnil, max_lookup = 0;
@@ -7435,7 +7685,9 @@ encode_coding (struct coding_system *coding)
7435 } while (coding->consumed_char < coding->src_chars); 7685 } while (coding->consumed_char < coding->src_chars);
7436 7686
7437 if (BUFFERP (coding->dst_object) && coding->produced_char > 0) 7687 if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7438 insert_from_gap (coding->produced_char, coding->produced); 7688 insert_from_gap (coding->produced_char, coding->produced, 0);
7689
7690 SAFE_FREE ();
7439} 7691}
7440 7692
7441 7693
@@ -7529,8 +7781,6 @@ decode_coding_gap (struct coding_system *coding,
7529 ptrdiff_t count = SPECPDL_INDEX (); 7781 ptrdiff_t count = SPECPDL_INDEX ();
7530 Lisp_Object attrs; 7782 Lisp_Object attrs;
7531 7783
7532 code_conversion_save (0, 0);
7533
7534 coding->src_object = Fcurrent_buffer (); 7784 coding->src_object = Fcurrent_buffer ();
7535 coding->src_chars = chars; 7785 coding->src_chars = chars;
7536 coding->src_bytes = bytes; 7786 coding->src_bytes = bytes;
@@ -7542,15 +7792,95 @@ decode_coding_gap (struct coding_system *coding,
7542 coding->dst_pos_byte = PT_BYTE; 7792 coding->dst_pos_byte = PT_BYTE;
7543 coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters)); 7793 coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7544 7794
7795 coding->head_ascii = -1;
7796 coding->detected_utf8_chars = -1;
7797 coding->eol_seen = EOL_SEEN_NONE;
7545 if (CODING_REQUIRE_DETECTION (coding)) 7798 if (CODING_REQUIRE_DETECTION (coding))
7546 detect_coding (coding); 7799 detect_coding (coding);
7800 attrs = CODING_ID_ATTRS (coding->id);
7801 if (! disable_ascii_optimization
7802 && ! coding->src_multibyte
7803 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
7804 && NILP (CODING_ATTR_POST_READ (attrs))
7805 && NILP (get_translation_table (attrs, 0, NULL)))
7806 {
7807 chars = coding->head_ascii;
7808 if (chars < 0)
7809 chars = check_ascii (coding);
7810 if (chars != bytes)
7811 {
7812 /* There exists a non-ASCII byte. */
7813 if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8))
7814 {
7815 if (coding->detected_utf8_chars >= 0)
7816 chars = coding->detected_utf8_chars;
7817 else
7818 chars = check_utf_8 (coding);
7819 if (CODING_UTF_8_BOM (coding) != utf_without_bom
7820 && coding->head_ascii == 0
7821 && coding->source[0] == UTF_8_BOM_1
7822 && coding->source[1] == UTF_8_BOM_2
7823 && coding->source[2] == UTF_8_BOM_3)
7824 {
7825 chars--;
7826 bytes -= 3;
7827 coding->src_bytes -= 3;
7828 }
7829 }
7830 else
7831 chars = -1;
7832 }
7833 if (chars >= 0)
7834 {
7835 Lisp_Object eol_type;
7836
7837 eol_type = CODING_ID_EOL_TYPE (coding->id);
7838 if (VECTORP (eol_type))
7839 {
7840 if (coding->eol_seen != EOL_SEEN_NONE)
7841 eol_type = adjust_coding_eol_type (coding, coding->eol_seen);
7842 }
7843 if (EQ (eol_type, Qmac))
7844 {
7845 unsigned char *src_end = GAP_END_ADDR;
7846 unsigned char *src = src_end - coding->src_bytes;
7847
7848 while (src < src_end)
7849 {
7850 if (*src++ == '\r')
7851 src[-1] = '\n';
7852 }
7853 }
7854 else if (EQ (eol_type, Qdos))
7855 {
7856 unsigned char *src = GAP_END_ADDR;
7857 unsigned char *src_beg = src - coding->src_bytes;
7858 unsigned char *dst = src;
7859 ptrdiff_t diff;
7860
7861 while (src_beg < src)
7862 {
7863 *--dst = *--src;
7864 if (*src == '\n' && src > src_beg && src[-1] == '\r')
7865 src--;
7866 }
7867 diff = dst - src;
7868 bytes -= diff;
7869 chars -= diff;
7870 }
7871 coding->produced = bytes;
7872 coding->produced_char = chars;
7873 insert_from_gap (chars, bytes, 1);
7874 return;
7875 }
7876 }
7877 code_conversion_save (0, 0);
7547 7878
7548 coding->mode |= CODING_MODE_LAST_BLOCK; 7879 coding->mode |= CODING_MODE_LAST_BLOCK;
7549 current_buffer->text->inhibit_shrinking = 1; 7880 current_buffer->text->inhibit_shrinking = 1;
7550 decode_coding (coding); 7881 decode_coding (coding);
7551 current_buffer->text->inhibit_shrinking = 0; 7882 current_buffer->text->inhibit_shrinking = 0;
7552 7883
7553 attrs = CODING_ID_ATTRS (coding->id);
7554 if (! NILP (CODING_ATTR_POST_READ (attrs))) 7884 if (! NILP (CODING_ATTR_POST_READ (attrs)))
7555 { 7885 {
7556 ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE; 7886 ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
@@ -7724,14 +8054,8 @@ decode_coding_object (struct coding_system *coding,
7724 set_buffer_internal (XBUFFER (coding->dst_object)); 8054 set_buffer_internal (XBUFFER (coding->dst_object));
7725 if (dst_bytes < coding->produced) 8055 if (dst_bytes < coding->produced)
7726 { 8056 {
8057 eassert (coding->produced > 0);
7727 destination = xrealloc (destination, coding->produced); 8058 destination = xrealloc (destination, coding->produced);
7728 if (! destination)
7729 {
7730 record_conversion_result (coding,
7731 CODING_RESULT_INSUFFICIENT_MEM);
7732 unbind_to (count, Qnil);
7733 return;
7734 }
7735 if (BEGV < GPT && GPT < BEGV + coding->produced_char) 8059 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7736 move_gap_both (BEGV, BEGV_BYTE); 8060 move_gap_both (BEGV, BEGV_BYTE);
7737 memcpy (destination, BEGV_ADDR, coding->produced); 8061 memcpy (destination, BEGV_ADDR, coding->produced);
@@ -7999,6 +8323,50 @@ preferred_coding_system (void)
7999 return CODING_ID_NAME (id); 8323 return CODING_ID_NAME (id);
8000} 8324}
8001 8325
8326#if defined (WINDOWSNT) || defined (CYGWIN)
8327
8328Lisp_Object
8329from_unicode (Lisp_Object str)
8330{
8331 CHECK_STRING (str);
8332 if (!STRING_MULTIBYTE (str) &&
8333 SBYTES (str) & 1)
8334 {
8335 str = Fsubstring (str, make_number (0), make_number (-1));
8336 }
8337
8338 return code_convert_string_norecord (str, Qutf_16le, 0);
8339}
8340
8341Lisp_Object
8342from_unicode_buffer (const wchar_t* wstr)
8343{
8344 return from_unicode (
8345 make_unibyte_string (
8346 (char*) wstr,
8347 /* we get one of the two final 0 bytes for free. */
8348 1 + sizeof (wchar_t) * wcslen (wstr)));
8349}
8350
8351wchar_t *
8352to_unicode (Lisp_Object str, Lisp_Object *buf)
8353{
8354 *buf = code_convert_string_norecord (str, Qutf_16le, 1);
8355 /* We need to make another copy (in addition to the one made by
8356 code_convert_string_norecord) to ensure that the final string is
8357 _doubly_ zero terminated --- that is, that the string is
8358 terminated by two zero bytes and one utf-16le null character.
8359 Because strings are already terminated with a single zero byte,
8360 we just add one additional zero. */
8361 str = make_uninit_string (SBYTES (*buf) + 1);
8362 memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
8363 SDATA (str) [SBYTES (*buf)] = '\0';
8364 *buf = str;
8365 return WCSDATA (*buf);
8366}
8367
8368#endif /* WINDOWSNT || CYGWIN */
8369
8002 8370
8003#ifdef emacs 8371#ifdef emacs
8004/*** 8. Emacs Lisp library functions ***/ 8372/*** 8. Emacs Lisp library functions ***/
@@ -8416,9 +8784,6 @@ highest priority. */)
8416 ptrdiff_t from, to; 8784 ptrdiff_t from, to;
8417 ptrdiff_t from_byte, to_byte; 8785 ptrdiff_t from_byte, to_byte;
8418 8786
8419 CHECK_NUMBER_COERCE_MARKER (start);
8420 CHECK_NUMBER_COERCE_MARKER (end);
8421
8422 validate_region (&start, &end); 8787 validate_region (&start, &end);
8423 from = XINT (start), to = XINT (end); 8788 from = XINT (start), to = XINT (end);
8424 from_byte = CHAR_TO_BYTE (from); 8789 from_byte = CHAR_TO_BYTE (from);
@@ -8460,7 +8825,7 @@ highest priority. */)
8460} 8825}
8461 8826
8462 8827
8463static inline bool 8828static bool
8464char_encodable_p (int c, Lisp_Object attrs) 8829char_encodable_p (int c, Lisp_Object attrs)
8465{ 8830{
8466 Lisp_Object tail; 8831 Lisp_Object tail;
@@ -8862,8 +9227,6 @@ code_convert_region (Lisp_Object start, Lisp_Object end,
8862 ptrdiff_t from, from_byte, to, to_byte; 9227 ptrdiff_t from, from_byte, to, to_byte;
8863 Lisp_Object src_object; 9228 Lisp_Object src_object;
8864 9229
8865 CHECK_NUMBER_COERCE_MARKER (start);
8866 CHECK_NUMBER_COERCE_MARKER (end);
8867 if (NILP (coding_system)) 9230 if (NILP (coding_system))
8868 coding_system = Qno_conversion; 9231 coding_system = Qno_conversion;
8869 else 9232 else
@@ -9483,7 +9846,7 @@ make_subsidiaries (Lisp_Object base)
9483 int i; 9846 int i;
9484 9847
9485 memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len); 9848 memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
9486 subsidiaries = Fmake_vector (make_number (3), Qnil); 9849 subsidiaries = make_uninit_vector (3);
9487 for (i = 0; i < 3; i++) 9850 for (i = 0; i < 3; i++)
9488 { 9851 {
9489 strcpy (buf + base_name_len, suffixes[i]); 9852 strcpy (buf + base_name_len, suffixes[i]);
@@ -9783,7 +10146,7 @@ usage: (define-coding-system-internal ...) */)
9783 CHECK_VECTOR (initial); 10146 CHECK_VECTOR (initial);
9784 for (i = 0; i < 4; i++) 10147 for (i = 0; i < 4; i++)
9785 { 10148 {
9786 val = Faref (initial, make_number (i)); 10149 val = AREF (initial, i);
9787 if (! NILP (val)) 10150 if (! NILP (val))
9788 { 10151 {
9789 struct charset *charset; 10152 struct charset *charset;
@@ -9988,7 +10351,8 @@ usage: (define-coding-system-internal ...) */)
9988 this_name = AREF (eol_type, i); 10351 this_name = AREF (eol_type, i);
9989 this_aliases = Fcons (this_name, Qnil); 10352 this_aliases = Fcons (this_name, Qnil);
9990 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac); 10353 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
9991 this_spec = Fmake_vector (make_number (3), attrs); 10354 this_spec = make_uninit_vector (3);
10355 ASET (this_spec, 0, attrs);
9992 ASET (this_spec, 1, this_aliases); 10356 ASET (this_spec, 1, this_aliases);
9993 ASET (this_spec, 2, this_eol_type); 10357 ASET (this_spec, 2, this_eol_type);
9994 Fputhash (this_name, this_spec, Vcoding_system_hash_table); 10358 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
@@ -10001,7 +10365,8 @@ usage: (define-coding-system-internal ...) */)
10001 } 10365 }
10002 } 10366 }
10003 10367
10004 spec_vec = Fmake_vector (make_number (3), attrs); 10368 spec_vec = make_uninit_vector (3);
10369 ASET (spec_vec, 0, attrs);
10005 ASET (spec_vec, 1, aliases); 10370 ASET (spec_vec, 1, aliases);
10006 ASET (spec_vec, 2, eol_type); 10371 ASET (spec_vec, 2, eol_type);
10007 10372
@@ -10298,6 +10663,7 @@ syms_of_coding (void)
10298 DEFSYM (Qeol_type, "eol-type"); 10663 DEFSYM (Qeol_type, "eol-type");
10299 DEFSYM (Qunix, "unix"); 10664 DEFSYM (Qunix, "unix");
10300 DEFSYM (Qdos, "dos"); 10665 DEFSYM (Qdos, "dos");
10666 DEFSYM (Qmac, "mac");
10301 10667
10302 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system"); 10668 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10303 DEFSYM (Qpost_read_conversion, "post-read-conversion"); 10669 DEFSYM (Qpost_read_conversion, "post-read-conversion");
@@ -10312,6 +10678,11 @@ syms_of_coding (void)
10312 DEFSYM (Qutf_8, "utf-8"); 10678 DEFSYM (Qutf_8, "utf-8");
10313 DEFSYM (Qutf_8_emacs, "utf-8-emacs"); 10679 DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10314 10680
10681#if defined (WINDOWSNT) || defined (CYGWIN)
10682 /* No, not utf-16-le: that one has a BOM. */
10683 DEFSYM (Qutf_16le, "utf-16le");
10684#endif
10685
10315 DEFSYM (Qutf_16, "utf-16"); 10686 DEFSYM (Qutf_16, "utf-16");
10316 DEFSYM (Qbig, "big"); 10687 DEFSYM (Qbig, "big");
10317 DEFSYM (Qlittle, "little"); 10688 DEFSYM (Qlittle, "little");
@@ -10400,10 +10771,8 @@ syms_of_coding (void)
10400 intern_c_string ("coding-category-undecided")); 10771 intern_c_string ("coding-category-undecided"));
10401 10772
10402 DEFSYM (Qinsufficient_source, "insufficient-source"); 10773 DEFSYM (Qinsufficient_source, "insufficient-source");
10403 DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10404 DEFSYM (Qinvalid_source, "invalid-source"); 10774 DEFSYM (Qinvalid_source, "invalid-source");
10405 DEFSYM (Qinterrupted, "interrupted"); 10775 DEFSYM (Qinterrupted, "interrupted");
10406 DEFSYM (Qinsufficient_memory, "insufficient-memory");
10407 DEFSYM (Qcoding_system_define_form, "coding-system-define-form"); 10776 DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10408 10777
10409 defsubr (&Scoding_system_p); 10778 defsubr (&Scoding_system_p);
@@ -10704,7 +11073,7 @@ reading if you suppress escape sequence detection.
10704 11073
10705The other way to read escape sequences in a file without decoding is 11074The other way to read escape sequences in a file without decoding is
10706to explicitly specify some coding system that doesn't use ISO-2022 11075to explicitly specify some coding system that doesn't use ISO-2022
10707escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */); 11076escape sequence (e.g., `latin-1') on reading by \\[universal-coding-system-argument]. */);
10708 inhibit_iso_escape_detection = 0; 11077 inhibit_iso_escape_detection = 0;
10709 11078
10710 DEFVAR_BOOL ("inhibit-null-byte-detection", 11079 DEFVAR_BOOL ("inhibit-null-byte-detection",
@@ -10720,6 +11089,11 @@ from GNU Find and GNU Grep. Emacs will then ignore the null bytes and
10720decode text as usual. */); 11089decode text as usual. */);
10721 inhibit_null_byte_detection = 0; 11090 inhibit_null_byte_detection = 0;
10722 11091
11092 DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
11093 doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
11094Internal use only. Removed after the experimental optimizer gets stable. */);
11095 disable_ascii_optimization = 0;
11096
10723 DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input, 11097 DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
10724 doc: /* Char table for translating self-inserting characters. 11098 doc: /* Char table for translating self-inserting characters.
10725This is applied to the result of input methods, not their input. 11099This is applied to the result of input methods, not their input.