aboutsummaryrefslogtreecommitdiffstats
path: root/src/coding.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/coding.c')
-rw-r--r--src/coding.c118
1 files changed, 60 insertions, 58 deletions
diff --git a/src/coding.c b/src/coding.c
index bcc603a2c63..5a182792a0e 100644
--- a/src/coding.c
+++ b/src/coding.c
@@ -23,7 +23,7 @@ Boston, MA 02111-1307, USA. */
23/*** TABLE OF CONTENTS *** 23/*** TABLE OF CONTENTS ***
24 24
25 1. Preamble 25 1. Preamble
26 2. Emacs' internal format handlers 26 2. Emacs' internal format (emacs-mule) handlers
27 3. ISO2022 handlers 27 3. ISO2022 handlers
28 4. Shift-JIS and BIG5 handlers 28 4. Shift-JIS and BIG5 handlers
29 5. End-of-line handlers 29 5. End-of-line handlers
@@ -38,10 +38,11 @@ Boston, MA 02111-1307, USA. */
38 Coding system is an encoding mechanism of one or more character 38 Coding system is an encoding mechanism of one or more character
39 sets. Here's a list of coding systems which Emacs can handle. When 39 sets. Here's a list of coding systems which Emacs can handle. When
40 we say "decode", it means converting some other coding system to 40 we say "decode", it means converting some other coding system to
41 Emacs' internal format, and when we say "encode", it means 41 Emacs' internal format (emacs-internal), and when we say "encode",
42 converting Emacs' internal format to some other coding system. 42 it means converting the coding system emacs-mule to some other
43 coding system.
43 44
44 0. Emacs' internal format 45 0. Emacs' internal format (emacs-mule)
45 46
46 Emacs itself holds a multi-lingual character in a buffer and a string 47 Emacs itself holds a multi-lingual character in a buffer and a string
47 in a special format. Details are described in the section 2. 48 in a special format. Details are described in the section 2.
@@ -106,7 +107,7 @@ Boston, MA 02111-1307, USA. */
106 template of these functions. */ 107 template of these functions. */
107#if 0 108#if 0
108int 109int
109detect_coding_internal (src, src_end) 110detect_coding_emacs_mule (src, src_end)
110 unsigned char *src, *src_end; 111 unsigned char *src, *src_end;
111{ 112{
112 ... 113 ...
@@ -116,11 +117,11 @@ detect_coding_internal (src, src_end)
116/*** GENERAL NOTES on `decode_coding_XXX ()' functions *** 117/*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
117 118
118 These functions decode SRC_BYTES length text at SOURCE encoded in 119 These functions decode SRC_BYTES length text at SOURCE encoded in
119 CODING to Emacs' internal format. The resulting text goes to a 120 CODING to Emacs' internal format (emacs-mule). The resulting text
120 place pointed by DESTINATION, the length of which should not exceed 121 goes to a place pointed by DESTINATION, the length of which should
121 DST_BYTES. The bytes actually processed is returned as *CONSUMED. 122 not exceed DST_BYTES. The bytes actually processed is returned as
122 The return value is the length of the decoded text. Below is a 123 *CONSUMED. The return value is the length of the decoded text.
123 template of these functions. */ 124 Below is a template of these functions. */
124#if 0 125#if 0
125decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed) 126decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
126 struct coding_system *coding; 127 struct coding_system *coding;
@@ -134,12 +135,12 @@ decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
134 135
135/*** GENERAL NOTES on `encode_coding_XXX ()' functions *** 136/*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
136 137
137 These functions encode SRC_BYTES length text at SOURCE of Emacs 138 These functions encode SRC_BYTES length text at SOURCE of Emacs'
138 internal format to CODING. The resulting text goes to a place 139 internal format (emacs-mule) to CODING. The resulting text goes to
139 pointed by DESTINATION, the length of which should not exceed 140 a place pointed by DESTINATION, the length of which should not
140 DST_BYTES. The bytes actually processed is returned as *CONSUMED. 141 exceed DST_BYTES. The bytes actually processed is returned as
141 The return value is the length of the encoded text. Below is a 142 *CONSUMED. The return value is the length of the encoded text.
142 template of these functions. */ 143 Below is a template of these functions. */
143#if 0 144#if 0
144encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed) 145encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
145 struct coding_system *coding; 146 struct coding_system *coding;
@@ -293,7 +294,7 @@ Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
293 294
294/* Table of names of symbol for each coding-category. */ 295/* Table of names of symbol for each coding-category. */
295char *coding_category_name[CODING_CATEGORY_IDX_MAX] = { 296char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
296 "coding-category-internal", 297 "coding-category-emacs-mule",
297 "coding-category-sjis", 298 "coding-category-sjis",
298 "coding-category-iso-7", 299 "coding-category-iso-7",
299 "coding-category-iso-8-1", 300 "coding-category-iso-8-1",
@@ -317,7 +318,7 @@ Lisp_Object Qcharacter_unification_table;
317Lisp_Object Vcharset_revision_alist; 318Lisp_Object Vcharset_revision_alist;
318 319
319 320
320/*** 2. Emacs internal format handlers ***/ 321/*** 2. Emacs internal format (emacs-mule) handlers ***/
321 322
322/* Emacs' internal format for encoding multiple character sets is a 323/* Emacs' internal format for encoding multiple character sets is a
323 kind of multi-byte encoding, i.e. encoding a character by a sequence 324 kind of multi-byte encoding, i.e. encoding a character by a sequence
@@ -364,10 +365,10 @@ enum emacs_code_class_type emacs_code_class[256];
364 365
365/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". 366/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
366 Check if a text is encoded in Emacs' internal format. If it is, 367 Check if a text is encoded in Emacs' internal format. If it is,
367 return CODING_CATEGORY_MASK_INTERNAL, else return 0. */ 368 return CODING_CATEGORY_MASK_EMASC_MULE, else return 0. */
368 369
369int 370int
370detect_coding_internal (src, src_end) 371detect_coding_emacs_mule (src, src_end)
371 unsigned char *src, *src_end; 372 unsigned char *src, *src_end;
372{ 373{
373 unsigned char c; 374 unsigned char c;
@@ -423,7 +424,7 @@ detect_coding_internal (src, src_end)
423 break; 424 break;
424 } 425 }
425 } 426 }
426 return CODING_CATEGORY_MASK_INTERNAL; 427 return CODING_CATEGORY_MASK_EMACS_MULE;
427} 428}
428 429
429 430
@@ -1457,7 +1458,7 @@ encode_coding_iso2022 (coding, source, destination,
1457 coding->spec.iso2022.current_designation, 1458 coding->spec.iso2022.current_designation,
1458 sizeof coding->spec.iso2022.initial_designation); 1459 sizeof coding->spec.iso2022.initial_designation);
1459 if (coding->eol_type == CODING_EOL_LF 1460 if (coding->eol_type == CODING_EOL_LF
1460 || coding->eol_type == CODING_EOL_AUTOMATIC) 1461 || coding->eol_type == CODING_EOL_UNDECIDED)
1461 *dst++ = ISO_CODE_LF; 1462 *dst++ = ISO_CODE_LF;
1462 else if (coding->eol_type == CODING_EOL_CRLF) 1463 else if (coding->eol_type == CODING_EOL_CRLF)
1463 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF; 1464 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
@@ -1814,7 +1815,7 @@ encode_coding_sjis_big5 (coding, source, destination,
1814 1815
1815 case EMACS_linefeed_code: 1816 case EMACS_linefeed_code:
1816 if (coding->eol_type == CODING_EOL_LF 1817 if (coding->eol_type == CODING_EOL_LF
1817 || coding->eol_type == CODING_EOL_AUTOMATIC) 1818 || coding->eol_type == CODING_EOL_UNDECIDED)
1818 *dst++ = '\n'; 1819 *dst++ = '\n';
1819 else if (coding->eol_type == CODING_EOL_CRLF) 1820 else if (coding->eol_type == CODING_EOL_CRLF)
1820 *dst++ = '\r', *dst++ = '\n'; 1821 *dst++ = '\r', *dst++ = '\n';
@@ -1970,7 +1971,7 @@ encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
1970 switch (coding->eol_type) 1971 switch (coding->eol_type)
1971 { 1972 {
1972 case CODING_EOL_LF: 1973 case CODING_EOL_LF:
1973 case CODING_EOL_AUTOMATIC: 1974 case CODING_EOL_UNDECIDED:
1974 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes; 1975 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1975 bcopy (source, destination, produced); 1976 bcopy (source, destination, produced);
1976 if (coding->selective) 1977 if (coding->selective)
@@ -2036,13 +2037,14 @@ encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2036 `element[0]' contains information to be set in `coding->type'. The 2037 `element[0]' contains information to be set in `coding->type'. The
2037 value and its meaning is as follows: 2038 value and its meaning is as follows:
2038 2039
2039 0 -- coding_system_internal 2040 0 -- coding_type_emacs_mule
2040 1 -- coding_system_sjis 2041 1 -- coding_type_sjis
2041 2 -- coding_system_iso2022 2042 2 -- coding_type_iso2022
2042 3 -- coding_system_big5 2043 3 -- coding_type_big5
2043 4 -- coding_system_ccl 2044 4 -- coding_type_ccl encoder/decoder written in CCL
2044 nil -- coding_system_no_conversion 2045 nil -- coding_type_no_conversion
2045 t -- coding_system_automatic 2046 t -- coding_type_undecided (automatic conversion on decoding,
2047 no-conversion on encoding)
2046 2048
2047 `element[4]' contains information to be set in `coding->flags' and 2049 `element[4]' contains information to be set in `coding->flags' and
2048 `coding->spec'. The meaning varies by `coding->type'. 2050 `coding->spec'. The meaning varies by `coding->type'.
@@ -2127,7 +2129,7 @@ setup_coding_system (coding_system, coding)
2127 goto label_invalid_coding_system; 2129 goto label_invalid_coding_system;
2128 2130
2129 if (VECTORP (eol_type)) 2131 if (VECTORP (eol_type))
2130 coding->eol_type = CODING_EOL_AUTOMATIC; 2132 coding->eol_type = CODING_EOL_UNDECIDED;
2131 else if (XFASTINT (eol_type) == 1) 2133 else if (XFASTINT (eol_type) == 1)
2132 coding->eol_type = CODING_EOL_CRLF; 2134 coding->eol_type = CODING_EOL_CRLF;
2133 else if (XFASTINT (eol_type) == 2) 2135 else if (XFASTINT (eol_type) == 2)
@@ -2139,7 +2141,7 @@ setup_coding_system (coding_system, coding)
2139 switch (XFASTINT (type)) 2141 switch (XFASTINT (type))
2140 { 2142 {
2141 case 0: 2143 case 0:
2142 coding->type = coding_type_internal; 2144 coding->type = coding_type_emacs_mule;
2143 break; 2145 break;
2144 2146
2145 case 1: 2147 case 1:
@@ -2309,7 +2311,7 @@ setup_coding_system (coding_system, coding)
2309 2311
2310 default: 2312 default:
2311 if (EQ (type, Qt)) 2313 if (EQ (type, Qt))
2312 coding->type = coding_type_automatic; 2314 coding->type = coding_type_undecided;
2313 else 2315 else
2314 coding->type = coding_type_no_conversion; 2316 coding->type = coding_type_no_conversion;
2315 break; 2317 break;
@@ -2330,11 +2332,11 @@ setup_coding_system (coding_system, coding)
2330 because they use the same range of codes. So, at first, coding 2332 because they use the same range of codes. So, at first, coding
2331 systems are categorized into 7, those are: 2333 systems are categorized into 7, those are:
2332 2334
2333 o coding-category-internal 2335 o coding-category-emacs-mule
2334 2336
2335 The category for a coding system which has the same code range 2337 The category for a coding system which has the same code range
2336 as Emacs' internal format. Assigned the coding-system (Lisp 2338 as Emacs' internal format. Assigned the coding-system (Lisp
2337 symbol) `internal' by default. 2339 symbol) `emacs-mule' by default.
2338 2340
2339 o coding-category-sjis 2341 o coding-category-sjis
2340 2342
@@ -2439,13 +2441,13 @@ detect_coding_mask (src, src_bytes)
2439 or a leading code of Emacs. */ 2441 or a leading code of Emacs. */
2440 mask = (detect_coding_iso2022 (src, src_end) 2442 mask = (detect_coding_iso2022 (src, src_end)
2441 | detect_coding_sjis (src, src_end) 2443 | detect_coding_sjis (src, src_end)
2442 | detect_coding_internal (src, src_end)); 2444 | detect_coding_emacs_mule (src, src_end));
2443 2445
2444 else if (c < 0xA0) 2446 else if (c < 0xA0)
2445 /* C is the first byte of SJIS character code, 2447 /* C is the first byte of SJIS character code,
2446 or a leading-code of Emacs. */ 2448 or a leading-code of Emacs. */
2447 mask = (detect_coding_sjis (src, src_end) 2449 mask = (detect_coding_sjis (src, src_end)
2448 | detect_coding_internal (src, src_end)); 2450 | detect_coding_emacs_mule (src, src_end));
2449 2451
2450 else 2452 else
2451 /* C is a character of ISO2022 in graphic plane right, 2453 /* C is a character of ISO2022 in graphic plane right,
@@ -2511,7 +2513,7 @@ detect_coding (coding, src, src_bytes)
2511 2513
2512/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC 2514/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2513 is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF, 2515 is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
2514 CODING_EOL_CR, and CODING_EOL_AUTOMATIC. */ 2516 CODING_EOL_CR, and CODING_EOL_UNDECIDED. */
2515 2517
2516int 2518int
2517detect_eol_type (src, src_bytes) 2519detect_eol_type (src, src_bytes)
@@ -2534,7 +2536,7 @@ detect_eol_type (src, src_bytes)
2534 return CODING_EOL_CR; 2536 return CODING_EOL_CR;
2535 } 2537 }
2536 } 2538 }
2537 return CODING_EOL_AUTOMATIC; 2539 return CODING_EOL_UNDECIDED;
2538} 2540}
2539 2541
2540/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC 2542/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
@@ -2550,7 +2552,7 @@ detect_eol (coding, src, src_bytes)
2550 Lisp_Object val; 2552 Lisp_Object val;
2551 int eol_type = detect_eol_type (src, src_bytes); 2553 int eol_type = detect_eol_type (src, src_bytes);
2552 2554
2553 if (eol_type == CODING_EOL_AUTOMATIC) 2555 if (eol_type == CODING_EOL_UNDECIDED)
2554 /* We found no end-of-line in the source text. */ 2556 /* We found no end-of-line in the source text. */
2555 return; 2557 return;
2556 2558
@@ -2578,10 +2580,10 @@ decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2578 return 0; 2580 return 0;
2579 } 2581 }
2580 2582
2581 if (coding->type == coding_type_automatic) 2583 if (coding->type == coding_type_undecided)
2582 detect_coding (coding, source, src_bytes); 2584 detect_coding (coding, source, src_bytes);
2583 2585
2584 if (coding->eol_type == CODING_EOL_AUTOMATIC) 2586 if (coding->eol_type == CODING_EOL_UNDECIDED)
2585 detect_eol (coding, source, src_bytes); 2587 detect_eol (coding, source, src_bytes);
2586 2588
2587 coding->carryover_size = 0; 2589 coding->carryover_size = 0;
@@ -2594,10 +2596,10 @@ decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2594 *consumed = produced; 2596 *consumed = produced;
2595 break; 2597 break;
2596 2598
2597 case coding_type_internal: 2599 case coding_type_emacs_mule:
2598 case coding_type_automatic: 2600 case coding_type_undecided:
2599 if (coding->eol_type == CODING_EOL_LF 2601 if (coding->eol_type == CODING_EOL_LF
2600 || coding->eol_type == CODING_EOL_AUTOMATIC) 2602 || coding->eol_type == CODING_EOL_UNDECIDED)
2601 goto label_no_conversion; 2603 goto label_no_conversion;
2602 produced = decode_eol (coding, source, destination, 2604 produced = decode_eol (coding, source, destination,
2603 src_bytes, dst_bytes, consumed); 2605 src_bytes, dst_bytes, consumed);
@@ -2659,10 +2661,10 @@ encode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2659 *consumed = produced; 2661 *consumed = produced;
2660 break; 2662 break;
2661 2663
2662 case coding_type_internal: 2664 case coding_type_emacs_mule:
2663 case coding_type_automatic: 2665 case coding_type_undecided:
2664 if (coding->eol_type == CODING_EOL_LF 2666 if (coding->eol_type == CODING_EOL_LF
2665 || coding->eol_type == CODING_EOL_AUTOMATIC) 2667 || coding->eol_type == CODING_EOL_UNDECIDED)
2666 goto label_no_conversion; 2668 goto label_no_conversion;
2667 produced = encode_eol (coding, source, destination, 2669 produced = encode_eol (coding, source, destination,
2668 src_bytes, dst_bytes, consumed); 2670 src_bytes, dst_bytes, consumed);
@@ -2835,7 +2837,7 @@ DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
2835 2, 2, 0, 2837 2, 2, 0,
2836 "Detect coding-system of the text in the region between START and END.\n\ 2838 "Detect coding-system of the text in the region between START and END.\n\
2837Return a list of possible coding-systems ordered by priority.\n\ 2839Return a list of possible coding-systems ordered by priority.\n\
2838If only ASCII characters are found, it returns `automatic-conversion'\n\ 2840If only ASCII characters are found, it returns `undecided'\n\
2839 or its subsidiary coding-system according to a detected end-of-line format.") 2841 or its subsidiary coding-system according to a detected end-of-line format.")
2840 (b, e) 2842 (b, e)
2841 Lisp_Object b, e; 2843 Lisp_Object b, e;
@@ -2853,8 +2855,8 @@ If only ASCII characters are found, it returns `automatic-conversion'\n\
2853 2855
2854 if (coding_mask == CODING_CATEGORY_MASK_ANY) 2856 if (coding_mask == CODING_CATEGORY_MASK_ANY)
2855 { 2857 {
2856 val = intern ("automatic-conversion"); 2858 val = intern ("undecided");
2857 if (eol_type != CODING_EOL_AUTOMATIC) 2859 if (eol_type != CODING_EOL_UNDECIDED)
2858 { 2860 {
2859 Lisp_Object val2 = Fget (val, Qeol_type); 2861 Lisp_Object val2 = Fget (val, Qeol_type);
2860 if (VECTORP (val2)) 2862 if (VECTORP (val2))
@@ -2884,7 +2886,7 @@ If only ASCII characters are found, it returns `automatic-conversion'\n\
2884 val = Qnil; 2886 val = Qnil;
2885 for (; !NILP (val2); val2 = XCONS (val2)->cdr) 2887 for (; !NILP (val2); val2 = XCONS (val2)->cdr)
2886 { 2888 {
2887 if (eol_type == CODING_EOL_AUTOMATIC) 2889 if (eol_type == CODING_EOL_UNDECIDED)
2888 val = Fcons (XCONS (val2)->car, val); 2890 val = Fcons (XCONS (val2)->car, val);
2889 else 2891 else
2890 { 2892 {
@@ -2914,7 +2916,7 @@ shrink_conversion_area (begp, endp, coding, encodep)
2914 register unsigned char *beg_addr = *begp, *end_addr = *endp; 2916 register unsigned char *beg_addr = *begp, *end_addr = *endp;
2915 2917
2916 if (coding->eol_type != CODING_EOL_LF 2918 if (coding->eol_type != CODING_EOL_LF
2917 && coding->eol_type != CODING_EOL_AUTOMATIC) 2919 && coding->eol_type != CODING_EOL_UNDECIDED)
2918 /* Since we anyway have to convert end-of-line format, it is not 2920 /* Since we anyway have to convert end-of-line format, it is not
2919 worth skipping at most 100 bytes or so. */ 2921 worth skipping at most 100 bytes or so. */
2920 return; 2922 return;
@@ -2924,8 +2926,8 @@ shrink_conversion_area (begp, endp, coding, encodep)
2924 switch (coding->type) 2926 switch (coding->type)
2925 { 2927 {
2926 case coding_type_no_conversion: 2928 case coding_type_no_conversion:
2927 case coding_type_internal: 2929 case coding_type_emacs_mule:
2928 case coding_type_automatic: 2930 case coding_type_undecided:
2929 /* We need no conversion. */ 2931 /* We need no conversion. */
2930 *begp = *endp; 2932 *begp = *endp;
2931 return; 2933 return;
@@ -2962,7 +2964,7 @@ shrink_conversion_area (begp, endp, coding, encodep)
2962 /* We need no conversion. */ 2964 /* We need no conversion. */
2963 *begp = *endp; 2965 *begp = *endp;
2964 return; 2966 return;
2965 case coding_type_internal: 2967 case coding_type_emacs_mule:
2966 if (coding->eol_type == CODING_EOL_LF) 2968 if (coding->eol_type == CODING_EOL_LF)
2967 { 2969 {
2968 /* We need no conversion. */ 2970 /* We need no conversion. */
@@ -3461,7 +3463,7 @@ init_coding_once ()
3461{ 3463{
3462 int i; 3464 int i;
3463 3465
3464 /* Emacs internal format specific initialize routine. */ 3466 /* Emacs' internal format specific initialize routine. */
3465 for (i = 0; i <= 0x20; i++) 3467 for (i = 0; i <= 0x20; i++)
3466 emacs_code_class[i] = EMACS_control_code; 3468 emacs_code_class[i] = EMACS_control_code;
3467 emacs_code_class[0x0A] = EMACS_linefeed_code; 3469 emacs_code_class[0x0A] = EMACS_linefeed_code;