aboutsummaryrefslogtreecommitdiffstats
path: root/src/coding.c
diff options
context:
space:
mode:
authorKenichi Handa1997-05-10 03:37:01 +0000
committerKenichi Handa1997-05-10 03:37:01 +0000
commitbdd9fb4867851e75fd60a4fe0100dc719a09c049 (patch)
treef2e9db7c535ba7cc954a134263ad74ca786d0ed6 /src/coding.c
parentceb5851081f93fe19e70072bb971cbd868f17e6f (diff)
downloademacs-bdd9fb4867851e75fd60a4fe0100dc719a09c049.tar.gz
emacs-bdd9fb4867851e75fd60a4fe0100dc719a09c049.zip
(Valternate_charset_table): The valiable deleted.
(Venable_character_unification, Vstandard_character_unification_table_for_read, Vstandard_character_unification_table_for_write, Qcharacter_unification_table): New variables. (syms_of_coding): Initialize and declare them. (DECODE_ISO_CHARACTER): Modified to handle a character unification table instead of Valternate_charset_table. (DECODE_DESIGNATION): Delete handling of Valternate_charset_table. (decode_coding_iso2022): Handle a character unification table. (ENCODE_ISO_CHARACTER): New macro. (encode_designation_at_bol): Handle a character unification table. Do not return -1 even if end-of-line is not in the current run. (encode_coding_iso2022): Handle a character unification table. Call macro ENCODE_ISO_CHARACTER instead of calling ENCODE_ISO_CHARACTER_DIMENSION1 and ENCODE_ISO_CHARACTER_DIMENSION2 directly. Check the size of carryover before copying it to destination. (setup_coding_system): Initialize the member character_unification_table of the struct coding system to Qnil. (Fset_keyboard_coding_system): Doc string augmented.
Diffstat (limited to 'src/coding.c')
-rw-r--r--src/coding.c247
1 files changed, 152 insertions, 95 deletions
diff --git a/src/coding.c b/src/coding.c
index faae05d9136..bcc603a2c63 100644
--- a/src/coding.c
+++ b/src/coding.c
@@ -303,8 +303,15 @@ char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
303 "coding-category-binary" 303 "coding-category-binary"
304}; 304};
305 305
306/* Alist of charsets vs the alternate charsets. */ 306/* Flag to tell if we look up unification table on character code
307Lisp_Object Valternate_charset_table; 307 conversion. */
308Lisp_Object Venable_character_unification;
309/* Standard unification table to look up on reading (decoding). */
310Lisp_Object Vstandard_character_unification_table_for_read;
311/* Standard unification table to look up on writing (encoding). */
312Lisp_Object Vstandard_character_unification_table_for_write;
313
314Lisp_Object Qcharacter_unification_table;
308 315
309/* Alist of charsets vs revision number. */ 316/* Alist of charsets vs revision number. */
310Lisp_Object Vcharset_revision_alist; 317Lisp_Object Vcharset_revision_alist;
@@ -650,44 +657,46 @@ detect_coding_iso2022 (src, src_end)
650} 657}
651 658
652/* Decode a character of which charset is CHARSET and the 1st position 659/* Decode a character of which charset is CHARSET and the 1st position
653 code is C1. If dimension of CHARSET 2, the 2nd position code is 660 code is C1. If dimension of CHARSET is 2, the 2nd position code is
654 fetched from SRC and set to C2. If CHARSET is negative, it means 661 fetched from SRC and set to C2. If CHARSET is negative, it means
655 that we are decoding ill formed text, and what we can do is just to 662 that we are decoding ill formed text, and what we can do is just to
656 read C1 as is. */ 663 read C1 as is. */
657 664
658#define DECODE_ISO_CHARACTER(charset, c1) \ 665#define DECODE_ISO_CHARACTER(charset, c1) \
659 do { \ 666 do { \
660 if ((charset) >= 0 && CHARSET_DIMENSION (charset) == 2) \ 667 int c_alt, charset_alt = (charset); \
661 ONE_MORE_BYTE (c2); \ 668 if (COMPOSING_HEAD_P (coding->composing)) \
662 if (COMPOSING_HEAD_P (coding->composing)) \ 669 { \
663 { \ 670 *dst++ = LEADING_CODE_COMPOSITION; \
664 *dst++ = LEADING_CODE_COMPOSITION; \ 671 if (COMPOSING_WITH_RULE_P (coding->composing)) \
665 if (COMPOSING_WITH_RULE_P (coding->composing)) \ 672 /* To tell composition rules are embeded. */ \
666 /* To tell composition rules are embeded. */ \ 673 *dst++ = 0xFF; \
667 *dst++ = 0xFF; \ 674 coding->composing += 2; \
668 coding->composing += 2; \ 675 } \
669 } \ 676 if ((charset) >= 0) \
670 if ((charset) < 0) \ 677 { \
671 *dst++ = c1; \ 678 if (CHARSET_DIMENSION (charset) == 2) \
672 else if ((charset) == CHARSET_ASCII) \ 679 ONE_MORE_BYTE (c2); \
673 DECODE_CHARACTER_ASCII (c1); \ 680 if (!NILP (unification_table) \
674 else if (CHARSET_DIMENSION (charset) == 1) \ 681 && ((c_alt = unify_char (unification_table, \
675 DECODE_CHARACTER_DIMENSION1 (charset, c1); \ 682 -1, (charset), c1, c2)) >= 0)) \
676 else \ 683 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
677 DECODE_CHARACTER_DIMENSION2 (charset, c1, c2); \ 684 } \
678 if (COMPOSING_WITH_RULE_P (coding->composing)) \ 685 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
679 /* To tell a composition rule follows. */ \ 686 DECODE_CHARACTER_ASCII (c1); \
680 coding->composing = COMPOSING_WITH_RULE_RULE; \ 687 else if (CHARSET_DIMENSION (charset_alt) == 1) \
688 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
689 else \
690 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
691 if (COMPOSING_WITH_RULE_P (coding->composing)) \
692 /* To tell a composition rule follows. */ \
693 coding->composing = COMPOSING_WITH_RULE_RULE; \
681 } while (0) 694 } while (0)
682 695
683/* Set designation state into CODING. */ 696/* Set designation state into CODING. */
684#define DECODE_DESIGNATION(reg, dimension, chars, final_char) \ 697#define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
685 do { \ 698 do { \
686 int charset = ISO_CHARSET_TABLE (dimension, chars, final_char); \ 699 int charset = ISO_CHARSET_TABLE (dimension, chars, final_char); \
687 Lisp_Object temp \
688 = Fassq (CHARSET_SYMBOL (charset), Valternate_charset_table); \
689 if (! NILP (temp)) \
690 charset = get_charset_id (XCONS (temp)->cdr); \
691 if (charset >= 0) \ 700 if (charset >= 0) \
692 { \ 701 { \
693 if (coding->direction == 1 \ 702 if (coding->direction == 1 \
@@ -719,6 +728,10 @@ decode_coding_iso2022 (coding, source, destination,
719 /* Charsets invoked to graphic plane 0 and 1 respectively. */ 728 /* Charsets invoked to graphic plane 0 and 1 respectively. */
720 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0); 729 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
721 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1); 730 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
731 Lisp_Object unification_table = coding->character_unification_table;
732
733 if (!NILP (Venable_character_unification) && NILP (unification_table))
734 unification_table = Vstandard_character_unification_table_for_read;
722 735
723 while (src < src_end && dst < adjusted_dst_end) 736 while (src < src_end && dst < adjusted_dst_end)
724 { 737 {
@@ -728,7 +741,7 @@ decode_coding_iso2022 (coding, source, destination,
728 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset 741 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
729 to SRC_BASE before exiting. */ 742 to SRC_BASE before exiting. */
730 unsigned char *src_base = src; 743 unsigned char *src_base = src;
731 unsigned char c1 = *src++, c2, cmprule; 744 int c1 = *src++, c2;
732 745
733 switch (iso_code_class [c1]) 746 switch (iso_code_class [c1])
734 { 747 {
@@ -1167,6 +1180,21 @@ decode_coding_iso2022 (coding, source, destination,
1167 dst = encode_invocation_designation (charset, coding, dst); \ 1180 dst = encode_invocation_designation (charset, coding, dst); \
1168 } while (1) 1181 } while (1)
1169 1182
1183#define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1184 do { \
1185 int c_alt, charset_alt; \
1186 if (!NILP (unification_table) \
1187 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1188 < 0)) \
1189 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1190 else \
1191 charset_alt = charset; \
1192 if (CHARSET_DIMENSION (charset_alt) == 1) \
1193 ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
1194 else \
1195 ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1196 } while (0)
1197
1170/* Produce designation and invocation codes at a place pointed by DST 1198/* Produce designation and invocation codes at a place pointed by DST
1171 to use CHARSET. The element `spec.iso2022' of *CODING is updated. 1199 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1172 Return new DST. */ 1200 Return new DST. */
@@ -1266,48 +1294,57 @@ encode_invocation_designation (charset, coding, dst)
1266 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \ 1294 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1267 } while (0) 1295 } while (0)
1268 1296
1269int 1297/* Produce designation sequences of charsets in the line started from
1270encode_designation_at_bol (coding, src, src_end, dstp) 1298 *SRC to a place pointed by DSTP.
1299
1300 If the current block ends before any end-of-line, we may fail to
1301 find all the necessary *designations. */
1302encode_designation_at_bol (coding, table, src, src_end, dstp)
1271 struct coding_system *coding; 1303 struct coding_system *coding;
1304 Lisp_Object table;
1272 unsigned char *src, *src_end, **dstp; 1305 unsigned char *src, *src_end, **dstp;
1273{ 1306{
1274 int charset, reg, r[4]; 1307 int charset, c, found = 0, reg;
1275 unsigned char *dst = *dstp, c; 1308 /* Table of charsets to be designated to each graphic register. */
1276 for (reg = 0; reg < 4; reg++) r[reg] = -1; 1309 int r[4];
1277 while (src < src_end && (c = *src++) != '\n') 1310 unsigned char *dst = *dstp;
1311
1312 for (reg = 0; reg < 4; reg++)
1313 r[reg] = -1;
1314
1315 while (src < src_end && *src != '\n' && found < 4)
1278 { 1316 {
1279 switch (emacs_code_class[c]) 1317 int bytes = BYTES_BY_CHAR_HEAD (*src);
1318
1319 if (NILP (table))
1320 charset = CHARSET_AT (src);
1321 else
1280 { 1322 {
1281 case EMACS_ascii_code: 1323 int c_alt, c1, c2;
1282 charset = CHARSET_ASCII; 1324
1283 break; 1325 SPLIT_STRING(src, bytes, charset, c1, c2);
1284 case EMACS_leading_code_2: 1326 if ((c_alt = unify_char (table, -1, charset, c1, c2)) >= 0)
1285 if (++src >= src_end) continue; 1327 charset = CHAR_CHARSET (c_alt);
1286 charset = c;
1287 break;
1288 case EMACS_leading_code_3:
1289 if ((src += 2) >= src_end) continue;
1290 charset = (c < LEADING_CODE_PRIVATE_11 ? c : *(src - 2));
1291 break;
1292 case EMACS_leading_code_4:
1293 if ((src += 3) >= src_end) continue;
1294 charset = *(src - 3);
1295 break;
1296 default:
1297 continue;
1298 } 1328 }
1329
1299 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset); 1330 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1300 if (r[reg] < 0 1331 if (r[reg] < 0)
1301 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != charset) 1332 {
1302 r[reg] = charset; 1333 found++;
1334 r[reg] = charset;
1335 }
1336
1337 src += bytes;
1338 }
1339
1340 if (found)
1341 {
1342 for (reg = 0; reg < 4; reg++)
1343 if (r[reg] >= 0
1344 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1345 ENCODE_DESIGNATION (r[reg], reg, coding);
1346 *dstp = dst;
1303 } 1347 }
1304 if (c != '\n' && !coding->last_block)
1305 return -1;
1306 for (reg = 0; reg < 4; reg++)
1307 if (r[reg] >= 0)
1308 ENCODE_DESIGNATION (r[reg], reg, coding);
1309 *dstp = dst;
1310 return 0;
1311} 1348}
1312 1349
1313/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */ 1350/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
@@ -1328,6 +1365,10 @@ encode_coding_iso2022 (coding, source, destination,
1328 from DST_END to assure overflow checking is necessary only at the 1365 from DST_END to assure overflow checking is necessary only at the
1329 head of loop. */ 1366 head of loop. */
1330 unsigned char *adjusted_dst_end = dst_end - 19; 1367 unsigned char *adjusted_dst_end = dst_end - 19;
1368 Lisp_Object unification_table = coding->character_unification_table;
1369
1370 if (!NILP (Venable_character_unification) && NILP (unification_table))
1371 unification_table = Vstandard_character_unification_table_for_write;
1331 1372
1332 while (src < src_end && dst < adjusted_dst_end) 1373 while (src < src_end && dst < adjusted_dst_end)
1333 { 1374 {
@@ -1337,18 +1378,14 @@ encode_coding_iso2022 (coding, source, destination,
1337 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is 1378 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1338 reset to SRC_BASE before exiting. */ 1379 reset to SRC_BASE before exiting. */
1339 unsigned char *src_base = src; 1380 unsigned char *src_base = src;
1340 unsigned char c1, c2, c3, c4; 1381 int charset, c1, c2, c3, c4;
1341 int charset;
1342 1382
1343 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL 1383 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1344 && CODING_SPEC_ISO_BOL (coding)) 1384 && CODING_SPEC_ISO_BOL (coding))
1345 { 1385 {
1346 /* We have to produce destination sequences now. */ 1386 /* We have to produce designation sequences if any now. */
1347 if (encode_designation_at_bol (coding, src, src_end, &dst) < 0) 1387 encode_designation_at_bol (coding, unification_table,
1348 /* We can't find end of line in the current block. Let's 1388 src, src_end, &dst);
1349 repeat encoding starting from the current position
1350 pointed by SRC. */
1351 break;
1352 CODING_SPEC_ISO_BOL (coding) = 0; 1389 CODING_SPEC_ISO_BOL (coding) = 0;
1353 } 1390 }
1354 1391
@@ -1393,7 +1430,7 @@ encode_coding_iso2022 (coding, source, destination,
1393 switch (emacs_code_class[c1]) 1430 switch (emacs_code_class[c1])
1394 { 1431 {
1395 case EMACS_ascii_code: 1432 case EMACS_ascii_code:
1396 ENCODE_ISO_CHARACTER_DIMENSION1 (CHARSET_ASCII, c1); 1433 ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
1397 break; 1434 break;
1398 1435
1399 case EMACS_control_code: 1436 case EMACS_control_code:
@@ -1431,20 +1468,20 @@ encode_coding_iso2022 (coding, source, destination,
1431 1468
1432 case EMACS_leading_code_2: 1469 case EMACS_leading_code_2:
1433 ONE_MORE_BYTE (c2); 1470 ONE_MORE_BYTE (c2);
1434 ENCODE_ISO_CHARACTER_DIMENSION1 (c1, c2); 1471 ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
1435 break; 1472 break;
1436 1473
1437 case EMACS_leading_code_3: 1474 case EMACS_leading_code_3:
1438 TWO_MORE_BYTES (c2, c3); 1475 TWO_MORE_BYTES (c2, c3);
1439 if (c1 < LEADING_CODE_PRIVATE_11) 1476 if (c1 < LEADING_CODE_PRIVATE_11)
1440 ENCODE_ISO_CHARACTER_DIMENSION2 (c1, c2, c3); 1477 ENCODE_ISO_CHARACTER (c1, c2, c3);
1441 else 1478 else
1442 ENCODE_ISO_CHARACTER_DIMENSION1 (c2, c3); 1479 ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
1443 break; 1480 break;
1444 1481
1445 case EMACS_leading_code_4: 1482 case EMACS_leading_code_4:
1446 THREE_MORE_BYTES (c2, c3, c4); 1483 THREE_MORE_BYTES (c2, c3, c4);
1447 ENCODE_ISO_CHARACTER_DIMENSION2 (c2, c3, c4); 1484 ENCODE_ISO_CHARACTER (c2, c3, c4);
1448 break; 1485 break;
1449 1486
1450 case EMACS_leading_code_composition: 1487 case EMACS_leading_code_composition:
@@ -1472,20 +1509,21 @@ encode_coding_iso2022 (coding, source, destination,
1472 label_end_of_loop: 1509 label_end_of_loop:
1473 coding->carryover_size = src - src_base; 1510 coding->carryover_size = src - src_base;
1474 bcopy (src_base, coding->carryover, coding->carryover_size); 1511 bcopy (src_base, coding->carryover, coding->carryover_size);
1475 src = src_base;
1476 break; 1512 break;
1477 } 1513 }
1478 1514
1479 /* If this is the last block of the text to be encoded, we must 1515 /* If this is the last block of the text to be encoded, we must
1480 reset the state of graphic planes and registers to initial one. 1516 reset graphic planes and registers to the initial state. */
1481 In addition, we had better just flush out all remaining codes in 1517 if (src >= src_end && coding->last_block)
1482 the text although they are not valid characters. */
1483 if (coding->last_block)
1484 { 1518 {
1485 ENCODE_RESET_PLANE_AND_REGISTER; 1519 ENCODE_RESET_PLANE_AND_REGISTER;
1486 bcopy(src, dst, src_end - src); 1520 if (coding->carryover_size > 0
1487 dst += (src_end - src); 1521 && coding->carryover_size < (dst_end - dst))
1488 src = src_end; 1522 {
1523 bcopy (coding->carryover, dst, coding->carryover_size);
1524 dst += coding->carryover_size;
1525 coding->carryover_size = 0;
1526 }
1489 } 1527 }
1490 *consumed = src - source; 1528 *consumed = src - source;
1491 return dst - destination; 1529 return dst - destination;
@@ -2063,6 +2101,9 @@ setup_coding_system (coding_system, coding)
2063 coding->direction = 0; 2101 coding->direction = 0;
2064 coding->carryover_size = 0; 2102 coding->carryover_size = 0;
2065 coding->post_read_conversion = coding->pre_write_conversion = Qnil; 2103 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2104 /* We have not yet implemented a way to specify unification table in
2105 a coding system. */
2106 coding->character_unification_table = Qnil;
2066 2107
2067 Vlast_coding_system_used = coding->symbol = coding_system; 2108 Vlast_coding_system_used = coding->symbol = coding_system;
2068 eol_type = Qnil; 2109 eol_type = Qnil;
@@ -3316,10 +3357,13 @@ DEFUN ("terminal-coding-system",
3316} 3357}
3317 3358
3318DEFUN ("set-keyboard-coding-system", 3359DEFUN ("set-keyboard-coding-system",
3319 Fset_keyboard_coding_system, Sset_keyboard_coding_system, 1, 1, 3360 Fset_keyboard_coding_system, Sset_keyboard_coding_system, 1, 1, 0,
3320 "zCoding-system for keyboard input: ", 3361 "Set coding-system of codes sent from terminal keyboard to CODING-SYSTEM.\n\
3321 "Set coding-system of what is sent from terminal keyboard to CODING-SYSTEM.\n\ 3362In Encoded-kbd minor mode, user inputs are decoded\n\
3322All inputs from terminal are decoded from this coding-system.") 3363accoding to CODING-SYSTEM.\n\
3364Do not call this function directly, but use the command\n\
3365encoded-kbd-set-coding-system to activate Encoded-kbd mode\n\
3366with a specific coding system.")
3323 (coding_system) 3367 (coding_system)
3324 Lisp_Object coding_system; 3368 Lisp_Object coding_system;
3325{ 3369{
@@ -3529,6 +3573,11 @@ syms_of_coding ()
3529 } 3573 }
3530 } 3574 }
3531 3575
3576 Qcharacter_unification_table = intern ("character-unification-table");
3577 staticpro (&Qcharacter_unification_table);
3578 Fput (Qcharacter_unification_table, Qchar_table_extra_slots,
3579 make_number (0));
3580
3532 defsubr (&Scoding_system_vector); 3581 defsubr (&Scoding_system_vector);
3533 defsubr (&Scoding_system_p); 3582 defsubr (&Scoding_system_p);
3534 defsubr (&Sread_coding_system); 3583 defsubr (&Sread_coding_system);
@@ -3613,11 +3662,19 @@ See the documentation of `find-coding-system' for more detail.");
3613 "Mnemonic character indicating end-of-line format is not yet decided."); 3662 "Mnemonic character indicating end-of-line format is not yet decided.");
3614 eol_mnemonic_undecided = '-'; 3663 eol_mnemonic_undecided = '-';
3615 3664
3616 DEFVAR_LISP ("alternate-charset-table", &Valternate_charset_table, 3665 DEFVAR_LISP ("enable-character-unification", &Venable_character_unification,
3617 "Alist of charsets vs the alternate charsets.\n\ 3666 "Non-nil means ISO 2022 encoder/decoder do character unification.");
3618While decoding, if a charset (car part of an element) is found,\n\ 3667 Venable_character_unification = Qt;
3619decode it as the alternate charset (cdr part of the element)."); 3668
3620 Valternate_charset_table = Qnil; 3669 DEFVAR_LISP ("standard-character-unification-table-for-read",
3670 &Vstandard_character_unification_table_for_read,
3671 "Table for unifying characters when reading.");
3672 Vstandard_character_unification_table_for_read = Qnil;
3673
3674 DEFVAR_LISP ("standard-character-unification-table-for-write",
3675 &Vstandard_character_unification_table_for_write,
3676 "Table for unifying characters when writing.");
3677 Vstandard_character_unification_table_for_write = Qnil;
3621 3678
3622 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist, 3679 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
3623 "Alist of charsets vs revision numbers.\n\ 3680 "Alist of charsets vs revision numbers.\n\