aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorKarl Heuer1997-02-20 07:02:49 +0000
committerKarl Heuer1997-02-20 07:02:49 +0000
commit4ed4686978bd18292e2bb7b87a7b0e0407ecb3b1 (patch)
tree860ad83f81c8c630fe7051e3c5379ca8a9658f69 /src
parentadb572fb93ddfee88f9c5e9681434517fd241232 (diff)
downloademacs-4ed4686978bd18292e2bb7b87a7b0e0407ecb3b1.tar.gz
emacs-4ed4686978bd18292e2bb7b87a7b0e0407ecb3b1.zip
Initial revision
Diffstat (limited to 'src')
-rw-r--r--src/category.c665
-rw-r--r--src/category.h130
-rw-r--r--src/ccl.c1140
-rw-r--r--src/ccl.h53
-rw-r--r--src/charset.c1452
-rw-r--r--src/charset.h649
-rw-r--r--src/coding.c3520
-rw-r--r--src/coding.h409
-rw-r--r--src/fontset.c819
-rw-r--r--src/fontset.h201
10 files changed, 9038 insertions, 0 deletions
diff --git a/src/category.c b/src/category.c
new file mode 100644
index 00000000000..8bdaee9e5af
--- /dev/null
+++ b/src/category.c
@@ -0,0 +1,665 @@
1/* GNU Emacs routines to deal with category tables.
2 Ver.1.0
3
4 Copyright (C) 1995 Free Software Foundation, Inc.
5 Copyright (C) 1995 Electrotechnical Laboratory, JAPAN.
6
7This file is part of GNU Emacs.
8
9GNU Emacs is free software; you can redistribute it and/or modify
10it under the terms of the GNU General Public License as published by
11the Free Software Foundation; either version 2, or (at your option)
12any later version.
13
14GNU Emacs is distributed in the hope that it will be useful,
15but WITHOUT ANY WARRANTY; without even the implied warranty of
16MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17GNU General Public License for more details.
18
19You should have received a copy of the GNU General Public License
20along with GNU Emacs; see the file COPYING. If not, write to
21the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. */
22
23
24/* Here we handle three objects: category, category set, and category
25 table. Read comments in the file category.h to understand them. */
26
27#include <config.h>
28#include <ctype.h>
29#include "lisp.h"
30#include "buffer.h"
31#include "charset.h"
32#include "category.h"
33
34/* The version number of the latest category table. Each category
35 table has a unique version number. It is assigned a new number
36 also when it is modified. When a regular expression is compiled
37 into the struct re_pattern_buffer, the version number of the
38 category table (of the current buffer) at that moment is also
39 embedded in the structure.
40
41 For the moment, we are not using this feature. */
42static int category_table_version;
43
44Lisp_Object Qcategory_table, Qcategoryp, Qcategorysetp, Qcategory_table_p;
45
46/* Variables to determine word boundary. */
47Lisp_Object Vword_combining_categories, Vword_separating_categories;
48
49/* Temporary internal variable used in macro CHAR_HAS_CATEGORY. */
50Lisp_Object _temp_category_set;
51
52
53/* Category set staff. */
54
55DEFUN ("make-category-set", Fmake_category_set, Smake_category_set, 1, 1, 0,
56 "Return a newly created category-set which contains CATEGORIES.\n\
57CATEGORIES is a string of category mnemonics.")
58 (categories)
59 Lisp_Object categories;
60{
61 Lisp_Object val;
62 int len;
63
64 CHECK_STRING (categories, 0);
65 val = MAKE_CATEGORY_SET;
66
67 len = XSTRING (categories)->size;
68 while (--len >= 0)
69 {
70 Lisp_Object category = make_number (XSTRING (categories)->data[len]);
71
72 CHECK_CATEGORY (category, 0);
73 SET_CATEGORY_SET (val, category, Qt);
74 }
75 return val;
76}
77
78
79/* Category staff. */
80
81Lisp_Object check_category_table ();
82
83DEFUN ("define-category", Fdefine_category, Sdefine_category, 2, 3, 0,
84 "Define CHAR as a category which is described by DOCSTRING.\n\
85CHAR should be a visible letter of ` ' thru `~'.\n\
86DOCSTRING is a documentation string of the category.\n\
87The category is defined only in category table TABLE, which defaults to\n\
88 the current buffer's category table.")
89 (category, docstring, table)
90 Lisp_Object category, docstring, table;
91{
92 CHECK_CATEGORY (category, 0);
93 CHECK_STRING (docstring, 1);
94 table = check_category_table (table);
95
96 if (!NILP (CATEGORY_DOCSTRING (table, XFASTINT (category))))
97 error ("Category `%c' is already defined", XFASTINT (category));
98 CATEGORY_DOCSTRING (table, XFASTINT (category)) = docstring;
99
100 return Qnil;
101}
102
103DEFUN ("category-docstring", Fcategory_docstring, Scategory_docstring, 1, 2, 0,
104 "Return a documentation string of CATEGORY.\n\
105Optional second arg specifies CATEGORY-TABLE,\n\
106 which defaults to the current buffer's category table.")
107 (category, table)
108 Lisp_Object category, table;
109{
110 Lisp_Object doc;
111
112 CHECK_CATEGORY (category, 0);
113 table = check_category_table (table);
114
115 return CATEGORY_DOCSTRING (table, XFASTINT (category));
116}
117
118DEFUN ("get-unused-category", Fget_unused_category, Sget_unused_category,
119 0, 1, 0,
120 "Return a category which is not yet defined.\n\
121If total number of categories has reached the limit (95), return nil.\n\
122Optional argument specifies CATEGORY-TABLE,\n\
123 which defaults to the current buffer's category table.")
124 (table)
125 Lisp_Object table;
126{
127 int i;
128 Lisp_Object docstring_vector;
129
130 table = check_category_table (table);
131
132 for (i = ' '; i <= '~'; i++)
133 if (NILP (CATEGORY_DOCSTRING (table, i)))
134 return make_number (i);
135
136 return Qnil;
137}
138
139
140/* Category-table staff. */
141
142DEFUN ("category-table-p", Fcategory_table_p, Scategory_table_p, 1, 1, 0,
143 "Return t if ARG is a category table.")
144 (arg)
145 Lisp_Object arg;
146{
147 if (CHAR_TABLE_P (arg)
148 && EQ (XCHAR_TABLE (arg)->purpose, Qcategory_table)
149 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (arg)) == 2)
150 return Qt;
151 return Qnil;
152}
153
154/* If TABLE is nil, return the current category table. If TABLE is
155 not nil, check the validity of TABLE as a category table. If
156 valid, return TABLE itself, but if not valid, signal an error of
157 wrong-type-argument. */
158
159Lisp_Object
160check_category_table (table)
161 Lisp_Object table;
162{
163 register Lisp_Object tem;
164 if (NILP (table))
165 return current_buffer->category_table;
166 while (tem = Fcategory_table_p (table), NILP (tem))
167 table = wrong_type_argument (Qcategory_table_p, table);
168 return table;
169}
170
171DEFUN ("category-table", Fcategory_table, Scategory_table, 0, 0, 0,
172 "Return the current category table.\n\
173This is the one specified by the current buffer.")
174 ()
175{
176 return current_buffer->category_table;
177}
178
179DEFUN ("standard-category-table", Fstandard_category_table,
180 Sstandard_category_table, 0, 0, 0,
181 "Return the standard category table.\n\
182This is the one used for new buffers.")
183 ()
184{
185 return Vstandard_category_table;
186}
187
188/* Return a copy of category table TABLE. We can't simply use the
189 function copy-sequence because no contents should be shared between
190 the original and the copy.
191
192 If TOP is 1, we at first copy the tree structure of the table. */
193
194Lisp_Object
195copy_category_table (table, top)
196 Lisp_Object table;
197{
198 int i;
199
200 if (top)
201 table = Fcopy_sequence (table);
202 else if (!NILP (XCHAR_TABLE (table)->defalt))
203 XCHAR_TABLE (table)->defalt
204 = Fcopy_sequence (XCHAR_TABLE (table)->defalt);
205
206 for (i = 0; i < CHAR_TABLE_ORDINARY_SLOTS; i++)
207 {
208 Lisp_Object idx = make_number (i);
209 Lisp_Object val = Faref (table, idx);
210
211 if (NILP (val)) /* Do nothing because we can share nil. */
212 ;
213 else if (CATEGORY_SET_P (val))
214 Faset (table, idx, Fcopy_sequence (val));
215 else if (CHAR_TABLE_P (val))
216 Faset (table, idx, copy_category_table (val, 0));
217 else /* Invalid contents. */
218 Faset (table, idx, Qnil);
219 }
220
221 return table;
222}
223
224DEFUN ("copy-category-table", Fcopy_category_table, Scopy_category_table,
225 0, 1, 0,
226 "Construct a new category table and return it.\n\
227It is a copy of the TABLE, which defaults to the standard category table.")
228 (table)
229 Lisp_Object table;
230{
231 if (!NILP (table))
232 check_category_table (table);
233 else
234 table = Vstandard_category_table;
235
236 return copy_category_table (table, 1);
237}
238
239DEFUN ("set-category-table", Fset_category_table, Sset_category_table, 1, 1, 0,
240 "Select a new category table for the current buffer.\n\
241One argument, a category table.")
242 (table)
243 Lisp_Object table;
244{
245 table = check_category_table (table);
246 current_buffer->category_table = table;
247 /* Indicate that this buffer now has a specified category table. */
248 current_buffer->local_var_flags
249 |= XFASTINT (buffer_local_flags.category_table);
250 return table;
251}
252
253
254DEFUN ("char-category-set", Fchar_category_set, Schar_category_set, 1, 1, 0,
255 "Return a category set of CHAR.")
256 (ch)
257 Lisp_Object ch;
258{
259 Lisp_Object val;
260 int charset;
261 unsigned char c1, c2;
262
263 CHECK_NUMBER (ch, 0);
264 return CATEGORY_SET (XFASTINT (ch));
265}
266
267DEFUN ("category-set-mnemonics", Fcategory_set_mnemonics,
268 Scategory_set_mnemonics, 1, 1, 0,
269 "Return a string of mnemonics of all categories in CATEGORY-SET.")
270 (category_set)
271 Lisp_Object category_set;
272{
273 int i, j;
274 char str[96];
275
276 CHECK_CATEGORY_SET (category_set, 0);
277
278 j = 0;
279 for (i = 32; i < 127; i++)
280 if (CATEGORY_MEMBER (i, category_set))
281 str[j++] = i;
282 str[j] = '\0';
283
284 return build_string (str);
285}
286
287/* Modify all category sets stored under category table TABLE so that
288 they contain (SET_VALUE is t) or don't contain (SET_VALUE is nil)
289 CATEGORY. */
290
291void
292modify_lower_category_set (table, category, set_value)
293 Lisp_Object table, category, set_value;
294{
295 Lisp_Object val;
296 int i;
297
298 if (NILP (XCHAR_TABLE (table)->defalt))
299 {
300 val = MAKE_CATEGORY_SET;
301 SET_CATEGORY_SET (val, category, set_value);
302 XCHAR_TABLE (table)->defalt = val;
303 }
304
305 for (i = 32; i < CHAR_TABLE_ORDINARY_SLOTS; i++)
306 {
307 val = XCHAR_TABLE (table)->contents[i];
308
309 if (CATEGORY_SET_P (val))
310 SET_CATEGORY_SET (val, category, set_value);
311 else if (CHAR_TABLE_P (val))
312 modify_lower_category_set (val, category, set_value);
313 }
314}
315
316void
317set_category_set (category_set, category, val)
318 Lisp_Object category_set, category, val;
319{
320 do {
321 int idx = XINT (category) / 8;
322 unsigned char bits = 1 << (XINT (category) % 8);
323
324 if (NILP (val))
325 XCATEGORY_SET (category_set)->data[idx] &= ~bits;
326 else
327 XCATEGORY_SET (category_set)->data[idx] |= bits;
328 } while (0);
329}
330
331DEFUN ("modify-category-entry", Fmodify_category_entry,
332 Smodify_category_entry, 2, 4, 0,
333 "Modify the category set of CHAR by adding CATEGORY to it.\n\
334The category is changed only for table TABLE, which defaults to\n\
335 the current buffer's category table.\n\
336If optional forth argument RESET is non NIL,\n\
337 CATEGORY is deleted from the category set instead of being added.")
338 (ch, category, table, reset)
339 Lisp_Object ch, category, table, reset;
340{
341 int c, charset, c1, c2;
342 Lisp_Object set_value; /* Actual value to be set in category sets. */
343 Lisp_Object val, category_set;
344
345 CHECK_NUMBER (ch, 0);
346 c = XINT (ch);
347 CHECK_CATEGORY (category, 1);
348 table = check_category_table (table);
349
350 if (NILP (CATEGORY_DOCSTRING (table, XFASTINT (category))))
351 error ("Undefined category: %c", XFASTINT (category));
352
353 set_value = NILP (reset) ? Qt : Qnil;
354
355 if (SINGLE_BYTE_CHAR_P (c))
356 {
357 val = XCHAR_TABLE (table)->contents[c];
358 if (!CATEGORY_SET_P (val))
359 XCHAR_TABLE (table)->contents[c] = (val = MAKE_CATEGORY_SET);
360 SET_CATEGORY_SET (val, category, set_value);
361 return Qnil;
362 }
363
364 if (COMPOSITE_CHAR_P (c))
365 c = cmpchar_component (c, 0);
366 SPLIT_NON_ASCII_CHAR (c, charset, c1, c2);
367
368 /* The top level table. */
369 val = XCHAR_TABLE (table)->contents[charset];
370 if (NILP (val))
371 {
372 category_set = MAKE_CATEGORY_SET;
373 XCHAR_TABLE (table)->contents[charset] = category_set;
374 }
375 else if (CATEGORY_SET_P (val))
376 category_set = val;
377
378 if (!c1)
379 {
380 /* Only a charset is specified. */
381 if (CHAR_TABLE_P (val))
382 /* All characters in CHARSET should be the same as for CATEGORY. */
383 modify_lower_category_set (val, category, set_value);
384 else
385 SET_CATEGORY_SET (category_set, category, set_value);
386 return Qnil;
387 }
388
389 /* The second level table. */
390 if (!CHAR_TABLE_P (val))
391 {
392 val = Fmake_char_table (Qnil, Qnil);
393 XCHAR_TABLE (table)->contents[charset] = val;
394 /* We must set default category set of CHARSET in `defalt' slot. */
395 XCHAR_TABLE (val)->defalt = category_set;
396 }
397 table = val;
398
399 val = XCHAR_TABLE (table)->contents[c1];
400 if (NILP (val))
401 {
402 category_set = Fcopy_sequence (XCHAR_TABLE (table)->defalt);
403 XCHAR_TABLE (table)->contents[c1] = category_set;
404 }
405 else if (CATEGORY_SET_P (val))
406 category_set = val;
407
408 if (!c2)
409 {
410 if (CHAR_TABLE_P (val))
411 /* All characters in C1 group of CHARSET should be the same as
412 for CATEGORY. */
413 modify_lower_category_set (val, category, set_value);
414 else
415 SET_CATEGORY_SET (category_set, category, set_value);
416 return Qnil;
417 }
418
419 /* The third (bottom) level table. */
420 if (!CHAR_TABLE_P (val))
421 {
422 val = Fmake_char_table (Qnil, Qnil);
423 XCHAR_TABLE (table)->contents[c1] = val;
424 /* We must set default category set of CHARSET and C1 in
425 `defalt' slot. */
426 XCHAR_TABLE (val)->defalt = category_set;
427 }
428 table = val;
429
430 val = XCHAR_TABLE (table)->contents[c2];
431 if (NILP (val))
432 {
433 category_set = Fcopy_sequence (XCHAR_TABLE (table)->defalt);
434 XCHAR_TABLE (table)->contents[c2] = category_set;
435 }
436 else if (CATEGORY_SET_P (val))
437 category_set = val;
438 else
439 /* This should never happen. */
440 error ("Invalid category table");
441
442 SET_CATEGORY_SET (category_set, category, set_value);
443
444 return Qnil;
445}
446
447/* Dump category table to buffer in human-readable format */
448
449static void
450describe_category (value)
451 Lisp_Object value;
452{
453 Lisp_Object mnemonics;
454
455 Findent_to (make_number (16), make_number (1));
456
457 if (NILP (value))
458 {
459 insert_string ("default\n");
460 return;
461 }
462
463 if (!CATEGORY_SET_P (value))
464 {
465 insert_string ("invalid\n");
466 return;
467 }
468
469 mnemonics = Fcategory_set_mnemonics (value);
470 insert_from_string (mnemonics, 0, XSTRING (mnemonics)->size, 0);
471 insert_string ("\n");
472 return;
473}
474
475static Lisp_Object
476describe_category_1 (vector)
477 Lisp_Object vector;
478{
479 struct buffer *old = current_buffer;
480 set_buffer_internal (XBUFFER (Vstandard_output));
481 describe_vector (vector, Qnil, describe_category, 0, Qnil, Qnil);
482 {
483 int i;
484 Lisp_Object docs = XCHAR_TABLE (vector)->extras[0];
485 Lisp_Object elt;
486
487 if (!VECTORP (docs) || XVECTOR (docs)->size != 95)
488 {
489 insert_string ("Invalid first extra slot in this char table\n");
490 return Qnil;
491 }
492
493 insert_string ("Meanings of mnemonice characters are:\n");
494 for (i = 0; i < 95; i++)
495 {
496 elt = XVECTOR (docs)->contents[i];
497 if (NILP (elt))
498 continue;
499
500 insert_char (i + 32);
501 insert (": ", 2);
502 insert_from_string (elt, 0, XSTRING (elt)->size, 0);
503 insert ("\n", 1);
504 }
505 }
506
507 while (! NILP (XCHAR_TABLE (vector)->parent))
508 {
509 vector = XCHAR_TABLE (vector)->parent;
510 insert_string ("\nThe parent category table is:");
511 describe_vector (vector, Qnil, describe_category, 0, Qnil, Qnil);
512 }
513
514 call0 (intern ("help-mode"));
515 set_buffer_internal (old);
516 return Qnil;
517}
518
519DEFUN ("describe-category", Fdescribe_category, Sdescribe_category, 0, 0, "",
520 "Describe the category specifications in the category table.\n\
521The descriptions are inserted in a buffer, which is then displayed.")
522 ()
523{
524 internal_with_output_to_temp_buffer
525 ("*Help*", describe_category_1, current_buffer->category_table);
526
527 return Qnil;
528}
529
530/* Return 1 if there is a word boundary between two word-constituent
531 characters C1 and C2 if they appear in this order, else return 0.
532 Use the macro WORD_BOUNDARY_P instead of calling this function
533 directly. */
534
535int
536word_boundary_p (c1, c2)
537 int c1, c2;
538{
539 Lisp_Object category_set1, category_set2;
540 Lisp_Object tail;
541 int default_result;
542
543 if (CHAR_CHARSET (c1) == CHAR_CHARSET (c2))
544 {
545 tail = Vword_separating_categories;
546 default_result = 0;
547 }
548 else
549 {
550 tail = Vword_combining_categories;
551 default_result = 1;
552 }
553
554 category_set1 = CATEGORY_SET (c1);
555 if (NILP (category_set1))
556 return default_result;
557 category_set2 = CATEGORY_SET (c2);
558 if (NILP (category_set2))
559 return default_result;
560
561 for (; CONSP (tail); tail = XCONS (tail)->cdr)
562 {
563 Lisp_Object elt = XCONS(tail)->car;
564
565 if (CONSP (elt)
566 && CATEGORYP (XCONS (elt)->car)
567 && CATEGORYP (XCONS (elt)->cdr)
568 && CATEGORY_MEMBER (XCONS (elt)->car, category_set1)
569 && CATEGORY_MEMBER (XCONS (elt)->cdr, category_set2))
570 return !default_result;
571 }
572 return default_result;
573}
574
575
576init_category_once ()
577{
578 /* This has to be done here, before we call Fmake_char_table. */
579 Qcategory_table = intern ("category-table");
580 staticpro (&Qcategory_table);
581
582 /* Intern this now in case it isn't already done.
583 Setting this variable twice is harmless.
584 But don't staticpro it here--that is done in alloc.c. */
585 Qchar_table_extra_slots = intern ("char-table-extra-slots");
586
587 /* Now we are ready to set up this property, so we can
588 create category tables. */
589 Fput (Qcategory_table, Qchar_table_extra_slots, make_number (2));
590
591 Vstandard_category_table = Fmake_char_table (Qcategory_table, Qnil);
592 /* Set a category set which contains nothing to the default. */
593 XCHAR_TABLE (Vstandard_category_table)->defalt = MAKE_CATEGORY_SET;
594 Fset_char_table_extra_slot (Vstandard_category_table, 0,
595 Fmake_vector (make_number (95), Qnil));
596}
597
598syms_of_category ()
599{
600 Qcategoryp = intern ("categoryp");
601 staticpro (&Qcategoryp);
602 Qcategorysetp = intern ("categorysetp");
603 staticpro (&Qcategorysetp);
604 Qcategory_table_p = intern ("category-table-p");
605 staticpro (&Qcategory_table_p);
606
607 DEFVAR_LISP ("word-combining-categories", &Vword_combining_categories,
608 "List of pair (cons) of categories to determine word boundary.\n\
609\n\
610Emacs treats a sequence of word constituent characters as a single\n\
611word (i.e. finds no word boundary between them) iff they belongs to\n\
612the same charset. But, exceptions are allowed in the following cases.\n\
613\n\
614(1) The case that characters are in different charsets is controlled\n\
615by the variable `word-combining-categories'.\n\
616\n\
617Emacs finds no word boundary between characters of different charsets\n\
618if they have categories matching some element of this list.\n\
619\n\
620More precisely, if an element of this list is a cons of category CAT1\n\
621and CAT2, and a multibyte character C1 which has CAT1 is followed by\n\
622C2 which has CAT2, there's no word boundary between C1 and C2.\n\
623\n\
624For instance, to tell that ASCII characters and Latin-1 characters can\n\
625form a single word, the element `(?l . ?l)' should be in this list\n\
626because both characters have the category `l' (Latin characters).\n\
627\n\
628(2) The case that character are in the same charset is controlled by\n\
629the variable `word-separating-categories'.\n\
630\n\
631Emacs find a word boundary between characters of the same charset\n\
632if they have categories matching some element of this list.\n\
633\n\
634More precisely, if an element of this list is a cons of category CAT1\n\
635and CAT2, and a multibyte character C1 which has CAT1 is followed by\n\
636C2 which has CAT2, there's a word boundary between C1 and C2.\n\
637\n\
638For instance, to tell that there's a word boundary between Japanese\n\
639Hiragana and Japanese Kanji (both are in the same charset), the\n\
640element `(?H . ?C) should be in this list.");
641
642 Vword_combining_categories = Qnil;
643
644 DEFVAR_LISP ("word-separating-categories", &Vword_separating_categories,
645 "List of pair (cons) of categories to determine word boundary.\n\
646See the documentation of the variable `word-combining-categories'.");
647
648 Vword_separating_categories = Qnil;
649
650 defsubr (&Smake_category_set);
651 defsubr (&Sdefine_category);
652 defsubr (&Scategory_docstring);
653 defsubr (&Sget_unused_category);
654 defsubr (&Scategory_table_p);
655 defsubr (&Scategory_table);
656 defsubr (&Sstandard_category_table);
657 defsubr (&Scopy_category_table);
658 defsubr (&Sset_category_table);
659 defsubr (&Schar_category_set);
660 defsubr (&Scategory_set_mnemonics);
661 defsubr (&Smodify_category_entry);
662 defsubr (&Sdescribe_category);
663
664 category_table_version = 0;
665}
diff --git a/src/category.h b/src/category.h
new file mode 100644
index 00000000000..975e82b52f2
--- /dev/null
+++ b/src/category.h
@@ -0,0 +1,130 @@
1/* Declarations having to do with Emacs category tables.
2 Ver.1.0
3
4 Copyright (C) 1995 Free Software Foundation, Inc.
5 Copyright (C) 1995 Electrotechnical Laboratory, JAPAN.
6
7This file is part of GNU Emacs.
8
9GNU Emacs is free software; you can redistribute it and/or modify
10it under the terms of the GNU General Public License as published by
11the Free Software Foundation; either version 2, or (at your option)
12any later version.
13
14GNU Emacs is distributed in the hope that it will be useful,
15but WITHOUT ANY WARRANTY; without even the implied warranty of
16MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17GNU General Public License for more details.
18
19You should have received a copy of the GNU General Public License
20along with GNU Emacs; see the file COPYING. If not, write to
21the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. */
22
23
24/* We introduce here three types of object: category, category set,
25 and category table.
26
27 A category is like syntax but differs in the following points:
28
29 o A category is represented by a mnemonic character of the range
30 ` '(32)..`~'(126) (printable ASCII characters).
31
32 o A category is not exclusive, i.e. a character has multiple
33 categories (category set). Of course, there's a case that a
34 category set is empty, i.e. the character has no category.
35
36 o In addition to the predefined categories, a user can define new
37 categories. Total number of categories is limited to 95.
38
39 A category set is a set of categories represented by Lisp
40 bool-vector of length 128 (only elements of 31th through 125th
41 are used).
42
43 A category table is like syntax-table, represented by a Lisp
44 char-table. The contents are category sets or nil. It has two
45 extra slots. for a vector of doc string of each category and a
46 version number.
47
48 The first extra slot is a vector of doc strings of categories, the
49 length is 95. The Nth element corresponding to the category N+32.
50
51 The second extra slot is a version number of the category table.
52 But, for the moment, we are not using this slot. */
53
54#define CATEGORYP(x) \
55 (INTEGERP ((x)) && XFASTINT ((x)) >= 0x20 && XFASTINT ((x)) <= 0x7E)
56
57#define CHECK_CATEGORY(x, i) \
58 do { \
59 if (!CATEGORYP ((x))) x = wrong_type_argument (Qcategoryp, (x)); \
60 } while (0)
61
62#define XCATEGORY_SET XBOOL_VECTOR
63
64#define CATEGORY_SET_P(x) \
65 (BOOL_VECTOR_P ((x)) && (EMACS_INT) (XBOOL_VECTOR ((x))->size) == 128)
66
67/* Return a new empty category set. */
68#define MAKE_CATEGORY_SET (Fmake_bool_vector (make_number (128), Qnil))
69
70/* Make CATEGORY_SET includes (if VAL is t) or excludes (if VAL is
71 nil) CATEGORY. */
72#define SET_CATEGORY_SET(category_set, category, val) \
73 (Faset (category_set, category, val))
74
75#define CHECK_CATEGORY_SET(x, i) \
76 do { \
77 if (!CATEGORY_SET_P ((x))) x = wrong_type_argument (Qcategorysetp, (x)); \
78 } while (0)
79
80/* Return 1 if CATEGORY_SET contains CATEGORY, else return 0.
81 The faster version of `!NILP (Faref (category_set, category))'. */
82#define CATEGORY_MEMBER(category, category_set) \
83 (!NILP (category_set) \
84 && (XCATEGORY_SET (category_set)->data[XFASTINT (category) / 8] \
85 & (1 << (XFASTINT (category) % 8))))
86
87/* Temporary internal variable used in macro CHAR_HAS_CATEGORY. */
88extern Lisp_Object _temp_category_set;
89
90/* Return 1 if category set of CH contains CATEGORY, elt return 0. */
91#define CHAR_HAS_CATEGORY(ch, category) \
92 (_temp_category_set = CATEGORY_SET (ch), \
93 CATEGORY_MEMBER (category, _temp_category_set))
94
95/* The standard category table is stored where it will automatically
96 be used in all new buffers. */
97#define Vstandard_category_table buffer_defaults.category_table
98
99/* Return the category set of character C in the current category table. */
100#ifdef __GNUC__
101#define CATEGORY_SET(c) \
102 ({ Lisp_Object table = current_buffer->category_table; \
103 Lisp_Object temp; \
104 if (c < CHAR_TABLE_ORDINARY_SLOTS) \
105 while (NILP (temp = XCHAR_TABLE (table)->contents[c]) \
106 && NILP (temp = XCHAR_TABLE (table)->defalt)) \
107 table = XCHAR_TABLE (table)->parent; \
108 else \
109 temp = Faref (table, c); \
110 temp; })
111#else
112#define CATEGORY_SET(c) Faref (current_buffer->category_table, c)
113#endif
114
115/* Return the doc string of CATEGORY in category table TABLE. */
116#define CATEGORY_DOCSTRING(table, category) \
117 XVECTOR (Fchar_table_extra_slot (table, 0))->contents[(category) - ' ']
118
119/* Return the version number of category table TABLE. Not used for
120 the moment. */
121#define CATEGORY_TABLE_VERSION (table) \
122 Fchar_table_extra_slot (table, 1)
123
124/* Return 1 if there is a word boundary between two word-constituent
125 characters C1 and C2 if they appear in this order, else return 0.
126 There is no word boundary between two word-constituent ASCII
127 characters. */
128#define WORD_BOUNDARY_P(c1, c2) \
129 (!(SINGLE_BYTE_CHAR_P (c1) && SINGLE_BYTE_CHAR_P (c2)) \
130 && word_boundary_p (c1, c2))
diff --git a/src/ccl.c b/src/ccl.c
new file mode 100644
index 00000000000..11c1ae500d6
--- /dev/null
+++ b/src/ccl.c
@@ -0,0 +1,1140 @@
1/* CCL (Code Conversion Language) interpreter.
2 Ver.1.0
3
4 Copyright (C) 1995 Free Software Foundation, Inc.
5 Copyright (C) 1995 Electrotechnical Laboratory, JAPAN.
6
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
20
21#include <stdio.h>
22
23#ifdef emacs
24
25#include <config.h>
26#include "lisp.h"
27#include "charset.h"
28#include "ccl.h"
29#include "coding.h"
30
31#else /* not emacs */
32
33#include "mulelib.h"
34
35#endif /* not emacs */
36
37/* Alist of fontname patterns vs corresponding CCL program. */
38Lisp_Object Vfont_ccl_encoder_alist;
39
40/* Vector of CCL program names vs corresponding program data. */
41Lisp_Object Vccl_program_table;
42
43/* CCL (Code Conversion Language) is a simple language which has
44 operations on one input buffer, one output buffer, and 7 registers.
45 The syntax of CCL is described in `ccl.el'. Emacs Lisp function
46 `ccl-compile' compiles a CCL program and produces a CCL code which
47 is a vector of integers. The structure of this vector is as
48 follows: The 1st element: buffer-magnification, a factor for the
49 size of output buffer compared with the size of input buffer. The
50 2nd element: address of CCL code to be executed when encountered
51 with end of input stream. The 3rd and the remaining elements: CCL
52 codes. */
53
54/* Header of CCL compiled code */
55#define CCL_HEADER_BUF_MAG 0
56#define CCL_HEADER_EOF 1
57#define CCL_HEADER_MAIN 2
58
59/* CCL code is a sequence of 28-bit non-negative integers (i.e. the
60 MSB is always 0), each contains CCL command and/or arguments in the
61 following format:
62
63 |----------------- integer (28-bit) ------------------|
64 |------- 17-bit ------|- 3-bit --|- 3-bit --|- 5-bit -|
65 |--constant argument--|-register-|-register-|-command-|
66 ccccccccccccccccc RRR rrr XXXXX
67 or
68 |------- relative address -------|-register-|-command-|
69 cccccccccccccccccccc rrr XXXXX
70 or
71 |------------- constant or other args ----------------|
72 cccccccccccccccccccccccccccc
73
74 where, `cc...c' is a non-negative integer indicating constant value
75 (the left most `c' is always 0) or an absolute jump address, `RRR'
76 and `rrr' are CCL register number, `XXXXX' is one of the following
77 CCL commands. */
78
79/* CCL commands
80
81 Each comment fields shows one or more lines for command syntax and
82 the following lines for semantics of the command. In semantics, IC
83 stands for Instruction Counter. */
84
85#define CCL_SetRegister 0x00 /* Set register a register value:
86 1:00000000000000000RRRrrrXXXXX
87 ------------------------------
88 reg[rrr] = reg[RRR];
89 */
90
91#define CCL_SetShortConst 0x01 /* Set register a short constant value:
92 1:CCCCCCCCCCCCCCCCCCCCrrrXXXXX
93 ------------------------------
94 reg[rrr] = CCCCCCCCCCCCCCCCCCC;
95 */
96
97#define CCL_SetConst 0x02 /* Set register a constant value:
98 1:00000000000000000000rrrXXXXX
99 2:CONSTANT
100 ------------------------------
101 reg[rrr] = CONSTANT;
102 IC++;
103 */
104
105#define CCL_SetArray 0x03 /* Set register an element of array:
106 1:CCCCCCCCCCCCCCCCCRRRrrrXXXXX
107 2:ELEMENT[0]
108 3:ELEMENT[1]
109 ...
110 ------------------------------
111 if (0 <= reg[RRR] < CC..C)
112 reg[rrr] = ELEMENT[reg[RRR]];
113 IC += CC..C;
114 */
115
116#define CCL_Jump 0x04 /* Jump:
117 1:A--D--D--R--E--S--S-000XXXXX
118 ------------------------------
119 IC += ADDRESS;
120 */
121
122/* Note: If CC..C is greater than 0, the second code is omitted. */
123
124#define CCL_JumpCond 0x05 /* Jump conditional:
125 1:A--D--D--R--E--S--S-rrrXXXXX
126 ------------------------------
127 if (!reg[rrr])
128 IC += ADDRESS;
129 */
130
131
132#define CCL_WriteRegisterJump 0x06 /* Write register and jump:
133 1:A--D--D--R--E--S--S-rrrXXXXX
134 ------------------------------
135 write (reg[rrr]);
136 IC += ADDRESS;
137 */
138
139#define CCL_WriteRegisterReadJump 0x07 /* Write register, read, and jump:
140 1:A--D--D--R--E--S--S-rrrXXXXX
141 2:A--D--D--R--E--S--S-rrrYYYYY
142 -----------------------------
143 write (reg[rrr]);
144 IC++;
145 read (reg[rrr]);
146 IC += ADDRESS;
147 */
148/* Note: If read is suspended, the resumed execution starts from the
149 second code (YYYYY == CCL_ReadJump). */
150
151#define CCL_WriteConstJump 0x08 /* Write constant and jump:
152 1:A--D--D--R--E--S--S-000XXXXX
153 2:CONST
154 ------------------------------
155 write (CONST);
156 IC += ADDRESS;
157 */
158
159#define CCL_WriteConstReadJump 0x09 /* Write constant, read, and jump:
160 1:A--D--D--R--E--S--S-rrrXXXXX
161 2:CONST
162 3:A--D--D--R--E--S--S-rrrYYYYY
163 -----------------------------
164 write (CONST);
165 IC += 2;
166 read (reg[rrr]);
167 IC += ADDRESS;
168 */
169/* Note: If read is suspended, the resumed execution starts from the
170 second code (YYYYY == CCL_ReadJump). */
171
172#define CCL_WriteStringJump 0x0A /* Write string and jump:
173 1:A--D--D--R--E--S--S-000XXXXX
174 2:LENGTH
175 3:0000STRIN[0]STRIN[1]STRIN[2]
176 ...
177 ------------------------------
178 write_string (STRING, LENGTH);
179 IC += ADDRESS;
180 */
181
182#define CCL_WriteArrayReadJump 0x0B /* Write an array element, read, and jump:
183 1:A--D--D--R--E--S--S-rrrXXXXX
184 2:LENGTH
185 3:ELEMENET[0]
186 4:ELEMENET[1]
187 ...
188 N:A--D--D--R--E--S--S-rrrYYYYY
189 ------------------------------
190 if (0 <= reg[rrr] < LENGTH)
191 write (ELEMENT[reg[rrr]]);
192 IC += LENGTH + 2; (... pointing at N+1)
193 read (reg[rrr]);
194 IC += ADDRESS;
195 */
196/* Note: If read is suspended, the resumed execution starts from the
197 Mth code (YYYYY == CCL_ReadJump). */
198
199#define CCL_ReadJump 0x0C /* Read and jump:
200 1:A--D--D--R--E--S--S-rrrYYYYY
201 -----------------------------
202 read (reg[rrr]);
203 IC += ADDRESS;
204 */
205
206#define CCL_Branch 0x0D /* Jump by branch table:
207 1:CCCCCCCCCCCCCCCCCCCCrrrXXXXX
208 2:A--D--D--R--E-S-S[0]000XXXXX
209 3:A--D--D--R--E-S-S[1]000XXXXX
210 ...
211 ------------------------------
212 if (0 <= reg[rrr] < CC..C)
213 IC += ADDRESS[reg[rrr]];
214 else
215 IC += ADDRESS[CC..C];
216 */
217
218#define CCL_ReadRegister 0x0E /* Read bytes into registers:
219 1:CCCCCCCCCCCCCCCCCCCCrrrXXXXX
220 2:CCCCCCCCCCCCCCCCCCCCrrrXXXXX
221 ...
222 ------------------------------
223 while (CCC--)
224 read (reg[rrr]);
225 */
226
227#define CCL_WriteExprConst 0x0F /* write result of expression:
228 1:00000OPERATION000RRR000XXXXX
229 2:CONSTANT
230 ------------------------------
231 write (reg[RRR] OPERATION CONSTANT);
232 IC++;
233 */
234
235/* Note: If the Nth read is suspended, the resumed execution starts
236 from the Nth code. */
237
238#define CCL_ReadBranch 0x10 /* Read one byte into a register,
239 and jump by branch table:
240 1:CCCCCCCCCCCCCCCCCCCCrrrXXXXX
241 2:A--D--D--R--E-S-S[0]000XXXXX
242 3:A--D--D--R--E-S-S[1]000XXXXX
243 ...
244 ------------------------------
245 read (read[rrr]);
246 if (0 <= reg[rrr] < CC..C)
247 IC += ADDRESS[reg[rrr]];
248 else
249 IC += ADDRESS[CC..C];
250 */
251
252#define CCL_WriteRegister 0x11 /* Write registers:
253 1:CCCCCCCCCCCCCCCCCCCrrrXXXXX
254 2:CCCCCCCCCCCCCCCCCCCrrrXXXXX
255 ...
256 ------------------------------
257 while (CCC--)
258 write (reg[rrr]);
259 ...
260 */
261
262/* Note: If the Nth write is suspended, the resumed execution
263 starts from the Nth code. */
264
265#define CCL_WriteExprRegister 0x12 /* Write result of expression
266 1:00000OPERATIONRrrRRR000XXXXX
267 ------------------------------
268 write (reg[RRR] OPERATION reg[Rrr]);
269 */
270
271#define CCL_Call 0x13 /* Write a constant:
272 1:CCCCCCCCCCCCCCCCCCCC000XXXXX
273 ------------------------------
274 call (CC..C)
275 */
276
277#define CCL_WriteConstString 0x14 /* Write a constant or a string:
278 1:CCCCCCCCCCCCCCCCCCCCrrrXXXXX
279 [2:0000STRIN[0]STRIN[1]STRIN[2]]
280 [...]
281 -----------------------------
282 if (!rrr)
283 write (CC..C)
284 else
285 write_string (STRING, CC..C);
286 IC += (CC..C + 2) / 3;
287 */
288
289#define CCL_WriteArray 0x15 /* Write an element of array:
290 1:CCCCCCCCCCCCCCCCCCCCrrrXXXXX
291 2:ELEMENT[0]
292 3:ELEMENT[1]
293 ...
294 ------------------------------
295 if (0 <= reg[rrr] < CC..C)
296 write (ELEMENT[reg[rrr]]);
297 IC += CC..C;
298 */
299
300#define CCL_End 0x16 /* Terminate:
301 1:00000000000000000000000XXXXX
302 ------------------------------
303 terminate ();
304 */
305
306/* The following two codes execute an assignment arithmetic/logical
307 operation. The form of the operation is like REG OP= OPERAND. */
308
309#define CCL_ExprSelfConst 0x17 /* REG OP= constant:
310 1:00000OPERATION000000rrrXXXXX
311 2:CONSTANT
312 ------------------------------
313 reg[rrr] OPERATION= CONSTANT;
314 */
315
316#define CCL_ExprSelfReg 0x18 /* REG1 OP= REG2:
317 1:00000OPERATION000RRRrrrXXXXX
318 ------------------------------
319 reg[rrr] OPERATION= reg[RRR];
320 */
321
322/* The following codes execute an arithmetic/logical operation. The
323 form of the operation is like REG_X = REG_Y OP OPERAND2. */
324
325#define CCL_SetExprConst 0x19 /* REG_X = REG_Y OP constant:
326 1:00000OPERATION000RRRrrrXXXXX
327 2:CONSTANT
328 ------------------------------
329 reg[rrr] = reg[RRR] OPERATION CONSTANT;
330 IC++;
331 */
332
333#define CCL_SetExprReg 0x1A /* REG1 = REG2 OP REG3:
334 1:00000OPERATIONRrrRRRrrrXXXXX
335 ------------------------------
336 reg[rrr] = reg[RRR] OPERATION reg[Rrr];
337 */
338
339#define CCL_JumpCondExprConst 0x1B /* Jump conditional according to
340 an operation on constant:
341 1:A--D--D--R--E--S--S-rrrXXXXX
342 2:OPERATION
343 3:CONSTANT
344 -----------------------------
345 reg[7] = reg[rrr] OPERATION CONSTANT;
346 if (!(reg[7]))
347 IC += ADDRESS;
348 else
349 IC += 2
350 */
351
352#define CCL_JumpCondExprReg 0x1C /* Jump conditional according to
353 an operation on register:
354 1:A--D--D--R--E--S--S-rrrXXXXX
355 2:OPERATION
356 3:RRR
357 -----------------------------
358 reg[7] = reg[rrr] OPERATION reg[RRR];
359 if (!reg[7])
360 IC += ADDRESS;
361 else
362 IC += 2;
363 */
364
365#define CCL_ReadJumpCondExprConst 0x1D /* Read and jump conditional according
366 to an operation on constant:
367 1:A--D--D--R--E--S--S-rrrXXXXX
368 2:OPERATION
369 3:CONSTANT
370 -----------------------------
371 read (reg[rrr]);
372 reg[7] = reg[rrr] OPERATION CONSTANT;
373 if (!reg[7])
374 IC += ADDRESS;
375 else
376 IC += 2;
377 */
378
379#define CCL_ReadJumpCondExprReg 0x1E /* Read and jump conditional according
380 to an operation on register:
381 1:A--D--D--R--E--S--S-rrrXXXXX
382 2:OPERATION
383 3:RRR
384 -----------------------------
385 read (reg[rrr]);
386 reg[7] = reg[rrr] OPERATION reg[RRR];
387 if (!reg[7])
388 IC += ADDRESS;
389 else
390 IC += 2;
391 */
392
393#define CCL_Extention 0x1F /* Extended CCL code
394 1:ExtendedCOMMNDRrrRRRrrrXXXXX
395 2:ARGUEMENT
396 3:...
397 ------------------------------
398 extended_command (rrr,RRR,Rrr,ARGS)
399 */
400
401
402/* CCL arithmetic/logical operators. */
403#define CCL_PLUS 0x00 /* X = Y + Z */
404#define CCL_MINUS 0x01 /* X = Y - Z */
405#define CCL_MUL 0x02 /* X = Y * Z */
406#define CCL_DIV 0x03 /* X = Y / Z */
407#define CCL_MOD 0x04 /* X = Y % Z */
408#define CCL_AND 0x05 /* X = Y & Z */
409#define CCL_OR 0x06 /* X = Y | Z */
410#define CCL_XOR 0x07 /* X = Y ^ Z */
411#define CCL_LSH 0x08 /* X = Y << Z */
412#define CCL_RSH 0x09 /* X = Y >> Z */
413#define CCL_LSH8 0x0A /* X = (Y << 8) | Z */
414#define CCL_RSH8 0x0B /* X = Y >> 8, r[7] = Y & 0xFF */
415#define CCL_DIVMOD 0x0C /* X = Y / Z, r[7] = Y % Z */
416#define CCL_LS 0x10 /* X = (X < Y) */
417#define CCL_GT 0x11 /* X = (X > Y) */
418#define CCL_EQ 0x12 /* X = (X == Y) */
419#define CCL_LE 0x13 /* X = (X <= Y) */
420#define CCL_GE 0x14 /* X = (X >= Y) */
421#define CCL_NE 0x15 /* X = (X != Y) */
422
423#define CCL_ENCODE_SJIS 0x16 /* X = HIGHER_BYTE (SJIS (Y, Z))
424 r[7] = LOWER_BYTE (SJIS (Y, Z) */
425#define CCL_DECODE_SJIS 0x17 /* X = HIGHER_BYTE (DE-SJIS (Y, Z))
426 r[7] = LOWER_BYTE (DE-SJIS (Y, Z)) */
427
428/* Macros for exit status of CCL program. */
429#define CCL_STAT_SUCCESS 0 /* Terminated successfully. */
430#define CCL_STAT_SUSPEND 1 /* Terminated because of empty input
431 buffer or full output buffer. */
432#define CCL_STAT_INVALID_CMD 2 /* Terminated because of invalid
433 command. */
434#define CCL_STAT_QUIT 3 /* Terminated because of quit. */
435
436/* Terminate CCL program successfully. */
437#define CCL_SUCCESS \
438 do { \
439 ccl->status = CCL_STAT_SUCCESS; \
440 ccl->ic = CCL_HEADER_MAIN; \
441 goto ccl_finish; \
442 } while (0)
443
444/* Suspend CCL program because of reading from empty input buffer or
445 writing to full output buffer. When this program is resumed, the
446 same I/O command is executed. */
447#define CCL_SUSPEND \
448 do { \
449 ic--; \
450 ccl->status = CCL_STAT_SUSPEND; \
451 goto ccl_finish; \
452 } while (0)
453
454/* Terminate CCL program because of invalid command. Should not occur
455 in the normal case. */
456#define CCL_INVALID_CMD \
457 do { \
458 ccl->status = CCL_STAT_INVALID_CMD; \
459 goto ccl_error_handler; \
460 } while (0)
461
462/* Encode one character CH to multibyte form and write to the current
463 output buffer. If CH is negative, write one byte -CH. */
464#define CCL_WRITE_CHAR(ch) \
465 do { \
466 if (!dst) \
467 CCL_INVALID_CMD; \
468 else \
469 { \
470 unsigned char work[4], *str; \
471 int len = CHAR_STRING (ch, work, str); \
472 if (dst + len <= dst_end) \
473 { \
474 bcopy (str, dst, len); \
475 dst += len; \
476 } \
477 else \
478 CCL_SUSPEND; \
479 } \
480 } while (0)
481
482/* Write a string at ccl_prog[IC] of length LEN to the current output
483 buffer. */
484#define CCL_WRITE_STRING(len) \
485 do { \
486 if (!dst) \
487 CCL_INVALID_CMD; \
488 else if (dst + len <= dst_end) \
489 for (i = 0; i < len; i++) \
490 *dst++ = ((XFASTINT (ccl_prog[ic + (i / 3)])) \
491 >> ((2 - (i % 3)) * 8)) & 0xFF; \
492 else \
493 CCL_SUSPEND; \
494 } while (0)
495
496/* Read one byte from the current input buffer into Rth register. */
497#define CCL_READ_CHAR(r) \
498 do { \
499 if (!src) \
500 CCL_INVALID_CMD; \
501 else if (src < src_end) \
502 r = *src++; \
503 else if (ccl->last_block) \
504 { \
505 ic = ccl->eof_ic; \
506 goto ccl_finish; \
507 } \
508 else \
509 CCL_SUSPEND; \
510 } while (0)
511
512
513/* Execute CCL code on SRC_BYTES length text at SOURCE. The resulting
514 text goes to a place pointed by DESTINATION, the length of which
515 should not exceed DST_BYTES. The bytes actually processed is
516 returned as *CONSUMED. The return value is the length of the
517 resulting text. As a side effect, the contents of CCL registers
518 are updated. If SOURCE or DESTINATION is NULL, only operations on
519 registers are permitted. */
520
521#ifdef CCL_DEBUG
522#define CCL_DEBUG_BACKTRACE_LEN 256
523int ccl_backtrace_table[CCL_BACKTRACE_TABLE];
524int ccl_backtrace_idx;
525#endif
526
527struct ccl_prog_stack
528 {
529 int *ccl_prog; /* Pointer to an array of CCL code. */
530 int ic; /* Instruction Counter. */
531 };
532
533ccl_driver (ccl, source, destination, src_bytes, dst_bytes, consumed)
534 struct ccl_program *ccl;
535 unsigned char *source, *destination;
536 int src_bytes, dst_bytes;
537 int *consumed;
538{
539 register int *reg = ccl->reg;
540 register int ic = ccl->ic;
541 register int code, field1, field2;
542 register int *ccl_prog = ccl->prog;
543 unsigned char *src = source, *src_end = src + src_bytes;
544 unsigned char *dst = destination, *dst_end = dst + dst_bytes;
545 int jump_address;
546 int i, j, op;
547 int stack_idx = 0;
548 /* For the moment, we only support depth 256 of stack. */
549 struct ccl_prog_stack ccl_prog_stack_struct[256];
550
551 if (ic >= ccl->eof_ic)
552 ic = CCL_HEADER_MAIN;
553
554#ifdef CCL_DEBUG
555 ccl_backtrace_idx = 0;
556#endif
557
558 for (;;)
559 {
560#ifdef CCL_DEBUG
561 ccl_backtrace_table[ccl_backtrace_idx++] = ic;
562 if (ccl_backtrace_idx >= CCL_DEBUG_BACKTRACE_LEN)
563 ccl_backtrace_idx = 0;
564 ccl_backtrace_table[ccl_backtrace_idx] = 0;
565#endif
566
567 if (!NILP (Vquit_flag) && NILP (Vinhibit_quit))
568 {
569 /* We can't just signal Qquit, instead break the loop as if
570 the whole data is processed. Don't reset Vquit_flag, it
571 must be handled later at a safer place. */
572 if (consumed)
573 src = source + src_bytes;
574 ccl->status = CCL_STAT_QUIT;
575 break;
576 }
577
578 code = XINT (ccl_prog[ic]); ic++;
579 field1 = code >> 8;
580 field2 = (code & 0xFF) >> 5;
581
582#define rrr field2
583#define RRR (field1 & 7)
584#define Rrr ((field1 >> 3) & 7)
585#define ADDR field1
586
587 switch (code & 0x1F)
588 {
589 case CCL_SetRegister: /* 00000000000000000RRRrrrXXXXX */
590 reg[rrr] = reg[RRR];
591 break;
592
593 case CCL_SetShortConst: /* CCCCCCCCCCCCCCCCCCCCrrrXXXXX */
594 reg[rrr] = field1;
595 break;
596
597 case CCL_SetConst: /* 00000000000000000000rrrXXXXX */
598 reg[rrr] = XINT (ccl_prog[ic]);
599 ic++;
600 break;
601
602 case CCL_SetArray: /* CCCCCCCCCCCCCCCCCCCCRRRrrrXXXXX */
603 i = reg[RRR];
604 j = field1 >> 3;
605 if ((unsigned int) i < j)
606 reg[rrr] = XINT (ccl_prog[ic + i]);
607 ic += j;
608 break;
609
610 case CCL_Jump: /* A--D--D--R--E--S--S-000XXXXX */
611 ic += ADDR;
612 break;
613
614 case CCL_JumpCond: /* A--D--D--R--E--S--S-rrrXXXXX */
615 if (!reg[rrr])
616 ic += ADDR;
617 break;
618
619 case CCL_WriteRegisterJump: /* A--D--D--R--E--S--S-rrrXXXXX */
620 i = reg[rrr];
621 CCL_WRITE_CHAR (i);
622 ic += ADDR;
623 break;
624
625 case CCL_WriteRegisterReadJump: /* A--D--D--R--E--S--S-rrrXXXXX */
626 i = reg[rrr];
627 CCL_WRITE_CHAR (i);
628 ic++;
629 CCL_READ_CHAR (reg[rrr]);
630 ic += ADDR - 1;
631 break;
632
633 case CCL_WriteConstJump: /* A--D--D--R--E--S--S-000XXXXX */
634 i = XINT (ccl_prog[ic]);
635 CCL_WRITE_CHAR (i);
636 ic += ADDR;
637 break;
638
639 case CCL_WriteConstReadJump: /* A--D--D--R--E--S--S-rrrXXXXX */
640 i = XINT (ccl_prog[ic]);
641 CCL_WRITE_CHAR (i);
642 ic++;
643 CCL_READ_CHAR (reg[rrr]);
644 ic += ADDR - 1;
645 break;
646
647 case CCL_WriteStringJump: /* A--D--D--R--E--S--S-000XXXXX */
648 j = XINT (ccl_prog[ic]);
649 ic++;
650 CCL_WRITE_STRING (j);
651 ic += ADDR - 1;
652 break;
653
654 case CCL_WriteArrayReadJump: /* A--D--D--R--E--S--S-rrrXXXXX */
655 i = reg[rrr];
656 j = ccl_prog[ic++];
657 if ((unsigned int) i < j)
658 {
659 i = XINT (ccl_prog[ic + i]);
660 CCL_WRITE_CHAR (i);
661 }
662 ic += j + 1;
663 CCL_READ_CHAR (reg[rrr]);
664 ic += ADDR - (j + 2);
665 break;
666
667 case CCL_ReadJump: /* A--D--D--R--E--S--S-rrrYYYYY */
668 CCL_READ_CHAR (reg[rrr]);
669 ic += ADDR;
670 break;
671
672 case CCL_ReadBranch: /* CCCCCCCCCCCCCCCCCCCCrrrXXXXX */
673 CCL_READ_CHAR (reg[rrr]);
674 /* fall through ... */
675 case CCL_Branch: /* CCCCCCCCCCCCCCCCCCCCrrrXXXXX */
676 if ((unsigned int) reg[rrr] < field1)
677 ic += XINT (ccl_prog[ic + reg[rrr]]);
678 else
679 ic += XINT (ccl_prog[ic + field1]);
680 break;
681
682 case CCL_ReadRegister: /* CCCCCCCCCCCCCCCCCCCCrrXXXXX */
683 while (1)
684 {
685 CCL_READ_CHAR (reg[rrr]);
686 if (!field1) break;
687 code = XINT (ccl_prog[ic]); ic++;
688 field1 = code >> 8;
689 field2 = (code & 0xFF) >> 5;
690 }
691 break;
692
693 case CCL_WriteExprConst: /* 1:00000OPERATION000RRR000XXXXX */
694 rrr = 7;
695 i = reg[RRR];
696 j = XINT (ccl_prog[ic]);
697 op = field1 >> 6;
698 ic++;
699 goto ccl_set_expr;
700
701 case CCL_WriteRegister: /* CCCCCCCCCCCCCCCCCCCrrrXXXXX */
702 while (1)
703 {
704 i = reg[rrr];
705 CCL_WRITE_CHAR (i);
706 if (!field1) break;
707 code = XINT (ccl_prog[ic]); ic++;
708 field1 = code >> 8;
709 field2 = (code & 0xFF) >> 5;
710 }
711 break;
712
713 case CCL_WriteExprRegister: /* 1:00000OPERATIONRrrRRR000XXXXX */
714 rrr = 7;
715 i = reg[RRR];
716 j = reg[Rrr];
717 op = field1 >> 6;
718 goto ccl_set_expr;
719
720 case CCL_Call: /* CCCCCCCCCCCCCCCCCCCC000XXXXX */
721 {
722 Lisp_Object slot;
723
724 if (stack_idx >= 256
725 || field1 < 0
726 || field1 >= XVECTOR (Vccl_program_table)->size
727 || (slot = XVECTOR (Vccl_program_table)->contents[field1],
728 !CONSP (slot))
729 || !VECTORP (XCONS (slot)->cdr))
730 {
731 if (stack_idx > 0)
732 {
733 ccl_prog = ccl_prog_stack_struct[0].ccl_prog;
734 ic = ccl_prog_stack_struct[0].ic;
735 }
736 CCL_INVALID_CMD;
737 }
738
739 ccl_prog_stack_struct[stack_idx].ccl_prog = ccl_prog;
740 ccl_prog_stack_struct[stack_idx].ic = ic;
741 stack_idx++;
742 ccl_prog = XVECTOR (XCONS (slot)->cdr)->contents;
743 ic = CCL_HEADER_MAIN;
744 }
745 break;
746
747 case CCL_WriteConstString: /* CCCCCCCCCCCCCCCCCCCCrrrXXXXX */
748 if (!rrr)
749 CCL_WRITE_CHAR (field1);
750 else
751 {
752 CCL_WRITE_STRING (field1);
753 ic += (field1 + 2) / 3;
754 }
755 break;
756
757 case CCL_WriteArray: /* CCCCCCCCCCCCCCCCCCCCrrrXXXXX */
758 i = reg[rrr];
759 if ((unsigned int) i < field1)
760 {
761 j = XINT (ccl_prog[ic + i]);
762 CCL_WRITE_CHAR (j);
763 }
764 ic += field1;
765 break;
766
767 case CCL_End: /* 0000000000000000000000XXXXX */
768 if (stack_idx-- > 0)
769 {
770 ccl_prog = ccl_prog_stack_struct[stack_idx].ccl_prog;
771 ic = ccl_prog_stack_struct[stack_idx].ic;
772 break;
773 }
774 CCL_SUCCESS;
775
776 case CCL_ExprSelfConst: /* 00000OPERATION000000rrrXXXXX */
777 i = XINT (ccl_prog[ic]);
778 ic++;
779 op = field1 >> 6;
780 goto ccl_expr_self;
781
782 case CCL_ExprSelfReg: /* 00000OPERATION000RRRrrrXXXXX */
783 i = reg[RRR];
784 op = field1 >> 6;
785
786 ccl_expr_self:
787 switch (op)
788 {
789 case CCL_PLUS: reg[rrr] += i; break;
790 case CCL_MINUS: reg[rrr] -= i; break;
791 case CCL_MUL: reg[rrr] *= i; break;
792 case CCL_DIV: reg[rrr] /= i; break;
793 case CCL_MOD: reg[rrr] %= i; break;
794 case CCL_AND: reg[rrr] &= i; break;
795 case CCL_OR: reg[rrr] |= i; break;
796 case CCL_XOR: reg[rrr] ^= i; break;
797 case CCL_LSH: reg[rrr] <<= i; break;
798 case CCL_RSH: reg[rrr] >>= i; break;
799 case CCL_LSH8: reg[rrr] <<= 8; reg[rrr] |= i; break;
800 case CCL_RSH8: reg[7] = reg[rrr] & 0xFF; reg[rrr] >>= 8; break;
801 case CCL_DIVMOD: reg[7] = reg[rrr] % i; reg[rrr] /= i; break;
802 case CCL_LS: reg[rrr] = reg[rrr] < i; break;
803 case CCL_GT: reg[rrr] = reg[rrr] > i; break;
804 case CCL_EQ: reg[rrr] = reg[rrr] == i; break;
805 case CCL_LE: reg[rrr] = reg[rrr] <= i; break;
806 case CCL_GE: reg[rrr] = reg[rrr] >= i; break;
807 case CCL_NE: reg[rrr] = reg[rrr] != i; break;
808 default: CCL_INVALID_CMD;
809 }
810 break;
811
812 case CCL_SetExprConst: /* 00000OPERATION000RRRrrrXXXXX */
813 i = reg[RRR];
814 j = XINT (ccl_prog[ic]);
815 op = field1 >> 6;
816 jump_address = ++ic;
817 goto ccl_set_expr;
818
819 case CCL_SetExprReg: /* 00000OPERATIONRrrRRRrrrXXXXX */
820 i = reg[RRR];
821 j = reg[Rrr];
822 op = field1 >> 6;
823 jump_address = ic;
824 goto ccl_set_expr;
825
826 case CCL_ReadJumpCondExprConst: /* A--D--D--R--E--S--S-rrrXXXXX */
827 CCL_READ_CHAR (reg[rrr]);
828 case CCL_JumpCondExprConst: /* A--D--D--R--E--S--S-rrrXXXXX */
829 i = reg[rrr];
830 op = XINT (ccl_prog[ic]);
831 jump_address = ic++ + ADDR;
832 j = XINT (ccl_prog[ic]);
833 ic++;
834 rrr = 7;
835 goto ccl_set_expr;
836
837 case CCL_ReadJumpCondExprReg: /* A--D--D--R--E--S--S-rrrXXXXX */
838 CCL_READ_CHAR (reg[rrr]);
839 case CCL_JumpCondExprReg:
840 i = reg[rrr];
841 op = XINT (ccl_prog[ic]);
842 jump_address = ic++ + ADDR;
843 j = reg[XINT (ccl_prog[ic])];
844 ic++;
845 rrr = 7;
846
847 ccl_set_expr:
848 switch (op)
849 {
850 case CCL_PLUS: reg[rrr] = i + j; break;
851 case CCL_MINUS: reg[rrr] = i - j; break;
852 case CCL_MUL: reg[rrr] = i * j; break;
853 case CCL_DIV: reg[rrr] = i / j; break;
854 case CCL_MOD: reg[rrr] = i % j; break;
855 case CCL_AND: reg[rrr] = i & j; break;
856 case CCL_OR: reg[rrr] = i | j; break;
857 case CCL_XOR: reg[rrr] = i ^ j;; break;
858 case CCL_LSH: reg[rrr] = i << j; break;
859 case CCL_RSH: reg[rrr] = i >> j; break;
860 case CCL_LSH8: reg[rrr] = (i << 8) | j; break;
861 case CCL_RSH8: reg[rrr] = i >> 8; reg[7] = i & 0xFF; break;
862 case CCL_DIVMOD: reg[rrr] = i / j; reg[7] = i % j; break;
863 case CCL_LS: reg[rrr] = i < j; break;
864 case CCL_GT: reg[rrr] = i > j; break;
865 case CCL_EQ: reg[rrr] = i == j; break;
866 case CCL_LE: reg[rrr] = i <= j; break;
867 case CCL_GE: reg[rrr] = i >= j; break;
868 case CCL_NE: reg[rrr] = i != j; break;
869 case CCL_ENCODE_SJIS: ENCODE_SJIS (i, j, reg[rrr], reg[7]); break;
870 case CCL_DECODE_SJIS: DECODE_SJIS (i, j, reg[rrr], reg[7]); break;
871 default: CCL_INVALID_CMD;
872 }
873 code &= 0x1F;
874 if (code == CCL_WriteExprConst || code == CCL_WriteExprRegister)
875 {
876 i = reg[rrr];
877 CCL_WRITE_CHAR (i);
878 }
879 else if (!reg[rrr])
880 ic = jump_address;
881 break;
882
883 default:
884 CCL_INVALID_CMD;
885 }
886 }
887
888 ccl_error_handler:
889 if (destination)
890 {
891 /* We can insert an error message only if DESTINATION is
892 specified and we still have a room to store the message
893 there. */
894 char msg[256];
895 int msglen;
896
897 switch (ccl->status)
898 {
899 case CCL_STAT_INVALID_CMD:
900 sprintf(msg, "\nCCL: Invalid command %x (ccl_code = %x) at %d.",
901 code & 0x1F, code, ic);
902#ifdef CCL_DEBUG
903 {
904 int i = ccl_backtrace_idx - 1;
905 int j;
906
907 msglen = strlen (msg);
908 if (dst + msglen <= dst_end)
909 {
910 bcopy (msg, dst, msglen);
911 dst += msglen;
912 }
913
914 for (j = 0; j < CCL_DEBUG_BACKTRACE_LEN; j++, i--)
915 {
916 if (i < 0) i = CCL_DEBUG_BACKTRACE_LEN - 1;
917 if (ccl_backtrace_table[i] == 0)
918 break;
919 sprintf(msg, " %d", ccl_backtrace_table[i]);
920 msglen = strlen (msg);
921 if (dst + msglen > dst_end)
922 break;
923 bcopy (msg, dst, msglen);
924 dst += msglen;
925 }
926 }
927 goto ccl_finish;
928#endif
929
930 case CCL_STAT_QUIT:
931 sprintf(msg, "\nCCL: Quited.");
932 break;
933
934 default:
935 sprintf(msg, "\nCCL: Unknown error type (%d).", ccl->status);
936 }
937
938 msglen = strlen (msg);
939 if (dst + msglen <= dst_end)
940 {
941 bcopy (msg, dst, msglen);
942 dst += msglen;
943 }
944 }
945
946 ccl_finish:
947 ccl->ic = ic;
948 if (consumed) *consumed = src - source;
949 return dst - destination;
950}
951
952/* Setup fields of the structure pointed by CCL appropriately for the
953 execution of compiled CCL code in VEC (vector of integer). */
954setup_ccl_program (ccl, vec)
955 struct ccl_program *ccl;
956 Lisp_Object vec;
957{
958 int i;
959
960 ccl->size = XVECTOR (vec)->size;
961 ccl->prog = XVECTOR (vec)->contents;
962 ccl->ic = CCL_HEADER_MAIN;
963 ccl->eof_ic = XINT (XVECTOR (vec)->contents[CCL_HEADER_EOF]);
964 ccl->buf_magnification = XINT (XVECTOR (vec)->contents[CCL_HEADER_BUF_MAG]);
965 for (i = 0; i < 8; i++)
966 ccl->reg[i] = 0;
967 ccl->last_block = 0;
968 ccl->status = 0;
969}
970
971#ifdef emacs
972
973DEFUN ("ccl-execute", Fccl_execute, Sccl_execute, 2, 2, 0,
974 "Execute CCL-PROGRAM with registers initialized by REGISTERS.\n\
975CCL-PROGRAM is a compiled code generated by `ccl-compile',\n\
976 no I/O commands should appear in the CCL program.\n\
977REGISTERS is a vector of [R0 R1 ... R7] where RN is an initial value\n\
978 of Nth register.\n\
979As side effect, each element of REGISTER holds the value of\n\
980 corresponding register after the execution.")
981 (ccl_prog, reg)
982 Lisp_Object ccl_prog, reg;
983{
984 struct ccl_program ccl;
985 int i;
986
987 CHECK_VECTOR (ccl_prog, 0);
988 CHECK_VECTOR (reg, 1);
989 if (XVECTOR (reg)->size != 8)
990 error ("Invalid length of vector REGISTERS");
991
992 setup_ccl_program (&ccl, ccl_prog);
993 for (i = 0; i < 8; i++)
994 ccl.reg[i] = (INTEGERP (XVECTOR (reg)->contents[i])
995 ? XINT (XVECTOR (reg)->contents[i])
996 : 0);
997
998 ccl_driver (&ccl, (char *)0, (char *)0, 0, 0, (int *)0);
999 QUIT;
1000 if (ccl.status != CCL_STAT_SUCCESS)
1001 error ("Error in CCL program at %dth code", ccl.ic);
1002
1003 for (i = 0; i < 8; i++)
1004 XSETINT (XVECTOR (reg)->contents[i], ccl.reg[i]);
1005 return Qnil;
1006}
1007
1008DEFUN ("ccl-execute-on-string", Fccl_execute_on_string, Sccl_execute_on_string,
1009 3, 3, 0,
1010 "Execute CCL-PROGRAM with initial STATUS on STRING.\n\
1011CCL-PROGRAM is a compiled code generated by `ccl-compile'.\n\
1012Read buffer is set to STRING, and write buffer is allocated automatically.\n\
1013STATUS is a vector of [R0 R1 ... R7 IC], where\n\
1014 R0..R7 are initial values of corresponding registers,\n\
1015 IC is the instruction counter specifying from where to start the program.\n\
1016If R0..R7 are nil, they are initialized to 0.\n\
1017If IC is nil, it is initialized to head of the CCL program.\n\
1018Returns the contents of write buffer as a string,\n\
1019 and as side effect, STATUS is updated.")
1020 (ccl_prog, status, str)
1021 Lisp_Object ccl_prog, status, str;
1022{
1023 Lisp_Object val;
1024 struct ccl_program ccl;
1025 int i, produced;
1026 int outbufsize;
1027 char *outbuf;
1028 struct gcpro gcpro1, gcpro2, gcpro3;
1029
1030 CHECK_VECTOR (ccl_prog, 0);
1031 CHECK_VECTOR (status, 1);
1032 if (XVECTOR (status)->size != 9)
1033 error ("Invalid length of vector STATUS");
1034 CHECK_STRING (str, 2);
1035 GCPRO3 (ccl_prog, status, str);
1036
1037 setup_ccl_program (&ccl, ccl_prog);
1038 for (i = 0; i < 8; i++)
1039 {
1040 if (NILP (XVECTOR (status)->contents[i]))
1041 XSETINT (XVECTOR (status)->contents[i], 0);
1042 if (INTEGERP (XVECTOR (status)->contents[i]))
1043 ccl.reg[i] = XINT (XVECTOR (status)->contents[i]);
1044 }
1045 if (INTEGERP (XVECTOR (status)->contents[i]))
1046 {
1047 i = XFASTINT (XVECTOR (status)->contents[8]);
1048 if (ccl.ic < i && i < ccl.size)
1049 ccl.ic = i;
1050 }
1051 outbufsize = XSTRING (str)->size * ccl.buf_magnification + 256;
1052 outbuf = (char *) xmalloc (outbufsize);
1053 if (!outbuf)
1054 error ("Not enough memory");
1055 ccl.last_block = 1;
1056 produced = ccl_driver (&ccl, XSTRING (str)->data, outbuf,
1057 XSTRING (str)->size, outbufsize, (int *)0);
1058 for (i = 0; i < 8; i++)
1059 XSET (XVECTOR (status)->contents[i], Lisp_Int, ccl.reg[i]);
1060 XSETINT (XVECTOR (status)->contents[8], ccl.ic);
1061 UNGCPRO;
1062
1063 val = make_string (outbuf, produced);
1064 free (outbuf);
1065 QUIT;
1066 if (ccl.status != CCL_STAT_SUCCESS
1067 && ccl.status != CCL_STAT_SUSPEND)
1068 error ("Error in CCL program at %dth code", ccl.ic);
1069
1070 return val;
1071}
1072
1073DEFUN ("register-ccl-program", Fregister_ccl_program, Sregister_ccl_program,
1074 2, 2, 0,
1075 "Register CCL program PROGRAM of NAME in `ccl-program-table'.
1076PROGRAM should be a compiled code of CCL program, or nil.
1077Return index number of the registered CCL program.")
1078 (name, ccl_prog)
1079 Lisp_Object name, ccl_prog;
1080{
1081 int len = XVECTOR (Vccl_program_table)->size;
1082 int i, idx;
1083
1084 CHECK_SYMBOL (name, 0);
1085 if (!NILP (ccl_prog))
1086 CHECK_VECTOR (ccl_prog, 1);
1087
1088 for (i = 0; i < len; i++)
1089 {
1090 Lisp_Object slot = XVECTOR (Vccl_program_table)->contents[i];
1091
1092 if (!CONSP (slot))
1093 break;
1094
1095 if (EQ (name, XCONS (slot)->car))
1096 {
1097 XCONS (slot)->cdr = ccl_prog;
1098 return make_number (i);
1099 }
1100 }
1101
1102 if (i == len)
1103 {
1104 Lisp_Object new_table = Fmake_vector (len * 2, Qnil);
1105 int j;
1106
1107 for (j = 0; j < len; j++)
1108 XVECTOR (new_table)->contents[j]
1109 = XVECTOR (Vccl_program_table)->contents[j];
1110 Vccl_program_table = new_table;
1111 }
1112
1113 XVECTOR (Vccl_program_table)->contents[i] = Fcons (name, ccl_prog);
1114 return make_number (i);
1115}
1116
1117syms_of_ccl ()
1118{
1119 staticpro (&Vccl_program_table);
1120 Vccl_program_table = Fmake_vector (32, Qnil);
1121
1122 DEFVAR_LISP ("font-ccl-encoder-alist", &Vfont_ccl_encoder_alist,
1123 "Alist of fontname patterns vs corresponding CCL program.\n\
1124Each element looks like (REGEXP . CCL-CODE),\n\
1125 where CCL-CODE is a compiled CCL program.\n\
1126When a font whose name matches REGEXP is used for displaying a character,\n\
1127 CCL-CODE is executed to calculate the code point in the font\n\
1128 from the charset number and position code(s) of the character which are set\n\
1129 in CCL registers R0, R1, and R2 before the execution.\n\
1130The code point in the font is set in CCL registers R1 and R2\n\
1131 when the execution terminated.\n\
1132If the font is single-byte font, the register R2 is not used.");
1133 Vfont_ccl_encoder_alist = Qnil;
1134
1135 defsubr (&Sccl_execute);
1136 defsubr (&Sccl_execute_on_string);
1137 defsubr (&Sregister_ccl_program);
1138}
1139
1140#endif /* emacs */
diff --git a/src/ccl.h b/src/ccl.h
new file mode 100644
index 00000000000..ebda0cc1595
--- /dev/null
+++ b/src/ccl.h
@@ -0,0 +1,53 @@
1/* Header for CCL (Code Conversion Language) interpreter.
2
3 Copyright (C) 1995 Free Software Foundation, Inc.
4 Copyright (C) 1995 Electrotechnical Laboratory, JAPAN.
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
19
20#ifndef _CCL_H
21#define _CCL_H
22
23/* Structure to hold information about running CCL code. Read
24 comments in the file ccl.c for the detail of each field. */
25struct ccl_program {
26 int size; /* Size of the compiled code. */
27 Lisp_Object *prog; /* Pointer into the compiled code. */
28 int ic; /* Instruction Counter (index for PROG). */
29 int eof_ic; /* Instruction Counter for end-of-file
30 processing code. */
31 int reg[8]; /* CCL registers, reg[7] is used for
32 condition flag of relational
33 operations. */
34 int last_block; /* Set to 1 while processing the last
35 block. */
36 int status; /* Exit status of the CCL program. */
37 int buf_magnification; /* Output buffer magnification. How
38 many times bigger the output buffer
39 should be than the input buffer. */
40};
41
42/* This data type is used for the spec field of the structure
43 coding_system. */
44
45struct ccl_spec {
46 struct ccl_program decoder;
47 struct ccl_program encoder;
48};
49
50/* Alist of fontname patterns vs corresponding CCL program. */
51extern Lisp_Object Vfont_ccl_encoder_alist;
52
53#endif /* _CCL_H */
diff --git a/src/charset.c b/src/charset.c
new file mode 100644
index 00000000000..b962f346f22
--- /dev/null
+++ b/src/charset.c
@@ -0,0 +1,1452 @@
1/* Multilingual characters handler.
2 Ver.1.0
3
4 Copyright (C) 1995 Free Software Foundation, Inc.
5 Copyright (C) 1995 Electrotechnical Laboratory, JAPAN.
6
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
20
21/* At first, see the document in `charset.h' to understand the code in
22 this file. */
23
24#include <stdio.h>
25
26#ifdef emacs
27
28#include <sys/types.h>
29#include <config.h>
30#include "lisp.h"
31#include "buffer.h"
32#include "charset.h"
33#include "coding.h"
34
35#else /* not emacs */
36
37#include "mulelib.h"
38
39#endif /* emacs */
40
41Lisp_Object Qcharset, Qascii, Qcomposition;
42
43/* Declaration of special leading-codes. */
44int leading_code_composition; /* for composite characters */
45int leading_code_private_11; /* for private DIMENSION1 of 1-column */
46int leading_code_private_12; /* for private DIMENSION1 of 2-column */
47int leading_code_private_21; /* for private DIMENSION2 of 1-column */
48int leading_code_private_22; /* for private DIMENSION2 of 2-column */
49
50/* Declaration of special charsets. */
51int charset_ascii; /* ASCII */
52int charset_composition; /* for a composite character */
53int charset_latin_iso8859_1; /* ISO8859-1 (Latin-1) */
54int charset_jisx0208_1978; /* JISX0208.1978 (Japanese Kanji old set) */
55int charset_jisx0208; /* JISX0208.1983 (Japanese Kanji) */
56int charset_katakana_jisx0201; /* JISX0201.Kana (Japanese Katakana) */
57int charset_latin_jisx0201; /* JISX0201.Roman (Japanese Roman) */
58int charset_big5_1; /* Big5 Level 1 (Chinese Traditional) */
59int charset_big5_2; /* Big5 Level 2 (Chinese Traditional) */
60
61Lisp_Object Qcharset_table;
62
63/* A char-table containing information of each character set. */
64Lisp_Object Vcharset_table;
65
66/* A vector of charset symbol indexed by charset-id. This is used
67 only for returning charset symbol from C functions. */
68Lisp_Object Vcharset_symbol_table;
69
70/* A list of charset symbols ever defined. */
71Lisp_Object Vcharset_list;
72
73/* Tables used by macros BYTES_BY_CHAR_HEAD and WIDTH_BY_CHAR_HEAD. */
74int bytes_by_char_head[256];
75int width_by_char_head[256];
76
77/* Mapping table from ISO2022's charset (specified by DIMENSION,
78 CHARS, and FINAL-CHAR) to Emacs' charset. */
79int iso_charset_table[2][2][128];
80
81/* Variables used locally in the macro FETCH_MULTIBYTE_CHAR. */
82unsigned char *_fetch_multibyte_char_p;
83int _fetch_multibyte_char_len;
84
85/* Set STR a pointer to the multi-byte form of the character C. If C
86 is not a composite character, the multi-byte form is set in WORKBUF
87 and STR points WORKBUF. The caller should allocate at least 4-byte
88 area at WORKBUF in advance. Returns the length of the multi-byte
89 form.
90
91 Use macro `CHAR_STRING (C, WORKBUF, STR)' instead of calling this
92 function directly if C can be an ASCII character. */
93
94int
95non_ascii_char_to_string (c, workbuf, str)
96 int c;
97 unsigned char *workbuf, **str;
98{
99 int charset;
100 unsigned char c1, c2;
101
102 if (COMPOSITE_CHAR_P (c))
103 {
104 int cmpchar_id = COMPOSITE_CHAR_ID (c);
105
106 if (cmpchar_id < n_cmpchars)
107 {
108 *str = cmpchar_table[cmpchar_id]->data;
109 return cmpchar_table[cmpchar_id]->len;
110 }
111 else
112 {
113 *str = workbuf;
114 return 0;
115 }
116 }
117
118 SPLIT_NON_ASCII_CHAR (c, charset, c1, c2);
119
120 *str = workbuf;
121 *workbuf++ = CHARSET_LEADING_CODE_BASE (charset);
122 if (*workbuf = CHARSET_LEADING_CODE_EXT (charset))
123 workbuf++;
124 *workbuf++ = c1 | 0x80;
125 if (c2)
126 *workbuf++ = c2 | 0x80;
127
128 return (workbuf - *str);
129}
130
131/* Return a non-ASCII character of which multi-byte form is at STR of
132 length LEN. If ACTUAL_LEN is not NULL, the actual length of the
133 character is set to the address ACTUAL_LEN.
134
135 Use macro `STRING_CHAR (STR, LEN)' instead of calling this function
136 directly if STR can hold an ASCII character. */
137
138string_to_non_ascii_char (str, len, actual_len)
139 unsigned char *str;
140 int len, *actual_len;
141{
142 int charset;
143 unsigned char c1, c2;
144 register int c;
145
146 if (SPLIT_STRING (str, len, charset, c1, c2) == CHARSET_ASCII)
147 {
148 if (actual_len)
149 *actual_len = 1;
150 return (int) *str;
151 }
152
153 c = MAKE_NON_ASCII_CHAR (charset, c1, c2);
154
155 if (actual_len)
156 *actual_len = (charset == CHARSET_COMPOSITION
157 ? cmpchar_table[COMPOSITE_CHAR_ID (c)]->len
158 : BYTES_BY_CHAR_HEAD (*str));
159 return c;
160}
161
162/* Return the length of the multi-byte form at string STR of length LEN. */
163int
164multibyte_form_length (str, len)
165 unsigned char *str;
166 int len;
167{
168 int charset;
169 unsigned char c1, c2;
170 register int c;
171
172 if (SPLIT_STRING (str, len, charset, c1, c2) == CHARSET_ASCII)
173 return 1;
174
175 return (charset == CHARSET_COMPOSITION
176 ? cmpchar_table[(c1 << 7) | c2]->len
177 : BYTES_BY_CHAR_HEAD (*str));
178}
179
180/* Check if string STR of length LEN contains valid multi-byte form of
181 a character. If valid, charset and position codes of the character
182 is set at *CHARSET, *C1, and *C2, and return 0. If not valid,
183 return -1. This should be used only in the macro SPLIT_STRING
184 which checks range of STR in advance. */
185
186split_non_ascii_string (str, len, charset, c1, c2)
187 register unsigned char *str, *c1, *c2;
188 register int len, *charset;
189{
190 register unsigned int cs = *str++;
191
192 if (cs == LEADING_CODE_COMPOSITION)
193 {
194 int cmpchar_id = str_cmpchar_id (str - 1, len);
195
196 if (cmpchar_id < 0)
197 return -1;
198 *charset = cs, *c1 = cmpchar_id >> 7, *c2 = cmpchar_id & 0x7F;
199 }
200 else if ((cs < LEADING_CODE_PRIVATE_11 || (cs = *str++) >= 0xA0)
201 && CHARSET_DEFINED_P (cs))
202 {
203 *charset = cs;
204 if (*str < 0xA0)
205 return -1;
206 *c1 = (*str++) & 0x7F;
207 if (CHARSET_DIMENSION (cs) == 2)
208 {
209 if (*str < 0xA0)
210 return -1;
211 *c2 = (*str++) & 0x7F;
212 }
213 }
214 else
215 return -1;
216 return 0;
217}
218
219/* Update the table Vcharset_table with the given arguments (see the
220 document of `define-charset' for the meaning of each argument).
221 Several other table contents are also updated. The caller should
222 check the validity of CHARSET-ID and the remaining arguments in
223 advance. */
224
225void
226update_charset_table (charset_id, dimension, chars, width, direction,
227 iso_final_char, iso_graphic_plane,
228 short_name, long_name, description)
229 Lisp_Object charset_id, dimension, chars, width, direction;
230 Lisp_Object iso_final_char, iso_graphic_plane;
231 Lisp_Object short_name, long_name, description;
232{
233 int charset = XINT (charset_id);
234 int bytes;
235 unsigned char leading_code_base, leading_code_ext;
236
237 if (NILP (Faref (Vcharset_table, charset_id)))
238 Faset (Vcharset_table, charset_id,
239 Fmake_vector (make_number (CHARSET_MAX_IDX), Qnil));
240
241 /* Get byte length of multibyte form, base leading-code, and
242 extended leading-code of the charset. See the comment under the
243 title "GENERAL NOTE on CHARACTER SET (CHARSET)" in charset.h. */
244 bytes = XINT (dimension);
245 if (charset < MIN_CHARSET_PRIVATE_DIMENSION1)
246 {
247 /* Official charset, it doesn't have an extended leading-code. */
248 if (charset != CHARSET_ASCII)
249 bytes += 1; /* For a base leading-code. */
250 leading_code_base = charset;
251 leading_code_ext = 0;
252 }
253 else
254 {
255 /* Private charset. */
256 bytes += 2; /* For base and extended leading-codes. */
257 leading_code_base
258 = (charset < LEADING_CODE_EXT_12
259 ? LEADING_CODE_PRIVATE_11
260 : (charset < LEADING_CODE_EXT_21
261 ? LEADING_CODE_PRIVATE_12
262 : (charset < LEADING_CODE_EXT_22
263 ? LEADING_CODE_PRIVATE_21
264 : LEADING_CODE_PRIVATE_22)));
265 leading_code_ext = charset;
266 }
267
268 CHARSET_TABLE_INFO (charset, CHARSET_ID_IDX) = charset_id;
269 CHARSET_TABLE_INFO (charset, CHARSET_BYTES_IDX) = make_number (bytes);
270 CHARSET_TABLE_INFO (charset, CHARSET_DIMENSION_IDX) = dimension;
271 CHARSET_TABLE_INFO (charset, CHARSET_CHARS_IDX) = chars;
272 CHARSET_TABLE_INFO (charset, CHARSET_WIDTH_IDX) = width;
273 CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX) = direction;
274 CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_BASE_IDX)
275 = make_number (leading_code_base);
276 CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_EXT_IDX)
277 = make_number (leading_code_ext);
278 CHARSET_TABLE_INFO (charset, CHARSET_ISO_FINAL_CHAR_IDX) = iso_final_char;
279 CHARSET_TABLE_INFO (charset, CHARSET_ISO_GRAPHIC_PLANE_IDX)
280 = iso_graphic_plane;
281 CHARSET_TABLE_INFO (charset, CHARSET_SHORT_NAME_IDX) = short_name;
282 CHARSET_TABLE_INFO (charset, CHARSET_LONG_NAME_IDX) = long_name;
283 CHARSET_TABLE_INFO (charset, CHARSET_DESCRIPTION_IDX) = description;
284 CHARSET_TABLE_INFO (charset, CHARSET_PLIST_IDX) = Qnil;
285
286 {
287 /* If we have already defined a charset which has the same
288 DIMENSION, CHARS and ISO-FINAL-CHAR but the different
289 DIRECTION, we must update the entry REVERSE-CHARSET of both
290 charsets. If there's no such charset, the value of the entry
291 is set to nil. */
292 int i;
293
294 for (i = 0; i < MAX_CHARSET; i++)
295 if (!NILP (CHARSET_TABLE_ENTRY (i)))
296 {
297 if (CHARSET_DIMENSION (i) == XINT (dimension)
298 && CHARSET_CHARS (i) == XINT (chars)
299 && CHARSET_ISO_FINAL_CHAR (i) == XINT (iso_final_char)
300 && CHARSET_DIRECTION (i) != XINT (direction))
301 {
302 CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
303 = make_number (i);
304 CHARSET_TABLE_INFO (i, CHARSET_REVERSE_CHARSET_IDX) = charset_id;
305 break;
306 }
307 }
308 if (i >= MAX_CHARSET)
309 /* No such a charset. */
310 CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
311 = make_number (-1);
312 }
313
314 if (charset != CHARSET_ASCII
315 && charset < MIN_CHARSET_PRIVATE_DIMENSION1)
316 {
317 /* Update tables bytes_by_char_head and width_by_char_head. */
318 bytes_by_char_head[leading_code_base] = bytes;
319 width_by_char_head[leading_code_base] = XINT (width);
320
321 /* Update table emacs_code_class. */
322 emacs_code_class[charset] = (bytes == 2
323 ? EMACS_leading_code_2
324 : (bytes == 3
325 ? EMACS_leading_code_3
326 : EMACS_leading_code_4));
327 }
328
329 /* Update table iso_charset_table. */
330 if (ISO_CHARSET_TABLE (dimension, chars, iso_final_char) < 0)
331 ISO_CHARSET_TABLE (dimension, chars, iso_final_char) = charset;
332}
333
334#ifdef emacs
335
336/* Return charset id of CHARSET_SYMBOL, or return -1 if CHARSET_SYMBOL
337 is invalid. */
338int
339get_charset_id (charset_symbol)
340 Lisp_Object charset_symbol;
341{
342 Lisp_Object val;
343 int charset;
344
345 return ((SYMBOLP (charset_symbol)
346 && (val = Fget (charset_symbol, Qcharset), VECTORP (val))
347 && (charset = XINT (XVECTOR (val)->contents[CHARSET_ID_IDX]),
348 CHARSET_VALID_P (charset)))
349 ? charset : -1);
350}
351
352/* Return an identification number for a new private charset of
353 DIMENSION and WIDTH. If there's no more room for the new charset,
354 return 0. */
355Lisp_Object
356get_new_private_charset_id (dimension, width)
357 int dimension, width;
358{
359 int charset, from, to;
360
361 if (dimension == 1)
362 {
363 if (width == 1)
364 from = LEADING_CODE_EXT_11, to = LEADING_CODE_EXT_12;
365 else
366 from = LEADING_CODE_EXT_12, to = LEADING_CODE_EXT_21;
367 }
368 else
369 {
370 if (width == 1)
371 from = LEADING_CODE_EXT_21, to = LEADING_CODE_EXT_22;
372 else
373 from = LEADING_CODE_EXT_22, to = LEADING_CODE_EXT_MAX - 1;
374 }
375
376 for (charset = from; charset < to; charset++)
377 if (!CHARSET_DEFINED_P (charset)) break;
378
379 return make_number (charset < to ? charset : 0);
380}
381
382DEFUN ("define-charset", Fdefine_charset, Sdefine_charset, 3, 3, 0,
383 "Define CHARSET-ID as the identification number of CHARSET with INFO-VECTOR.\n\
384If CHARSET-ID is nil, it is set automatically, which means CHARSET is\n\
385 treated as a private charset.\n\
386INFO-VECTOR is a vector of the format:\n\
387 [DIMENSION CHARS WIDTH DIRECTION ISO-FINAL-CHAR ISO-GRAPHIC-PLANE\n\
388 SHORT-NAME LONG-NAME DESCRIPTION]\n\
389The meanings of each elements is as follows:\n\
390DIMENSION (integer) is the number of bytes to represent a character: 1 or 2.\n\
391CHARS (integer) is the number of characters in a dimension: 94 or 96.\n\
392WIDTH (integer) is the number of columns a character in the charset\n\
393occupies on the screen: one of 0, 1, and 2.\n\
394\n\
395DIRECTION (integer) is the rendering direction of characters in the\n\
396charset when rendering. If 0, render from right to left, else\n\
397render from left to right.\n\
398\n\
399ISO-FINAL-CHAR (character) is the final character of the\n\
400corresponding ISO 2022 charset.\n\
401\n\
402ISO-GRAPHIC-PLANE (integer) is the graphic plane to be invoked\n\
403while encoding to variants of ISO 2022 coding system, one of the\n\
404following: 0/graphic-plane-left(GL), 1/graphic-plane-right(GR).\n\
405\n\
406SHORT-NAME (string) is the short name to refer to the charset.\n\
407\n\
408LONG-NAME (string) is the long name to refer to the charset.\n\
409\n\
410DESCRIPTION (string) is the description string of the charset.")
411 (charset_id, charset_symbol, info_vector)
412 Lisp_Object charset_id, charset_symbol, info_vector;
413{
414 Lisp_Object *vec;
415
416 if (!NILP (charset_id))
417 CHECK_NUMBER (charset_id, 0);
418 CHECK_SYMBOL (charset_symbol, 1);
419 CHECK_VECTOR (info_vector, 2);
420
421 if (! NILP (charset_id))
422 {
423 if (! CHARSET_VALID_P (XINT (charset_id)))
424 error ("Invalid CHARSET: %d", XINT (charset_id));
425 else if (CHARSET_DEFINED_P (XINT (charset_id)))
426 error ("Already defined charset: %d", XINT (charset_id));
427 }
428
429 vec = XVECTOR (info_vector)->contents;
430 if (XVECTOR (info_vector)->size != 9
431 || !INTEGERP (vec[0]) || !(XINT (vec[0]) == 1 || XINT (vec[0]) == 2)
432 || !INTEGERP (vec[1]) || !(XINT (vec[1]) == 94 || XINT (vec[1]) == 96)
433 || !INTEGERP (vec[2]) || !(XINT (vec[2]) == 1 || XINT (vec[2]) == 2)
434 || !INTEGERP (vec[3]) || !(XINT (vec[3]) == 0 || XINT (vec[3]) == 1)
435 || !INTEGERP (vec[4]) || !(XINT (vec[4]) >= '0' && XINT (vec[4]) <= '~')
436 || !INTEGERP (vec[5]) || !(XINT (vec[5]) == 0 || XINT (vec[5]) == 1)
437 || !STRINGP (vec[6])
438 || !STRINGP (vec[7])
439 || !STRINGP (vec[8]))
440 error ("Invalid info-vector argument for defining charset %s",
441 XSYMBOL (charset_symbol)->name->data);
442
443 if (NILP (charset_id))
444 {
445 charset_id = get_new_private_charset_id (XINT (vec[0]), XINT (vec[2]));
446 if (XINT (charset_id) == 0)
447 error ("There's no room for a new private charset %s",
448 XSYMBOL (charset_symbol)->name->data);
449 }
450
451 update_charset_table (charset_id, vec[0], vec[1], vec[2], vec[3],
452 vec[4], vec[5], vec[6], vec[7], vec[8]);
453 Fput (charset_symbol, Qcharset, Faref (Vcharset_table, charset_id));
454 CHARSET_SYMBOL (XINT (charset_id)) = charset_symbol;
455 Vcharset_list = Fcons (charset_symbol, Vcharset_list);
456 return Qnil;
457}
458
459DEFUN ("declare-equiv-charset", Fdeclare_equiv_charset, Sdeclare_equiv_charset,
460 4, 4, 0,
461 "Declare a charset of DIMENSION, CHARS, FINAL-CHAR is the same as CHARSET.\n\
462CHARSET should be defined by `defined-charset' in advance.")
463 (dimension, chars, final_char, charset_symbol)
464 Lisp_Object dimension, chars, final_char, charset_symbol;
465{
466 int charset;
467
468 CHECK_NUMBER (dimension, 0);
469 CHECK_NUMBER (chars, 1);
470 CHECK_NUMBER (final_char, 2);
471 CHECK_SYMBOL (charset_symbol, 3);
472
473 if (XINT (dimension) != 1 && XINT (dimension) != 2)
474 error ("Invalid DIMENSION %d, it should be 1 or 2", XINT (dimension));
475 if (XINT (chars) != 94 && XINT (chars) != 96)
476 error ("Invalid CHARS %d, it should be 94 or 96", XINT (chars));
477 if (XINT (final_char) < '0' || XFASTINT (final_char) > '~')
478 error ("Invalid FINAL-CHAR %c, it should be `0'..`~'", XINT (chars));
479 if ((charset = get_charset_id (charset_symbol)) < 0)
480 error ("Invalid charset %s", XSYMBOL (charset_symbol)->name->data);
481
482 ISO_CHARSET_TABLE (dimension, chars, final_char) = charset;
483 return Qnil;
484}
485
486/* Return number of different charsets in STR of length LEN. In
487 addition, for each found charset N, CHARSETS[N] is set 1. The
488 caller should allocate CHARSETS (MAX_CHARSET bytes) in advance. */
489
490int
491find_charset_in_str (str, len, charsets)
492 unsigned char *str, *charsets;
493 int len;
494{
495 int num = 0;
496
497 while (len > 0)
498 {
499 int bytes = BYTES_BY_CHAR_HEAD (*str);
500 int charset = CHARSET_AT (str);
501
502 if (!charsets[charset])
503 {
504 charsets[charset] = 1;
505 num += 1;
506 }
507 str += bytes;
508 len -= bytes;
509 }
510 return num;
511}
512
513DEFUN ("find-charset-region", Ffind_charset_region, Sfind_charset_region,
514 2, 2, 0,
515 "Return a list of charsets in the region between BEG and END.\n\
516BEG and END are buffer positions.")
517 (beg, end)
518 Lisp_Object beg, end;
519{
520 char charsets[MAX_CHARSET];
521 int from, to, stop, i;
522 Lisp_Object val;
523
524 validate_region (&beg, &end);
525 from = XFASTINT (beg);
526 stop = to = XFASTINT (end);
527 if (from < GPT && GPT < to)
528 stop = GPT;
529 bzero (charsets, MAX_CHARSET);
530 while (1)
531 {
532 find_charset_in_str (POS_ADDR (from), stop - from, charsets);
533 if (stop < to)
534 from = stop, stop = to;
535 else
536 break;
537 }
538 val = Qnil;
539 for (i = MAX_CHARSET - 1; i >= 0; i--)
540 if (charsets[i])
541 val = Fcons (CHARSET_SYMBOL (i), val);
542 return val;
543}
544
545DEFUN ("find-charset-string", Ffind_charset_string, Sfind_charset_string,
546 1, 1, 0,
547 "Return a list of charsets in STR.")
548 (str)
549 Lisp_Object str;
550{
551 char charsets[MAX_CHARSET];
552 int i;
553 Lisp_Object val;
554
555 CHECK_STRING (str, 0);
556 bzero (charsets, MAX_CHARSET);
557 find_charset_in_str (XSTRING (str)->data, XSTRING (str)->size, charsets);
558 val = Qnil;
559 for (i = MAX_CHARSET - 1; i >= 0; i--)
560 if (charsets[i])
561 val = Fcons (CHARSET_SYMBOL (i), val);
562 return val;
563}
564
565DEFUN ("make-char-internal", Fmake_char_internal, Smake_char_internal, 1, 3, 0,
566 "Return a character of CHARSET and position-codes CODE1 and CODE2.\n\
567CODE1 and CODE2 are optional, but if you don't supply\n\
568 sufficient position-codes, return a generic character which stands for\n\
569all characters or group of characters in the character sets.\n\
570A generic character can be an argument of `modify-syntax-entry' and\n\
571`modify-category-entry'.")
572 (charset, code1, code2)
573 Lisp_Object charset, code1, code2;
574{
575 CHECK_NUMBER (charset, 0);
576
577 if (NILP (code1))
578 XSETFASTINT (code1, 0);
579 else
580 CHECK_NUMBER (code1, 1);
581 if (NILP (code2))
582 XSETFASTINT (code2, 0);
583 else
584 CHECK_NUMBER (code2, 2);
585
586 if (!CHARSET_DEFINED_P (XINT (charset)))
587 error ("Invalid charset: %d", XINT (charset));
588
589 return make_number (MAKE_CHAR (XINT (charset), XINT (code1), XINT (code2)));
590}
591
592DEFUN ("split-char", Fsplit_char, Ssplit_char, 1, 1, 0,
593 "Return list of charset and one or two position-codes of CHAR.")
594 (ch)
595 Lisp_Object ch;
596{
597 Lisp_Object val;
598 int charset;
599 unsigned char c1, c2;
600
601 CHECK_NUMBER (ch, 0);
602 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
603 return ((charset == CHARSET_COMPOSITION || CHARSET_DIMENSION (charset) == 2)
604 ? Fcons (CHARSET_SYMBOL (charset),
605 Fcons (make_number (c1), Fcons (make_number (c2), Qnil)))
606 : Fcons (CHARSET_SYMBOL (charset), Fcons (make_number (c1), Qnil)));
607}
608
609DEFUN ("char-charset", Fchar_charset, Schar_charset, 1, 1, 0,
610 "Return charset of CHAR.")
611 (ch)
612 Lisp_Object ch;
613{
614 CHECK_NUMBER (ch, 0);
615
616 return CHARSET_SYMBOL (CHAR_CHARSET (XINT (ch)));
617}
618
619DEFUN ("iso-charset", Fiso_charset, Siso_charset, 3, 3, 0,
620 "Return charset of ISO's specification DIMENSION, CHARS, and FINAL-CHAR.")
621 (dimension, chars, final_char)
622 Lisp_Object dimension, chars, final_char;
623{
624 int charset;
625
626 CHECK_NUMBER (dimension, 0);
627 CHECK_NUMBER (chars, 1);
628 CHECK_NUMBER (final_char, 2);
629
630 if ((charset = ISO_CHARSET_TABLE (dimension, chars, final_char)) < 0)
631 return Qnil;
632 return CHARSET_SYMBOL (charset);
633}
634
635DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
636 "Return byte length of multi-byte form of CHAR.")
637 (ch)
638 Lisp_Object ch;
639{
640 Lisp_Object val;
641 int bytes;
642
643 CHECK_NUMBER (ch, 0);
644 if (COMPOSITE_CHAR_P (XFASTINT (ch)))
645 {
646 unsigned int id = COMPOSITE_CHAR_ID (XFASTINT (ch));
647
648 bytes = (id < n_cmpchars ? cmpchar_table[id]->len : 1);
649 }
650 else
651 {
652 int charset = CHAR_CHARSET (XFASTINT (ch));
653
654 bytes = CHARSET_DEFINED_P (charset) ? CHARSET_BYTES (charset) : 1;
655 }
656
657 XSETFASTINT (val, bytes);
658 return val;
659}
660
661/* Return the width of character of which multi-byte form starts with
662 C. The width is measured by how many columns occupied on the
663 screen when displayed in the current buffer. */
664
665#define ONE_BYTE_CHAR_WIDTH(c) \
666 (c < 0x20 \
667 ? (c == '\t' \
668 ? current_buffer->tab_width \
669 : (c == '\n' ? 0 : (NILP (current_buffer->ctl_arrow) ? 4 : 2))) \
670 : (c < 0x7f \
671 ? 1 \
672 : (c == 0x7F \
673 ? (NILP (current_buffer->ctl_arrow) ? 4 : 2) \
674 : ((! NILP (current_buffer->enable_multibyte_characters) \
675 && BASE_LEADING_CODE_P (c)) \
676 ? WIDTH_BY_CHAR_HEAD (c) \
677 : 4)))) \
678
679
680DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
681 "Return width of CHAR when displayed in the current buffer.\n\
682The width is measured by how many columns it occupies on the screen.")
683 (ch)
684 Lisp_Object ch;
685{
686 Lisp_Object val;
687 int c;
688
689 CHECK_NUMBER (ch, 0);
690
691 c = XFASTINT (ch);
692 if (SINGLE_BYTE_CHAR_P (c))
693 XSETFASTINT (val, ONE_BYTE_CHAR_WIDTH (c));
694 else if (COMPOSITE_CHAR_P (c))
695 {
696 int id = COMPOSITE_CHAR_ID (XFASTINT (ch));
697 XSETFASTINT (val, (id < n_cmpchars ? cmpchar_table[id]->width : 0));
698 }
699 else
700 {
701 int charset = CHAR_CHARSET (c);
702
703 XSETFASTINT (val, CHARSET_WIDTH (charset));
704 }
705 return val;
706}
707
708/* Return width of string STR of length LEN when displayed in the
709 current buffer. The width is measured by how many columns it
710 occupies on the screen. */
711int
712strwidth (str, len)
713 unsigned char *str;
714 int len;
715{
716 unsigned char *endp = str + len;
717 int width = 0;
718
719 while (str < endp) {
720 if (*str == LEADING_CODE_COMPOSITION)
721 {
722 int id = str_cmpchar_id (str, endp - str);
723
724 if (id < 0)
725 {
726 width += 4;
727 str++;
728 }
729 else
730 {
731 width += cmpchar_table[id]->width;
732 str += cmpchar_table[id]->len;
733 }
734 }
735 else
736 {
737 width += ONE_BYTE_CHAR_WIDTH (*str);
738 str += BYTES_BY_CHAR_HEAD (*str);
739 }
740 }
741 return width;
742}
743
744DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
745 "Return width of STRING when displayed in the current buffer.\n\
746Width is measured by how many columns it occupies on the screen.\n\
747When calculating width of a multi-byte character in STRING,\n\
748 only the base leading-code is considered and the validity of\n\
749 the following bytes are not checked.")
750 (str)
751 Lisp_Object str;
752{
753 Lisp_Object val;
754
755 CHECK_STRING (str, 0);
756 XSETFASTINT (val, strwidth (XSTRING (str)->data, XSTRING (str)->size));
757 return val;
758}
759
760DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
761 "Return the direction of CHAR.\n\
762The returned value is 0 for left-to-right and 1 for right-to-left.")
763 (ch)
764 Lisp_Object ch;
765{
766 int charset;
767
768 CHECK_NUMBER (ch, 0);
769 charset = CHAR_CHARSET (XFASTINT (ch));
770 if (!CHARSET_DEFINED_P (charset))
771 error ("Invalid character: %d", XINT (ch));
772 return CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX);
773}
774
775DEFUN ("chars-in-string", Fchars_in_string, Schars_in_string, 1, 1, 0,
776 "Return number of characters in STRING.")
777 (str)
778 Lisp_Object str;
779{
780 Lisp_Object val;
781 unsigned char *p, *endp;
782 int chars;
783
784 CHECK_STRING (str, 0);
785
786 p = XSTRING (str)->data; endp = p + XSTRING (str)->size;
787 chars = 0;
788 while (p < endp)
789 {
790 if (*p == LEADING_CODE_COMPOSITION)
791 {
792 p++;
793 while (p < endp && ! CHAR_HEAD_P (p)) p++;
794 }
795 else
796 p += BYTES_BY_CHAR_HEAD (*p);
797 chars++;
798 }
799
800 XSETFASTINT (val, chars);
801 return val;
802}
803
804DEFUN ("char-boundary-p", Fchar_boundary_p, Schar_boundary_p, 1, 1, 0,
805 "Return non-nil value if POS is at character boundary of multibyte form.\n\
806The return value is:\n\
807 0 if POS is at an ASCII character or at the end of range,\n\
808 1 if POS is at a head of 2-byte length multi-byte form,\n\
809 2 if POS is at a head of 3-byte length multi-byte form,\n\
810 3 if POS is at a head of 4-byte length multi-byte form,\n\
811 4 if POS is at a head of multi-byte form of a composite character.\n\
812If POS is out of range or not at character boundary, return NIL.")
813 (pos)
814 Lisp_Object pos;
815{
816 Lisp_Object val;
817 int n;
818
819 CHECK_NUMBER_COERCE_MARKER (pos, 0);
820
821 n = XINT (pos);
822 if (n < BEGV || n > ZV)
823 return Qnil;
824
825 if (n == ZV || NILP (current_buffer->enable_multibyte_characters))
826 XSETFASTINT (val, 0);
827 else
828 {
829 unsigned char *p = POS_ADDR (n);
830
831 if (SINGLE_BYTE_CHAR_P (*p))
832 XSETFASTINT (val, 0);
833 else if (*p == LEADING_CODE_COMPOSITION)
834 XSETFASTINT (val, 4);
835 else if (BYTES_BY_CHAR_HEAD (*p) > 1)
836 XSETFASTINT (val, BYTES_BY_CHAR_HEAD (*p) - 1);
837 else
838 val = Qnil;
839 }
840 return val;
841}
842
843DEFUN ("concat-chars", Fconcat_chars, Sconcat_chars, 1, MANY, 0,
844 "Concatenate all the argument characters and make the result a string.")
845 (nargs, args)
846 int nargs;
847 Lisp_Object *args;
848{
849 int i, n = XINT (nargs);
850 unsigned char *buf
851 = (unsigned char *) malloc (MAX_LENGTH_OF_MULTI_BYTE_FORM * n);
852 unsigned char *p = buf;
853 Lisp_Object val;
854
855 for (i = 0; i < n; i++)
856 {
857 int c, len;
858 unsigned char *str;
859
860 if (!INTEGERP (args[i]))
861 {
862 free (buf);
863 CHECK_NUMBER (args[i], 0);
864 }
865 c = XINT (args[i]);
866 len = CHAR_STRING (c, p, str);
867 if (p != str)
868 /* C is a composite character. */
869 bcopy (str, p, len);
870 p += len;
871 }
872
873 val = make_string (buf, p - buf);
874 free (buf);
875 return val;
876}
877
878#endif /* emacs */
879
880/*** Composite characters staffs ***/
881
882/* Each composite character is identified by CMPCHAR-ID which is
883 assigned when Emacs needs the character code of the composite
884 character (e.g. when displaying it on the screen). See the
885 document "GENERAL NOTE on COMPOSITE CHARACTER" in `charset.h' how a
886 composite character is represented in Emacs. */
887
888/* If `static' is defined, it means that it is defined to null string. */
889#ifndef static
890/* The following function is copied from lread.c. */
891static int
892hash_string (ptr, len)
893 unsigned char *ptr;
894 int len;
895{
896 register unsigned char *p = ptr;
897 register unsigned char *end = p + len;
898 register unsigned char c;
899 register int hash = 0;
900
901 while (p != end)
902 {
903 c = *p++;
904 if (c >= 0140) c -= 40;
905 hash = ((hash<<3) + (hash>>28) + c);
906 }
907 return hash & 07777777777;
908}
909#endif
910
911/* Table of pointers to the structure `cmpchar_info' indexed by
912 CMPCHAR-ID. */
913struct cmpchar_info **cmpchar_table;
914/* The current size of `cmpchar_table'. */
915static int cmpchar_table_size;
916/* Number of the current composite characters. */
917int n_cmpchars;
918
919#define CMPCHAR_HASH_TABLE_SIZE 0xFFF
920
921static int *cmpchar_hash_table[CMPCHAR_HASH_TABLE_SIZE];
922
923/* Each element of `cmpchar_hash_table' is a pointer to an array of
924 integer, where the 1st element is the size of the array, the 2nd
925 element is how many elements are actually used in the array, and
926 the remaining elements are CMPCHAR-IDs of composite characters of
927 the same hash value. */
928#define CMPCHAR_HASH_SIZE(table) table[0]
929#define CMPCHAR_HASH_USED(table) table[1]
930#define CMPCHAR_HASH_CMPCHAR_ID(table, i) table[i]
931
932/* Return CMPCHAR-ID of the composite character in STR of the length
933 LEN. If the composite character has not yet been registered,
934 register it in `cmpchar_table' and assign new CMPCHAR-ID. This
935 is the sole function for assigning CMPCHAR-ID. */
936int
937str_cmpchar_id (str, len)
938 unsigned char *str;
939 int len;
940{
941 int hash_idx, *hashp;
942 unsigned char *buf;
943 int embedded_rule; /* 1 if composition rule is embedded. */
944 int chars; /* number of components. */
945 int i;
946 struct cmpchar_info *cmpcharp;
947
948 if (len < 5)
949 /* Any composite char have at least 3-byte length. */
950 return -1;
951
952 /* The second byte 0xFF means compostion rule is embedded. */
953 embedded_rule = (str[1] == 0xFF);
954
955 /* At first, get the actual length of the composite character. */
956 {
957 unsigned char *p, *endp = str + 1, *lastp = str + len;
958 int bytes;
959
960 while (endp < lastp && ! CHAR_HEAD_P (endp)) endp++;
961 chars = 0;
962 p = str + 1 + embedded_rule;
963 while (p < endp)
964 {
965 /* No need of checking if *P is 0xA0 because
966 BYTES_BY_CHAR_HEAD (0x80) surely returns 2. */
967 p += (bytes = BYTES_BY_CHAR_HEAD (*p - 0x20) + embedded_rule);
968 chars++;
969 }
970 len = (p -= embedded_rule) - str;
971 if (p > endp)
972 len -= - bytes, chars--;
973
974 if (chars < 2 || chars > MAX_COMPONENT_COUNT)
975 /* Invalid number of components. */
976 return -1;
977 }
978 hash_idx = hash_string (str, len) % CMPCHAR_HASH_TABLE_SIZE;
979 hashp = cmpchar_hash_table[hash_idx];
980
981 /* Then, look into the hash table. */
982 if (hashp != NULL)
983 /* Find the correct one among composite characters of the same
984 hash value. */
985 for (i = 2; i < CMPCHAR_HASH_USED (hashp); i++)
986 {
987 cmpcharp = cmpchar_table[CMPCHAR_HASH_CMPCHAR_ID (hashp, i)];
988 if (len == cmpcharp->len
989 && ! bcmp (str, cmpcharp->data, len))
990 return CMPCHAR_HASH_CMPCHAR_ID (hashp, i);
991 }
992
993 /* We have to register the composite character in cmpchar_table. */
994 /* Make the entry in hash table. */
995 if (hashp == NULL)
996 {
997 /* Make a table for 8 composite characters initially. */
998 hashp = (cmpchar_hash_table[hash_idx]
999 = (int *) xmalloc (sizeof (int) * (2 + 8)));
1000 CMPCHAR_HASH_SIZE (hashp) = 10;
1001 CMPCHAR_HASH_USED (hashp) = 2;
1002 }
1003 else if (CMPCHAR_HASH_USED (hashp) >= CMPCHAR_HASH_SIZE (hashp))
1004 {
1005 CMPCHAR_HASH_SIZE (hashp) += 8;
1006 hashp = (cmpchar_hash_table[hash_idx]
1007 = (int *) xrealloc (hashp,
1008 sizeof (int) * CMPCHAR_HASH_SIZE (hashp)));
1009 }
1010 CMPCHAR_HASH_CMPCHAR_ID (hashp, CMPCHAR_HASH_USED (hashp)) = n_cmpchars;
1011 CMPCHAR_HASH_USED (hashp)++;
1012
1013 /* Set information of the composite character in cmpchar_table. */
1014 if (cmpchar_table_size == 0)
1015 {
1016 /* This is the first composite character to be registered. */
1017 cmpchar_table_size = 256;
1018 cmpchar_table
1019 = (struct cmpchar_info **) xmalloc (sizeof (cmpchar_table[0])
1020 * cmpchar_table_size);
1021 }
1022 else if (cmpchar_table_size <= n_cmpchars)
1023 {
1024 cmpchar_table_size += 256;
1025 cmpchar_table
1026 = (struct cmpchar_info **) xrealloc (cmpchar_table,
1027 sizeof (cmpchar_table[0])
1028 * cmpchar_table_size);
1029 }
1030
1031 cmpcharp = (struct cmpchar_info *) xmalloc (sizeof (struct cmpchar_info));
1032
1033 cmpcharp->len = len;
1034 cmpcharp->data = (unsigned char *) xmalloc (len + 1);
1035 bcopy (str, cmpcharp->data, len);
1036 cmpcharp->data[len] = 0;
1037 cmpcharp->glyph_len = chars;
1038 cmpcharp->glyph = (GLYPH *) xmalloc (sizeof (GLYPH) * chars);
1039 if (embedded_rule)
1040 {
1041 cmpcharp->cmp_rule = (unsigned char *) xmalloc (chars);
1042 cmpcharp->col_offset = (float *) xmalloc (sizeof (float) * chars);
1043 }
1044 else
1045 {
1046 cmpcharp->cmp_rule = NULL;
1047 cmpcharp->col_offset = NULL;
1048 }
1049
1050 /* Setup GLYPH data and composition rules (if any) so as not to make
1051 them every time on displaying. */
1052 {
1053 unsigned char *bufp;
1054 int width;
1055 float leftmost = 0.0, rightmost = 1.0;
1056
1057 if (embedded_rule)
1058 /* At first, col_offset[N] is set to relative to col_offset[0]. */
1059 cmpcharp->col_offset[0] = 0;
1060
1061 for (i = 0, bufp = cmpcharp->data + 1; i < chars; i++)
1062 {
1063 if (embedded_rule)
1064 cmpcharp->cmp_rule[i] = *bufp++;
1065
1066 if (*bufp == 0xA0) /* This is an ASCII character. */
1067 {
1068 cmpcharp->glyph[i] = FAST_MAKE_GLYPH ((*++bufp & 0x7F), 0);
1069 width = 1;
1070 bufp++;
1071 }
1072 else /* Multibyte character. */
1073 {
1074 /* Make `bufp' point normal multi-byte form temporally. */
1075 *bufp -= 0x20;
1076 cmpcharp->glyph[i]
1077 = FAST_MAKE_GLYPH (string_to_non_ascii_char (bufp, 4, 0), 0);
1078 width = WIDTH_BY_CHAR_HEAD (*bufp);
1079 *bufp += 0x20;
1080 bufp += BYTES_BY_CHAR_HEAD (*bufp - 0x20);
1081 }
1082
1083 if (embedded_rule && i > 0)
1084 {
1085 /* Reference points (global_ref and new_ref) are
1086 encoded as below:
1087
1088 0--1--2 -- ascent
1089 | |
1090 | |
1091 | 4 -+--- center
1092 -- 3 5 -- baseline
1093 | |
1094 6--7--8 -- descent
1095
1096 Now, we calculate the column offset of the new glyph
1097 from the left edge of the first glyph. This can avoid
1098 the same calculation everytime displaying this
1099 composite character. */
1100
1101 /* Reference points of global glyph and new glyph. */
1102 int global_ref = (cmpcharp->cmp_rule[i] - 0xA0) / 9;
1103 int new_ref = (cmpcharp->cmp_rule[i] - 0xA0) % 9;
1104 /* Column offset relative to the first glyph. */
1105 float left = (leftmost
1106 + (global_ref % 3) * (rightmost - leftmost) / 2.0
1107 - (new_ref % 3) * width / 2.0);
1108
1109 cmpcharp->col_offset[i] = left;
1110 if (left < leftmost)
1111 leftmost = left;
1112 if (left + width > rightmost)
1113 rightmost = left + width;
1114 }
1115 else
1116 {
1117 if (width > rightmost)
1118 rightmost = width;
1119 }
1120 }
1121 if (embedded_rule)
1122 {
1123 /* Now col_offset[N] are relative to the left edge of the
1124 first component. Make them relative to the left edge of
1125 overall glyph. */
1126 for (i = 0; i < chars; i++)
1127 cmpcharp->col_offset[i] -= leftmost;
1128 /* Make rightmost holds width of overall glyph. */
1129 rightmost -= leftmost;
1130 }
1131
1132 cmpcharp->width = rightmost;
1133 if (cmpcharp->width < rightmost)
1134 /* To get a ceiling integer value. */
1135 cmpcharp->width++;
1136 }
1137
1138 cmpchar_table[n_cmpchars] = cmpcharp;
1139
1140 return n_cmpchars++;
1141}
1142
1143/* Return the Nth element of the composite character C. */
1144int
1145cmpchar_component (c, n)
1146 unsigned int c, n;
1147{
1148 int id = COMPOSITE_CHAR_ID (c);
1149
1150 if (id >= n_cmpchars /* C is not a valid composite character. */
1151 || n >= cmpchar_table[id]->glyph_len) /* No such component. */
1152 return -1;
1153 /* No face data is stored in glyph code. */
1154 return ((int) (cmpchar_table[id]->glyph[n]));
1155}
1156
1157DEFUN ("cmpcharp", Fcmpcharp, Scmpcharp, 1, 1, 0,
1158 "T if CHAR is a composite character.")
1159 (ch)
1160 Lisp_Object ch;
1161{
1162 CHECK_NUMBER (ch, 0);
1163 return (COMPOSITE_CHAR_P (XINT (ch)) ? Qt : Qnil);
1164}
1165
1166DEFUN ("composite-char-component", Fcmpchar_component, Scmpchar_component,
1167 2, 2, 0,
1168 "Return the IDXth component character of composite character CHARACTER.")
1169 (character, idx)
1170 Lisp_Object character, idx;
1171{
1172 int c;
1173
1174 CHECK_NUMBER (character, 0);
1175 CHECK_NUMBER (idx, 1);
1176
1177 if ((c = cmpchar_component (XINT (character), XINT (idx))) < 0)
1178 args_out_of_range (character, idx);
1179
1180 return make_number (c);
1181}
1182
1183DEFUN ("composite-char-composition-rule", Fcmpchar_cmp_rule, Scmpchar_cmp_rule,
1184 2, 2, 0,
1185 "Return the IDXth composition rule embedded in composite character CHARACTER.
1186The returned rule is for composing the IDXth component
1187on the (IDX-1)th component. If IDX is 0, the returned value is always 255.")
1188 (character, idx)
1189 Lisp_Object character, idx;
1190{
1191 int id, i;
1192
1193 CHECK_NUMBER (character, 0);
1194 CHECK_NUMBER (idx, 1);
1195
1196 id = COMPOSITE_CHAR_ID (XINT (character));
1197 if (id < 0 || id >= n_cmpchars)
1198 error ("Invalid composite character: %d", XINT (character));
1199 i = XINT (idx);
1200 if (i > cmpchar_table[id]->glyph_len)
1201 args_out_of_range (character, idx);
1202
1203 return make_number (cmpchar_table[id]->cmp_rule[i]);
1204}
1205
1206DEFUN ("composite-char-composition-rule-p", Fcmpchar_cmp_rule_p,
1207 Scmpchar_cmp_rule_p, 1, 1, 0,
1208 "Return non-nil if composite character CHARACTER contains a embedded rule.")
1209 (character)
1210 Lisp_Object character;
1211{
1212 int id;
1213
1214 CHECK_NUMBER (character, 0);
1215 id = COMPOSITE_CHAR_ID (XINT (character));
1216 if (id < 0 || id >= n_cmpchars)
1217 error ("Invalid composite character: %d", XINT (character));
1218
1219 return (cmpchar_table[id]->cmp_rule ? Qt : Qnil);
1220}
1221
1222DEFUN ("composite-char-component-count", Fcmpchar_cmp_count,
1223 Scmpchar_cmp_count, 1, 1, 0,
1224 "Return number of compoents of composite character CHARACTER.")
1225 (character)
1226 Lisp_Object character;
1227{
1228 int id;
1229
1230 CHECK_NUMBER (character, 0);
1231 id = COMPOSITE_CHAR_ID (XINT (character));
1232 if (id < 0 || id >= n_cmpchars)
1233 error ("Invalid composite character: %d", XINT (character));
1234
1235 return (make_number (cmpchar_table[id]->glyph_len));
1236}
1237
1238DEFUN ("compose-string", Fcompose_string, Scompose_string,
1239 1, 1, 0,
1240 "Return one char string composed from all characters in STRING.")
1241 (str)
1242 Lisp_Object str;
1243{
1244 unsigned char buf[MAX_LENGTH_OF_MULTI_BYTE_FORM], *p, *pend, *ptemp;
1245 int len, i;
1246
1247 CHECK_STRING (str, 0);
1248
1249 buf[0] = LEADING_CODE_COMPOSITION;
1250 p = XSTRING (str)->data;
1251 pend = p + XSTRING (str)->size;
1252 i = 1;
1253 while (p < pend)
1254 {
1255 if (*p < 0x20 || *p == 127) /* control code */
1256 error ("Invalid component character: %d", *p);
1257 else if (*p < 0x80) /* ASCII */
1258 {
1259 if (i + 2 >= MAX_LENGTH_OF_MULTI_BYTE_FORM)
1260 error ("Too long string to be composed: %s", XSTRING (str)->data);
1261 /* Prepend an ASCII charset indicator 0xA0, set MSB of the
1262 code itself. */
1263 buf[i++] = 0xA0;
1264 buf[i++] = *p++ + 0x80;
1265 }
1266 else if (*p == LEADING_CODE_COMPOSITION) /* composite char */
1267 {
1268 /* Already composed. Eliminate the heading
1269 LEADING_CODE_COMPOSITION, keep the remaining bytes
1270 unchanged. */
1271 p++;
1272 ptemp = p;
1273 while (! CHAR_HEAD_P (p)) p++;
1274 if (i + (p - ptemp) >= MAX_LENGTH_OF_MULTI_BYTE_FORM)
1275 error ("Too long string to be composed: %s", XSTRING (str)->data);
1276 bcopy (ptemp, buf + i, p - ptemp);
1277 i += p - ptemp;
1278 }
1279 else /* multibyte char */
1280 {
1281 /* Add 0x20 to the base leading-code, keep the remaining
1282 bytes unchanged. */
1283 len = BYTES_BY_CHAR_HEAD (*p);
1284 if (i + len >= MAX_LENGTH_OF_MULTI_BYTE_FORM)
1285 error ("Too long string to be composed: %s", XSTRING (str)->data);
1286 bcopy (p, buf + i, len);
1287 buf[i] += 0x20;
1288 p += len, i += len;
1289 }
1290 }
1291
1292 if (i < 5)
1293 /* STR contains only one character, which can't be composed. */
1294 error ("Too short string to be composed: %s", XSTRING (str)->data);
1295
1296 return make_string (buf, i);
1297}
1298
1299
1300charset_id_internal (charset_name)
1301 char *charset_name;
1302{
1303 Lisp_Object val = Fget (intern (charset_name), Qcharset);
1304
1305 if (!VECTORP (val))
1306 error ("Charset %s is not defined", charset_name);
1307
1308 return (XINT (XVECTOR (val)->contents[0]));
1309}
1310
1311DEFUN ("setup-special-charsets", Fsetup_special_charsets,
1312 Ssetup_special_charsets, 0, 0, 0, "Internal use only.")
1313 ()
1314{
1315 charset_latin_iso8859_1 = charset_id_internal ("latin-iso8859-1");
1316 charset_jisx0208_1978 = charset_id_internal ("japanese-jisx0208-1978");
1317 charset_jisx0208 = charset_id_internal ("japanese-jisx0208");
1318 charset_katakana_jisx0201 = charset_id_internal ("katakana-jisx0201");
1319 charset_latin_jisx0201 = charset_id_internal ("latin-jisx0201");
1320 charset_big5_1 = charset_id_internal ("chinese-big5-1");
1321 charset_big5_2 = charset_id_internal ("chinese-big5-2");
1322 return Qnil;
1323}
1324
1325init_charset_once ()
1326{
1327 int i, j, k;
1328
1329 staticpro (&Vcharset_table);
1330 staticpro (&Vcharset_symbol_table);
1331
1332 /* This has to be done here, before we call Fmake_char_table. */
1333 Qcharset_table = intern ("charset-table");
1334 staticpro (&Qcharset_table);
1335
1336 /* Intern this now in case it isn't already done.
1337 Setting this variable twice is harmless.
1338 But don't staticpro it here--that is done in alloc.c. */
1339 Qchar_table_extra_slots = intern ("char-table-extra-slots");
1340
1341 /* Now we are ready to set up this property, so we can
1342 create the charset table. */
1343 Fput (Qcharset_table, Qchar_table_extra_slots, make_number (0));
1344 Vcharset_table = Fmake_char_table (Qcharset_table, Qnil);
1345
1346 Vcharset_symbol_table = Fmake_vector (make_number (MAX_CHARSET), Qnil);
1347
1348 /* Setup tables. */
1349 for (i = 0; i < 2; i++)
1350 for (j = 0; j < 2; j++)
1351 for (k = 0; k < 128; k++)
1352 iso_charset_table [i][j][k] = -1;
1353
1354 bzero (cmpchar_hash_table, sizeof cmpchar_hash_table);
1355 cmpchar_table_size = n_cmpchars = 0;
1356
1357 for (i = 0; i < 256; i++)
1358 BYTES_BY_CHAR_HEAD (i) = 1;
1359 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_11) = 3;
1360 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_12) = 3;
1361 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_21) = 4;
1362 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_22) = 4;
1363 /* The following doesn't reflect the actual bytes, but just to tell
1364 that it is a start of a multibyte character. */
1365 BYTES_BY_CHAR_HEAD (LEADING_CODE_COMPOSITION) = 2;
1366
1367 for (i = 0; i < 128; i++)
1368 WIDTH_BY_CHAR_HEAD (i) = 1;
1369 for (; i < 256; i++)
1370 WIDTH_BY_CHAR_HEAD (i) = 4;
1371 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_11) = 1;
1372 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_12) = 2;
1373 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_21) = 1;
1374 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_22) = 2;
1375}
1376
1377#ifdef emacs
1378
1379syms_of_charset ()
1380{
1381 Qascii = intern ("ascii");
1382 staticpro (&Qascii);
1383
1384 Qcharset = intern ("charset");
1385 staticpro (&Qcharset);
1386
1387 /* Define ASCII charset now. */
1388 update_charset_table (make_number (CHARSET_ASCII),
1389 make_number (1), make_number (94),
1390 make_number (1),
1391 make_number (0),
1392 make_number ('B'),
1393 make_number (0),
1394 build_string ("ASCII"),
1395 build_string ("ASCII"),
1396 build_string ("ASCII (ISO646 IRV)"));
1397 CHARSET_SYMBOL (CHARSET_ASCII) = Qascii;
1398 Fput (Qascii, Qcharset, CHARSET_TABLE_ENTRY (CHARSET_ASCII));
1399
1400 Qcomposition = intern ("composition");
1401 staticpro (&Qcomposition);
1402 CHARSET_SYMBOL (CHARSET_COMPOSITION) = Qcomposition;
1403
1404 defsubr (&Sdefine_charset);
1405 defsubr (&Sdeclare_equiv_charset);
1406 defsubr (&Sfind_charset_region);
1407 defsubr (&Sfind_charset_string);
1408 defsubr (&Smake_char_internal);
1409 defsubr (&Ssplit_char);
1410 defsubr (&Schar_charset);
1411 defsubr (&Siso_charset);
1412 defsubr (&Schar_bytes);
1413 defsubr (&Schar_width);
1414 defsubr (&Sstring_width);
1415 defsubr (&Schar_direction);
1416 defsubr (&Schars_in_string);
1417 defsubr (&Schar_boundary_p);
1418 defsubr (&Sconcat_chars);
1419 defsubr (&Scmpcharp);
1420 defsubr (&Scmpchar_component);
1421 defsubr (&Scmpchar_cmp_rule);
1422 defsubr (&Scmpchar_cmp_rule_p);
1423 defsubr (&Scmpchar_cmp_count);
1424 defsubr (&Scompose_string);
1425 defsubr (&Ssetup_special_charsets);
1426
1427 DEFVAR_LISP ("charset-list", &Vcharset_list,
1428 "List of charsets ever defined.");
1429 Vcharset_list = Fcons (Qascii, Qnil);
1430
1431 DEFVAR_INT ("leading-code-composition", &leading_code_composition,
1432 "Leading-code of composite characters.");
1433 leading_code_composition = LEADING_CODE_COMPOSITION;
1434
1435 DEFVAR_INT ("leading-code-private-11", &leading_code_private_11,
1436 "Leading-code of private TYPE9N charset of column-width 1.");
1437 leading_code_private_11 = LEADING_CODE_PRIVATE_11;
1438
1439 DEFVAR_INT ("leading-code-private-12", &leading_code_private_12,
1440 "Leading-code of private TYPE9N charset of column-width 2.");
1441 leading_code_private_12 = LEADING_CODE_PRIVATE_12;
1442
1443 DEFVAR_INT ("leading-code-private-21", &leading_code_private_21,
1444 "Leading-code of private TYPE9Nx9N charset of column-width 1.");
1445 leading_code_private_21 = LEADING_CODE_PRIVATE_21;
1446
1447 DEFVAR_INT ("leading-code-private-22", &leading_code_private_22,
1448 "Leading-code of private TYPE9Nx9N charset of column-width 2.");
1449 leading_code_private_22 = LEADING_CODE_PRIVATE_22;
1450}
1451
1452#endif /* emacs */
diff --git a/src/charset.h b/src/charset.h
new file mode 100644
index 00000000000..677a5755adf
--- /dev/null
+++ b/src/charset.h
@@ -0,0 +1,649 @@
1/* Header for multilingual character handler.
2 Ver.1.0
3
4 Copyright (C) 1995 Free Software Foundation, Inc.
5 Copyright (C) 1995 Electrotechnical Laboratory, JAPAN.
6
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
20
21#ifndef _CHARSET_H
22#define _CHARSET_H
23
24/*** GENERAL NOTE on CHARACTER SET (CHARSET) ***
25
26 A character set ("charset" hereafter) is a meaningful collection
27 (i.e. language, culture, functionality, etc) of characters. Emacs
28 handles multiple charsets at once. Each charset corresponds to one
29 of ISO charsets (except for a special charset for composition
30 characters). Emacs identifies a charset by a unique identification
31 number, whereas ISO identifies a charset by a triplet of DIMENSION,
32 CHARS and FINAL-CHAR. So, hereafter, just saying "charset" means an
33 identification number (integer value).
34
35 The value range of charset is 0x00, 0x80..0xFE. There are four
36 kinds of charset depending on DIMENSION (1 or 2) and CHARS (94 or
37 96). For instance, a charset of DIMENSION2_CHARS94 contains 94x94
38
39
40 Within Emacs Lisp, a charset is treated as a symbol which has a
41 property `charset'. The property value is a vector containing
42 various information about the charset. For readability of C codes,
43 we use the following convention on C variable names:
44 charset_symbol: Emacs Lisp symbol of a charset
45 charset_id: Emacs Lisp integer of an identification number of a charset
46 charset: C integer of an identification number of a charset
47
48 Each charset (except for ASCII) is assigned a base leading-code
49 (range 0x80..0x9D). In addition, a charset of greater than 0xA0
50 (whose base leading-code is 0x9A..0x9D) is assigned an extended
51 leading-code (range 0xA0..0xFE). In this case, each base
52 leading-code specify the allowable range of extended leading-code as
53 shown in the table below. A leading-code is used to represent a
54 character in Emacs' buffer and string.
55
56 We call a charset which has extended leading-code as "private
57 charset" because those are mainly for a charset which is not
58 registered by ISO. On the contrary, we call a charset which does
59 not have extended leading-code as "official charset".
60
61 ---------------------------------------------------------------------------
62 charset dimension base leading-code extended leading-code
63 ---------------------------------------------------------------------------
64 0x00 official dim1 -- none -- -- none --
65 (ASCII)
66 0x01..0x7F --never used--
67 0x80 COMPOSITION same as charset -- none --
68 0x81..0x8F official dim1 same as charset -- none --
69 0x90..0x99 official dim2 same as charset -- none --
70 0x9A..0x9F --never used--
71 0xA0..0xDF private dim1 0x9A same as charset
72 of 1-column width
73 0xE0..0xEF private dim1 0x9B same as charset
74 of 2-column width
75 0xF0..0xF4 private dim2 0x9C same as charset
76 of 1-column width
77 0xF5..0xFE private dim2 0x9D same as charset
78 of 2-column width
79 0xFF --never used--
80 ---------------------------------------------------------------------------
81
82 In the table, "COMPOSITION" means a charset for a composite
83 character which is a character composed from several (up to 16)
84 non-composite characters (components). Although a composite
85 character can contain components of many charsets, a composite
86 character itself belongs to the charset CHARSET-COMPOSITION. See
87 the document "GENERAL NOTE on COMPOSITE CHARACTER" below for more
88 detail.
89
90*/
91
92/* Definition of special leading-codes. */
93/* Base leading-code. */
94/* Special leading-code followed by components of a composite character. */
95#define LEADING_CODE_COMPOSITION 0x80
96/* Leading-code followed by extended leading-code. */
97#define LEADING_CODE_PRIVATE_11 0x9A /* for private DIMENSION1 of 1-column */
98#define LEADING_CODE_PRIVATE_12 0x9B /* for private DIMENSION1 of 2-column */
99#define LEADING_CODE_PRIVATE_21 0x9C /* for private DIMENSION2 of 1-column */
100#define LEADING_CODE_PRIVATE_22 0x9D /* for private DIMENSION2o f 2-column */
101
102/* Extended leading-code. */
103/* Start of each extended leading-codes. */
104#define LEADING_CODE_EXT_11 0xA0 /* follows LEADING_CODE_PRIVATE_11 */
105#define LEADING_CODE_EXT_12 0xE0 /* follows LEADING_CODE_PRIVATE_12 */
106#define LEADING_CODE_EXT_21 0xF0 /* follows LEADING_CODE_PRIVATE_21 */
107#define LEADING_CODE_EXT_22 0xF5 /* follows LEADING_CODE_PRIVATE_22 */
108/* Maximum value of extended leading-codes. */
109#define LEADING_CODE_EXT_MAX 0xFE
110
111/* Definition of minimum/maximum charset of each DIMENSION. */
112#define MIN_CHARSET_OFFICIAL_DIMENSION1 0x81
113#define MAX_CHARSET_OFFICIAL_DIMENSION1 0x8F
114#define MIN_CHARSET_OFFICIAL_DIMENSION2 0x90
115#define MAX_CHARSET_OFFICIAL_DIMENSION2 0x99
116#define MIN_CHARSET_PRIVATE_DIMENSION1 LEADING_CODE_EXT_11
117#define MIN_CHARSET_PRIVATE_DIMENSION2 LEADING_CODE_EXT_21
118
119/* Definition of special charsets. */
120#define CHARSET_ASCII 0
121#define CHARSET_COMPOSITION 0x80
122
123extern int charset_ascii; /* ASCII */
124extern int charset_composition; /* for a composite character */
125extern int charset_latin_iso8859_1; /* ISO8859-1 (Latin-1) */
126extern int charset_jisx0208_1978; /* JISX0208.1978 (Japanese Kanji old set) */
127extern int charset_jisx0208; /* JISX0208.1983 (Japanese Kanji) */
128extern int charset_katakana_jisx0201; /* JISX0201.Kana (Japanese Katakana) */
129extern int charset_latin_jisx0201; /* JISX0201.Roman (Japanese Roman) */
130extern int charset_big5_1; /* Big5 Level 1 (Chinese Traditional) */
131extern int charset_big5_2; /* Big5 Level 2 (Chinese Traditional) */
132
133/* Check if STR points the head of multi-byte form, i.e. *STR is an
134 ASCII character or a base leading-code. */
135#define CHAR_HEAD_P(str) ((unsigned char) *(str) < 0xA0)
136
137/*** GENERAL NOTE on CHARACTER REPRESENTATION ***
138
139 At first, the term "character" or "char" is used for a multilingual
140 character (of course, including ASCII character), not for a byte in
141 computer memory. We use the term "code" or "byte" for the latter
142 case.
143
144 A character is identified by charset and one or two POSITION-CODEs.
145 POSITION-CODE is the position of the character in the charset. A
146 character of DIMENSION1 charset has one POSITION-CODE: POSITION-CODE-1.
147 A character of DIMENSION2 charset has two POSITION-CODE:
148 POSITION-CODE-1 and POSITION-CODE-2. The code range of
149 POSITION-CODE is 0x20..0x7F.
150
151 Emacs has two kinds of representation of a character: multi-byte
152 form (for buffer and string) and single-word form (for character
153 object in Emacs Lisp). The latter is called "character code" here
154 after. Both representation encode the information of charset and
155 POSITION-CODE but in a different way (for instance, MSB of
156 POSITION-CODE is set in multi-byte form).
157
158 For details of multi-byte form, see the section "2. Emacs internal
159 format handlers" of `coding.c'.
160
161 Emacs uses 19 bits for a character code. The bits are divided into
162 3 fields: FIELD1(5bits):FIELD2(7bits):FIELD3(7bits).
163
164 A character code of DIMENSION1 character uses FIELD2 to hold charset
165 and FIELD3 to hold POSITION-CODE-1. A character code of DIMENSION2
166 character uses FIELD1 to hold charset, FIELD2 and FIELD3 to hold
167 POSITION-CODE-1 and POSITION-CODE-2 respectively.
168
169 More precisely...
170
171 FIELD2 of DIMENSION1 character (except for ASCII) is "charset - 0x70".
172 This is to make all character codes except for ASCII greater than
173 256 (ASCII's FIELD2 is 0). So, the range of FIELD2 of DIMENSION1
174 character is 0 or 0x11..0x7F.
175
176 FIELD1 of DIMENSION2 character is "charset - 0x8F" for official
177 charset and "charset - 0xE0" for private charset. So, the range of
178 FIELD1 of DIMENSION2 character is 0x01..0x1E.
179
180 -----------------------------------------------------------------------
181 charset FIELD1 (5-bit) FIELD2 (7-bit) FIELD3 (7-bit)
182 -----------------------------------------------------------------------
183 ASCII 0 0 POSITION-CODE-1
184 DIMENSION1 0 charset - 0x70 POSITION-CODE-1
185 DIMENSION2(o) charset - 0x8F POSITION-CODE-1 POSITION-CODE-2
186 DIMENSION2(p) charset - 0xE0 POSITION-CODE-1 POSITION-CODE-2
187 -----------------------------------------------------------------------
188 "(o)": official, "(p)": private
189 -----------------------------------------------------------------------
190
191*/
192
193/*** GENERAL NOTE on COMPOSITE CHARACTER ***
194
195 A composite character is a character composed from several (up to
196 16) non-composite characters (components). Although each components
197 can belong to any charset, a composite character itself belongs to
198 the charset `charset-composition' and is assigned a special
199 leading-code `LEADING_CODE_COMPOSITION' for multi-byte form. See
200 the document "2. Emacs internal format handlers" in `coding.c' for
201 more detail about multi-byte form.
202
203 A character code of composite character has special format. In the
204 above document, FIELD1 of a composite character is 0x1F. Each
205 composite character is assigned a sequential number CMPCHAR-ID.
206 FIELD2 and FIELD3 are combined to make 14bits field for holding
207 CMPCHAR-ID, which means that Emacs can handle at most 2^14 (= 16384)
208 composite characters at once.
209
210 -----------------------------------------------------------------------
211 charset FIELD1 (5-bit) FIELD2&3 (14-bit)
212 -----------------------------------------------------------------------
213 CHARSET-COMPOSITION 0x1F CMPCHAR-ID
214 -----------------------------------------------------------------------
215
216 Emacs assigns CMPCHAR-ID to a composite character only when it
217 requires the character code of the composite character (e.g. while
218 displaying the composite character).
219
220*/
221
222/* Masks of each field of character code. */
223#define CHAR_FIELD1_MASK (0x1F << 14)
224#define CHAR_FIELD2_MASK (0x7F << 7)
225#define CHAR_FIELD3_MASK 0x7F
226
227/* Macros to access each field of character C. */
228#define CHAR_FIELD1(c) (((c) & CHAR_FIELD1_MASK) >> 14)
229#define CHAR_FIELD2(c) (((c) & CHAR_FIELD2_MASK) >> 7)
230#define CHAR_FIELD3(c) ((c) & CHAR_FIELD3_MASK)
231
232/* Minimum character code of character of each DIMENSION. */
233#define MIN_CHAR_OFFICIAL_DIMENSION1 \
234 ((MIN_CHARSET_OFFICIAL_DIMENSION1 - 0x70) << 7)
235#define MIN_CHAR_PRIVATE_DIMENSION1 \
236 ((MIN_CHARSET_PRIVATE_DIMENSION1 - 0x70) << 7)
237#define MIN_CHAR_OFFICIAL_DIMENSION2 \
238 ((MIN_CHARSET_OFFICIAL_DIMENSION2 - 0x8F) << 14)
239#define MIN_CHAR_PRIVATE_DIMENSION2 \
240 ((MIN_CHARSET_PRIVATE_DIMENSION2 - 0xE0) << 14)
241#define MIN_CHAR_COMPOSITION \
242 (0x1F << 14)
243
244/* 1 if C is an ASCII character, else 0. */
245#define SINGLE_BYTE_CHAR_P(c) ((c) < 0x100)
246/* 1 if C is an composite character, else 0. */
247#define COMPOSITE_CHAR_P(c) ((c) >= MIN_CHAR_COMPOSITION)
248
249/* A char-table containing information of each character set.
250
251 Unlike ordinary char-tables, this doesn't contain any nested table.
252 Only the top level elements are used. Each element is a vector of
253 the following information:
254 CHARSET-ID, BYTES, DIMENSION, CHARS, WIDTH, DIRECTION,
255 LEADING-CODE-BASE, LEADING-CODE-EXT,
256 ISO-FINAL-CHAR, ISO-GRAPHIC-PLANE,
257 REVERSE-CHARSET, SHORT-NAME, LONG-NAME, DESCRIPTION,
258 PLIST.
259
260 CHARSET-ID (integer) is the identification number of the charset.
261
262 BYTE (integer) is the length of multi-byte form of a character in
263 the charset: one of 1, 2, 3, and 4.
264
265 DIMENSION (integer) is the number of bytes to represent a character: 1 or 2.
266
267 CHARS (integer) is the number of characters in a dimension: 94 or 96.
268
269 WIDTH (integer) is the number of columns a character in the charset
270 occupies on the screen: one of 0, 1, and 2.
271
272 DIRECTION (integer) is the rendering direction of characters in the
273 charset when rendering. If 0, render from right to left, else
274 render from left to right.
275
276 LEADING-CODE-BASE (integer) is the base leading-code for the
277 charset.
278
279 LEADING-CODE-EXT (integer) is the extended leading-code for the
280 charset. All charsets of less than 0xA0 has the value 0.
281
282 ISO-FINAL-CHAR (character) is the final character of the
283 corresponding ISO 2022 charset.
284
285 ISO-GRAPHIC-PLANE (integer) is the graphic plane to be invoked
286 while encoding to variants of ISO 2022 coding system, one of the
287 following: 0/graphic-plane-left(GL), 1/graphic-plane-right(GR).
288
289 REVERSE-CHARSET (integer) is the charset which differs only in
290 LEFT-TO-RIGHT value from the charset. If there's no such a
291 charset, the value is -1.
292
293 SHORT-NAME (string) is the short name to refer to the charset.
294
295 LONG-NAME (string) is the long name to refer to the charset.
296
297 DESCRIPTION (string) is the description string of the charset.
298
299 PLIST (property list) may contain any type of information a user
300 want to put and get by functions `put-charset-property' and
301 `get-charset-property' respectively. */
302extern Lisp_Object Vcharset_table;
303
304/* Macros to access various information of CHARSET in Vcharset_table.
305 We provide these macros for efficiency. No range check of CHARSET. */
306
307/* Return entry of CHARSET (lisp integer) in Vcharset_table. */
308#define CHARSET_TABLE_ENTRY(charset) \
309 XCHAR_TABLE (Vcharset_table)->contents[charset]
310
311/* Return information INFO-IDX of CHARSET. */
312#define CHARSET_TABLE_INFO(charset, info_idx) \
313 XVECTOR (CHARSET_TABLE_ENTRY (charset))->contents[info_idx]
314
315#define CHARSET_ID_IDX (0)
316#define CHARSET_BYTES_IDX (1)
317#define CHARSET_DIMENSION_IDX (2)
318#define CHARSET_CHARS_IDX (3)
319#define CHARSET_WIDTH_IDX (4)
320#define CHARSET_DIRECTION_IDX (5)
321#define CHARSET_LEADING_CODE_BASE_IDX (6)
322#define CHARSET_LEADING_CODE_EXT_IDX (7)
323#define CHARSET_ISO_FINAL_CHAR_IDX (8)
324#define CHARSET_ISO_GRAPHIC_PLANE_IDX (9)
325#define CHARSET_REVERSE_CHARSET_IDX (10)
326#define CHARSET_SHORT_NAME_IDX (11)
327#define CHARSET_LONG_NAME_IDX (12)
328#define CHARSET_DESCRIPTION_IDX (13)
329#define CHARSET_PLIST_IDX (14)
330/* Size of a vector of each entry of Vcharset_table. */
331#define CHARSET_MAX_IDX (15)
332
333/* And several more macros to be used frequently. */
334#define CHARSET_BYTES(charset) \
335 XFASTINT (CHARSET_TABLE_INFO (charset, CHARSET_BYTES_IDX))
336#define CHARSET_DIMENSION(charset) \
337 XFASTINT (CHARSET_TABLE_INFO (charset, CHARSET_DIMENSION_IDX))
338#define CHARSET_CHARS(charset) \
339 XFASTINT (CHARSET_TABLE_INFO (charset, CHARSET_CHARS_IDX))
340#define CHARSET_WIDTH(charset) \
341 XFASTINT (CHARSET_TABLE_INFO (charset, CHARSET_WIDTH_IDX))
342#define CHARSET_DIRECTION(charset) \
343 XFASTINT (CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX))
344#define CHARSET_LEADING_CODE_BASE(charset) \
345 XFASTINT (CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_BASE_IDX))
346#define CHARSET_LEADING_CODE_EXT(charset) \
347 XFASTINT (CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_EXT_IDX))
348#define CHARSET_ISO_FINAL_CHAR(charset) \
349 XFASTINT (CHARSET_TABLE_INFO (charset, CHARSET_ISO_FINAL_CHAR_IDX))
350#define CHARSET_ISO_GRAPHIC_PLANE(charset) \
351 XFASTINT (CHARSET_TABLE_INFO (charset, CHARSET_ISO_GRAPHIC_PLANE_IDX))
352#define CHARSET_REVERSE_CHARSET(charset) \
353 XINT (CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX))
354
355/* Macros to specify direction of a charset. */
356#define CHARSET_DIRECTION_LEFT_TO_RIGHT 0
357#define CHARSET_DIRECTION_RIGHT_TO_LEFT 1
358
359/* A vector of charset symbol indexed by charset-id. This is used
360 only for returning charset symbol from C functions. */
361extern Lisp_Object Vcharset_symbol_table;
362
363/* Return symbol of CHARSET. */
364#define CHARSET_SYMBOL(charset) \
365 XVECTOR (Vcharset_symbol_table)->contents[charset]
366
367/* 1 if CHARSET is valid, else 0. */
368#define CHARSET_VALID_P(charset) \
369 ((charset) == 0 \
370 || ((charset) >= 0x80 && (charset) <= MAX_CHARSET_OFFICIAL_DIMENSION2) \
371 || ((charset) >= MIN_CHARSET_PRIVATE_DIMENSION1 && (charset) < MAX_CHARSET))
372
373/* 1 if CHARSET is already defined, else 0. */
374#define CHARSET_DEFINED_P(charset) \
375 (((charset) >= 0) && ((charset) < MAX_CHARSET) \
376 && !NILP (CHARSET_TABLE_ENTRY (charset)))
377
378/* Since the information CHARSET-BYTES and CHARSET-WIDTH of
379 Vcharset_table can be retrieved only from the first byte of
380 multi-byte form (an ASCII code or a base leading-code), we provide
381 here tables to be used by macros BYTES_BY_CHAR_HEAD and
382 WIDTH_BY_CHAR_HEAD for faster information retrieval. */
383extern int bytes_by_char_head[256];
384extern int width_by_char_head[256];
385
386#define BYTES_BY_CHAR_HEAD(char_head) bytes_by_char_head[char_head]
387#define WIDTH_BY_CHAR_HEAD(char_head) width_by_char_head[char_head]
388
389/* Charset of the character C. */
390#define CHAR_CHARSET(c) \
391 (SINGLE_BYTE_CHAR_P (c) \
392 ? CHARSET_ASCII \
393 : ((c) < MIN_CHAR_OFFICIAL_DIMENSION2 \
394 ? CHAR_FIELD2 (c) + 0x70 \
395 : ((c) < MIN_CHAR_PRIVATE_DIMENSION2 \
396 ? CHAR_FIELD1 (c) + 0x8F \
397 : ((c) < MIN_CHAR_COMPOSITION \
398 ? CHAR_FIELD1 (c) + 0xE0 \
399 : CHARSET_COMPOSITION))))
400
401/* Return charset at the place pointed by P. */
402#define CHARSET_AT(p) \
403 (*(p) < 0x80 \
404 ? CHARSET_ASCII \
405 : (*(p) == LEADING_CODE_COMPOSITION \
406 ? CHARSET_COMPOSITION \
407 : (*(p) < LEADING_CODE_PRIVATE_11 \
408 ? (int)*(p) \
409 : (*(p) <= LEADING_CODE_PRIVATE_22 \
410 ? (int)*((p) + 1) \
411 : -1))))
412
413/* Same as `CHARSET_AT ()' but perhaps runs faster because of an
414 additional argument C which is the code (byte) at P. */
415#define FIRST_CHARSET_AT(p, c) \
416 ((c) < 0x80 \
417 ? CHARSET_ASCII \
418 : ((c) == LEADING_CODE_COMPOSITION \
419 ? CHARSET_COMPOSITION \
420 : ((c) < LEADING_CODE_PRIVATE_11 \
421 ? (int)(c) \
422 : ((c) <= LEADING_CODE_PRIVATE_22 \
423 ? (int)*((p) + 1) \
424 : -1))))
425
426/* Check if two characters C1 and C2 belong to the same charset.
427 Always return 0 for composite characters. */
428#define SAME_CHARSET_P(c1, c2) \
429 (c1 < MIN_CHAR_COMPOSITION \
430 && (SINGLE_BYTE_CHAR_P (c1) \
431 ? SINGLE_BYTE_CHAR_P (c2) \
432 : (c1 < MIN_CHAR_OFFICIAL_DIMENSION2 \
433 ? (c1 & CHAR_FIELD2_MASK) == (c2 & CHAR_FIELD2_MASK) \
434 : (c1 & CHAR_FIELD1_MASK) == (c2 & CHAR_FIELD1_MASK))))
435
436/* Return a non-ASCII character of which charset is CHARSET and
437 position-codes are C1 and C2. DIMENSION1 character ignores C2. */
438#define MAKE_NON_ASCII_CHAR(charset, c1, c2) \
439 ((charset) == CHARSET_COMPOSITION \
440 ? MAKE_COMPOSITE_CHAR (((c1) << 7) + (c2)) \
441 : (CHARSET_DIMENSION (charset) == 1 \
442 ? (((charset) - 0x70) << 7) | (c1) \
443 : ((charset) < MIN_CHARSET_PRIVATE_DIMENSION2 \
444 ? (((charset) - 0x8F) << 14) | ((c1) << 7) | (c2) \
445 : (((charset) - 0xE0) << 14) | ((c1) << 7) | (c2))))
446
447/* Return a composite character of which CMPCHAR-ID is ID. */
448#define MAKE_COMPOSITE_CHAR(id) (MIN_CHAR_COMPOSITION + (id))
449
450/* Return CMPCHAR-ID of a composite character C. */
451#define COMPOSITE_CHAR_ID(c) ((c) - MIN_CHAR_COMPOSITION)
452
453/* Return a character of which charset is CHARSET and position-codes
454 are C1 and C2. DIMENSION1 character ignores C2. */
455#define MAKE_CHAR(charset, c1, c2) \
456 ((charset) == CHARSET_ASCII \
457 ? (c1) \
458 : MAKE_NON_ASCII_CHAR ((charset), (c1) & 0x7F, (c2) & 0x7F))
459
460/* The charset of non-ASCII character C is set to CHARSET, and the
461 position-codes of C are set to C1 and C2. C2 of DIMENSION1 character
462 is 0. */
463#define SPLIT_NON_ASCII_CHAR(c, charset, c1, c2) \
464 ((c) < MIN_CHAR_OFFICIAL_DIMENSION2 \
465 ? (charset = CHAR_FIELD2 (c) + 0x70, \
466 c1 = CHAR_FIELD3 (c), \
467 c2 = 0) \
468 : (charset = ((c) < MIN_CHAR_COMPOSITION \
469 ? (CHAR_FIELD1 (c) \
470 + ((c) < MIN_CHAR_PRIVATE_DIMENSION2 ? 0x8F : 0xE0)) \
471 : CHARSET_COMPOSITION), \
472 c1 = CHAR_FIELD2 (c), \
473 c2 = CHAR_FIELD3 (c)))
474
475/* The charset of character C is set to CHARSET, and the
476 position-codes of C are set to C1 and C2. C2 of DIMENSION1 character
477 is 0. */
478#define SPLIT_CHAR(c, charset, c1, c2) \
479 (SINGLE_BYTE_CHAR_P (c) \
480 ? charset = CHARSET_ASCII, c1 = (c), c2 = 0 \
481 : SPLIT_NON_ASCII_CHAR (c, charset, c1, c2))
482
483/* The charset of the character at STR is set to CHARSET, and the
484 position-codes are set to C1 and C2. C2 of DIMENSION1 character is 0.
485 If the character is a composite character, the upper 7-bit and
486 lower 7-bit of CMPCHAR-ID are set in C1 and C2 respectively. No
487 range checking. */
488#define SPLIT_STRING(str, len, charset, c1, c2) \
489 ((BYTES_BY_CHAR_HEAD ((unsigned char) *(str)) < 2 \
490 || BYTES_BY_CHAR_HEAD ((unsigned char) *(str)) > len \
491 || split_non_ascii_string (str, len, &charset, &c1, &c2, 0) < 0) \
492 ? c1 = *(str), charset = CHARSET_ASCII \
493 : charset)
494
495#define MAX_CHARSET 0xFF
496
497/* Mapping table from ISO2022's charset (specified by DIMENSION,
498 CHARS, and FINAL_CHAR) to Emacs' charset. Should be accessed by
499 macro ISO_CHARSET_TABLE (DIMENSION, CHARS, FINAL_CHAR). */
500extern int iso_charset_table[2][2][128];
501
502#define ISO_CHARSET_TABLE(dimension, chars, final_char) \
503 iso_charset_table[XINT (dimension) - 1][XINT (chars) > 94][XINT (final_char)]
504
505#define BASE_LEADING_CODE_P(c) (BYTES_BY_CHAR_HEAD ((unsigned char) (c)) > 1)
506
507/* The following two macros CHAR_STRING and STRING_CHAR are the main
508 entry points to convert between Emacs two types of character
509 representations: multi-byte form and single-word form (character
510 code). */
511
512/* Set STR a pointer to the multi-byte form of the character C. If C
513 is not a composite character, the multi-byte form is set in WORKBUF
514 and STR points WORKBUF. The caller should allocate at least 4-byte
515 area at WORKBUF in advance. Returns the length of the multi-byte
516 form. */
517
518#define CHAR_STRING(c, workbuf, str) \
519 (SINGLE_BYTE_CHAR_P (c) \
520 ? *(str = workbuf) = (unsigned char)(c), 1 \
521 : non_ascii_char_to_string (c, workbuf, &str))
522
523/* Return a character code of the character of which multi-byte form
524 is at STR and the length is LEN. If STR doesn't contain valid
525 multi-byte form, only the first byte in STR is returned. */
526
527#define STRING_CHAR(str, len) \
528 ((BYTES_BY_CHAR_HEAD ((unsigned char) *(str)) == 1 \
529 || BYTES_BY_CHAR_HEAD ((unsigned char) *(str)) > (len)) \
530 ? (unsigned char) *(str) \
531 : string_to_non_ascii_char (str, len, 0))
532
533/* This is like STRING_CHAR but the third arg ACTUAL_LEN is set to
534 the length of the multi-byte form. Just to know the length, use
535 MULTIBYTE_FORM_LENGTH. */
536
537#define STRING_CHAR_AND_LENGTH(str, len, actual_len) \
538 ((BYTES_BY_CHAR_HEAD ((unsigned char) *(str)) == 1 \
539 || BYTES_BY_CHAR_HEAD ((unsigned char) *(str)) > (len)) \
540 ? (actual_len = 1), (unsigned char) *(str) \
541 : string_to_non_ascii_char (str, len, &actual_len))
542
543/* Return the length of the multi-byte form at string STR of length LEN. */
544
545#define MULTIBYTE_FORM_LENGTH(str, len) \
546 ((BYTES_BY_CHAR_HEAD (*(unsigned char *)(str)) == 1 \
547 || BYTES_BY_CHAR_HEAD (*(unsigned char *)(str)) > (len)) \
548 ? 1 \
549 : multibyte_form_length (str, len))
550
551/* Set C a (possibly multibyte) character at P. P points into a
552 string which is the virtual concatenation of STR1 (which ends at
553 END1) or STR2 (which ends at END2). */
554
555#define GET_CHAR_AFTER_2(c, p, str1, end1, str2, end2) \
556 do { \
557 const char *dtemp = (p) == (end1) ? (str2) : (p); \
558 const char *dlimit = ((p) >= (str1) && (p) < (end1)) ? (end1) : (end2); \
559 c = STRING_CHAR (dtemp, dlimit - dtemp); \
560 } while (0)
561
562/* Set C a (possibly multibyte) character before P. P points into a
563 string which is the virtual concatenation of STR1 (which ends at
564 END1) or STR2 (which ends at END2). */
565
566#define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
567 do { \
568 const char *dtemp = (p); \
569 const char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \
570 while (dtemp-- > dlimit && (unsigned char) *dtemp >= 0xA0); \
571 c = STRING_CHAR (dtemp, p - dtemp); \
572 } while (0)
573
574#ifdef emacs
575
576/* Increase the buffer point POS of the current buffer to the next
577 character boundary. This macro relies on the fact that *GPT_ADDR
578 and *Z_ADDR are always accessible and the values are '\0'. No
579 range checking of POS. */
580#define INC_POS(pos) \
581 do { \
582 unsigned char *p = POS_ADDR (pos) + 1; \
583 pos++; \
584 while (!CHAR_HEAD_P (p)) p++, pos++; \
585 } while (0)
586
587/* Decrease the buffer point POS of the current buffer to the previous
588 character boundary. No range checking of POS. */
589#define DEC_POS(pos) \
590 do { \
591 unsigned char *p, *p_min; \
592 if (--pos < GPT) \
593 p = BEG_ADDR + pos - 1, p_min = BEG_ADDR; \
594 else \
595 p = BEG_ADDR + GAP_SIZE + pos - 1, p_min = GAP_END_ADDR; \
596 while (p > p_min && !CHAR_HEAD_P (p)) p--, pos--; \
597 } while (0)
598
599#endif /* emacs */
600
601/* Maximum counts of components in one composite character. */
602#define MAX_COMPONENT_COUNT 16
603
604/* Structure to hold information of a composite character. */
605struct cmpchar_info {
606 /* Byte length of the composite character. */
607 int len;
608
609 /* Multi-byte form of the composite character. */
610 unsigned char *data;
611
612 /* Length of glyph codes. */
613 int glyph_len;
614
615 /* Width of the overall glyph of the composite character. */
616 int width;
617
618 /* Pointer to an array of glyph codes of the composite character.
619 This actually contains only character code, no face. */
620 GLYPH *glyph;
621
622 /* Pointer to an array of composition rules. The value has the form:
623 (0xA0 + ((GLOBAL-REF-POINT << 2) | NEW-REF-POINT))
624 where each XXX-REF-POINT is 0..8. */
625 unsigned char *cmp_rule;
626
627 /* Pointer to an array of x-axis offset of left edge of glyphs
628 relative to the left of of glyph[0] except for the first element
629 which is the absolute offset from the left edge of overall glyph.
630 The actual pixel offset should be calculated by multiplying each
631 frame's one column width by this value:
632 (i.e. FONT_WIDTH (f->output_data.x->font) * col_offset[N]). */
633 float *col_offset;
634
635 /* Work slot used by `dumpglyphs' (xterm.c). */
636 int face_work;
637};
638
639/* Table of pointers to the structure `cmpchar_info' indexed by
640 CMPCHAR-ID. */
641extern struct cmpchar_info **cmpchar_table;
642/* Number of the current composite characters. */
643extern int n_cmpchars;
644
645/* This is the maximum length of multi-byte form. */
646#define MAX_LENGTH_OF_MULTI_BYTE_FORM (MAX_COMPONENT_COUNT * 6)
647
648#endif /* _CHARSET_H */
649
diff --git a/src/coding.c b/src/coding.c
new file mode 100644
index 00000000000..95bbd26fef9
--- /dev/null
+++ b/src/coding.c
@@ -0,0 +1,3520 @@
1/* Coding system handler (conversion, detection, and etc).
2 Ver.1.0.
3
4 Copyright (C) 1995 Free Software Foundation, Inc.
5 Copyright (C) 1995 Electrotechnical Laboratory, JAPAN.
6
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
20
21/*** TABLE OF CONTENTS ***
22
23 1. Preamble
24 2. Emacs' internal format handlers
25 3. ISO2022 handlers
26 4. Shift-JIS and BIG5 handlers
27 5. End-of-line handlers
28 6. C library functions
29 7. Emacs Lisp library functions
30 8. Post-amble
31
32*/
33
34/*** GENERAL NOTE on CODING SYSTEM ***
35
36 Coding system is an encoding mechanism of one or more character
37 sets. Here's a list of coding systems which Emacs can handle. When
38 we say "decode", it means converting some other coding system to
39 Emacs' internal format, and when we say "encode", it means
40 converting Emacs' internal format to some other coding system.
41
42 0. Emacs' internal format
43
44 Emacs itself holds a multi-lingual character in a buffer and a string
45 in a special format. Details are described in the section 2.
46
47 1. ISO2022
48
49 The most famous coding system for multiple character sets. X's
50 Compound Text, various EUCs (Extended Unix Code), and such coding
51 systems used in Internet communication as ISO-2022-JP are all
52 variants of ISO2022. Details are described in the section 3.
53
54 2. SJIS (or Shift-JIS or MS-Kanji-Code)
55
56 A coding system to encode character sets: ASCII, JISX0201, and
57 JISX0208. Widely used for PC's in Japan. Details are described in
58 the section 4.
59
60 3. BIG5
61
62 A coding system to encode character sets: ASCII and Big5. Widely
63 used by Chinese (mainly in Taiwan and Hong Kong). Details are
64 described in the section 4. In this file, when written as "BIG5"
65 (all uppercase), it means the coding system, and when written as
66 "Big5" (capitalized), it means the character set.
67
68 4. Else
69
70 If a user want to read/write a text encoded in a coding system not
71 listed above, he can supply a decoder and an encoder for it in CCL
72 (Code Conversion Language) programs. Emacs executes the CCL program
73 while reading/writing.
74
75 Emacs represent a coding-system by a Lisp symbol that has a property
76 `coding-system'. But, before actually using the coding-system, the
77 information about it is set in a structure of type `struct
78 coding_system' for rapid processing. See the section 6 for more
79 detail.
80
81*/
82
83/*** GENERAL NOTES on END-OF-LINE FORMAT ***
84
85 How end-of-line of a text is encoded depends on a system. For
86 instance, Unix's format is just one byte of `line-feed' code,
87 whereas DOS's format is two bytes sequence of `carriage-return' and
88 `line-feed' codes. MacOS's format is one byte of `carriage-return'.
89
90 Since how characters in a text is encoded and how end-of-line is
91 encoded is independent, any coding system described above can take
92 any format of end-of-line. So, Emacs has information of format of
93 end-of-line in each coding-system. See the section 6 for more
94 detail.
95
96*/
97
98/*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
99
100 These functions check if a text between SRC and SRC_END is encoded
101 in the coding system category XXX. Each returns an integer value in
102 which appropriate flag bits for the category XXX is set. The flag
103 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
104 template of these functions. */
105#if 0
106int
107detect_coding_internal (src, src_end)
108 unsigned char *src, *src_end;
109{
110 ...
111}
112#endif
113
114/*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
115
116 These functions decode SRC_BYTES length text at SOURCE encoded in
117 CODING to Emacs' internal format. The resulting text goes to a
118 place pointed by DESTINATION, the length of which should not exceed
119 DST_BYTES. The bytes actually processed is returned as *CONSUMED.
120 The return value is the length of the decoded text. Below is a
121 template of these functions. */
122#if 0
123decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
124 struct coding_system *coding;
125 unsigned char *source, *destination;
126 int src_bytes, dst_bytes;
127 int *consumed;
128{
129 ...
130}
131#endif
132
133/*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
134
135 These functions encode SRC_BYTES length text at SOURCE of Emacs
136 internal format to CODING. The resulting text goes to a place
137 pointed by DESTINATION, the length of which should not exceed
138 DST_BYTES. The bytes actually processed is returned as *CONSUMED.
139 The return value is the length of the encoded text. Below is a
140 template of these functions. */
141#if 0
142encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
143 struct coding_system *coding;
144 unsigned char *source, *destination;
145 int src_bytes, dst_bytes;
146 int *consumed;
147{
148 ...
149}
150#endif
151
152/*** COMMONLY USED MACROS ***/
153
154/* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
155 THREE_MORE_BYTES safely get one, two, and three bytes from the
156 source text respectively. If there are not enough bytes in the
157 source, they jump to `label_end_of_loop'. The caller should set
158 variables `src' and `src_end' to appropriate areas in advance. */
159
160#define ONE_MORE_BYTE(c1) \
161 do { \
162 if (src < src_end) \
163 c1 = *src++; \
164 else \
165 goto label_end_of_loop; \
166 } while (0)
167
168#define TWO_MORE_BYTES(c1, c2) \
169 do { \
170 if (src + 1 < src_end) \
171 c1 = *src++, c2 = *src++; \
172 else \
173 goto label_end_of_loop; \
174 } while (0)
175
176#define THREE_MORE_BYTES(c1, c2, c3) \
177 do { \
178 if (src + 2 < src_end) \
179 c1 = *src++, c2 = *src++, c3 = *src++; \
180 else \
181 goto label_end_of_loop; \
182 } while (0)
183
184/* The following three macros DECODE_CHARACTER_ASCII,
185 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
186 the multi-byte form of a character of each class at the place
187 pointed by `dst'. The caller should set the variable `dst' to
188 point to an appropriate area and the variable `coding' to point to
189 the coding-system of the currently decoding text in advance. */
190
191/* Decode one ASCII character C. */
192
193#define DECODE_CHARACTER_ASCII(c) \
194 do { \
195 if (COMPOSING_P (coding->composing)) \
196 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
197 else \
198 *dst++ = (c); \
199 } while (0)
200
201/* Decode one DIMENSION1 character of which charset is CHARSET and
202 position-code is C. */
203
204#define DECODE_CHARACTER_DIMENSION1(charset, c) \
205 do { \
206 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
207 if (COMPOSING_P (coding->composing)) \
208 *dst++ = leading_code + 0x20; \
209 else \
210 *dst++ = leading_code; \
211 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
212 *dst++ = leading_code; \
213 *dst++ = (c) | 0x80; \
214 } while (0)
215
216/* Decode one DIMENSION2 character of which charset is CHARSET and
217 position-codes are C1 and C2. */
218
219#define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
220 do { \
221 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
222 *dst++ = (c2) | 0x80; \
223 } while (0)
224
225
226/*** 1. Preamble ***/
227
228#include <stdio.h>
229
230#ifdef emacs
231
232#include <config.h>
233#include "lisp.h"
234#include "buffer.h"
235#include "charset.h"
236#include "ccl.h"
237#include "coding.h"
238#include "window.h"
239
240#else /* not emacs */
241
242#include "mulelib.h"
243
244#endif /* not emacs */
245
246Lisp_Object Qcoding_system, Qeol_type;
247Lisp_Object Qbuffer_file_coding_system;
248Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
249
250extern Lisp_Object Qinsert_file_contents, Qwrite_region;
251Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
252Lisp_Object Qstart_process, Qopen_network_stream;
253Lisp_Object Qtarget_idx;
254
255/* Mnemonic character of each format of end-of-line. */
256int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
257/* Mnemonic character to indicate format of end-of-line is not yet
258 decided. */
259int eol_mnemonic_undecided;
260
261#ifdef emacs
262
263Lisp_Object Qcoding_system_vector, Qcoding_system_p, Qcoding_system_error;
264
265/* Coding-systems are handed between Emacs Lisp programs and C internal
266 routines by the following three variables. */
267/* Coding-system for reading files and receiving data from process. */
268Lisp_Object Vcoding_system_for_read;
269/* Coding-system for writing files and sending data to process. */
270Lisp_Object Vcoding_system_for_write;
271/* Coding-system actually used in the latest I/O. */
272Lisp_Object Vlast_coding_system_used;
273
274/* Coding-system of what terminal accept for displaying. */
275struct coding_system terminal_coding;
276
277/* Coding-system of what is sent from terminal keyboard. */
278struct coding_system keyboard_coding;
279
280Lisp_Object Vcoding_system_alist;
281
282#endif /* emacs */
283
284Lisp_Object Qcoding_category_index;
285
286/* List of symbols `coding-category-xxx' ordered by priority. */
287Lisp_Object Vcoding_category_list;
288
289/* Table of coding-systems currently assigned to each coding-category. */
290Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
291
292/* Table of names of symbol for each coding-category. */
293char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
294 "coding-category-internal",
295 "coding-category-sjis",
296 "coding-category-iso-7",
297 "coding-category-iso-8-1",
298 "coding-category-iso-8-2",
299 "coding-category-iso-else",
300 "coding-category-big5",
301 "coding-category-binary"
302};
303
304/* Alist of charsets vs the alternate charsets. */
305Lisp_Object Valternate_charset_table;
306
307/* Alist of charsets vs revision number. */
308Lisp_Object Vcharset_revision_alist;
309
310
311/*** 2. Emacs internal format handlers ***/
312
313/* Emacs' internal format for encoding multiple character sets is a
314 kind of multi-byte encoding, i.e. encoding a character by a sequence
315 of one-byte codes of variable length. ASCII characters and control
316 characters (e.g. `tab', `newline') are represented by one-byte as
317 is. It takes the range 0x00 through 0x7F. The other characters
318 are represented by a sequence of `base leading-code', optional
319 `extended leading-code', and one or two `position-code's. Length
320 of the sequence is decided by the base leading-code. Leading-code
321 takes the range 0x80 through 0x9F, whereas extended leading-code
322 and position-code take the range 0xA0 through 0xFF. See the
323 document of `charset.h' for more detail about leading-code and
324 position-code.
325
326 There's one exception in this rule. Special leading-code
327 `leading-code-composition' denotes that the following several
328 characters should be composed into one character. Leading-codes of
329 components (except for ASCII) are added 0x20. An ASCII character
330 component is represented by a 2-byte sequence of `0xA0' and
331 `ASCII-code + 0x80'. See also the document in `charset.h' for the
332 detail of composite character. Hence, we can summarize the code
333 range as follows:
334
335 --- CODE RANGE of Emacs' internal format ---
336 (character set) (range)
337 ASCII 0x00 .. 0x7F
338 ELSE (1st byte) 0x80 .. 0x9F
339 (rest bytes) 0xA0 .. 0xFF
340 ---------------------------------------------
341
342 */
343
344enum emacs_code_class_type emacs_code_class[256];
345
346/* Go to the next statement only if *SRC is accessible and the code is
347 greater than 0xA0. */
348#define CHECK_CODE_RANGE_A0_FF \
349 do { \
350 if (src >= src_end) \
351 goto label_end_of_switch; \
352 else if (*src++ < 0xA0) \
353 return 0; \
354 } while (0)
355
356/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
357 Check if a text is encoded in Emacs' internal format. If it is,
358 return CODING_CATEGORY_MASK_INTERNAL, else return 0. */
359
360int
361detect_coding_internal (src, src_end)
362 unsigned char *src, *src_end;
363{
364 unsigned char c;
365 int composing = 0;
366
367 while (src < src_end)
368 {
369 c = *src++;
370
371 if (composing)
372 {
373 if (c < 0xA0)
374 composing = 0;
375 else
376 c -= 0x20;
377 }
378
379 switch (emacs_code_class[c])
380 {
381 case EMACS_ascii_code:
382 case EMACS_linefeed_code:
383 break;
384
385 case EMACS_control_code:
386 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
387 return 0;
388 break;
389
390 case EMACS_invalid_code:
391 return 0;
392
393 case EMACS_leading_code_composition: /* c == 0x80 */
394 if (composing)
395 CHECK_CODE_RANGE_A0_FF;
396 else
397 composing = 1;
398 break;
399
400 case EMACS_leading_code_4:
401 CHECK_CODE_RANGE_A0_FF;
402 /* fall down to check it two more times ... */
403
404 case EMACS_leading_code_3:
405 CHECK_CODE_RANGE_A0_FF;
406 /* fall down to check it one more time ... */
407
408 case EMACS_leading_code_2:
409 CHECK_CODE_RANGE_A0_FF;
410 break;
411
412 default:
413 label_end_of_switch:
414 break;
415 }
416 }
417 return CODING_CATEGORY_MASK_INTERNAL;
418}
419
420
421/*** 3. ISO2022 handlers ***/
422
423/* The following note describes the coding system ISO2022 briefly.
424 Since the intension of this note is to help understanding of the
425 programs in this file, some parts are NOT ACCURATE or OVERLY
426 SIMPLIFIED. For the thorough understanding, please refer to the
427 original document of ISO2022.
428
429 ISO2022 provides many mechanisms to encode several character sets
430 in 7-bit and 8-bit environment. If one choose 7-bite environment,
431 all text is encoded by codes of less than 128. This may make the
432 encoded text a little bit longer, but the text get more stability
433 to pass through several gateways (some of them split MSB off).
434
435 There are two kind of character set: control character set and
436 graphic character set. The former contains control characters such
437 as `newline' and `escape' to provide control functions (control
438 functions are provided also by escape sequence). The latter
439 contains graphic characters such as ' A' and '-'. Emacs recognizes
440 two control character sets and many graphic character sets.
441
442 Graphic character sets are classified into one of the following
443 four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
444 DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
445 bytes (DIMENSION) and the number of characters in one dimension
446 (CHARS) of the set. In addition, each character set is assigned an
447 identification tag (called "final character" and denoted as <F>
448 here after) which is unique in each class. <F> of each character
449 set is decided by ECMA(*) when it is registered in ISO. Code range
450 of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
451
452 Note (*): ECMA = European Computer Manufacturers Association
453
454 Here are examples of graphic character set [NAME(<F>)]:
455 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
456 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
457 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
458 o DIMENSION2_CHARS96 -- none for the moment
459
460 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
461 C0 [0x00..0x1F] -- control character plane 0
462 GL [0x20..0x7F] -- graphic character plane 0
463 C1 [0x80..0x9F] -- control character plane 1
464 GR [0xA0..0xFF] -- graphic character plane 1
465
466 A control character set is directly designated and invoked to C0 or
467 C1 by an escape sequence. The most common case is that ISO646's
468 control character set is designated/invoked to C0 and ISO6429's
469 control character set is designated/invoked to C1, and usually
470 these designations/invocations are omitted in a coded text. With
471 7-bit environment, only C0 can be used, and a control character for
472 C1 is encoded by an appropriate escape sequence to fit in the
473 environment. All control characters for C1 are defined the
474 corresponding escape sequences.
475
476 A graphic character set is at first designated to one of four
477 graphic registers (G0 through G3), then these graphic registers are
478 invoked to GL or GR. These designations and invocations can be
479 done independently. The most common case is that G0 is invoked to
480 GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
481 these invocations and designations are omitted in a coded text.
482 With 7-bit environment, only GL can be used.
483
484 When a graphic character set of CHARS94 is invoked to GL, code 0x20
485 and 0x7F of GL area work as control characters SPACE and DEL
486 respectively, and code 0xA0 and 0xFF of GR area should not be used.
487
488 There are two ways of invocation: locking-shift and single-shift.
489 With locking-shift, the invocation lasts until the next different
490 invocation, whereas with single-shift, the invocation works only
491 for the following character and doesn't affect locking-shift.
492 Invocations are done by the following control characters or escape
493 sequences.
494
495 ----------------------------------------------------------------------
496 function control char escape sequence description
497 ----------------------------------------------------------------------
498 SI (shift-in) 0x0F none invoke G0 to GL
499 SI (shift-out) 0x0E none invoke G1 to GL
500 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
501 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
502 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 into GL
503 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 into GL
504 ----------------------------------------------------------------------
505 The first four are for locking-shift. Control characters for these
506 functions are defined by macros ISO_CODE_XXX in `coding.h'.
507
508 Designations are done by the following escape sequences.
509 ----------------------------------------------------------------------
510 escape sequence description
511 ----------------------------------------------------------------------
512 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
513 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
514 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
515 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
516 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
517 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
518 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
519 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
520 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
521 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
522 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
523 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
524 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
525 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
526 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
527 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
528 ----------------------------------------------------------------------
529
530 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
531 of dimension 1, chars 94, and final character <F>, and etc.
532
533 Note (*): Although these designations are not allowed in ISO2022,
534 Emacs accepts them on decoding, and produces them on encoding
535 CHARS96 character set in a coding system which is characterized as
536 7-bit environment, non-locking-shift, and non-single-shift.
537
538 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
539 '(' can be omitted. We call this as "short-form" here after.
540
541 Now you may notice that there are a lot of ways for encoding the
542 same multilingual text in ISO2022. Actually, there exist many
543 coding systems such as Compound Text (used in X's inter client
544 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
545 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
546 localized platforms), and all of these are variants of ISO2022.
547
548 In addition to the above, Emacs handles two more kinds of escape
549 sequences: ISO6429's direction specification and Emacs' private
550 sequence for specifying character composition.
551
552 ISO6429's direction specification takes the following format:
553 o CSI ']' -- end of the current direction
554 o CSI '0' ']' -- end of the current direction
555 o CSI '1' ']' -- start of left-to-right text
556 o CSI '2' ']' -- start of right-to-left text
557 The control character CSI (0x9B: control sequence introducer) is
558 abbreviated to the escape sequence ESC '[' in 7-bit environment.
559
560 Character composition specification takes the following format:
561 o ESC '0' -- start character composition
562 o ESC '1' -- end character composition
563 Since these are not standard escape sequences of any ISO, the use
564 of them for these meaning is restricted to Emacs only. */
565
566enum iso_code_class_type iso_code_class[256];
567
568/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
569 Check if a text is encoded in ISO2022. If it is, returns an
570 integer in which appropriate flag bits any of:
571 CODING_CATEGORY_MASK_ISO_7
572 CODING_CATEGORY_MASK_ISO_8_1
573 CODING_CATEGORY_MASK_ISO_8_2
574 CODING_CATEGORY_MASK_ISO_ELSE
575 are set. If a code which should never appear in ISO2022 is found,
576 returns 0. */
577
578int
579detect_coding_iso2022 (src, src_end)
580 unsigned char *src, *src_end;
581{
582 unsigned char graphic_register[4];
583 unsigned char c, esc_cntl;
584 int mask = (CODING_CATEGORY_MASK_ISO_7
585 | CODING_CATEGORY_MASK_ISO_8_1
586 | CODING_CATEGORY_MASK_ISO_8_2);
587 /* We may look ahead maximum 3 bytes. */
588 unsigned char *adjusted_src_end = src_end - 3;
589 int i;
590
591 for (i = 0; i < 4; i++)
592 graphic_register[i] = CHARSET_ASCII;
593
594 while (src < adjusted_src_end)
595 {
596 c = *src++;
597 switch (c)
598 {
599 case ISO_CODE_ESC:
600 if (src >= adjusted_src_end)
601 break;
602 c = *src++;
603 if (c == '$')
604 {
605 /* Designation of 2-byte character set. */
606 if (src >= adjusted_src_end)
607 break;
608 c = *src++;
609 }
610 if ((c >= ')' && c <= '+') || (c >= '-' && c <= '/'))
611 /* Designation to graphic register 1, 2, or 3. */
612 mask &= ~CODING_CATEGORY_MASK_ISO_7;
613 else if (c == 'N' || c == 'O' || c == 'n' || c == 'o')
614 return CODING_CATEGORY_MASK_ISO_ELSE;
615 break;
616
617 case ISO_CODE_SI:
618 case ISO_CODE_SO:
619 return CODING_CATEGORY_MASK_ISO_ELSE;
620
621 case ISO_CODE_CSI:
622 case ISO_CODE_SS2:
623 case ISO_CODE_SS3:
624 mask &= ~CODING_CATEGORY_MASK_ISO_7;
625 break;
626
627 default:
628 if (c < 0x80)
629 break;
630 else if (c < 0xA0)
631 return 0;
632 else
633 {
634 int count = 1;
635
636 mask &= ~CODING_CATEGORY_MASK_ISO_7;
637 while (src < adjusted_src_end && *src >= 0xA0)
638 count++, src++;
639 if (count & 1 && src < adjusted_src_end)
640 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
641 }
642 break;
643 }
644 }
645
646 return mask;
647}
648
649/* Decode a character of which charset is CHARSET and the 1st position
650 code is C1. If dimension of CHARSET 2, the 2nd position code is
651 fetched from SRC and set to C2. If CHARSET is negative, it means
652 that we are decoding ill formed text, and what we can do is just to
653 read C1 as is. */
654
655#define DECODE_ISO_CHARACTER(charset, c1) \
656 do { \
657 if ((charset) >= 0 && CHARSET_DIMENSION (charset) == 2) \
658 ONE_MORE_BYTE (c2); \
659 if (COMPOSING_HEAD_P (coding->composing)) \
660 { \
661 *dst++ = LEADING_CODE_COMPOSITION; \
662 if (COMPOSING_WITH_RULE_P (coding->composing)) \
663 /* To tell composition rules are embeded. */ \
664 *dst++ = 0xFF; \
665 coding->composing += 2; \
666 } \
667 if ((charset) < 0) \
668 *dst++ = c1; \
669 else if ((charset) == CHARSET_ASCII) \
670 DECODE_CHARACTER_ASCII (c1); \
671 else if (CHARSET_DIMENSION (charset) == 1) \
672 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
673 else \
674 DECODE_CHARACTER_DIMENSION2 (charset, c1, c2); \
675 if (COMPOSING_WITH_RULE_P (coding->composing)) \
676 /* To tell a composition rule follows. */ \
677 coding->composing = COMPOSING_WITH_RULE_RULE; \
678 } while (0)
679
680/* Set designation state into CODING. */
681#define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
682 do { \
683 int charset = ISO_CHARSET_TABLE (dimension, chars, final_char); \
684 Lisp_Object temp \
685 = Fassq (CHARSET_SYMBOL (charset), Valternate_charset_table); \
686 if (! NILP (temp)) \
687 charset = get_charset_id (XCONS (temp)->cdr); \
688 if (charset >= 0) \
689 { \
690 if (coding->direction == 1 \
691 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
692 charset = CHARSET_REVERSE_CHARSET (charset); \
693 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
694 } \
695 } while (0)
696
697/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
698
699int
700decode_coding_iso2022 (coding, source, destination,
701 src_bytes, dst_bytes, consumed)
702 struct coding_system *coding;
703 unsigned char *source, *destination;
704 int src_bytes, dst_bytes;
705 int *consumed;
706{
707 unsigned char *src = source;
708 unsigned char *src_end = source + src_bytes;
709 unsigned char *dst = destination;
710 unsigned char *dst_end = destination + dst_bytes;
711 /* Since the maximum bytes produced by each loop is 7, we subtract 6
712 from DST_END to assure that overflow checking is necessary only
713 at the head of loop. */
714 unsigned char *adjusted_dst_end = dst_end - 6;
715 int charset;
716 /* Charsets invoked to graphic plane 0 and 1 respectively. */
717 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
718 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
719
720 while (src < src_end && dst < adjusted_dst_end)
721 {
722 /* SRC_BASE remembers the start position in source in each loop.
723 The loop will be exited when there's not enough source text
724 to analyze long escape sequence or 2-byte code (within macros
725 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
726 to SRC_BASE before exiting. */
727 unsigned char *src_base = src;
728 unsigned char c1 = *src++, c2, cmprule;
729
730 switch (iso_code_class [c1])
731 {
732 case ISO_0x20_or_0x7F:
733 if (!coding->composing
734 && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
735 {
736 /* This is SPACE or DEL. */
737 *dst++ = c1;
738 break;
739 }
740 /* This is a graphic character, we fall down ... */
741
742 case ISO_graphic_plane_0:
743 if (coding->composing == COMPOSING_WITH_RULE_RULE)
744 {
745 /* This is a composition rule. */
746 *dst++ = c1 | 0x80;
747 coding->composing = COMPOSING_WITH_RULE_TAIL;
748 }
749 else
750 DECODE_ISO_CHARACTER (charset0, c1);
751 break;
752
753 case ISO_0xA0_or_0xFF:
754 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94)
755 {
756 /* Invalid code. */
757 *dst++ = c1;
758 break;
759 }
760 /* This is a graphic character, we fall down ... */
761
762 case ISO_graphic_plane_1:
763 DECODE_ISO_CHARACTER (charset1, c1);
764 break;
765
766 case ISO_control_code:
767 /* All ISO2022 control characters in this class have the
768 same representation in Emacs internal format. */
769 *dst++ = c1;
770 break;
771
772 case ISO_carriage_return:
773 if (coding->eol_type == CODING_EOL_CR)
774 {
775 *dst++ = '\n';
776 }
777 else if (coding->eol_type == CODING_EOL_CRLF)
778 {
779 ONE_MORE_BYTE (c1);
780 if (c1 == ISO_CODE_LF)
781 *dst++ = '\n';
782 else
783 {
784 src--;
785 *dst++ = c1;
786 }
787 }
788 else
789 {
790 *dst++ = c1;
791 }
792 break;
793
794 case ISO_shift_out:
795 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
796 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
797 break;
798
799 case ISO_shift_in:
800 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
801 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
802 break;
803
804 case ISO_single_shift_2_7:
805 case ISO_single_shift_2:
806 /* SS2 is handled as an escape sequence of ESC 'N' */
807 c1 = 'N';
808 goto label_escape_sequence;
809
810 case ISO_single_shift_3:
811 /* SS2 is handled as an escape sequence of ESC 'O' */
812 c1 = 'O';
813 goto label_escape_sequence;
814
815 case ISO_control_sequence_introducer:
816 /* CSI is handled as an escape sequence of ESC '[' ... */
817 c1 = '[';
818 goto label_escape_sequence;
819
820 case ISO_escape:
821 ONE_MORE_BYTE (c1);
822 label_escape_sequence:
823 /* Escape sequences handled by Emacs are invocation,
824 designation, direction specification, and character
825 composition specification. */
826 switch (c1)
827 {
828 case '&': /* revision of following character set */
829 ONE_MORE_BYTE (c1);
830 if (!(c1 >= '@' && c1 <= '~'))
831 {
832 goto label_invalid_escape_sequence;
833 }
834 ONE_MORE_BYTE (c1);
835 if (c1 != ISO_CODE_ESC)
836 {
837 goto label_invalid_escape_sequence;
838 }
839 ONE_MORE_BYTE (c1);
840 goto label_escape_sequence;
841
842 case '$': /* designation of 2-byte character set */
843 ONE_MORE_BYTE (c1);
844 if (c1 >= '@' && c1 <= 'B')
845 { /* designation of JISX0208.1978, GB2312.1980,
846 or JISX0208.1980 */
847 DECODE_DESIGNATION (0, 2, 94, c1);
848 }
849 else if (c1 >= 0x28 && c1 <= 0x2B)
850 { /* designation of DIMENSION2_CHARS94 character set */
851 ONE_MORE_BYTE (c2);
852 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
853 }
854 else if (c1 >= 0x2C && c1 <= 0x2F)
855 { /* designation of DIMENSION2_CHARS96 character set */
856 ONE_MORE_BYTE (c2);
857 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
858 }
859 else
860 {
861 goto label_invalid_escape_sequence;
862 }
863 break;
864
865 case 'n': /* invocation of locking-shift-2 */
866 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
867 break;
868
869 case 'o': /* invocation of locking-shift-3 */
870 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
871 break;
872
873 case 'N': /* invocation of single-shift-2 */
874 ONE_MORE_BYTE (c1);
875 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
876 DECODE_ISO_CHARACTER (charset, c1);
877 break;
878
879 case 'O': /* invocation of single-shift-3 */
880 ONE_MORE_BYTE (c1);
881 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
882 DECODE_ISO_CHARACTER (charset, c1);
883 break;
884
885 case '0': /* start composing without embeded rules */
886 coding->composing = COMPOSING_NO_RULE_HEAD;
887 break;
888
889 case '1': /* end composing */
890 coding->composing = COMPOSING_NO;
891 break;
892
893 case '2': /* start composing with embeded rules */
894 coding->composing = COMPOSING_WITH_RULE_HEAD;
895 break;
896
897 case '[': /* specification of direction */
898 /* For the moment, nested direction is not supported.
899 So, the value of `coding->direction' is 0 or 1: 0
900 means left-to-right, 1 means right-to-left. */
901 ONE_MORE_BYTE (c1);
902 switch (c1)
903 {
904 case ']': /* end of the current direction */
905 coding->direction = 0;
906
907 case '0': /* end of the current direction */
908 case '1': /* start of left-to-right direction */
909 ONE_MORE_BYTE (c1);
910 if (c1 == ']')
911 coding->direction = 0;
912 else
913 goto label_invalid_escape_sequence;
914 break;
915
916 case '2': /* start of right-to-left direction */
917 ONE_MORE_BYTE (c1);
918 if (c1 == ']')
919 coding->direction= 1;
920 else
921 goto label_invalid_escape_sequence;
922 break;
923
924 default:
925 goto label_invalid_escape_sequence;
926 }
927 break;
928
929 default:
930 if (c1 >= 0x28 && c1 <= 0x2B)
931 { /* designation of DIMENSION1_CHARS94 character set */
932 ONE_MORE_BYTE (c2);
933 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
934 }
935 else if (c1 >= 0x2C && c1 <= 0x2F)
936 { /* designation of DIMENSION1_CHARS96 character set */
937 ONE_MORE_BYTE (c2);
938 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
939 }
940 else
941 {
942 goto label_invalid_escape_sequence;
943 }
944 }
945 /* We must update these variables now. */
946 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
947 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
948 break;
949
950 label_invalid_escape_sequence:
951 {
952 int length = src - src_base;
953
954 bcopy (src_base, dst, length);
955 dst += length;
956 }
957 }
958 continue;
959
960 label_end_of_loop:
961 coding->carryover_size = src - src_base;
962 bcopy (src_base, coding->carryover, coding->carryover_size);
963 src = src_base;
964 break;
965 }
966
967 /* If this is the last block of the text to be decoded, we had
968 better just flush out all remaining codes in the text although
969 they are not valid characters. */
970 if (coding->last_block)
971 {
972 bcopy (src, dst, src_end - src);
973 dst += (src_end - src);
974 src = src_end;
975 }
976 *consumed = src - source;
977 return dst - destination;
978}
979
980/* ISO2022 encoding staffs. */
981
982/*
983 It is not enough to say just "ISO2022" on encoding, but we have to
984 specify more details. In Emacs, each coding-system of ISO2022
985 variant has the following specifications:
986 1. Initial designation to G0 thru G3.
987 2. Allows short-form designation?
988 3. ASCII should be designated to G0 before control characters?
989 4. ASCII should be designated to G0 at end of line?
990 5. 7-bit environment or 8-bit environment?
991 6. Use locking-shift?
992 7. Use Single-shift?
993 And the following two are only for Japanese:
994 8. Use ASCII in place of JIS0201-1976-Roman?
995 9. Use JISX0208-1983 in place of JISX0208-1978?
996 These specifications are encoded in `coding->flags' as flag bits
997 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
998 detail.
999*/
1000
1001/* Produce codes (escape sequence) for designating CHARSET to graphic
1002 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1003 the coding system CODING allows, produce designation sequence of
1004 short-form. */
1005
1006#define ENCODE_DESIGNATION(charset, reg, coding) \
1007 do { \
1008 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1009 char *intermediate_char_94 = "()*+"; \
1010 char *intermediate_char_96 = ",-./"; \
1011 Lisp_Object temp \
1012 = Fassq (make_number (charset), Vcharset_revision_alist); \
1013 if (! NILP (temp)) \
1014 { \
1015 *dst++ = ISO_CODE_ESC; \
1016 *dst++ = '&'; \
1017 *dst++ = XINT (XCONS (temp)->cdr) + '@'; \
1018 } \
1019 *dst++ = ISO_CODE_ESC; \
1020 if (CHARSET_DIMENSION (charset) == 1) \
1021 { \
1022 if (CHARSET_CHARS (charset) == 94) \
1023 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1024 else \
1025 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1026 } \
1027 else \
1028 { \
1029 *dst++ = '$'; \
1030 if (CHARSET_CHARS (charset) == 94) \
1031 { \
1032 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1033 || reg != 0 \
1034 || final_char < '@' || final_char > 'B') \
1035 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1036 } \
1037 else \
1038 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1039 } \
1040 *dst++ = final_char; \
1041 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1042 } while (0)
1043
1044/* The following two macros produce codes (control character or escape
1045 sequence) for ISO2022 single-shift functions (single-shift-2 and
1046 single-shift-3). */
1047
1048#define ENCODE_SINGLE_SHIFT_2 \
1049 do { \
1050 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1051 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1052 else \
1053 *dst++ = ISO_CODE_SS2; \
1054 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1055 } while (0)
1056
1057#define ENCODE_SINGLE_SHIFT_3 \
1058 do { \
1059 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1060 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1061 else \
1062 *dst++ = ISO_CODE_SS3; \
1063 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1064 } while (0)
1065
1066/* The following four macros produce codes (control character or
1067 escape sequence) for ISO2022 locking-shift functions (shift-in,
1068 shift-out, locking-shift-2, and locking-shift-3). */
1069
1070#define ENCODE_SHIFT_IN \
1071 do { \
1072 *dst++ = ISO_CODE_SI; \
1073 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1074 } while (0)
1075
1076#define ENCODE_SHIFT_OUT \
1077 do { \
1078 *dst++ = ISO_CODE_SO; \
1079 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1080 } while (0)
1081
1082#define ENCODE_LOCKING_SHIFT_2 \
1083 do { \
1084 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1085 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1086 } while (0)
1087
1088#define ENCODE_LOCKING_SHIFT_3 \
1089 do { \
1090 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1091 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1092 } while (0)
1093
1094/* Produce codes for a DIMENSION1 character of which character set is
1095 CHARSET and position-code is C1. Designation and invocation
1096 sequences are also produced in advance if necessary. */
1097
1098
1099#define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1100 do { \
1101 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1102 { \
1103 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1104 *dst++ = c1 & 0x7F; \
1105 else \
1106 *dst++ = c1 | 0x80; \
1107 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1108 break; \
1109 } \
1110 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1111 { \
1112 *dst++ = c1 & 0x7F; \
1113 break; \
1114 } \
1115 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1116 { \
1117 *dst++ = c1 | 0x80; \
1118 break; \
1119 } \
1120 else \
1121 /* Since CHARSET is not yet invoked to any graphic planes, we \
1122 must invoke it, or, at first, designate it to some graphic \
1123 register. Then repeat the loop to actually produce the \
1124 character. */ \
1125 dst = encode_invocation_designation (charset, coding, dst); \
1126 } while (1)
1127
1128/* Produce codes for a DIMENSION2 character of which character set is
1129 CHARSET and position-codes are C1 and C2. Designation and
1130 invocation codes are also produced in advance if necessary. */
1131
1132#define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1133 do { \
1134 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1135 { \
1136 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1137 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1138 else \
1139 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1140 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1141 break; \
1142 } \
1143 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1144 { \
1145 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1146 break; \
1147 } \
1148 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1149 { \
1150 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1151 break; \
1152 } \
1153 else \
1154 /* Since CHARSET is not yet invoked to any graphic planes, we \
1155 must invoke it, or, at first, designate it to some graphic \
1156 register. Then repeat the loop to actually produce the \
1157 character. */ \
1158 dst = encode_invocation_designation (charset, coding, dst); \
1159 } while (1)
1160
1161/* Produce designation and invocation codes at a place pointed by DST
1162 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1163 Return new DST. */
1164
1165unsigned char *
1166encode_invocation_designation (charset, coding, dst)
1167 int charset;
1168 struct coding_system *coding;
1169 unsigned char *dst;
1170{
1171 int reg; /* graphic register number */
1172
1173 /* At first, check designations. */
1174 for (reg = 0; reg < 4; reg++)
1175 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1176 break;
1177
1178 if (reg >= 4)
1179 {
1180 /* CHARSET is not yet designated to any graphic registers. */
1181 /* At first check the requested designation. */
1182 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1183 if (reg < 0)
1184 /* Since CHARSET requests no special designation, designate to
1185 graphic register 0. */
1186 reg = 0;
1187
1188 ENCODE_DESIGNATION (charset, reg, coding);
1189 }
1190
1191 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1192 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1193 {
1194 /* Since the graphic register REG is not invoked to any graphic
1195 planes, invoke it to graphic plane 0. */
1196 switch (reg)
1197 {
1198 case 0: /* graphic register 0 */
1199 ENCODE_SHIFT_IN;
1200 break;
1201
1202 case 1: /* graphic register 1 */
1203 ENCODE_SHIFT_OUT;
1204 break;
1205
1206 case 2: /* graphic register 2 */
1207 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1208 ENCODE_SINGLE_SHIFT_2;
1209 else
1210 ENCODE_LOCKING_SHIFT_2;
1211 break;
1212
1213 case 3: /* graphic register 3 */
1214 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1215 ENCODE_SINGLE_SHIFT_3;
1216 else
1217 ENCODE_LOCKING_SHIFT_3;
1218 break;
1219 }
1220 }
1221 return dst;
1222}
1223
1224/* The following two macros produce codes for indicating composition. */
1225#define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1226#define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1227#define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1228
1229/* The following three macros produce codes for indicating direction
1230 of text. */
1231#define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1232 do { \
1233 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1234 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1235 else \
1236 *dst++ = ISO_CODE_CSI; \
1237 } while (0)
1238
1239#define ENCODE_DIRECTION_R2L \
1240 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1241
1242#define ENCODE_DIRECTION_L2R \
1243 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1244
1245/* Produce codes for designation and invocation to reset the graphic
1246 planes and registers to initial state. */
1247#define ENCODE_RESET_PLANE_AND_REGISTER(eol) \
1248 do { \
1249 int reg; \
1250 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1251 ENCODE_SHIFT_IN; \
1252 for (reg = 0; reg < 4; reg++) \
1253 { \
1254 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) < 0) \
1255 { \
1256 if (eol) CODING_SPEC_ISO_DESIGNATION (coding, reg) = -1; \
1257 } \
1258 else if (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1259 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)) \
1260 ENCODE_DESIGNATION \
1261 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1262 } \
1263 } while (0)
1264
1265/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1266
1267int
1268encode_coding_iso2022 (coding, source, destination,
1269 src_bytes, dst_bytes, consumed)
1270 struct coding_system *coding;
1271 unsigned char *source, *destination;
1272 int src_bytes, dst_bytes;
1273 int *consumed;
1274{
1275 unsigned char *src = source;
1276 unsigned char *src_end = source + src_bytes;
1277 unsigned char *dst = destination;
1278 unsigned char *dst_end = destination + dst_bytes;
1279 /* Since the maximum bytes produced by each loop is 6, we subtract 5
1280 from DST_END to assure overflow checking is necessary only at the
1281 head of loop. */
1282 unsigned char *adjusted_dst_end = dst_end - 5;
1283
1284 while (src < src_end && dst < adjusted_dst_end)
1285 {
1286 /* SRC_BASE remembers the start position in source in each loop.
1287 The loop will be exited when there's not enough source text
1288 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1289 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1290 reset to SRC_BASE before exiting. */
1291 unsigned char *src_base = src;
1292 unsigned char c1 = *src++, c2, c3, c4;
1293 int charset;
1294
1295 /* If we are seeing a component of a composite character, we are
1296 seeing a leading-code specially encoded for composition, or a
1297 composition rule if composing with rule. We must set C1
1298 to a normal leading-code or an ASCII code. If we are not at
1299 a composed character, we must reset the composition state. */
1300 if (COMPOSING_P (coding->composing))
1301 {
1302 if (c1 < 0xA0)
1303 {
1304 /* We are not in a composite character any longer. */
1305 coding->composing = COMPOSING_NO;
1306 ENCODE_COMPOSITION_END;
1307 }
1308 else
1309 {
1310 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1311 {
1312 *dst++ = c1 & 0x7F;
1313 coding->composing = COMPOSING_WITH_RULE_HEAD;
1314 continue;
1315 }
1316 else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1317 coding->composing = COMPOSING_WITH_RULE_RULE;
1318 if (c1 == 0xA0)
1319 {
1320 /* This is an ASCII component. */
1321 ONE_MORE_BYTE (c1);
1322 c1 &= 0x7F;
1323 }
1324 else
1325 /* This is a leading-code of non ASCII component. */
1326 c1 -= 0x20;
1327 }
1328 }
1329
1330 /* Now encode one character. C1 is a control character, an
1331 ASCII character, or a leading-code of multi-byte character. */
1332 switch (emacs_code_class[c1])
1333 {
1334 case EMACS_ascii_code:
1335 ENCODE_ISO_CHARACTER_DIMENSION1 (CHARSET_ASCII, c1);
1336 break;
1337
1338 case EMACS_control_code:
1339 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1340 ENCODE_RESET_PLANE_AND_REGISTER (0);
1341 *dst++ = c1;
1342 break;
1343
1344 case EMACS_carriage_return_code:
1345 if (!coding->selective)
1346 {
1347 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1348 ENCODE_RESET_PLANE_AND_REGISTER (0);
1349 *dst++ = c1;
1350 break;
1351 }
1352 /* fall down to treat '\r' as '\n' ... */
1353
1354 case EMACS_linefeed_code:
1355 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
1356 ENCODE_RESET_PLANE_AND_REGISTER (1);
1357 if (coding->eol_type == CODING_EOL_LF
1358 || coding->eol_type == CODING_EOL_AUTOMATIC)
1359 *dst++ = ISO_CODE_LF;
1360 else if (coding->eol_type == CODING_EOL_CRLF)
1361 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1362 else
1363 *dst++ = ISO_CODE_CR;
1364 break;
1365
1366 case EMACS_leading_code_2:
1367 ONE_MORE_BYTE (c2);
1368 ENCODE_ISO_CHARACTER_DIMENSION1 (c1, c2);
1369 break;
1370
1371 case EMACS_leading_code_3:
1372 TWO_MORE_BYTES (c2, c3);
1373 if (c1 < LEADING_CODE_PRIVATE_11)
1374 ENCODE_ISO_CHARACTER_DIMENSION2 (c1, c2, c3);
1375 else
1376 ENCODE_ISO_CHARACTER_DIMENSION1 (c2, c3);
1377 break;
1378
1379 case EMACS_leading_code_4:
1380 THREE_MORE_BYTES (c2, c3, c4);
1381 ENCODE_ISO_CHARACTER_DIMENSION2 (c2, c3, c4);
1382 break;
1383
1384 case EMACS_leading_code_composition:
1385 ONE_MORE_BYTE (c1);
1386 if (c1 == 0xFF)
1387 {
1388 coding->composing = COMPOSING_WITH_RULE_HEAD;
1389 ENCODE_COMPOSITION_WITH_RULE_START;
1390 }
1391 else
1392 {
1393 /* Rewind one byte because it is a character code of
1394 composition elements. */
1395 src--;
1396 coding->composing = COMPOSING_NO_RULE_HEAD;
1397 ENCODE_COMPOSITION_NO_RULE_START;
1398 }
1399 break;
1400
1401 case EMACS_invalid_code:
1402 *dst++ = c1;
1403 break;
1404 }
1405 continue;
1406 label_end_of_loop:
1407 coding->carryover_size = src - src_base;
1408 bcopy (src_base, coding->carryover, coding->carryover_size);
1409 src = src_base;
1410 break;
1411 }
1412
1413 /* If this is the last block of the text to be encoded, we must
1414 reset the state of graphic planes and registers to initial one.
1415 In addition, we had better just flush out all remaining codes in
1416 the text although they are not valid characters. */
1417 if (coding->last_block)
1418 {
1419 ENCODE_RESET_PLANE_AND_REGISTER (1);
1420 bcopy(src, dst, src_end - src);
1421 dst += (src_end - src);
1422 src = src_end;
1423 }
1424 *consumed = src - source;
1425 return dst - destination;
1426}
1427
1428
1429/*** 4. SJIS and BIG5 handlers ***/
1430
1431/* Although SJIS and BIG5 are not ISO's coding system, They are used
1432 quite widely. So, for the moment, Emacs supports them in the bare
1433 C code. But, in the future, they may be supported only by CCL. */
1434
1435/* SJIS is a coding system encoding three character sets: ASCII, right
1436 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
1437 as is. A character of charset katakana-jisx0201 is encoded by
1438 "position-code + 0x80". A character of charset japanese-jisx0208
1439 is encoded in 2-byte but two position-codes are divided and shifted
1440 so that it fit in the range below.
1441
1442 --- CODE RANGE of SJIS ---
1443 (character set) (range)
1444 ASCII 0x00 .. 0x7F
1445 KATAKANA-JISX0201 0xA0 .. 0xDF
1446 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xFF
1447 (2nd byte) 0x40 .. 0xFF
1448 -------------------------------
1449
1450*/
1451
1452/* BIG5 is a coding system encoding two character sets: ASCII and
1453 Big5. An ASCII character is encoded as is. Big5 is a two-byte
1454 character set and is encoded in two-byte.
1455
1456 --- CODE RANGE of BIG5 ---
1457 (character set) (range)
1458 ASCII 0x00 .. 0x7F
1459 Big5 (1st byte) 0xA1 .. 0xFE
1460 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
1461 --------------------------
1462
1463 Since the number of characters in Big5 is larger than maximum
1464 characters in Emacs' charset (96x96), it can't be handled as one
1465 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
1466 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
1467 contains frequently used characters and the latter contains less
1468 frequently used characters. */
1469
1470/* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
1471 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1472 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1473 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
1474
1475/* Number of Big5 characters which have the same code in 1st byte. */
1476#define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1477
1478#define DECODE_BIG5(b1, b2, charset, c1, c2) \
1479 do { \
1480 unsigned int temp \
1481 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
1482 if (b1 < 0xC9) \
1483 charset = charset_big5_1; \
1484 else \
1485 { \
1486 charset = charset_big5_2; \
1487 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
1488 } \
1489 c1 = temp / (0xFF - 0xA1) + 0x21; \
1490 c2 = temp % (0xFF - 0xA1) + 0x21; \
1491 } while (0)
1492
1493#define ENCODE_BIG5(charset, c1, c2, b1, b2) \
1494 do { \
1495 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
1496 if (charset == charset_big5_2) \
1497 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
1498 b1 = temp / BIG5_SAME_ROW + 0xA1; \
1499 b2 = temp % BIG5_SAME_ROW; \
1500 b2 += b2 < 0x3F ? 0x40 : 0x62; \
1501 } while (0)
1502
1503/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1504 Check if a text is encoded in SJIS. If it is, return
1505 CODING_CATEGORY_MASK_SJIS, else return 0. */
1506
1507int
1508detect_coding_sjis (src, src_end)
1509 unsigned char *src, *src_end;
1510{
1511 unsigned char c;
1512
1513 while (src < src_end)
1514 {
1515 c = *src++;
1516 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1517 return 0;
1518 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
1519 {
1520 if (src < src_end && *src++ < 0x40)
1521 return 0;
1522 }
1523 }
1524 return CODING_CATEGORY_MASK_SJIS;
1525}
1526
1527/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1528 Check if a text is encoded in BIG5. If it is, return
1529 CODING_CATEGORY_MASK_BIG5, else return 0. */
1530
1531int
1532detect_coding_big5 (src, src_end)
1533 unsigned char *src, *src_end;
1534{
1535 unsigned char c;
1536
1537 while (src < src_end)
1538 {
1539 c = *src++;
1540 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1541 return 0;
1542 if (c >= 0xA1)
1543 {
1544 if (src >= src_end)
1545 break;
1546 c = *src++;
1547 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
1548 return 0;
1549 }
1550 }
1551 return CODING_CATEGORY_MASK_BIG5;
1552}
1553
1554/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1555 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
1556
1557int
1558decode_coding_sjis_big5 (coding, source, destination,
1559 src_bytes, dst_bytes, consumed, sjis_p)
1560 struct coding_system *coding;
1561 unsigned char *source, *destination;
1562 int src_bytes, dst_bytes;
1563 int *consumed;
1564 int sjis_p;
1565{
1566 unsigned char *src = source;
1567 unsigned char *src_end = source + src_bytes;
1568 unsigned char *dst = destination;
1569 unsigned char *dst_end = destination + dst_bytes;
1570 /* Since the maximum bytes produced by each loop is 4, we subtract 3
1571 from DST_END to assure overflow checking is necessary only at the
1572 head of loop. */
1573 unsigned char *adjusted_dst_end = dst_end - 3;
1574
1575 while (src < src_end && dst < adjusted_dst_end)
1576 {
1577 /* SRC_BASE remembers the start position in source in each loop.
1578 The loop will be exited when there's not enough source text
1579 to analyze two-byte character (within macro ONE_MORE_BYTE).
1580 In that case, SRC is reset to SRC_BASE before exiting. */
1581 unsigned char *src_base = src;
1582 unsigned char c1 = *src++, c2, c3, c4;
1583
1584 if (c1 == '\r')
1585 {
1586 if (coding->eol_type == CODING_EOL_CRLF)
1587 {
1588 ONE_MORE_BYTE (c2);
1589 if (c2 == '\n')
1590 *dst++ = c2;
1591 else
1592 /* To process C2 again, SRC is subtracted by 1. */
1593 *dst++ = c1, src--;
1594 }
1595 else
1596 *dst++ = c1;
1597 }
1598 else if (c1 < 0x80)
1599 *dst++ = c1;
1600 else if (c1 < 0xA0 || c1 >= 0xE0)
1601 {
1602 /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1603 if (sjis_p)
1604 {
1605 ONE_MORE_BYTE (c2);
1606 DECODE_SJIS (c1, c2, c3, c4);
1607 DECODE_CHARACTER_DIMENSION2 (charset_jisx0208, c3, c4);
1608 }
1609 else if (c1 >= 0xE0 && c1 < 0xFF)
1610 {
1611 int charset;
1612
1613 ONE_MORE_BYTE (c2);
1614 DECODE_BIG5 (c1, c2, charset, c3, c4);
1615 DECODE_CHARACTER_DIMENSION2 (charset, c3, c4);
1616 }
1617 else /* Invalid code */
1618 *dst++ = c1;
1619 }
1620 else
1621 {
1622 /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1623 if (sjis_p)
1624 DECODE_CHARACTER_DIMENSION1 (charset_katakana_jisx0201, c1);
1625 else
1626 {
1627 int charset;
1628
1629 ONE_MORE_BYTE (c2);
1630 DECODE_BIG5 (c1, c2, charset, c3, c4);
1631 DECODE_CHARACTER_DIMENSION2 (charset, c3, c4);
1632 }
1633 }
1634 continue;
1635
1636 label_end_of_loop:
1637 coding->carryover_size = src - src_base;
1638 bcopy (src_base, coding->carryover, coding->carryover_size);
1639 src = src_base;
1640 break;
1641 }
1642
1643 *consumed = src - source;
1644 return dst - destination;
1645}
1646
1647/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1648 This function can encode `charset_ascii', `charset_katakana_jisx0201',
1649 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
1650 sure that all these charsets are registered as official charset
1651 (i.e. do not have extended leading-codes). Characters of other
1652 charsets are produced without any encoding. If SJIS_P is 1, encode
1653 SJIS text, else encode BIG5 text. */
1654
1655int
1656encode_coding_sjis_big5 (coding, source, destination,
1657 src_bytes, dst_bytes, consumed, sjis_p)
1658 struct coding_system *coding;
1659 unsigned char *source, *destination;
1660 int src_bytes, dst_bytes;
1661 int *consumed;
1662 int sjis_p;
1663{
1664 unsigned char *src = source;
1665 unsigned char *src_end = source + src_bytes;
1666 unsigned char *dst = destination;
1667 unsigned char *dst_end = destination + dst_bytes;
1668 /* Since the maximum bytes produced by each loop is 2, we subtract 1
1669 from DST_END to assure overflow checking is necessary only at the
1670 head of loop. */
1671 unsigned char *adjusted_dst_end = dst_end - 1;
1672
1673 while (src < src_end && dst < adjusted_dst_end)
1674 {
1675 /* SRC_BASE remembers the start position in source in each loop.
1676 The loop will be exited when there's not enough source text
1677 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
1678 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
1679 before exiting. */
1680 unsigned char *src_base = src;
1681 unsigned char c1 = *src++, c2, c3, c4;
1682
1683 if (coding->composing)
1684 {
1685 if (c1 == 0xA0)
1686 {
1687 ONE_MORE_BYTE (c1);
1688 c1 &= 0x7F;
1689 }
1690 else if (c1 >= 0xA0)
1691 c1 -= 0x20;
1692 else
1693 coding->composing = 0;
1694 }
1695
1696 switch (emacs_code_class[c1])
1697 {
1698 case EMACS_ascii_code:
1699 case EMACS_control_code:
1700 *dst++ = c1;
1701 break;
1702
1703 case EMACS_carriage_return_code:
1704 if (!coding->selective)
1705 {
1706 *dst++ = c1;
1707 break;
1708 }
1709 /* fall down to treat '\r' as '\n' ... */
1710
1711 case EMACS_linefeed_code:
1712 if (coding->eol_type == CODING_EOL_LF
1713 || coding->eol_type == CODING_EOL_AUTOMATIC)
1714 *dst++ = '\n';
1715 else if (coding->eol_type == CODING_EOL_CRLF)
1716 *dst++ = '\r', *dst++ = '\n';
1717 else
1718 *dst++ = '\r';
1719 break;
1720
1721 case EMACS_leading_code_2:
1722 ONE_MORE_BYTE (c2);
1723 if (sjis_p && c1 == charset_katakana_jisx0201)
1724 *dst++ = c2;
1725 else
1726 *dst++ = c1, *dst++ = c2;
1727 break;
1728
1729 case EMACS_leading_code_3:
1730 TWO_MORE_BYTES (c2, c3);
1731 c2 &= 0x7F, c3 &= 0x7F;
1732 if (sjis_p && c1 == charset_jisx0208)
1733 {
1734 unsigned char s1, s2;
1735
1736 ENCODE_SJIS (c2, c3, s1, s2);
1737 *dst++ = s1, *dst++ = s2;
1738 }
1739 else if (!sjis_p && (c1 == charset_big5_1 || c1 == charset_big5_2))
1740 {
1741 unsigned char b1, b2;
1742
1743 ENCODE_BIG5 (c1, c2, c3, b1, b2);
1744 *dst++ = b1, *dst++ = b2;
1745 }
1746 else
1747 *dst++ = c1, *dst++ = c2, *dst++ = c3;
1748 break;
1749
1750 case EMACS_leading_code_4:
1751 THREE_MORE_BYTES (c2, c3, c4);
1752 *dst++ = c1, *dst++ = c2, *dst++ = c3, *dst++ = c4;
1753 break;
1754
1755 case EMACS_leading_code_composition:
1756 coding->composing = 1;
1757 break;
1758
1759 default: /* i.e. case EMACS_invalid_code: */
1760 *dst++ = c1;
1761 }
1762 continue;
1763
1764 label_end_of_loop:
1765 coding->carryover_size = src - src_base;
1766 bcopy (src_base, coding->carryover, coding->carryover_size);
1767 src = src_base;
1768 break;
1769 }
1770
1771 *consumed = src - source;
1772 return dst - destination;
1773}
1774
1775
1776/*** 5. End-of-line handlers ***/
1777
1778/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1779 This function is called only when `coding->eol_type' is
1780 CODING_EOL_CRLF or CODING_EOL_CR. */
1781
1782decode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
1783 struct coding_system *coding;
1784 unsigned char *source, *destination;
1785 int src_bytes, dst_bytes;
1786 int *consumed;
1787{
1788 unsigned char *src = source;
1789 unsigned char *src_end = source + src_bytes;
1790 unsigned char *dst = destination;
1791 unsigned char *dst_end = destination + dst_bytes;
1792 int produced;
1793
1794 switch (coding->eol_type)
1795 {
1796 case CODING_EOL_CRLF:
1797 {
1798 /* Since the maximum bytes produced by each loop is 2, we
1799 subtract 1 from DST_END to assure overflow checking is
1800 necessary only at the head of loop. */
1801 unsigned char *adjusted_dst_end = dst_end - 1;
1802
1803 while (src < src_end && dst < adjusted_dst_end)
1804 {
1805 unsigned char *src_base = src;
1806 unsigned char c = *src++;
1807 if (c == '\r')
1808 {
1809 ONE_MORE_BYTE (c);
1810 if (c != '\n')
1811 *dst++ = '\r';
1812
1813 }
1814 else
1815 *dst++ = c;
1816 continue;
1817
1818 label_end_of_loop:
1819 coding->carryover_size = src - src_base;
1820 bcopy (src_base, coding->carryover, coding->carryover_size);
1821 src = src_base;
1822 break;
1823 }
1824 *consumed = src - source;
1825 produced = dst - destination;
1826 break;
1827 }
1828
1829 case CODING_EOL_CR:
1830 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1831 bcopy (source, destination, produced);
1832 dst_end = destination + produced;
1833 while (dst < dst_end)
1834 if (*dst++ == '\r') dst[-1] = '\n';
1835 *consumed = produced;
1836 break;
1837
1838 default: /* i.e. case: CODING_EOL_LF */
1839 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1840 bcopy (source, destination, produced);
1841 *consumed = produced;
1842 break;
1843 }
1844
1845 return produced;
1846}
1847
1848/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
1849 format of end-of-line according to `coding->eol_type'. If
1850 `coding->selective' is 1, code '\r' in source text also means
1851 end-of-line. */
1852
1853encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
1854 struct coding_system *coding;
1855 unsigned char *source, *destination;
1856 int src_bytes, dst_bytes;
1857 int *consumed;
1858{
1859 unsigned char *src = source;
1860 unsigned char *dst = destination;
1861 int produced;
1862
1863 if (src_bytes <= 0)
1864 return 0;
1865
1866 switch (coding->eol_type)
1867 {
1868 case CODING_EOL_LF:
1869 case CODING_EOL_AUTOMATIC:
1870 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1871 bcopy (source, destination, produced);
1872 if (coding->selective)
1873 {
1874 int i = produced;
1875 while (i--)
1876 if (*dst++ == '\r') dst[-1] = '\n';
1877 }
1878 *consumed = produced;
1879
1880 case CODING_EOL_CRLF:
1881 {
1882 unsigned char c;
1883 unsigned char *src_end = source + src_bytes;
1884 unsigned char *dst_end = destination + dst_bytes;
1885 /* Since the maximum bytes produced by each loop is 2, we
1886 subtract 1 from DST_END to assure overflow checking is
1887 necessary only at the head of loop. */
1888 unsigned char *adjusted_dst_end = dst_end - 1;
1889
1890 while (src < src_end && dst < adjusted_dst_end)
1891 {
1892 c = *src++;
1893 if (c == '\n' || (c == '\r' && coding->selective))
1894 *dst++ = '\r', *dst++ = '\n';
1895 else
1896 *dst++ = c;
1897 }
1898 produced = dst - destination;
1899 *consumed = src - source;
1900 break;
1901 }
1902
1903 default: /* i.e. case CODING_EOL_CR: */
1904 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1905 bcopy (source, destination, produced);
1906 {
1907 int i = produced;
1908 while (i--)
1909 if (*dst++ == '\n') dst[-1] = '\r';
1910 }
1911 *consumed = produced;
1912 }
1913
1914 return produced;
1915}
1916
1917
1918/*** 6. C library functions ***/
1919
1920/* In Emacs Lisp, coding system is represented by a Lisp symbol which
1921 has a property `coding-system'. The value of this property is a
1922 vector of length 5 (called as coding-vector). Among elements of
1923 this vector, the first (element[0]) and the fifth (element[4])
1924 carry important information for decoding/encoding. Before
1925 decoding/encoding, this information should be set in fields of a
1926 structure of type `coding_system'.
1927
1928 A value of property `coding-system' can be a symbol of another
1929 subsidiary coding-system. In that case, Emacs gets coding-vector
1930 from that symbol.
1931
1932 `element[0]' contains information to be set in `coding->type'. The
1933 value and its meaning is as follows:
1934
1935 0 -- coding_system_internal
1936 1 -- coding_system_sjis
1937 2 -- coding_system_iso2022
1938 3 -- coding_system_big5
1939 4 -- coding_system_ccl
1940 nil -- coding_system_no_conversion
1941 t -- coding_system_automatic
1942
1943 `element[4]' contains information to be set in `coding->flags' and
1944 `coding->spec'. The meaning varies by `coding->type'.
1945
1946 If `coding->type' is `coding_type_iso2022', element[4] is a vector
1947 of length 32 (of which the first 13 sub-elements are used now).
1948 Meanings of these sub-elements are:
1949
1950 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
1951 If the value is an integer of valid charset, the charset is
1952 assumed to be designated to graphic register N initially.
1953
1954 If the value is minus, it is a minus value of charset which
1955 reserves graphic register N, which means that the charset is
1956 not designated initially but should be designated to graphic
1957 register N just before encoding a character in that charset.
1958
1959 If the value is nil, graphic register N is never used on
1960 encoding.
1961
1962 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
1963 Each value takes t or nil. See the section ISO2022 of
1964 `coding.h' for more information.
1965
1966 If `coding->type' is `coding_type_big5', element[4] is t to denote
1967 BIG5-ETen or nil to denote BIG5-HKU.
1968
1969 If `coding->type' takes the other value, element[4] is ignored.
1970
1971 Emacs Lisp's coding system also carries information about format of
1972 end-of-line in a value of property `eol-type'. If the value is
1973 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
1974 means CODING_EOL_CR. If it is not integer, it should be a vector
1975 of subsidiary coding systems of which property `eol-type' has one
1976 of above values.
1977
1978*/
1979
1980/* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
1981 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
1982 is setup so that no conversion is necessary and return -1, else
1983 return 0. */
1984
1985int
1986setup_coding_system (coding_system_symbol, coding)
1987 Lisp_Object coding_system_symbol;
1988 struct coding_system *coding;
1989{
1990 Lisp_Object coding_system_vector = Qnil;
1991 Lisp_Object type, eol_type;
1992
1993 /* At first, set several fields default values. */
1994 coding->require_flushing = 0;
1995 coding->last_block = 0;
1996 coding->selective = 0;
1997 coding->composing = 0;
1998 coding->direction = 0;
1999 coding->carryover_size = 0;
2000 coding->symbol = Qnil;
2001 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2002
2003 /* Get value of property `coding-system'. If it is a Lisp symbol
2004 pointing another coding system, fetch its property until we get a
2005 vector. */
2006 while (!NILP (coding_system_symbol))
2007 {
2008 coding->symbol = coding_system_symbol;
2009 if (NILP (coding->post_read_conversion))
2010 coding->post_read_conversion = Fget (coding_system_symbol,
2011 Qpost_read_conversion);
2012 if (NILP (coding->pre_write_conversion))
2013 coding->pre_write_conversion = Fget (coding_system_symbol,
2014 Qpre_write_conversion);
2015
2016 coding_system_vector = Fget (coding_system_symbol, Qcoding_system);
2017 if (VECTORP (coding_system_vector))
2018 break;
2019 coding_system_symbol = coding_system_vector;
2020 }
2021 Vlast_coding_system_used = coding->symbol;
2022
2023 if (!VECTORP (coding_system_vector)
2024 || XVECTOR (coding_system_vector)->size != 5)
2025 goto label_invalid_coding_system;
2026
2027 /* Get value of property `eol-type' by searching from the root
2028 coding-system. */
2029 coding_system_symbol = coding->symbol;
2030 eol_type = Qnil;
2031 while (SYMBOLP (coding_system_symbol) && !NILP (coding_system_symbol))
2032 {
2033 eol_type = Fget (coding_system_symbol, Qeol_type);
2034 if (!NILP (eol_type))
2035 break;
2036 coding_system_symbol = Fget (coding_system_symbol, Qcoding_system);
2037 }
2038
2039 if (VECTORP (eol_type))
2040 coding->eol_type = CODING_EOL_AUTOMATIC;
2041 else if (XFASTINT (eol_type) == 1)
2042 coding->eol_type = CODING_EOL_CRLF;
2043 else if (XFASTINT (eol_type) == 2)
2044 coding->eol_type = CODING_EOL_CR;
2045 else
2046 coding->eol_type = CODING_EOL_LF;
2047
2048 type = XVECTOR (coding_system_vector)->contents[0];
2049 switch (XFASTINT (type))
2050 {
2051 case 0:
2052 coding->type = coding_type_internal;
2053 break;
2054
2055 case 1:
2056 coding->type = coding_type_sjis;
2057 break;
2058
2059 case 2:
2060 coding->type = coding_type_iso2022;
2061 {
2062 Lisp_Object val = XVECTOR (coding_system_vector)->contents[4];
2063 Lisp_Object *flags;
2064 int i, charset, default_reg_bits = 0;
2065
2066 if (!VECTORP (val) || XVECTOR (val)->size != 32)
2067 goto label_invalid_coding_system;
2068
2069 flags = XVECTOR (val)->contents;
2070 coding->flags
2071 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2072 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2073 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2074 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2075 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2076 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2077 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2078 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
2079 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION));
2080
2081 /* Invoke graphic register 0 to plane 0. */
2082 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2083 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
2084 CODING_SPEC_ISO_INVOCATION (coding, 1)
2085 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2086 /* Not single shifting at first. */
2087 CODING_SPEC_ISO_SINGLE_SHIFTING(coding) = 0;
2088
2089 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2090 FLAGS[REG] can be one of below:
2091 integer CHARSET: CHARSET occupies register I,
2092 t: designate nothing to REG initially, but can be used
2093 by any charsets,
2094 list of integer, nil, or t: designate the first
2095 element (if integer) to REG initially, the remaining
2096 elements (if integer) is designated to REG on request,
2097 if an element is t, REG can be used by any charset,
2098 nil: REG is never used. */
2099 for (charset = 0; charset < MAX_CHARSET; charset++)
2100 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = -1;
2101 for (i = 0; i < 4; i++)
2102 {
2103 if (INTEGERP (flags[i])
2104 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset)))
2105 {
2106 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2107 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
2108 }
2109 else if (EQ (flags[i], Qt))
2110 {
2111 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2112 default_reg_bits |= 1 << i;
2113 }
2114 else if (CONSP (flags[i]))
2115 {
2116 Lisp_Object tail = flags[i];
2117
2118 if (INTEGERP (XCONS (tail)->car)
2119 && (charset = XINT (XCONS (tail)->car),
2120 CHARSET_VALID_P (charset)))
2121 {
2122 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2123 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
2124 }
2125 else
2126 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2127 tail = XCONS (tail)->cdr;
2128 while (CONSP (tail))
2129 {
2130 if (INTEGERP (XCONS (tail)->car)
2131 && (charset = XINT (XCONS (tail)->car),
2132 CHARSET_VALID_P (charset)))
2133 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2134 = i;
2135 else if (EQ (XCONS (tail)->car, Qt))
2136 default_reg_bits |= 1 << i;
2137 tail = XCONS (tail)->cdr;
2138 }
2139 }
2140 else
2141 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2142
2143 CODING_SPEC_ISO_DESIGNATION (coding, i)
2144 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2145 }
2146
2147 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2148 {
2149 /* REG 1 can be used only by locking shift in 7-bit env. */
2150 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2151 default_reg_bits &= ~2;
2152 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
2153 /* Without any shifting, only REG 0 and 1 can be used. */
2154 default_reg_bits &= 3;
2155 }
2156
2157 for (charset = 0; charset < MAX_CHARSET; charset++)
2158 if (CHARSET_VALID_P (charset)
2159 && CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) < 0)
2160 {
2161 /* We have not yet decided where to designate CHARSET. */
2162 int reg_bits = default_reg_bits;
2163
2164 if (CHARSET_CHARS (charset) == 96)
2165 /* A charset of CHARS96 can't be designated to REG 0. */
2166 reg_bits &= ~1;
2167
2168 if (reg_bits)
2169 /* There exist some default graphic register. */
2170 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2171 = (reg_bits & 1
2172 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
2173 else
2174 /* We anyway have to designate CHARSET to somewhere. */
2175 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2176 = (CHARSET_CHARS (charset) == 94
2177 ? 0
2178 : ((coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT
2179 || ! coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2180 ? 1
2181 : (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT
2182 ? 2 : 0)));
2183 }
2184 }
2185 coding->require_flushing = 1;
2186 break;
2187
2188 case 3:
2189 coding->type = coding_type_big5;
2190 coding->flags
2191 = (NILP (XVECTOR (coding_system_vector)->contents[4])
2192 ? CODING_FLAG_BIG5_HKU
2193 : CODING_FLAG_BIG5_ETEN);
2194 break;
2195
2196 case 4:
2197 coding->type = coding_type_ccl;
2198 {
2199 Lisp_Object val = XVECTOR (coding_system_vector)->contents[4];
2200 if (CONSP (val)
2201 && VECTORP (XCONS (val)->car)
2202 && VECTORP (XCONS (val)->cdr))
2203 {
2204 setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
2205 setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
2206 }
2207 else
2208 goto label_invalid_coding_system;
2209 }
2210 coding->require_flushing = 1;
2211 break;
2212
2213 default:
2214 if (EQ (type, Qt))
2215 coding->type = coding_type_automatic;
2216 else
2217 coding->type = coding_type_no_conversion;
2218 break;
2219 }
2220 return 0;
2221
2222 label_invalid_coding_system:
2223 coding->type = coding_type_no_conversion;
2224 return -1;
2225}
2226
2227/* Emacs has a mechanism to automatically detect a coding system if it
2228 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
2229 it's impossible to distinguish some coding systems accurately
2230 because they use the same range of codes. So, at first, coding
2231 systems are categorized into 7, those are:
2232
2233 o coding-category-internal
2234
2235 The category for a coding system which has the same code range
2236 as Emacs' internal format. Assigned the coding-system (Lisp
2237 symbol) `coding-system-internal' by default.
2238
2239 o coding-category-sjis
2240
2241 The category for a coding system which has the same code range
2242 as SJIS. Assigned the coding-system (Lisp
2243 symbol) `coding-system-sjis' by default.
2244
2245 o coding-category-iso-7
2246
2247 The category for a coding system which has the same code range
2248 as ISO2022 of 7-bit environment. Assigned the coding-system
2249 (Lisp symbol) `coding-system-junet' by default.
2250
2251 o coding-category-iso-8-1
2252
2253 The category for a coding system which has the same code range
2254 as ISO2022 of 8-bit environment and graphic plane 1 used only
2255 for DIMENSION1 charset. Assigned the coding-system (Lisp
2256 symbol) `coding-system-ctext' by default.
2257
2258 o coding-category-iso-8-2
2259
2260 The category for a coding system which has the same code range
2261 as ISO2022 of 8-bit environment and graphic plane 1 used only
2262 for DIMENSION2 charset. Assigned the coding-system (Lisp
2263 symbol) `coding-system-euc-japan' by default.
2264
2265 o coding-category-iso-else
2266
2267 The category for a coding system which has the same code range
2268 as ISO2022 but not belongs to any of the above three
2269 categories. Assigned the coding-system (Lisp symbol)
2270 `coding-system-iso-2022-ss2-7' by default.
2271
2272 o coding-category-big5
2273
2274 The category for a coding system which has the same code range
2275 as BIG5. Assigned the coding-system (Lisp symbol)
2276 `coding-system-big5' by default.
2277
2278 o coding-category-binary
2279
2280 The category for a coding system not categorized in any of the
2281 above. Assigned the coding-system (Lisp symbol)
2282 `coding-system-noconv' by default.
2283
2284 Each of them is a Lisp symbol and the value is an actual
2285 `coding-system's (this is also a Lisp symbol) assigned by a user.
2286 What Emacs does actually is to detect a category of coding system.
2287 Then, it uses a `coding-system' assigned to it. If Emacs can't
2288 decide only one possible category, it selects a category of the
2289 highest priority. Priorities of categories are also specified by a
2290 user in a Lisp variable `coding-category-list'.
2291
2292*/
2293
2294/* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2295 If it detects possible coding systems, return an integer in which
2296 appropriate flag bits are set. Flag bits are defined by macros
2297 CODING_CATEGORY_MASK_XXX in `coding.h'. */
2298
2299int
2300detect_coding_mask (src, src_bytes)
2301 unsigned char *src;
2302 int src_bytes;
2303{
2304 register unsigned char c;
2305 unsigned char *src_end = src + src_bytes;
2306 int mask;
2307
2308 /* At first, skip all ASCII characters and control characters except
2309 for three ISO2022 specific control characters. */
2310 while (src < src_end)
2311 {
2312 c = *src;
2313 if (c >= 0x80
2314 || (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2315 break;
2316 src++;
2317 }
2318
2319 if (src >= src_end)
2320 /* We found nothing other than ASCII. There's nothing to do. */
2321 return CODING_CATEGORY_MASK_ANY;
2322
2323 /* The text seems to be encoded in some multilingual coding system.
2324 Now, try to find in which coding system the text is encoded. */
2325 if (c < 0x80)
2326 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2327 /* C is an ISO2022 specific control code of C0. */
2328 mask = detect_coding_iso2022 (src, src_end);
2329
2330 else if (c == ISO_CODE_SS2 || c == ISO_CODE_SS3 || c == ISO_CODE_CSI)
2331 /* C is an ISO2022 specific control code of C1,
2332 or the first byte of SJIS's 2-byte character code,
2333 or a leading code of Emacs. */
2334 mask = (detect_coding_iso2022 (src, src_end)
2335 | detect_coding_sjis (src, src_end)
2336 | detect_coding_internal (src, src_end));
2337
2338 else if (c < 0xA0)
2339 /* C is the first byte of SJIS character code,
2340 or a leading-code of Emacs. */
2341 mask = (detect_coding_sjis (src, src_end)
2342 | detect_coding_internal (src, src_end));
2343
2344 else
2345 /* C is a character of ISO2022 in graphic plane right,
2346 or a SJIS's 1-byte character code (i.e. JISX0201),
2347 or the first byte of BIG5's 2-byte code. */
2348 mask = (detect_coding_iso2022 (src, src_end)
2349 | detect_coding_sjis (src, src_end)
2350 | detect_coding_big5 (src, src_end));
2351
2352 return mask;
2353}
2354
2355/* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2356 The information of the detected coding system is set in CODING. */
2357
2358void
2359detect_coding (coding, src, src_bytes)
2360 struct coding_system *coding;
2361 unsigned char *src;
2362 int src_bytes;
2363{
2364 int mask = detect_coding_mask (src, src_bytes);
2365 int idx;
2366
2367 if (mask == CODING_CATEGORY_MASK_ANY)
2368 /* We found nothing other than ASCII. There's nothing to do. */
2369 return;
2370
2371 if (!mask)
2372 /* The source text seems to be encoded in unknown coding system.
2373 Emacs regards the category of such a kind of coding system as
2374 `coding-category-binary'. We assume that a user has assigned
2375 an appropriate coding system for a `coding-category-binary'. */
2376 idx = CODING_CATEGORY_IDX_BINARY;
2377 else
2378 {
2379 /* We found some plausible coding systems. Let's use a coding
2380 system of the highest priority. */
2381 Lisp_Object val = Vcoding_category_list;
2382
2383 if (CONSP (val))
2384 while (!NILP (val))
2385 {
2386 idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
2387 if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
2388 break;
2389 val = XCONS (val)->cdr;
2390 }
2391 else
2392 val = Qnil;
2393
2394 if (NILP (val))
2395 {
2396 /* For unknown reason, `Vcoding_category_list' contains none
2397 of found categories. Let's use any of them. */
2398 for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
2399 if (mask & (1 << idx))
2400 break;
2401 }
2402 }
2403 setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding);
2404}
2405
2406/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2407 is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
2408 CODING_EOL_CR, and CODING_EOL_AUTOMATIC. */
2409
2410int
2411detect_eol_type (src, src_bytes)
2412 unsigned char *src;
2413 int src_bytes;
2414{
2415 unsigned char *src_end = src + src_bytes;
2416 unsigned char c;
2417
2418 while (src < src_end)
2419 {
2420 c = *src++;
2421 if (c == '\n')
2422 return CODING_EOL_LF;
2423 else if (c == '\r')
2424 {
2425 if (src < src_end && *src == '\n')
2426 return CODING_EOL_CRLF;
2427 else
2428 return CODING_EOL_CR;
2429 }
2430 }
2431 return CODING_EOL_AUTOMATIC;
2432}
2433
2434/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2435 is encoded. If it detects an appropriate format of end-of-line, it
2436 sets the information in *CODING. */
2437
2438void
2439detect_eol (coding, src, src_bytes)
2440 struct coding_system *coding;
2441 unsigned char *src;
2442 int src_bytes;
2443{
2444 Lisp_Object val;
2445 int eol_type = detect_eol_type (src, src_bytes);
2446
2447 if (eol_type == CODING_EOL_AUTOMATIC)
2448 /* We found no end-of-line in the source text. */
2449 return;
2450
2451 val = Fget (coding->symbol, Qeol_type);
2452 if (VECTORP (val) && XVECTOR (val)->size == 3)
2453 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
2454}
2455
2456/* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
2457 decoding, it may detect coding system and format of end-of-line if
2458 those are not yet decided. */
2459
2460int
2461decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2462 struct coding_system *coding;
2463 unsigned char *source, *destination;
2464 int src_bytes, dst_bytes;
2465 int *consumed;
2466{
2467 int produced;
2468
2469 if (src_bytes <= 0)
2470 {
2471 *consumed = 0;
2472 return 0;
2473 }
2474
2475 if (coding->type == coding_type_automatic)
2476 detect_coding (coding, source, src_bytes);
2477
2478 if (coding->eol_type == CODING_EOL_AUTOMATIC)
2479 detect_eol (coding, source, src_bytes);
2480
2481 coding->carryover_size = 0;
2482 switch (coding->type)
2483 {
2484 case coding_type_no_conversion:
2485 label_no_conversion:
2486 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2487 bcopy (source, destination, produced);
2488 *consumed = produced;
2489 break;
2490
2491 case coding_type_internal:
2492 case coding_type_automatic:
2493 if (coding->eol_type == CODING_EOL_LF
2494 || coding->eol_type == CODING_EOL_AUTOMATIC)
2495 goto label_no_conversion;
2496 produced = decode_eol (coding, source, destination,
2497 src_bytes, dst_bytes, consumed);
2498 break;
2499
2500 case coding_type_sjis:
2501 produced = decode_coding_sjis_big5 (coding, source, destination,
2502 src_bytes, dst_bytes, consumed,
2503 1);
2504 break;
2505
2506 case coding_type_iso2022:
2507 produced = decode_coding_iso2022 (coding, source, destination,
2508 src_bytes, dst_bytes, consumed);
2509 break;
2510
2511 case coding_type_big5:
2512 produced = decode_coding_sjis_big5 (coding, source, destination,
2513 src_bytes, dst_bytes, consumed,
2514 0);
2515 break;
2516
2517 case coding_type_ccl:
2518 produced = ccl_driver (&coding->spec.ccl.decoder, source, destination,
2519 src_bytes, dst_bytes, consumed);
2520 break;
2521 }
2522
2523 return produced;
2524}
2525
2526/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". */
2527
2528int
2529encode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2530 struct coding_system *coding;
2531 unsigned char *source, *destination;
2532 int src_bytes, dst_bytes;
2533 int *consumed;
2534{
2535 int produced;
2536
2537 coding->carryover_size = 0;
2538 switch (coding->type)
2539 {
2540 case coding_type_no_conversion:
2541 label_no_conversion:
2542 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2543 if (produced > 0)
2544 {
2545 bcopy (source, destination, produced);
2546 if (coding->selective)
2547 {
2548 unsigned char *p = destination, *pend = destination + produced;
2549 while (p < pend)
2550 if (*p++ = '\015') p[-1] = '\n';
2551 }
2552 }
2553 *consumed = produced;
2554 break;
2555
2556 case coding_type_internal:
2557 case coding_type_automatic:
2558 if (coding->eol_type == CODING_EOL_LF
2559 || coding->eol_type == CODING_EOL_AUTOMATIC)
2560 goto label_no_conversion;
2561 produced = encode_eol (coding, source, destination,
2562 src_bytes, dst_bytes, consumed);
2563 break;
2564
2565 case coding_type_sjis:
2566 produced = encode_coding_sjis_big5 (coding, source, destination,
2567 src_bytes, dst_bytes, consumed,
2568 1);
2569 break;
2570
2571 case coding_type_iso2022:
2572 produced = encode_coding_iso2022 (coding, source, destination,
2573 src_bytes, dst_bytes, consumed);
2574 break;
2575
2576 case coding_type_big5:
2577 produced = encode_coding_sjis_big5 (coding, source, destination,
2578 src_bytes, dst_bytes, consumed,
2579 0);
2580 break;
2581
2582 case coding_type_ccl:
2583 produced = ccl_driver (&coding->spec.ccl.encoder, source, destination,
2584 src_bytes, dst_bytes, consumed);
2585 break;
2586 }
2587
2588 return produced;
2589}
2590
2591#define CONVERSION_BUFFER_EXTRA_ROOM 256
2592
2593/* Return maximum size (bytes) of a buffer enough for decoding
2594 SRC_BYTES of text encoded in CODING. */
2595
2596int
2597decoding_buffer_size (coding, src_bytes)
2598 struct coding_system *coding;
2599 int src_bytes;
2600{
2601 int magnification;
2602
2603 if (coding->type == coding_type_iso2022)
2604 magnification = 3;
2605 else if (coding->type == coding_type_ccl)
2606 magnification = coding->spec.ccl.decoder.buf_magnification;
2607 else
2608 magnification = 2;
2609
2610 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2611}
2612
2613/* Return maximum size (bytes) of a buffer enough for encoding
2614 SRC_BYTES of text to CODING. */
2615
2616int
2617encoding_buffer_size (coding, src_bytes)
2618 struct coding_system *coding;
2619 int src_bytes;
2620{
2621 int magnification;
2622
2623 if (coding->type == coding_type_ccl)
2624 magnification = coding->spec.ccl.encoder.buf_magnification;
2625 else
2626 magnification = 3;
2627
2628 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2629}
2630
2631#ifndef MINIMUM_CONVERSION_BUFFER_SIZE
2632#define MINIMUM_CONVERSION_BUFFER_SIZE 1024
2633#endif
2634
2635char *conversion_buffer;
2636int conversion_buffer_size;
2637
2638/* Return a pointer to a SIZE bytes of buffer to be used for encoding
2639 or decoding. Sufficient memory is allocated automatically. If we
2640 run out of memory, return NULL. */
2641
2642char *
2643get_conversion_buffer (size)
2644 int size;
2645{
2646 if (size > conversion_buffer_size)
2647 {
2648 char *buf;
2649 int real_size = conversion_buffer_size * 2;
2650
2651 while (real_size < size) real_size *= 2;
2652 buf = (char *) xmalloc (real_size);
2653 xfree (conversion_buffer);
2654 conversion_buffer = buf;
2655 conversion_buffer_size = real_size;
2656 }
2657 return conversion_buffer;
2658}
2659
2660
2661#ifdef emacs
2662/*** 7. Emacs Lisp library functions ***/
2663
2664DEFUN ("coding-system-vector", Fcoding_system_vector, Scoding_system_vector,
2665 1, 1, 0,
2666 "Return coding-vector of CODING-SYSTEM.\n\
2667If CODING-SYSTEM is not a valid coding-system, return nil.")
2668 (obj)
2669 Lisp_Object obj;
2670{
2671 while (SYMBOLP (obj) && !NILP (obj))
2672 obj = Fget (obj, Qcoding_system);
2673 return ((NILP (obj) || !VECTORP (obj) || XVECTOR (obj)->size != 5)
2674 ? Qnil : obj);
2675}
2676
2677DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
2678 "Return t if OBJECT is nil or a coding-system.\n\
2679See document of make-coding-system for coding-system object.")
2680 (obj)
2681 Lisp_Object obj;
2682{
2683 return ((NILP (obj) || !NILP (Fcoding_system_vector (obj))) ? Qt : Qnil);
2684}
2685
2686DEFUN ("read-non-nil-coding-system",
2687 Fread_non_nil_coding_system, Sread_non_nil_coding_system, 1, 1, 0,
2688 "Read a coding-system from the minibuffer, prompting with string PROMPT.")
2689 (prompt)
2690 Lisp_Object prompt;
2691{
2692 return Fintern (Fcompleting_read (prompt, Vobarray, Qcoding_system_vector,
2693 Qt, Qnil, Qnil),
2694 Qnil);
2695}
2696
2697DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 1, 0,
2698 "Read a coding-system or nil from the minibuffer, prompting with string PROMPT.")
2699 (prompt)
2700 Lisp_Object prompt;
2701{
2702 return Fintern (Fcompleting_read (prompt, Vobarray, Qcoding_system_p,
2703 Qt, Qnil, Qnil),
2704 Qnil);
2705}
2706
2707DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
2708 1, 1, 0,
2709 "Check validity of CODING-SYSTEM.\n\
2710If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
2711CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
2712The value of property should be a vector of length 5.")
2713 (coding_system)
2714 Lisp_Object coding_system;
2715{
2716 CHECK_SYMBOL (coding_system, 0);
2717 if (!NILP (Fcoding_system_p (coding_system)))
2718 return coding_system;
2719 while (1)
2720 Fsignal (Qcoding_system_error, coding_system);
2721}
2722
2723DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
2724 2, 2, 0,
2725 "Detect coding-system of the text in the region between START and END.\n\
2726Return a list of possible coding-systems ordered by priority.\n\
2727If only ASCII characters are found, it returns `coding-system-automatic'\n\
2728 or its subsidiary coding-system according to a detected end-of-line format.")
2729 (b, e)
2730 Lisp_Object b, e;
2731{
2732 int coding_mask, eol_type;
2733 Lisp_Object val;
2734 int beg, end;
2735
2736 validate_region (&b, &e);
2737 beg = XINT (b), end = XINT (e);
2738 if (beg < GPT && end >= GPT) move_gap (end);
2739
2740 coding_mask = detect_coding_mask (POS_ADDR (beg), end - beg);
2741 eol_type = detect_eol_type (POS_ADDR (beg), end - beg);
2742
2743 if (coding_mask == CODING_CATEGORY_MASK_ANY)
2744 {
2745 val = intern ("coding-system-automatic");
2746 if (eol_type != CODING_EOL_AUTOMATIC)
2747 {
2748 Lisp_Object val2 = Fget (val, Qeol_type);
2749 if (VECTORP (val2))
2750 val = XVECTOR (val2)->contents[eol_type];
2751 }
2752 }
2753 else
2754 {
2755 Lisp_Object val2;
2756
2757 /* At first, gather possible coding-systems in VAL in a reverse
2758 order. */
2759 val = Qnil;
2760 for (val2 = Vcoding_category_list;
2761 !NILP (val2);
2762 val2 = XCONS (val2)->cdr)
2763 {
2764 int idx
2765 = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index));
2766 if (coding_mask & (1 << idx))
2767 val = Fcons (Fsymbol_value (XCONS (val2)->car), val);
2768 }
2769
2770 /* Then, change the order of the list, while getting subsidiary
2771 coding-systems. */
2772 val2 = val;
2773 val = Qnil;
2774 for (; !NILP (val2); val2 = XCONS (val2)->cdr)
2775 {
2776 if (eol_type == CODING_EOL_AUTOMATIC)
2777 val = Fcons (XCONS (val2)->car, val);
2778 else
2779 {
2780 Lisp_Object val3 = Fget (XCONS (val2)->car, Qeol_type);
2781 if (VECTORP (val3))
2782 val = Fcons (XVECTOR (val3)->contents[eol_type], val);
2783 else
2784 val = Fcons (XCONS (val2)->car, val);
2785 }
2786 }
2787 }
2788
2789 return val;
2790}
2791
2792/* Scan text in the region between *BEGP and *ENDP, skip characters
2793 which we never have to encode to (iff ENCODEP is 1) or decode from
2794 coding system CODING at the head and tail, then set BEGP and ENDP
2795 to the addresses of start and end of the text we actually convert. */
2796
2797void
2798shrink_conversion_area (begp, endp, coding, encodep)
2799 unsigned char **begp, **endp;
2800 struct coding_system *coding;
2801 int encodep;
2802{
2803 register unsigned char *beg_addr = *begp, *end_addr = *endp;
2804
2805 if (coding->eol_type != CODING_EOL_LF
2806 && coding->eol_type != CODING_EOL_AUTOMATIC)
2807 /* Since we anyway have to convert end-of-line format, it is not
2808 worth skipping at most 100 bytes or so. */
2809 return;
2810
2811 if (encodep) /* for encoding */
2812 {
2813 switch (coding->type)
2814 {
2815 case coding_type_no_conversion:
2816 case coding_type_internal:
2817 case coding_type_automatic:
2818 /* We need no conversion. */
2819 *begp = *endp;
2820 return;
2821 case coding_type_ccl:
2822 /* We can't skip any data. */
2823 return;
2824 default:
2825 /* We can skip all ASCII characters at the head and tail. */
2826 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
2827 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
2828 break;
2829 }
2830 }
2831 else /* for decoding */
2832 {
2833 switch (coding->type)
2834 {
2835 case coding_type_no_conversion:
2836 /* We need no conversion. */
2837 *begp = *endp;
2838 return;
2839 case coding_type_internal:
2840 if (coding->eol_type == CODING_EOL_LF)
2841 {
2842 /* We need no conversion. */
2843 *begp = *endp;
2844 return;
2845 }
2846 /* We can skip all but carriage-return. */
2847 while (beg_addr < end_addr && *beg_addr != '\r') beg_addr++;
2848 while (beg_addr < end_addr && *(end_addr - 1) != '\r') end_addr--;
2849 break;
2850 case coding_type_sjis:
2851 case coding_type_big5:
2852 /* We can skip all ASCII characters at the head. */
2853 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
2854 /* We can skip all ASCII characters at the tail except for
2855 the second byte of SJIS or BIG5 code. */
2856 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
2857 if (end_addr != *endp)
2858 end_addr++;
2859 break;
2860 case coding_type_ccl:
2861 /* We can't skip any data. */
2862 return;
2863 default: /* i.e. case coding_type_iso2022: */
2864 {
2865 unsigned char c;
2866
2867 /* We can skip all ASCII characters except for a few
2868 control codes at the head. */
2869 while (beg_addr < end_addr && (c = *beg_addr) < 0x80
2870 && c != ISO_CODE_CR && c != ISO_CODE_SO
2871 && c != ISO_CODE_SI && c != ISO_CODE_ESC)
2872 beg_addr++;
2873 }
2874 break;
2875 }
2876 }
2877 *begp = beg_addr;
2878 *endp = end_addr;
2879 return;
2880}
2881
2882/* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
2883 text between B and E. B and E are buffer position. */
2884
2885Lisp_Object
2886code_convert_region (b, e, coding, encodep)
2887 Lisp_Object b, e;
2888 struct coding_system *coding;
2889 int encodep;
2890{
2891 int beg, end, len, consumed, produced;
2892 char *buf;
2893 unsigned char *begp, *endp;
2894 int pos = PT;
2895
2896 validate_region (&b, &e);
2897 beg = XINT (b), end = XINT (e);
2898 if (beg < GPT && end >= GPT)
2899 move_gap (end);
2900
2901 if (encodep && !NILP (coding->pre_write_conversion))
2902 {
2903 /* We must call a pre-conversion function which may put a new
2904 text to be converted in a new buffer. */
2905 struct buffer *old = current_buffer, *new;
2906
2907 TEMP_SET_PT (beg);
2908 call2 (coding->pre_write_conversion, b, e);
2909 if (old != current_buffer)
2910 {
2911 /* Replace the original text by the text just generated. */
2912 len = ZV - BEGV;
2913 new = current_buffer;
2914 set_buffer_internal (old);
2915 del_range (beg, end);
2916 insert_from_buffer (new, 1, len, 0);
2917 end = beg + len;
2918 }
2919 }
2920
2921 /* We may be able to shrink the conversion region. */
2922 begp = POS_ADDR (beg); endp = begp + (end - beg);
2923 shrink_conversion_area (&begp, &endp, coding, encodep);
2924
2925 if (begp == endp)
2926 /* We need no conversion. */
2927 len = end - beg;
2928 else
2929 {
2930 beg += begp - POS_ADDR (beg);
2931 end = beg + (endp - begp);
2932
2933 if (encodep)
2934 len = encoding_buffer_size (coding, end - beg);
2935 else
2936 len = decoding_buffer_size (coding, end - beg);
2937 buf = get_conversion_buffer (len);
2938
2939 coding->last_block = 1;
2940 produced = (encodep
2941 ? encode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
2942 &consumed)
2943 : decode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
2944 &consumed));
2945
2946 len = produced + (beg - XINT (b)) + (XINT (e) - end);
2947
2948 TEMP_SET_PT (beg);
2949 insert (buf, produced);
2950 del_range (PT, PT + end - beg);
2951 if (pos >= end)
2952 pos = PT + (pos - end);
2953 else if (pos > beg)
2954 pos = beg;
2955 TEMP_SET_PT (pos);
2956 }
2957
2958 if (!encodep && !NILP (coding->post_read_conversion))
2959 {
2960 /* We must call a post-conversion function which may alter
2961 the text just converted. */
2962 Lisp_Object insval;
2963
2964 beg = XINT (b);
2965 TEMP_SET_PT (beg);
2966 insval = call1 (coding->post_read_conversion, make_number (len));
2967 CHECK_NUMBER (insval, 0);
2968 len = XINT (insval);
2969 }
2970
2971 return make_number (len);
2972}
2973
2974Lisp_Object
2975code_convert_string (str, coding, encodep)
2976 Lisp_Object str;
2977 struct coding_system *coding;
2978 int encodep;
2979{
2980 int len, consumed, produced;
2981 char *buf;
2982 unsigned char *begp, *endp;
2983 int head_skip, tail_skip;
2984 struct gcpro gcpro1;
2985
2986 if (encodep && !NILP (coding->pre_write_conversion)
2987 || !encodep && !NILP (coding->post_read_conversion))
2988 {
2989 /* Since we have to call Lisp functions which assume target text
2990 is in a buffer, after setting a temporary buffer, call
2991 code_convert_region. */
2992 int count = specpdl_ptr - specpdl;
2993 int len = XSTRING (str)->size;
2994 Lisp_Object result;
2995 struct buffer *old = current_buffer;
2996
2997 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
2998 temp_output_buffer_setup (" *code-converting-work*");
2999 set_buffer_internal (XBUFFER (Vstandard_output));
3000 insert_from_string (str, 0, len, 0);
3001 code_convert_region (make_number (BEGV), make_number (ZV),
3002 coding, encodep);
3003 result = make_buffer_string (BEGV, ZV, 0);
3004 set_buffer_internal (old);
3005 return unbind_to (count, result);
3006 }
3007
3008 /* We may be able to shrink the conversion region. */
3009 begp = XSTRING (str)->data;
3010 endp = begp + XSTRING (str)->size;
3011 shrink_conversion_area (&begp, &endp, coding, encodep);
3012
3013 if (begp == endp)
3014 /* We need no conversion. */
3015 return str;
3016
3017 head_skip = begp - XSTRING (str)->data;
3018 tail_skip = XSTRING (str)->size - head_skip - (endp - begp);
3019
3020 GCPRO1 (str);
3021
3022 if (encodep)
3023 len = encoding_buffer_size (coding, endp - begp);
3024 else
3025 len = decoding_buffer_size (coding, endp - begp);
3026 buf = get_conversion_buffer (len + head_skip + tail_skip);
3027
3028 bcopy (XSTRING (str)->data, buf, head_skip);
3029 coding->last_block = 1;
3030 produced = (encodep
3031 ? encode_coding (coding, XSTRING (str)->data + head_skip,
3032 buf + head_skip, endp - begp, len, &consumed)
3033 : decode_coding (coding, XSTRING (str)->data + head_skip,
3034 buf + head_skip, endp - begp, len, &consumed));
3035 bcopy (XSTRING (str)->data + head_skip + (endp - begp),
3036 buf + head_skip + produced,
3037 tail_skip);
3038
3039 UNGCPRO;
3040
3041 return make_string (buf, head_skip + produced + tail_skip);
3042}
3043
3044DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
3045 3, 3, 0,
3046 "Decode the text between START and END which is encoded in CODING-SYSTEM.\n\
3047Return length of decoded text.")
3048 (b, e, coding_system)
3049 Lisp_Object b, e, coding_system;
3050{
3051 struct coding_system coding;
3052
3053 CHECK_NUMBER_COERCE_MARKER (b, 0);
3054 CHECK_NUMBER_COERCE_MARKER (e, 1);
3055 CHECK_SYMBOL (coding_system, 2);
3056
3057 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3058 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3059
3060 return code_convert_region (b, e, &coding, 0);
3061}
3062
3063DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
3064 3, 3, 0,
3065 "Encode the text between START and END to CODING-SYSTEM.\n\
3066Return length of encoded text.")
3067 (b, e, coding_system)
3068 Lisp_Object b, e, coding_system;
3069{
3070 struct coding_system coding;
3071
3072 CHECK_NUMBER_COERCE_MARKER (b, 0);
3073 CHECK_NUMBER_COERCE_MARKER (e, 1);
3074 CHECK_SYMBOL (coding_system, 2);
3075
3076 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3077 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3078
3079 return code_convert_region (b, e, &coding, 1);
3080}
3081
3082DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
3083 2, 2, 0,
3084 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.")
3085 (string, coding_system)
3086 Lisp_Object string, coding_system;
3087{
3088 struct coding_system coding;
3089
3090 CHECK_STRING (string, 0);
3091 CHECK_SYMBOL (coding_system, 1);
3092
3093 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3094 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3095
3096 return code_convert_string (string, &coding, 0);
3097}
3098
3099DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
3100 2, 2, 0,
3101 "Encode STRING to CODING-SYSTEM, and return the result.")
3102 (string, coding_system)
3103 Lisp_Object string, coding_system;
3104{
3105 struct coding_system coding;
3106
3107 CHECK_STRING (string, 0);
3108 CHECK_SYMBOL (coding_system, 1);
3109
3110 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3111 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3112
3113 return code_convert_string (string, &coding, 1);
3114}
3115
3116DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
3117 "Decode a JISX0208 character of SJIS coding-system-sjis.\n\
3118CODE is the character code in SJIS.\n\
3119Return the corresponding character.")
3120 (code)
3121 Lisp_Object code;
3122{
3123 unsigned char c1, c2, s1, s2;
3124 Lisp_Object val;
3125
3126 CHECK_NUMBER (code, 0);
3127 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
3128 DECODE_SJIS (s1, s2, c1, c2);
3129 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
3130 return val;
3131}
3132
3133DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
3134 "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3135Return the corresponding character code in SJIS.")
3136 (ch)
3137 Lisp_Object ch;
3138{
3139 int charset;
3140 unsigned char c1, c2, s1, s2;
3141 Lisp_Object val;
3142
3143 CHECK_NUMBER (ch, 0);
3144 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3145 if (charset == charset_jisx0208)
3146 {
3147 ENCODE_SJIS (c1, c2, s1, s2);
3148 XSETFASTINT (val, ((int)s1 << 8) | s2);
3149 }
3150 else
3151 XSETFASTINT (val, 0);
3152 return val;
3153}
3154
3155DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
3156 "Decode a Big5 character CODE of BIG5 coding-system.\n\
3157CODE is the character code in BIG5.\n\
3158Return the corresponding character.")
3159 (code)
3160 Lisp_Object code;
3161{
3162 int charset;
3163 unsigned char b1, b2, c1, c2;
3164 Lisp_Object val;
3165
3166 CHECK_NUMBER (code, 0);
3167 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
3168 DECODE_BIG5 (b1, b2, charset, c1, c2);
3169 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
3170 return val;
3171}
3172
3173DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
3174 "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3175Return the corresponding character code in Big5.")
3176 (ch)
3177 Lisp_Object ch;
3178{
3179 int charset;
3180 unsigned char c1, c2, b1, b2;
3181 Lisp_Object val;
3182
3183 CHECK_NUMBER (ch, 0);
3184 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3185 if (charset == charset_big5_1 || charset == charset_big5_2)
3186 {
3187 ENCODE_BIG5 (charset, c1, c2, b1, b2);
3188 XSETFASTINT (val, ((int)b1 << 8) | b2);
3189 }
3190 else
3191 XSETFASTINT (val, 0);
3192 return val;
3193}
3194
3195DEFUN ("set-terminal-coding-system",
3196 Fset_terminal_coding_system, Sset_terminal_coding_system, 1, 1,
3197 "zCoding-system for terminal display: ",
3198 "Set coding-system of your terminal to CODING-SYSTEM.\n\
3199All outputs to terminal are encoded to this coding-system.")
3200 (coding_system)
3201 Lisp_Object coding_system;
3202{
3203 CHECK_SYMBOL (coding_system, 0);
3204 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
3205 update_mode_lines++;
3206 if (!NILP (Finteractive_p ()))
3207 Fredraw_display ();
3208 return Qnil;
3209}
3210
3211DEFUN ("terminal-coding-system",
3212 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3213 "Return coding-system of your terminal.")
3214 ()
3215{
3216 return terminal_coding.symbol;
3217}
3218
3219DEFUN ("set-keyboard-coding-system",
3220 Fset_keyboard_coding_system, Sset_keyboard_coding_system, 1, 1,
3221 "zCoding-system for keyboard input: ",
3222 "Set coding-system of what is sent from terminal keyboard to CODING-SYSTEM.\n\
3223All inputs from terminal are decoded from this coding-system.")
3224 (coding_system)
3225 Lisp_Object coding_system;
3226{
3227 CHECK_SYMBOL (coding_system, 0);
3228 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
3229 return Qnil;
3230}
3231
3232DEFUN ("keyboard-coding-system",
3233 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3234 "Return coding-system of what is sent from terminal keyboard.")
3235 ()
3236{
3237 return keyboard_coding.symbol;
3238}
3239
3240
3241DEFUN ("find-coding-system", Ffind_coding_system, Sfind_coding_system,
3242 1, MANY, 0,
3243 "Return a cons of coding systems for I/O primitive OPERATION.\n\
3244Remaining arguments are for OPERATION.\n\
3245OPERATION is one of the following Emacs I/O primitives:\n\
3246 For file I/O, insert-file-contents or write-region.\n\
3247 For process I/O, call-process, call-process-region, or start-process.\n\
3248 For network I/O, open-network-stream.\n\
3249For each OPERATION, TARGET is selected from the arguments as below:\n\
3250 For file I/O, TARGET is a file name.\n\
3251 For process I/O, TARGET is a process name.\n\
3252 For network I/O, TARGET is a service name or a port number\n\
3253\n\
3254The return value is a cons of coding systems for decoding and encoding\n\
3255registered in nested alist `coding-system-alist' (which see) at a slot\n\
3256corresponding to OPERATION and TARGET.
3257If a function symbol is at the slot, return a result of the function call.\n\
3258The function is called with one argument, a list of all the arguments.")
3259 (nargs, args)
3260 int nargs;
3261 Lisp_Object *args;
3262{
3263 Lisp_Object operation, target_idx, target, val;
3264 register Lisp_Object chain;
3265
3266 if (nargs < 2)
3267 error ("Too few arguments");
3268 operation = args[0];
3269 if (!SYMBOLP (operation)
3270 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3271 error ("Invalid first arguement");
3272 if (nargs < 1 + XINT (target_idx))
3273 error ("Too few arguments for operation: %s",
3274 XSYMBOL (operation)->name->data);
3275 target = args[XINT (target_idx) + 1];
3276 if (!(STRINGP (target)
3277 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
3278 error ("Invalid %dth argument", XINT (target_idx) + 1);
3279
3280 chain = Fassq (operation, Vcoding_system_alist);
3281 if (NILP (chain))
3282 return Qnil;
3283
3284 for (chain = XCONS (chain)->cdr; CONSP (chain); chain = XCONS (chain)->cdr)
3285 {
3286 Lisp_Object elt = XCONS (chain)->car;
3287
3288 if (CONSP (elt)
3289 && ((STRINGP (target)
3290 && STRINGP (XCONS (elt)->car)
3291 && fast_string_match (XCONS (elt)->car, target) >= 0)
3292 || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
3293 return (CONSP (val = XCONS (elt)->cdr)
3294 ? val
3295 : ((SYMBOLP (val) && Fboundp (val)
3296 ? call2 (val, Flist (nargs, args))
3297 : Qnil)));
3298 }
3299 return Qnil;
3300}
3301
3302#endif /* emacs */
3303
3304
3305/*** 8. Post-amble ***/
3306
3307init_coding_once ()
3308{
3309 int i;
3310
3311 /* Emacs internal format specific initialize routine. */
3312 for (i = 0; i <= 0x20; i++)
3313 emacs_code_class[i] = EMACS_control_code;
3314 emacs_code_class[0x0A] = EMACS_linefeed_code;
3315 emacs_code_class[0x0D] = EMACS_carriage_return_code;
3316 for (i = 0x21 ; i < 0x7F; i++)
3317 emacs_code_class[i] = EMACS_ascii_code;
3318 emacs_code_class[0x7F] = EMACS_control_code;
3319 emacs_code_class[0x80] = EMACS_leading_code_composition;
3320 for (i = 0x81; i < 0xFF; i++)
3321 emacs_code_class[i] = EMACS_invalid_code;
3322 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
3323 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
3324 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
3325 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
3326
3327 /* ISO2022 specific initialize routine. */
3328 for (i = 0; i < 0x20; i++)
3329 iso_code_class[i] = ISO_control_code;
3330 for (i = 0x21; i < 0x7F; i++)
3331 iso_code_class[i] = ISO_graphic_plane_0;
3332 for (i = 0x80; i < 0xA0; i++)
3333 iso_code_class[i] = ISO_control_code;
3334 for (i = 0xA1; i < 0xFF; i++)
3335 iso_code_class[i] = ISO_graphic_plane_1;
3336 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
3337 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
3338 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
3339 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
3340 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
3341 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
3342 iso_code_class[ISO_CODE_ESC] = ISO_escape;
3343 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
3344 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
3345 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
3346
3347 Qcoding_system = intern ("coding-system");
3348 staticpro (&Qcoding_system);
3349
3350 Qeol_type = intern ("eol-type");
3351 staticpro (&Qeol_type);
3352
3353 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
3354 staticpro (&Qbuffer_file_coding_system);
3355
3356 Qpost_read_conversion = intern ("post-read-conversion");
3357 staticpro (&Qpost_read_conversion);
3358
3359 Qpre_write_conversion = intern ("pre-write-conversion");
3360 staticpro (&Qpre_write_conversion);
3361
3362 Qcoding_system_vector = intern ("coding-system-vector");
3363 staticpro (&Qcoding_system_vector);
3364
3365 Qcoding_system_p = intern ("coding-system-p");
3366 staticpro (&Qcoding_system_p);
3367
3368 Qcoding_system_error = intern ("coding-system-error");
3369 staticpro (&Qcoding_system_error);
3370
3371 Fput (Qcoding_system_error, Qerror_conditions,
3372 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
3373 Fput (Qcoding_system_error, Qerror_message,
3374 build_string ("Coding-system error"));
3375
3376 Qcoding_category_index = intern ("coding-category-index");
3377 staticpro (&Qcoding_category_index);
3378
3379 {
3380 int i;
3381 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3382 {
3383 coding_category_table[i] = intern (coding_category_name[i]);
3384 staticpro (&coding_category_table[i]);
3385 Fput (coding_category_table[i], Qcoding_category_index,
3386 make_number (i));
3387 }
3388 }
3389
3390 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
3391 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
3392
3393 setup_coding_system (Qnil, &keyboard_coding);
3394 setup_coding_system (Qnil, &terminal_coding);
3395}
3396
3397#ifdef emacs
3398
3399syms_of_coding ()
3400{
3401 Qtarget_idx = intern ("target-idx");
3402 staticpro (&Qtarget_idx);
3403
3404 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
3405 Fput (Qwrite_region, Qtarget_idx, make_number (2));
3406
3407 Qcall_process = intern ("call-process");
3408 staticpro (&Qcall_process);
3409 Fput (Qcall_process, Qtarget_idx, make_number (0));
3410
3411 Qcall_process_region = intern ("call-process-region");
3412 staticpro (&Qcall_process_region);
3413 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
3414
3415 Qstart_process = intern ("start-process");
3416 staticpro (&Qstart_process);
3417 Fput (Qstart_process, Qtarget_idx, make_number (2));
3418
3419 Qopen_network_stream = intern ("open-network-stream");
3420 staticpro (&Qopen_network_stream);
3421 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
3422
3423 defsubr (&Scoding_system_vector);
3424 defsubr (&Scoding_system_p);
3425 defsubr (&Sread_coding_system);
3426 defsubr (&Sread_non_nil_coding_system);
3427 defsubr (&Scheck_coding_system);
3428 defsubr (&Sdetect_coding_region);
3429 defsubr (&Sdecode_coding_region);
3430 defsubr (&Sencode_coding_region);
3431 defsubr (&Sdecode_coding_string);
3432 defsubr (&Sencode_coding_string);
3433 defsubr (&Sdecode_sjis_char);
3434 defsubr (&Sencode_sjis_char);
3435 defsubr (&Sdecode_big5_char);
3436 defsubr (&Sencode_big5_char);
3437 defsubr (&Sset_terminal_coding_system);
3438 defsubr (&Sterminal_coding_system);
3439 defsubr (&Sset_keyboard_coding_system);
3440 defsubr (&Skeyboard_coding_system);
3441 defsubr (&Sfind_coding_system);
3442
3443 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
3444 "List of coding-categories (symbols) ordered by priority.");
3445 {
3446 int i;
3447
3448 Vcoding_category_list = Qnil;
3449 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
3450 Vcoding_category_list
3451 = Fcons (coding_category_table[i], Vcoding_category_list);
3452 }
3453
3454 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
3455 "A variable of internal use only.\n\
3456If the value is a coding system, it is used for decoding on read operation.\n\
3457If not, an appropriate element in `coding-system-alist' (which see) is used.");
3458 Vcoding_system_for_read = Qnil;
3459
3460 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
3461 "A variable of internal use only.\n\
3462If the value is a coding system, it is used for encoding on write operation.\n\
3463If not, an appropriate element in `coding-system-alist' (which see) is used.");
3464 Vcoding_system_for_write = Qnil;
3465
3466 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
3467 "Coding-system used in the latest file or process I/O.");
3468 Vlast_coding_system_used = Qnil;
3469
3470 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
3471 "Nested alist to decide a coding system for a specific I/O operation.\n\
3472The format is ((OPERATION . ((REGEXP . CODING-SYSTEMS) ...)) ...).\n\
3473
3474OPERATION is one of the following Emacs I/O primitives:\n\
3475 For file I/O, insert-file-contents and write-region.\n\
3476 For process I/O, call-process, call-process-region, and start-process.\n\
3477 For network I/O, open-network-stream.\n\
3478In addition, for process I/O, `process-argument' can be specified for\n\
3479encoding arguments of the process.\n\
3480\n\
3481REGEXP is a regular expression matching a target of OPERATION, where\n\
3482target is a file name for file I/O operations, a process name for\n\
3483process I/O operations, or a service name for network I/O\n\
3484operations. REGEXP might be a port number for network I/O operation.\n\
3485\n\
3486CODING-SYSTEMS is a cons of coding systems to encode and decode\n\
3487character code on OPERATION, or a function symbol returning the cons.\n\
3488See the documentation of `find-coding-system' for more detail.");
3489 Vcoding_system_alist = Qnil;
3490
3491 DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
3492 "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
3493 eol_mnemonic_unix = '.';
3494
3495 DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
3496 "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
3497 eol_mnemonic_dos = ':';
3498
3499 DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
3500 "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
3501 eol_mnemonic_mac = '\'';
3502
3503 DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
3504 "Mnemonic character indicating end-of-line format is not yet decided.");
3505 eol_mnemonic_undecided = '-';
3506
3507 DEFVAR_LISP ("alternate-charset-table", &Valternate_charset_table,
3508 "Alist of charsets vs the alternate charsets.\n\
3509While decoding, if a charset (car part of an element) is found,\n\
3510decode it as the alternate charset (cdr part of the element).");
3511 Valternate_charset_table = Qnil;
3512
3513 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
3514 "Alist of charsets vs revision numbers.\n\
3515While encoding, if a charset (car part of an element) is found,\n\
3516designate it with the escape sequence identifing revision (cdr part of the element).");
3517 Vcharset_revision_alist = Qnil;
3518}
3519
3520#endif /* emacs */
diff --git a/src/coding.h b/src/coding.h
new file mode 100644
index 00000000000..3ec2fcc32f3
--- /dev/null
+++ b/src/coding.h
@@ -0,0 +1,409 @@
1/* Header for coding system handler.
2 Ver.1.0
3
4 Copyright (C) 1995 Free Software Foundation, Inc.
5 Copyright (C) 1995 Electrotechnical Laboratory, JAPAN.
6
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
20
21#ifndef _CODING_H
22#define _CODING_H
23
24#ifndef _CCL_H
25#include "../src/ccl.h"
26#endif
27
28/*** EMACS' INTERNAL FORMAT section ***/
29
30/* All code (1-byte) of Emacs' internal format is classified into one
31 of the followings. See also `charset.h'. */
32enum emacs_code_class_type
33 {
34 EMACS_control_code, /* Control codes in the range
35 0x00..0x1F and 0x7F except for the
36 following two codes. */
37 EMACS_linefeed_code, /* 0x0A (linefeed) to denote
38 end-of-line. */
39 EMACS_carriage_return_code, /* 0x0D (carriage-return) to be used
40 in selective display mode. */
41 EMACS_ascii_code, /* ASCII characters. */
42 EMACS_leading_code_composition, /* Leading code of a composite
43 character. */
44 EMACS_leading_code_2, /* Base leading code of official
45 TYPE9N character. */
46 EMACS_leading_code_3, /* Base leading code of private TYPE9N
47 or official TYPE9Nx9N character. */
48 EMACS_leading_code_4, /* Base leading code of private
49 TYPE9Nx9N character. */
50 EMACS_invalid_code /* Invalid code, i.e. a base leading
51 code not yet assigned to any
52 charset, or a code of the range
53 0xA0..0xFF. */
54 };
55
56extern enum emacs_code_class_type emacs_code_class[256];
57
58/*** ISO2022 section ***/
59
60/* Macros to define code of control characters for ISO2022's functions. */
61 /* code */ /* function */
62#define ISO_CODE_LF 0x0A /* line-feed */
63#define ISO_CODE_CR 0x0D /* carriage-return */
64#define ISO_CODE_SO 0x0E /* shift-out */
65#define ISO_CODE_SI 0x0F /* shift-in */
66#define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
67#define ISO_CODE_ESC 0x1B /* escape */
68#define ISO_CODE_SS2 0x8E /* single-shift-2 */
69#define ISO_CODE_SS3 0x8F /* single-shift-3 */
70#define ISO_CODE_CSI 0x9B /* control-sequence-introduce */
71
72/* All code (1-byte) of ISO2022 is classified into one of the
73 followings. */
74enum iso_code_class_type
75 {
76 ISO_control_code, /* Control codes in the range
77 0x00..0x1F, 0x7F, and 0x80..0x9F,
78 except for the following seven
79 codes. */
80 ISO_carriage_return, /* ISO_CODE_CR (0x0D) */
81 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
82 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
83 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
84 ISO_escape, /* ISO_CODE_SO (0x1B) */
85 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
86 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
87 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
88 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
89 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
90 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
91 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
92 };
93
94/** The macros CODING_FLAG_ISO_XXX defines a flag bit of the `flags'
95 element in the structure `coding_system'. This information is used
96 while encoding a text to ISO2022. **/
97
98/* If set, produce short-form designation sequence (e.g. ESC $ A)
99 instead of long-form sequence (e.g. ESC $ ( A). */
100#define CODING_FLAG_ISO_SHORT_FORM 0x0001
101
102/* If set, reset graphic planes and registers at end-of-line to the
103 initial state. */
104#define CODING_FLAG_ISO_RESET_AT_EOL 0x0002
105
106/* If set, reset graphic planes and registers before any control
107 characters to the initial state. */
108#define CODING_FLAG_ISO_RESET_AT_CNTL 0x0004
109
110/* If set, encode by 7-bit environment. */
111#define CODING_FLAG_ISO_SEVEN_BITS 0x0008
112
113/* If set, use locking-shift function. */
114#define CODING_FLAG_ISO_LOCKING_SHIFT 0x0010
115
116/* If set, use single-shift function. Overwrite
117 CODING_FLAG_ISO_LOCKING_SHIFT. */
118#define CODING_FLAG_ISO_SINGLE_SHIFT 0x0020
119
120/* If set, designate JISX0201-Roman instead of ASCII. */
121#define CODING_FLAG_ISO_USE_ROMAN 0x0040
122
123/* If set, designate JISX0208-1978 instead of JISX0208-1983. */
124#define CODING_FLAG_ISO_USE_OLDJIS 0x0080
125
126/* If set, do not produce ISO6429's direction specifying sequence. */
127#define CODING_FLAG_ISO_NO_DIRECTION 0x0100
128
129/* Structure of the field `spec.iso2022' in the structure `coding_system'. */
130struct iso2022_spec
131{
132 /* The current graphic register invoked to each graphic plane. */
133 int current_invocation[2];
134
135 /* The current charset designated to each graphic register. */
136 int current_designation[4];
137
138 /* A charset initially designated to each graphic register. */
139 int initial_designation[4];
140
141 /* A graphic register to which each charset should be designated. */
142 int requested_designation[MAX_CHARSET];
143
144 /* Set to 1 temporarily only when graphic register 2 or 3 is invoked
145 by single-shift while encoding. */
146 int single_shifting;
147};
148
149/* Macros to access each field in the structure `spec.iso2022'. */
150#define CODING_SPEC_ISO_INVOCATION(coding, plane) \
151 coding->spec.iso2022.current_invocation[plane]
152#define CODING_SPEC_ISO_DESIGNATION(coding, reg) \
153 coding->spec.iso2022.current_designation[reg]
154#define CODING_SPEC_ISO_INITIAL_DESIGNATION(coding, reg) \
155 coding->spec.iso2022.initial_designation[reg]
156#define CODING_SPEC_ISO_REQUESTED_DESIGNATION(coding, charset) \
157 coding->spec.iso2022.requested_designation[charset]
158
159/* Set to 1 temporarily only when encoding a character with
160 single-shift function. */
161#define CODING_SPEC_ISO_SINGLE_SHIFTING(coding) \
162 coding->spec.iso2022.single_shifting
163
164/* Return a charset which is currently designated to the graphic plane
165 PLANE in the coding-system CODING. */
166#define CODING_SPEC_ISO_PLANE_CHARSET(coding, plane) \
167 CODING_SPEC_ISO_DESIGNATION \
168 (coding, CODING_SPEC_ISO_INVOCATION (coding, plane))
169
170/*** BIG5 section ***/
171
172/* Macros to denote each type of BIG5 coding system. */
173#define CODING_FLAG_BIG5_HKU 0x00 /* BIG5-HKU is one of variants of
174 BIG5 developed by Hong Kong
175 University. */
176#define CODING_FLAG_BIG5_ETEN 0x01 /* BIG5_ETen is one of variants
177 of BIG5 developed by the
178 company ETen in Taiwan. */
179
180/*** GENERAL section ***/
181
182/* Types of coding system. */
183enum coding_type
184 {
185 coding_type_no_conversion, /* A coding system which requires no
186 conversion for reading and writing
187 including end-of-line format. */
188 coding_type_internal, /* A coding system used in Emacs'
189 buffer and string. Requires no
190 conversion for reading and writing
191 except for end-of-line format. */
192 coding_type_automatic, /* A coding system which requires
193 automatic detection of a real
194 coding system. */
195 coding_type_sjis, /* SJIS coding system for Japanese. */
196 coding_type_iso2022, /* Any coding system of ISO2022
197 variants. */
198 coding_type_big5, /* BIG5 coding system for Chinese. */
199 coding_type_ccl /* The coding system of which decoder
200 and encoder are written in CCL. */
201 };
202
203/* Formats of end-of-line. */
204#define CODING_EOL_LF 0 /* Line-feed only, same as Emacs'
205 internal format. */
206#define CODING_EOL_CRLF 1 /* Sequence of carriage-return and
207 line-feed. */
208#define CODING_EOL_CR 2 /* Carriage-return only. */
209#define CODING_EOL_AUTOMATIC 3 /* This value is used to denote the
210 eol-type is not yet decided. */
211
212/* Character composition status while encoding/decoding. */
213#define COMPOSING_NO 0 /* not composing */
214#define COMPOSING_WITH_RULE_HEAD 1 /* 1st char of with-rule composing follow */
215#define COMPOSING_NO_RULE_HEAD 2 /* 1st char of no-rule composing follow */
216#define COMPOSING_WITH_RULE_TAIL 3 /* Nth char of with-rule composing follow */
217#define COMPOSING_NO_RULE_TAIL 4 /* Nth char of no-rule composing follow */
218#define COMPOSING_WITH_RULE_RULE 5 /* composition rule follow */
219
220/* 1 iff composing. */
221#define COMPOSING_P(composing) (composing)
222/* 1 iff 1st char of composing element follows. */
223#define COMPOSING_HEAD_P(composing) \
224 ((composing) && (composing) <= COMPOSING_NO_RULE_HEAD)
225/* 1 iff composing with embeded composition rule. */
226#define COMPOSING_WITH_RULE_P(composing) ((composing) & 1)
227
228struct coding_system
229{
230 /* Type of the coding system. */
231 enum coding_type type;
232
233 /* If the coding system requires specific code to be attached at the
234 tail of converted text, this value should be set to `1'. */
235 int require_flushing;
236
237 /* Flag bits of the coding system. The meaning of each bit depends
238 on the type of the coding system. */
239 unsigned int flags;
240
241 /* Type of end-of-line format (LF, CRLF, or CR) of the coding system. */
242 int eol_type;
243
244 /* Non-zero means that the current source text is the last block of the
245 whole text to be converted. */
246 int last_block;
247
248 /* Non-zero means that characters are being composed currently while
249 decoding or encoding. See macros COMPOSING_XXXX above for the
250 meaing of each non-zero value. */
251 int composing;
252
253 /* 0 (left-to-right) or 1 (right-to-left): the direction of the text
254 being processed currently. */
255 int direction;
256
257 /* Non-zero means that the current source text is in a buffer which
258 enables selective display. */
259 int selective;
260
261 /* Detailed information specific to each type of coding system. */
262 union spec
263 {
264 struct iso2022_spec iso2022;
265 struct ccl_spec ccl; /* Defined in ccl.h. */
266 } spec;
267
268 /* Backward pointer to the Lisp symbol of the coding system. */
269 Lisp_Object symbol;
270
271 /* Lisp function (symbol) to be called after decoding to do
272 additional conversion. */
273 Lisp_Object post_read_conversion;
274
275 /* Lisp function (symbol) to be called before encoding to do
276 additional conversion. */
277 Lisp_Object pre_write_conversion;
278
279 /* Carryover yielded by decoding/encoding incomplete source. No
280 coding-system yields more than 7-byte of carryover. This does
281 not include a text which is not processed because of short of
282 output buffer. */
283 char carryover[8];
284
285 /* Actual data length in the above array. */
286 int carryover_size;
287};
288
289/* Return 1 if the coding-system CODING requires conversion of
290 representation of a visible character (text). */
291#define CODING_REQUIRE_TEXT_CONVERSION(coding) \
292 ((coding)->type != coding_type_no_conversion \
293 && (coding)->type != coding_type_internal)
294
295/* Return 1 if the coding-system CODING requires conversion of the
296 format of end-of-line. */
297#define CODING_REQUIRE_EOL_CONVERSION(coding) \
298 ((coding)->eol_type != CODING_EOL_AUTOMATIC \
299 && (coding)->eol_type != CODING_EOL_LF)
300
301/* Return 1 if the coding-system CODING requires some conversion. */
302#define CODING_REQUIRE_CONVERSION(coding) \
303 (CODING_REQUIRE_TEXT_CONVERSION (coding) \
304 || CODING_REQUIRE_EOL_CONVERSION (coding))
305
306/* Index for each coding category in `coding_category_table' */
307#define CODING_CATEGORY_IDX_INTERNAL 0
308#define CODING_CATEGORY_IDX_SJIS 1
309#define CODING_CATEGORY_IDX_ISO_7 2
310#define CODING_CATEGORY_IDX_ISO_8_1 3
311#define CODING_CATEGORY_IDX_ISO_8_2 4
312#define CODING_CATEGORY_IDX_ISO_ELSE 5
313#define CODING_CATEGORY_IDX_BIG5 6
314#define CODING_CATEGORY_IDX_BINARY 7
315#define CODING_CATEGORY_IDX_MAX 8
316
317/* Definitions of flag bits returned by the function
318 detect_coding_mask (). */
319#define CODING_CATEGORY_MASK_INTERNAL (1 << CODING_CATEGORY_IDX_INTERNAL)
320#define CODING_CATEGORY_MASK_SJIS (1 << CODING_CATEGORY_IDX_SJIS)
321#define CODING_CATEGORY_MASK_ISO_7 (1 << CODING_CATEGORY_IDX_ISO_7)
322#define CODING_CATEGORY_MASK_ISO_8_1 (1 << CODING_CATEGORY_IDX_ISO_8_1)
323#define CODING_CATEGORY_MASK_ISO_8_2 (1 << CODING_CATEGORY_IDX_ISO_8_2)
324#define CODING_CATEGORY_MASK_ISO_ELSE (1 << CODING_CATEGORY_IDX_ISO_ELSE)
325#define CODING_CATEGORY_MASK_BIG5 (1 << CODING_CATEGORY_IDX_BIG5)
326
327/* This value is returned if detect_coding_mask () find nothing other
328 than ASCII characters. */
329#define CODING_CATEGORY_MASK_ANY \
330 ( CODING_CATEGORY_MASK_INTERNAL \
331 | CODING_CATEGORY_MASK_SJIS \
332 | CODING_CATEGORY_MASK_ISO_7 \
333 | CODING_CATEGORY_MASK_ISO_8_1 \
334 | CODING_CATEGORY_MASK_ISO_8_2 \
335 | CODING_CATEGORY_MASK_ISO_ELSE \
336 | CODING_CATEGORY_MASK_BIG5)
337
338/* Macros to decode or encode a character of JISX0208 in SJIS. S1 and
339 S2 are the 1st and 2nd position-codes of JISX0208 in SJIS coding
340 system. C1 and C2 are the 1st and 2nd position codes of Emacs'
341 internal format. */
342
343#define DECODE_SJIS(s1, s2, c1, c2) \
344 do { \
345 if (s2 >= 0x9F) \
346 c1 = s1 * 2 - (s1 >= 0xE0 ? 0x160 : 0xE0), \
347 c2 = s2 - 0x7E; \
348 else \
349 c1 = s1 * 2 - ((s1 >= 0xE0) ? 0x161 : 0xE1), \
350 c2 = s2 - ((s2 >= 0x7F) ? 0x20 : 0x1F); \
351 } while (0)
352
353#define ENCODE_SJIS(c1, c2, s1, s2) \
354 do { \
355 if (c1 & 1) \
356 s1 = c1 / 2 + ((c1 < 0x5F) ? 0x71 : 0xB1), \
357 s2 = c2 + ((c2 >= 0x60) ? 0x20 : 0x1F); \
358 else \
359 s1 = c1 / 2 + ((c1 < 0x5F) ? 0x70 : 0xB0), \
360 s2 = c2 + 0x7E; \
361 } while (0)
362
363/* Extern declarations. */
364extern int decode_coding (), encode_coding ();
365extern int decoding_buffer_size (), encoding_buffer_size ();
366extern int conversion_buffer_size;
367extern char *conversion_buffer, *get_conversion_buffer ();
368extern Lisp_Object Fcheck_coding_system ();
369extern Lisp_Object Qcoding_system, Qeol_type, Qcoding_category_index;
370extern Lisp_Object Qbuffer_file_coding_system;
371extern Lisp_Object Vcoding_category_list;
372
373/* Mnemonic character to indicate each type of end-of-line. */
374extern int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
375/* Mnemonic character to indicate type of end-of-line is not yet decided. */
376extern int eol_mnemonic_undecided;
377
378/* Table of coding-systems currently assigned to each coding-category. */
379extern Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
380/* Table of names of symbol for each coding-category. */
381extern char *coding_category_name[CODING_CATEGORY_IDX_MAX];
382
383#ifdef emacs
384extern Lisp_Object Qfile_coding_system;
385extern Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
386extern Lisp_Object Qstart_process, Qopen_network_stream;
387
388/* Coding-system for reading files and receiving data from process. */
389extern Lisp_Object Vcoding_system_for_read;
390/* Coding-system for writing files and sending data to process. */
391extern Lisp_Object Vcoding_system_for_write;
392/* Coding-system actually used in the latest I/O. */
393extern Lisp_Object Vlast_coding_system_used;
394
395/* Coding-system to be used for encoding terminal output. This
396 structure contains information of a coding-system specified by the
397 function `set-terminal-coding-system'. */
398extern struct coding_system terminal_coding;
399
400/* Coding-system of what is sent from terminal keyboard. This
401 structure contains information of a coding-system specified by the
402 function `set-keyboard-coding-system'. */
403extern struct coding_system keyboard_coding;
404
405extern Lisp_Object Vcoding_system_alist;
406
407#endif
408
409#endif /* _CODING_H */
diff --git a/src/fontset.c b/src/fontset.c
new file mode 100644
index 00000000000..7d88e90ae89
--- /dev/null
+++ b/src/fontset.c
@@ -0,0 +1,819 @@
1/* Fontset handler.
2 Ver.1.0
3
4 Copyright (C) 1995 Free Software Foundation, Inc.
5 Copyright (C) 1995 Electrotechnical Laboratory, JAPAN.
6
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
20
21#include <config.h>
22#if HAVE_ALLOCA_H
23#include <alloca.h>
24#endif /* HAVE_ALLOCA_H */
25#include "lisp.h"
26#include "charset.h"
27#include "ccl.h"
28#include "fontset.h"
29#include "frame.h"
30
31Lisp_Object Vglobal_fontset_alist;
32
33Lisp_Object Vfont_encoding_alist;
34
35/* We had better have our own strcasecmp function because some system
36 doesn't have it. */
37static char my_strcasetbl[256];
38
39/* Compare two strings S0 and S1 while ignoring differences in case.
40 Return 1 if they differ, else return 0. */
41static int
42my_strcasecmp (s0, s1)
43 unsigned char *s0, *s1;
44{
45 while (*s0)
46 if (my_strcasetbl[*s0++] != my_strcasetbl[*s1++]) return 1;
47 return (int) *s1;
48}
49
50/* The following six are window system dependent functions. See
51 the comments in src/fontset.h for more detail. */
52
53/* Return a pointer to struct font_info of font FONT_IDX of frame F. */
54struct font_info *(*get_font_info_func) (/* FRAME_PTR f; int font_idx */);
55
56/* Return a list of font names which matches PATTERN. See the document of
57 `x-list-fonts' for more detail. */
58Lisp_Object (*list_fonts_func) (/* Lisp_Object pattern, face, frame, width */);
59
60/* Load a font named NAME for frame F and return a pointer to the
61 information of the loaded font. If loading is failed, return 0. */
62struct font_info *(*load_font_func) (/* FRAME_PTR f; char *name */);
63
64/* Return a pointer to struct font_info of a font named NAME for frame F. */
65struct font_info *(*query_font_func) (/* FRAME_PTR f; char *name */);
66
67/* Additional function for setting fontset or changing fontset
68 contents of frame F. */
69void (*set_frame_fontset_func) (/* FRAME_PTR f; Lisp_Object arg, oldval */);
70
71/* Check if any window system is used now. */
72void (*check_window_system_func) ();
73
74struct fontset_data *
75alloc_fontset_data ()
76{
77 struct fontset_data *fontset_data
78 = (struct fontset_data *) xmalloc (sizeof (struct fontset_data));
79
80 bzero (fontset_data, sizeof (struct fontset_data));
81
82 return fontset_data;
83}
84
85void
86free_fontset_data (fontset_data)
87 struct fontset_data *fontset_data;
88{
89 int i;
90
91 for (i = 0; i < fontset_data->n_fontsets; i++)
92 {
93 int j;
94
95 xfree (fontset_data->fontset_table[i]->name);
96 for (j = 0; j < MAX_CHARSET; j++)
97 if (fontset_data->fontset_table[i]->fontname[j])
98 xfree (fontset_data->fontset_table[i]->fontname[j]);
99 xfree (fontset_data->fontset_table[i]);
100 }
101 xfree (fontset_data->fontset_table);
102
103 xfree (fontset_data);
104}
105
106/* Load a font named FONTNAME for displaying CHARSET on frame F.
107 All fonts for frame F is stored in a table pointed by FONT_TABLE.
108 Return a pointer to the struct font_info of the loaded font.
109 If loading fails, return 0;
110 If FONTNAME is NULL, the name is taken from the information of FONTSET.
111 If FONTSET is given, try to load a font whose size matches that of
112 FONTSET, and, the font index is stored in the table for FONTSET. */
113
114struct font_info *
115fs_load_font (f, font_table, charset, fontname, fontset)
116 FRAME_PTR f;
117 struct font_info *font_table;
118 int charset, fontset;
119 char *fontname;
120{
121 Lisp_Object font_list;
122 Lisp_Object list, elt;
123 int font_idx;
124 int size = 0;
125 struct fontset_info *fontsetp = 0;
126 struct font_info *fontp;
127
128 if (fontset >= 0 && fontset < FRAME_FONTSET_DATA (f)->n_fontsets)
129 {
130 fontsetp = FRAME_FONTSET_DATA (f)->fontset_table[fontset];
131 font_idx = fontsetp->font_indexes[charset];
132 if (font_idx >= 0)
133 /* We have already loaded a font. */
134 return font_table + font_idx;
135 else if (font_idx == FONT_NOT_FOUND)
136 /* We have already tried loading a font and failed. */
137 return 0;
138 if (!fontname)
139 fontname = fontsetp->fontname[charset];
140 }
141
142 if (!fontname)
143 /* No way to get fontname. */
144 return 0;
145
146 /* If a fontset is specified and we have already loaded some fonts
147 in the fontset, we need a font of appropriate size to be used
148 with the fonts. */
149 if (fontsetp && fontsetp->size)
150 size = fontsetp->size * CHARSET_WIDTH (charset);
151
152 fontp = (*load_font_func) (f, fontname, size);
153
154 if (!fontp)
155 {
156 if (fontsetp)
157 fontsetp->font_indexes[charset] = FONT_NOT_FOUND;
158 return 0;
159 }
160
161 /* Fill in fields (CHARSET, ENCODING, and FONT_ENCODER) which are
162 not set by (*load_font_func). */
163 fontp->charset = charset;
164
165 if (fontp->encoding[1] >= 0)
166 {
167 /* The font itself tells which code points to be used. Use this
168 encoding for all other charsets. */
169 int i;
170
171 fontp->encoding[0] = fontp->encoding[1];
172 for (i = MIN_CHARSET_OFFICIAL_DIMENSION1; i < MAX_CHARSET; i++)
173 fontp->encoding[i] = fontp->encoding[1];
174 }
175 else
176 {
177 /* The font itself doesn't tell which code points to be used. */
178 int i;
179
180 /* At first, set 1 (means 0xA0..0xFF) as the default. */
181 fontp->encoding[0] = 1;
182 for (i = MIN_CHARSET_OFFICIAL_DIMENSION1; i < MAX_CHARSET; i++)
183 fontp->encoding[i] = 1;
184 /* Then override them by a specification in Vfont_encoding_alist. */
185 for (list = Vfont_encoding_alist; CONSP (list); list = XCONS (list)->cdr)
186 {
187 elt = XCONS (list)->car;
188 if (CONSP (elt)
189 && STRINGP (XCONS (elt)->car) && CONSP (XCONS (elt)->cdr)
190 && (fast_string_match_ignore_case (XCONS (elt)->car, fontname)
191 >= 0))
192 {
193 Lisp_Object tmp;
194
195 for (tmp = XCONS (elt)->cdr; CONSP (tmp); tmp = XCONS (tmp)->cdr)
196 if (CONSP (XCONS (tmp)->car)
197 && INTEGERP (XCONS (XCONS (tmp)->car)->car)
198 && ((i = get_charset_id (XCONS (XCONS (tmp)->car)->car))
199 >= 0)
200 && INTEGERP (XCONS (XCONS (tmp)->car)->cdr)
201 && XFASTINT (XCONS (XCONS (tmp)->car)->cdr) < 4)
202 fontp->encoding[i]
203 = XFASTINT (XCONS (XCONS (tmp)->car)->cdr);
204 }
205 }
206 }
207
208 fontp->font_encoder = (struct ccl_program *) 0;
209 for (list = Vfont_ccl_encoder_alist; CONSP (list); list = XCONS (list)->cdr)
210 {
211 elt = XCONS (list)->car;
212 if (CONSP (elt)
213 && STRINGP (XCONS (elt)->car) && VECTORP (XCONS (elt)->cdr)
214 && fast_string_match_ignore_case (XCONS (elt)->car, fontname) >= 0)
215 {
216 fontp->font_encoder
217 = (struct ccl_program *) xmalloc (sizeof (struct ccl_program));
218 setup_ccl_program (fontp->font_encoder, XCONS (elt)->cdr);
219 break;
220 }
221 }
222
223 if (fontsetp)
224 {
225 fontsetp->font_indexes[charset] = fontp->font_idx;
226 if (fontsetp->size == 0)
227 fontsetp->size = fontp->size / CHARSET_WIDTH (charset);
228
229 if (charset == CHARSET_ASCII
230 && fontsetp->size != fontp->size)
231 {
232 /* When loading ASCII font of the different size from the
233 size of FONTSET, we have to update the size of FONTSET.
234 Since changing the size of FONTSET may make some fonts
235 already loaded inappropriate to be used in FONTSET, we
236 must delete the record of such fonts. In that case, we
237 also have to calculate the height of FONTSET from the
238 remaining fonts. */
239 int i;
240
241 fontsetp->size = fontp->size;
242 fontsetp->height = fontp->height;
243 for (i = CHARSET_ASCII + 1; i < MAX_CHARSET; i++)
244 {
245 font_idx = fontsetp->font_indexes[i];
246 if (font_idx >= 0)
247 {
248 struct font_info *fontp2 = font_table + font_idx;
249
250 if (fontp2->size != fontp->size * CHARSET_WIDTH (i))
251 fontsetp->font_indexes[i] = FONT_NOT_OPENED;
252 else if (fontsetp->height < fontp->height)
253 fontsetp->height = fontp->height;
254 }
255 }
256 }
257 else if (fontsetp->height < fontp->height)
258 fontsetp->height = fontp->height;
259 }
260
261 return fontp;
262}
263
264/* Return ID of the fontset named NAME on frame F. */
265
266int
267fs_query_fontset (f, name)
268 FRAME_PTR f;
269 char *name;
270{
271 struct fontset_data *fontset_data = FRAME_FONTSET_DATA (f);
272 int i;
273
274 for (i = 0; i < fontset_data->n_fontsets; i++)
275 if (!my_strcasecmp(name, fontset_data->fontset_table[i]->name))
276 return i;
277 return -1;
278}
279
280/* Register a fontset specified by FONTSET_INFO for frame FRAME.
281 Return the fontset ID if successfully registered, else return -1.
282 FONTSET_INFO is a cons of name of the fontset and FONTLIST, where
283 FONTLIST is an alist of charsets vs fontnames. */
284
285int
286fs_register_fontset (f, fontset_info)
287 FRAME_PTR f;
288 Lisp_Object fontset_info;
289{
290 struct fontset_data *fontset_data = FRAME_FONTSET_DATA (f);
291 Lisp_Object name, fontlist;
292 int fontset;
293 struct fontset_info *fontsetp;
294 int i;
295
296 if (!CONSP (fontset_info)
297 || !STRINGP (XCONS (fontset_info)->car)
298 || !CONSP (XCONS (fontset_info)->cdr))
299 /* Invalid data in FONTSET_INFO. */
300 return -1;
301
302 name = XCONS (fontset_info)->car;
303 if ((fontset = fs_query_fontset (f, XSTRING (name)->data)) >= 0)
304 /* This fontset already exists on frame F. */
305 return fontset;
306
307 fontsetp = (struct fontset_info *) xmalloc (sizeof (struct fontset_info));
308
309 fontsetp->name = (char *) xmalloc (XSTRING (name)->size + 1);
310 bcopy(XSTRING (name)->data, fontsetp->name, XSTRING (name)->size + 1);
311
312 fontsetp->size = fontsetp->height = 0;
313
314 for (i = 0; i < MAX_CHARSET; i++)
315 {
316 fontsetp->fontname[i] = (char *) 0;
317 fontsetp->font_indexes[i] = FONT_NOT_OPENED;
318 }
319
320 for (fontlist = XCONS (fontset_info)->cdr; CONSP (fontlist);
321 fontlist = XCONS (fontlist)->cdr)
322 {
323 Lisp_Object tem = Fcar (fontlist);
324 int charset;
325
326 if (CONSP (tem)
327 && (charset = get_charset_id (XCONS (tem)->car)) >= 0
328 && STRINGP (XCONS (tem)->cdr))
329 {
330 fontsetp->fontname[charset]
331 = (char *) xmalloc (XSTRING (XCONS (tem)->cdr)->size + 1);
332 bcopy (XSTRING (XCONS (tem)->cdr)->data,
333 fontsetp->fontname[charset],
334 XSTRING (XCONS (tem)->cdr)->size + 1);
335 }
336 else
337 /* Broken or invalid data structure. */
338 return -1;
339 }
340
341 /* Do we need to create the table? */
342 if (fontset_data->fontset_table_size == 0)
343 {
344 fontset_data->fontset_table_size = 8;
345 fontset_data->fontset_table
346 = (struct fontset_info **) xmalloc (fontset_data->fontset_table_size
347 * sizeof (struct fontset_info *));
348 }
349 /* Do we need to grow the table? */
350 else if (fontset_data->n_fontsets >= fontset_data->fontset_table_size)
351 {
352 fontset_data->fontset_table_size += 8;
353 fontset_data->fontset_table
354 = (struct fontset_info **) xrealloc (fontset_data->fontset_table,
355 fontset_data->fontset_table_size
356 * sizeof (struct fontset_info *));
357 }
358 fontset = fontset_data->n_fontsets++;
359 fontset_data->fontset_table[fontset] = fontsetp;
360
361 return fontset;
362}
363
364/* Cache data used by fontset_pattern_regexp. The car part is a
365 pattern string containing at least one wild card, the cdr part is
366 the corresponding regular expression. */
367static Lisp_Object Vcached_fontset_data;
368
369#define CACHED_FONTSET_NAME (XSTRING (XCONS (Vcached_fontset_data)->car)->data)
370#define CACHED_FONTSET_REGEX (XCONS (Vcached_fontset_data)->cdr)
371
372/* If fontset name PATTERN contains any wild card, return regular
373 expression corresponding to PATTERN. */
374
375Lisp_Object
376fontset_pattern_regexp (pattern)
377 Lisp_Object pattern;
378{
379 int nickname = 0;
380
381 if (!index (XSTRING (pattern)->data, '*')
382 && !index (XSTRING (pattern)->data, '?'))
383 /* PATTERN does not contain any wild cards. */
384 {
385 if (XSTRING (pattern)->size > 8
386 && ! bcmp (XSTRING (pattern)->data, "fontset-", 8))
387 /* Just a nickname of a fontset is specified. */
388 nickname = 1;
389 else
390 return Qnil;
391 }
392
393 if (!CONSP (Vcached_fontset_data)
394 || strcmp (XSTRING (pattern)->data, CACHED_FONTSET_NAME))
395 {
396 /* We must at first update the cached data. */
397 char *regex = (char *) alloca (XSTRING (pattern)->size * 2 + 3);
398 char *p0, *p1 = regex;
399
400 if (nickname)
401 {
402 /* Just prepend ".*-" to PATTERN. */
403 *p1++= '.'; *p1++= '*', *p1++= '-';
404 bcopy (XSTRING (pattern)->data, p1, XSTRING (pattern)->size);
405 p1 += XSTRING (pattern)->size;
406 }
407 else
408 {
409 /* Convert "*" to ".*", "?" to ".". */
410 *p1++ = '^';
411 for (p0 = XSTRING (pattern)->data; *p0; p0++)
412 {
413 if (*p0 == '*')
414 {
415 *p1++ = '.';
416 *p1++ = '*';
417 }
418 else if (*p0 == '?')
419 *p1++ == '.';
420 else
421 *p1++ = *p0;
422 }
423 }
424 *p1++ = '$';
425 *p1++ = 0;
426
427 Vcached_fontset_data = Fcons (build_string (XSTRING (pattern)->data),
428 build_string (regex));
429 }
430
431 return CACHED_FONTSET_REGEX;
432}
433
434DEFUN ("query-fontset", Fquery_fontset, Squery_fontset, 1, 1, 0,
435 "Return a fontset name which matches PATTERN, nil if no matching fontset.\n\
436PATTERN can contain `*' or `?' as a wild card\n\
437just like X's font name matching algorithm allows.")
438 (pattern)
439 Lisp_Object pattern;
440{
441 Lisp_Object regexp, tem;
442
443 (*check_window_system_func) ();
444
445 CHECK_STRING (pattern, 0);
446
447 if (XSTRING (pattern)->size == 0)
448 return Qnil;
449
450 regexp = fontset_pattern_regexp (pattern);
451
452 for (tem = Vglobal_fontset_alist; CONSP (tem); tem = XCONS (tem)->cdr)
453 {
454 Lisp_Object fontset_name = XCONS (XCONS (tem)->car)->car;
455 if (!NILP (regexp))
456 {
457 if (fast_string_match_ignore_case (regexp,
458 XSTRING (fontset_name)->data)
459 >= 0)
460 return fontset_name;
461 }
462 else
463 {
464 if (!my_strcasecmp (XSTRING (pattern)->data,
465 XSTRING (fontset_name)->data))
466 return fontset_name;
467 }
468 }
469
470 return Qnil;
471}
472
473Lisp_Object Fframe_char_width ();
474
475/* Return a list of names of available fontsets matching PATTERN on
476 frame F. If SIZE is not 0, it is the size (maximum bound width) of
477 fontsets to be listed. */
478
479Lisp_Object
480list_fontsets (f, pattern, size)
481 FRAME_PTR f;
482 Lisp_Object pattern;
483 int size;
484{
485 int i;
486 Lisp_Object regexp, val;
487
488 regexp = fontset_pattern_regexp (pattern);
489
490 val = Qnil;
491 for (i = 0; i < FRAME_FONTSET_DATA (f)->n_fontsets; i++)
492 {
493 struct fontset_info *fontsetp = FRAME_FONTSET_DATA (f)->fontset_table[i];
494 int name_matched = 0;
495 int size_matched = 0;
496
497 if (!NILP (regexp))
498 {
499 if (fast_string_match_ignore_case (regexp, fontsetp->name) >= 0)
500 name_matched = 1;
501 }
502 else
503 {
504 if (!my_strcasecmp (XSTRING (pattern)->data, fontsetp->name))
505 name_matched = 1;
506 }
507
508 if (name_matched)
509 {
510 if (!size || fontsetp->size == size)
511 size_matched = 1;
512 else if (fontsetp->size == 0)
513 {
514 /* No font of this fontset has loaded yet. Try loading
515 one with SIZE. */
516 int j;
517
518 for (j = 0; j < MAX_CHARSET; j++)
519 if (fontsetp->fontname[j])
520 {
521 if ((*load_font_func) (f, fontsetp->fontname[j], size))
522 size_matched = 1;
523 break;
524 }
525 }
526
527 if (size_matched)
528 val = Fcons (build_string (fontsetp->name), val);
529 }
530 }
531
532 return val;
533}
534
535DEFUN ("new-fontset", Fnew_fontset, Snew_fontset, 2, 2, 0,
536 "Create a new fontset NAME which contains fonts in FONTLIST.\n\
537FONTLIST is an alist of charsets vs corresponding font names.")
538 (name, fontlist)
539 Lisp_Object name, fontlist;
540{
541 Lisp_Object fullname, fontset_info;
542 Lisp_Object tail;
543
544 (*check_window_system_func) ();
545
546 CHECK_STRING (name, 0);
547 CHECK_LIST (fontlist, 1);
548
549 fullname = Fquery_fontset (name);
550 if (!NILP (fullname))
551 error ("Fontset \"%s\" matches the existing fontset \"%s\"",
552 XSTRING (name)->data, XSTRING (fullname)->data);
553
554 /* Check the validity of FONTLIST. */
555 for (tail = fontlist; CONSP (tail); tail = XCONS (tail)->cdr)
556 {
557 Lisp_Object tem = XCONS (tail)->car;
558 int charset;
559
560 if (!CONSP (tem)
561 || (charset = get_charset_id (XCONS (tem)->car)) < 0
562 || !STRINGP (XCONS (tem)->cdr))
563 error ("Elements of fontlist must be a cons of charset and font name");
564 }
565
566 fontset_info = Fcons (name, fontlist);
567 Vglobal_fontset_alist = Fcons (fontset_info, Vglobal_fontset_alist);
568
569 /* Register this fontset for all existing frames. */
570 {
571 Lisp_Object framelist, frame;
572
573 FOR_EACH_FRAME (framelist, frame)
574 if (!FRAME_TERMCAP_P (XFRAME (frame)))
575 fs_register_fontset (XFRAME (frame), fontset_info);
576 }
577
578 return Qnil;
579}
580
581extern Lisp_Object Fframe_parameters ();
582extern Lisp_Object Qfont;
583Lisp_Object Qfontset;
584
585DEFUN ("set-fontset-font", Fset_fontset_font, Sset_fontset_font, 3, 4, 0,
586 "Set FONTNAME for a font of CHARSET in fontset NAME on frame FRAME.\n\
587If FRAME is omitted or nil, all frames are affected.")
588 (name, charset_symbol, fontname, frame)
589 Lisp_Object name, charset_symbol, fontname, frame;
590{
591 int charset;
592 Lisp_Object fullname, fontlist;
593
594 (*check_window_system_func) ();
595
596 CHECK_STRING (name, 0);
597 CHECK_SYMBOL (charset_symbol, 1);
598 CHECK_STRING (fontname, 2);
599 if (!NILP (frame))
600 CHECK_LIVE_FRAME (frame, 3);
601
602 if ((charset = get_charset_id (charset_symbol)) < 0)
603 error ("Invalid charset: %s", XSYMBOL (charset_symbol)->name->data);
604
605 fullname = Fquery_fontset (name);
606 if (NILP (fullname))
607 error ("Fontset \"%s\" does not exist", XSTRING (name)->data);
608
609 /* If FRAME is not specified, we must, at first, update contents of
610 `global-fontset-alist' for a frame created in the future. */
611 if (NILP (frame))
612 {
613 Lisp_Object fontset_info = Fassoc (fullname, Vglobal_fontset_alist);
614 Lisp_Object tem = Fassq (charset, XCONS (fontset_info)->cdr);
615
616 if (NILP (tem))
617 XCONS (fontset_info)->cdr
618 = Fcons (Fcons (charset, fontname), XCONS (fontset_info)->cdr);
619 else
620 XCONS (tem)->cdr = fontname;
621 }
622
623 /* Then, update information in the specified frame or all existing
624 frames. */
625 {
626 Lisp_Object framelist, tem;
627
628 FOR_EACH_FRAME (framelist, tem)
629 if (!FRAME_TERMCAP_P (XFRAME (tem))
630 && (NILP (frame) || EQ (frame, tem)))
631 {
632 FRAME_PTR f = XFRAME (tem);
633 int fontset = fs_query_fontset (f, XSTRING (fullname)->data);
634 struct fontset_info *fontsetp
635 = FRAME_FONTSET_DATA (f)->fontset_table[fontset];
636
637 if (fontsetp->fontname[XINT (charset)])
638 xfree (fontsetp->fontname[XINT (charset)]);
639 fontsetp->fontname[XINT (charset)]
640 = (char *) xmalloc (XSTRING (fontname)->size + 1);
641 bcopy (XSTRING (fontname)->data, fontsetp->fontname[XINT (charset)],
642 XSTRING (fontname)->size + 1);
643 fontsetp->font_indexes[XINT (charset)] = FONT_NOT_OPENED;
644
645 if (charset == CHARSET_ASCII)
646 {
647 Lisp_Object font_param = Fassq (Qfont, Fframe_parameters (tem));
648
649 if (set_frame_fontset_func
650 && !NILP (font_param)
651 && !strcmp (XSTRING (fullname)->data,
652 XSTRING (XCONS (font_param)->cdr)->data))
653 /* This fontset is the default fontset on frame TEM.
654 We may have to resize this frame because of new
655 ASCII font. */
656 (*set_frame_fontset_func) (f, fullname, Qnil);
657 }
658 }
659 }
660
661 return Qnil;
662}
663
664DEFUN ("font-info", Ffont_info, Sfont_info, 1, 2, 0,
665 "Return information about a font named NAME on frame FRAME.\n\
666If FRAME is omitted or nil, use the selected frame.\n\
667The returned value is a vector of OPENED-NAME, FULL-NAME, CHARSET, SIZE,\n\
668 HEIGHT, BASELINE-OFFSET, and RELATIVE-COMPOSE,\n\
669where\n\
670 OPENED-NAME is the name used for opening the font,\n\
671 FULL-NAME is the full name of the font,\n\
672 CHARSET is the charset displayed by the font,\n\
673 SIZE is the minimum bound width of the font,\n\
674 HEIGHT is the height of the font,\n\
675 BASELINE-OFFSET is the upward offset pixels from ASCII baseline,\n\
676 RELATIVE-COMPOSE is the number controlling how to compose characters.\n\
677If the named font is not yet loaded, return nil.")
678 (name, frame)
679 Lisp_Object name, frame;
680{
681 FRAME_PTR f;
682 struct font_info *fontp;
683 Lisp_Object info;
684
685 (*check_window_system_func) ();
686
687 CHECK_STRING (name, 0);
688 if (NILP (frame))
689 f = selected_frame;
690 else
691 {
692 CHECK_LIVE_FRAME (frame, 1);
693 f = XFRAME (frame);
694 }
695
696 if (!query_font_func)
697 error ("Font query function is not supported");
698
699 fontp = (*query_font_func) (f, XSTRING (name)->data);
700 if (!fontp)
701 return Qnil;
702
703 info = Fmake_vector (make_number (6), Qnil);
704
705 XVECTOR (info)->contents[0] = build_string (fontp->name);
706 XVECTOR (info)->contents[1] = build_string (fontp->full_name);
707 XVECTOR (info)->contents[2] = CHARSET_SYMBOL (fontp->charset);
708 XVECTOR (info)->contents[3] = make_number (fontp->size);
709 XVECTOR (info)->contents[4] = make_number (fontp->height);
710 XVECTOR (info)->contents[5] = make_number (fontp->baseline_offset);
711 XVECTOR (info)->contents[6] = make_number (fontp->relative_compose);
712
713 return info;
714}
715
716DEFUN ("fontset-info", Ffontset_info, Sfontset_info, 1, 2, 0,
717 "Return information about a fontset named NAME on frame FRAME.\n\
718If FRAME is omitted or nil, use the selected frame.\n\
719The returned value is a vector of SIZE, HEIGHT, and FONT-LIST,\n\
720where\n\
721 SIZE is the minimum bound width of ASCII font of the fontset,\n\
722 HEIGHT is the height of the tallest font in the fontset, and\n\
723 FONT-LIST is an alist of the format:\n\
724 (CHARSET REQUESTED-FONT-NAME LOADED-FONT-NAME).\n\
725LOADED-FONT-NAME t means the font is not yet loaded, nil means the\n\
726loading failed.")
727 (name, frame)
728 Lisp_Object name, frame;
729{
730 FRAME_PTR f;
731 int fontset;
732 struct fontset_info *fontsetp;
733 Lisp_Object info, val;
734 int i;
735
736 (*check_window_system_func) ();
737
738 CHECK_STRING(name, 0);
739 if (NILP (frame))
740 f = selected_frame;
741 else
742 {
743 CHECK_LIVE_FRAME (frame, 1);
744 f = XFRAME (frame);
745 }
746
747 fontset = fs_query_fontset (f, XSTRING (name)->data);
748 if (fontset < 0)
749 error ("Fontset \"%s\" does not exist", XSTRING (name)->data);
750
751 info = Fmake_vector (make_number (3), Qnil);
752
753 fontsetp = FRAME_FONTSET_DATA (f)->fontset_table[fontset];
754
755 XVECTOR (info)->contents[0] = make_number (fontsetp->size);
756 XVECTOR (info)->contents[1] = make_number (fontsetp->height);
757 val = Qnil;
758 for (i = 0; i < MAX_CHARSET; i++)
759 if (fontsetp->fontname[i])
760 {
761 int font_idx = fontsetp->font_indexes[i];
762 Lisp_Object loaded;
763
764 if (font_idx == FONT_NOT_OPENED)
765 loaded = Qt;
766 else if (font_idx == FONT_NOT_FOUND)
767 loaded = Qnil;
768 else
769 loaded
770 = build_string ((*get_font_info_func) (f, font_idx)->full_name);
771 val = Fcons (Fcons (CHARSET_SYMBOL (i),
772 Fcons (build_string (fontsetp->fontname[i]),
773 Fcons (loaded, Qnil))),
774 val);
775 }
776 XVECTOR (info)->contents[2] = val;
777 return info;
778}
779
780syms_of_fontset ()
781{
782 int i;
783
784 for (i = 0; i < 256; i++)
785 my_strcasetbl[i] = (i >= 'A' && i <= 'Z') ? i + 'a' - 'A' : i;
786
787 if (!load_font_func)
788 /* Window system initializer should have set proper functions. */
789 abort ();
790
791 staticpro (&Qfontset);
792
793 Vcached_fontset_data = Qnil;
794 staticpro (&Vcached_fontset_data);
795
796 DEFVAR_LISP ("global-fontset-alist", &Vglobal_fontset_alist,
797 "Internal data for fontset. Not for external use.\n\
798This is an alist associating fontset names with the lists of fonts\n\
799 contained in them.\n\
800Newly created frames make their own fontset database from here.");
801 Vglobal_fontset_alist = Qnil;
802
803 DEFVAR_LISP ("font-encoding-alist", &Vfont_encoding_alist,
804 "Alist of fontname patterns vs corresponding encoding info.\n\
805Each element looks like (REGEXP . ENCODING-INFO),\n\
806 where ENCODING-INFO is an alist of CHARSET vs ENCODING.\n\
807ENCODING is one of the following integer values:\n\
808 0: code points 0x20..0x7F or 0x2020..0x7F7F are used,\n\
809 1: code points 0xA0..0xFF or 0xA0A0..0xFFFF are used,\n\
810 2: code points 0x20A0..0x7FFF are used,\n\
811 3: code points 0xA020..0xFF7F are used.");
812 Vfont_encoding_alist = Qnil;
813
814 defsubr (&Squery_fontset);
815 defsubr (&Snew_fontset);
816 defsubr (&Sset_fontset_font);
817 defsubr (&Sfont_info);
818 defsubr (&Sfontset_info);
819}
diff --git a/src/fontset.h b/src/fontset.h
new file mode 100644
index 00000000000..902f1691d6f
--- /dev/null
+++ b/src/fontset.h
@@ -0,0 +1,201 @@
1/* Header for fontset handler.
2 Ver.1.0
3
4 Copyright (C) 1995 Free Software Foundation, Inc.
5 Copyright (C) 1995 Electrotechnical Laboratory, JAPAN.
6
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
20
21#ifndef _FONTSET_H
22#define _FONTSET_H
23
24/*
25
26#define GENERIC_FONT_PTR void
27
28/* This data type is used for the font_table field of window system
29 depending data area (e.g. struct x_display_info on X window). */
30
31struct font_info
32{
33 /* Pointer to window system dependent font structure. On X window,
34 this value should be coerced to (XFontStruct *). */
35 void *font;
36
37 /* Index number of the font. */
38 int font_idx;
39
40 /* Name to be used to find the font. */
41 char *name;
42
43 /* Full name of the font given by a window system. */
44 char *full_name;
45
46 /* Charset of characters displayed by the font. */
47 int charset;
48
49 /* Maximum bound width over all existing characters of the font. On
50 X window, this is same as (font->max_bounds.width) */
51 int size;
52
53 /* Height of the font. On X window, this is same as (font->ascent
54 + font->descent). */
55 int height;
56
57 /* Encodings of the font indexed by CHARSET. The value an integer
58 0, 1, 2, or 3:
59 0: code points 0x20..0x7F or 0x2020..0x7F7F are used
60 1: code points 0xA0..0xFF or 0xA0A0..0xFFFF are used
61 2: code points 0x20A0..0x7FFF are used
62 3: code points 0xA020..0xFF7F are used
63 For instance, ASCII and Latin-1 characters may use the same font
64 but different code points (ASCII uses 0x20..0x7F and Latin-1 uses
65 0xA0..0xFF).
66
67 If the value can't be decided from information of the font, we
68 consult `font-encoding-alist' to get of the corresponding charset
69 whose default value is defined in lisp/fontset.el. Since there's
70 no charset whose id is 1, we use encoding[1] to store the
71 encoding information decided by the font itself. */
72 char encoding[MAX_CHARSET];
73
74 /* The baseline position of a font is normally `ascent' value of the
75 font. However, there exists many fonts which don't set `ascent'
76 an appropriate value to be used as baseline position. This is
77 typical in such ASCII fonts which are designed to be used with
78 Chinese, Japanese, Korean characters. When we use mixture of
79 such fonts and normal fonts (having correct `ascent' value), a
80 display line gets very ugly. Since we have no way to fix it
81 automatically, it is users responsibility to supply well designed
82 fonts or correct `ascent' value of fonts. But, the latter
83 requires heavy work (modifying all bitmap data in BDF files).
84 So, Emacs accepts a private font property
85 `_MULE_BASELINE_OFFSET'. If a font has this property, we
86 calculate the baseline position by subtracting the value from
87 `ascent'. In other words, the value indicates how many bits
88 higher we should draw a character of the font than normal ASCII
89 text for a better looking.
90
91 We also have to consider the fact that the concept of `baseline'
92 differs among languages to which each character belongs. For
93 instance, baseline should be at the bottom most position of all
94 glyphs for Chinese, Japanese, and Korean. But, many of existing
95 fonts for those characters doesn't have correct `ascent' values
96 because they are designed to be used with ASCII fonts. To
97 display characters of different language on the same line, the
98 best way will be to arrange them in the middle of the line. So,
99 in such a case, again, we utilize the font property
100 `_MULE_BASELINE_OFFSET'. If the value is larger than `ascent' we
101 calculate baseline so that a character is arranged in the middle
102 of a line. */
103
104 int baseline_offset;
105
106 /* Non zero means a character should be composed at a position
107 relative to the height (or depth) of previous glyphs in the
108 following cases:
109 (1) The bottom of the character is higher than this value. In
110 this case, the character is drawn above the previous glyphs.
111 (2) The top of the character is lower than 0 (i.e. baseline
112 height). In this case, the character is drawn beneath the
113 previous glyphs.
114
115 This value is take from a private font property
116 `_MULE_RELATIVE_COMPOSE' which is introduced by Emacs. */
117 int relative_compose;
118
119 /* CCL program to calculate code points of the font. */
120 struct ccl_program *font_encoder;
121};
122
123#define FONT_NOT_OPENED -1
124#define FONT_NOT_FOUND -2
125
126struct fontset_info
127{
128 /* Name of the fontset. */
129 char *name;
130
131 /* Size of the fontset. This is the same as the size of ASCII font
132 of this fontset. */
133 int size;
134
135 /* Height of the tallest font in the fontset. */
136 int height;
137
138 /* Table of font name for each character set. */
139 char *fontname[MAX_CHARSET];
140
141 /* Table of index numbers of fonts indexed by charset. If a font is
142 not yet loaded, the value is -1 (FONT_NOT_OPENED). If font
143 loading is failed, the value is -2 (FONT_NOT_FOUND). */
144 int font_indexes[MAX_CHARSET];
145};
146
147/* This data type is used for the fontset_data field of struct frame. */
148
149struct fontset_data
150{
151 /* A table of pointers to all the fontsets. */
152 struct fontset_info **fontset_table;
153
154 /* The current capacity of fontset_table. */
155 int fontset_table_size;
156
157 /* The number of fontsets actually stored in fontset_table.
158 fontset_table[n] is used and valid iff 0 <= n < n_fontsets.
159 0 <= n_fontsets <= fontset_table_size. */
160 int n_fontsets;
161};
162
163/* The following six are window system dependent functions.
164 Initialization routine of each window system should set appropriate
165 functions to these variables. For instance, in case of X window,
166 x_term_init does this. */
167
168/* Return a pointer to struct font_info of font FONT_IDX of frame F. */
169extern struct font_info *(*get_font_info_func) (/* FRAME_PTR f;
170 int font_idx */);
171
172/* Return a list of font names which matches PATTERN. See the document of
173 `x-list-fonts' for more detail. */
174extern Lisp_Object (*list_fonts_func) (/* Lisp_Object pattern, face, frame,
175 width */);
176
177/* Load a font named NAME for frame F and return a pointer to the
178 information of the loaded font. If loading is failed, return -1. */
179extern struct font_info *(*load_font_func) (/* FRAME_PTR f; char *name */);
180
181/* Return a pointer to struct font_info of a font named NAME for frame F.
182 If no such font is loaded, return NULL. */
183extern struct font_info *(*query_font_func) (/* FRAME_PTR f; char *name */);
184
185/* Additional function for setting fontset or changing fontset
186 contents of frame F. This function may change the coordinate of
187 the frame. */
188extern void (*set_frame_fontset_func) (/* FRAME_PTR f; Lisp_Object arg, oldval */);
189
190/* Check if any window system is used now. */
191extern void (*check_window_system_func) ();
192
193extern struct fontset_data *alloc_fontset_data ();
194extern void free_fontset_data ();
195extern struct font_info *fs_load_font ();
196extern Lisp_Object list_fontsets ();
197extern Lisp_Object Vglobal_fontset_alist;
198
199extern Lisp_Object Qfontset;
200
201#endif /* _FONTSET_H */