diff options
| author | Dave Love | 2002-07-24 23:01:32 +0000 |
|---|---|---|
| committer | Dave Love | 2002-07-24 23:01:32 +0000 |
| commit | fc2938d180aa5f1bb9ba918173daa6c45eadf000 (patch) | |
| tree | 4811a5787dc9cf73722639973dabbf8becd65908 | |
| parent | e65186d5b293d718c927e179a64103e661b24c95 (diff) | |
| download | emacs-fc2938d180aa5f1bb9ba918173daa6c45eadf000.tar.gz emacs-fc2938d180aa5f1bb9ba918173daa6c45eadf000.zip | |
*** empty log message ***
| -rw-r--r-- | etc/NEWS | 52 | ||||
| -rw-r--r-- | etc/PROBLEMS | 112 | ||||
| -rw-r--r-- | lisp/ChangeLog | 28 | ||||
| -rw-r--r-- | lisp/international/utf-16.el | 288 |
4 files changed, 420 insertions, 60 deletions
| @@ -110,13 +110,48 @@ now look at the character after point. If a face or faces are | |||
| 110 | specified for that character, the commands by default customize those | 110 | specified for that character, the commands by default customize those |
| 111 | faces. | 111 | faces. |
| 112 | 112 | ||
| 113 | ** New language environments: French, Cyrillic-KOI8-U, Windows-1251, | ||
| 114 | Cyrillic-KOI8-T, Bulgarian, Belarusian, Ukrainian, UTF-8, | ||
| 115 | Windows-1255, Welsh, Latin-7, Lithuanian, Latvian. | ||
| 116 | |||
| 117 | ** New input methods: latin-alt-postfix, latin-postfix, latin-prefix, | ||
| 118 | ukrainian-computer, belarusian, bulgarian-bds, russian-computer, | ||
| 119 | vietnamese-telex, lithuanian-numeric, lithuanian-keyboard, | ||
| 120 | latvian-keyboard, welsh, georgian, rfc1345, ucs, sgml, | ||
| 121 | bulgarian-phonetic, dutch. | ||
| 122 | |||
| 123 | ** Many new coding systems are available by loading the `code-pages' | ||
| 124 | library. These include complete versions of most of those in | ||
| 125 | codepage.el, based Unicode mappings. | ||
| 126 | |||
| 127 | ** The utf-8 coding system has been enhanced. Untranslatable utf-8 | ||
| 128 | sequences (mostly representing CJK characters) are composed into | ||
| 129 | single quasi-characters. By loading the library utf-8-subst, you can | ||
| 130 | arrange to translate many utf-8 CJK character sequences into real | ||
| 131 | Emacs characters in a similar way to the Mule-UCS system. The utf-8 | ||
| 132 | coding system will now encode characters from most of Emacs's | ||
| 133 | one-dimensional internal charsets, specifically the ISO-8859 ones. | ||
| 134 | |||
| 135 | ** New command `ucs-insert' inserts a character specified by its | ||
| 136 | Unicode. | ||
| 137 | |||
| 113 | +++ | 138 | +++ |
| 114 | ** Limited support for charset unification has been added. | 139 | ** Limited support for character unification has been added. |
| 115 | By default, Emacs now knows how to translate latin-N chars between their | 140 | Emacs now knows how to translate Latin-N chars between their charset |
| 116 | charset and some other latin-N charset or unicode. You can force a | 141 | and some other Latin-N charset or Unicode. By default this |
| 117 | more complete unification by calling (unify-8859-on-decoding-mode 1). | 142 | translation will happen automatically on encoding. Quail input |
| 118 | That maps all the Latin-N character sets into either Latin-1 | 143 | methods use the translations to make the input conformant with the |
| 119 | or Unicode characters. | 144 | encoding of the buffer in which it's being used where possible. |
| 145 | |||
| 146 | You can force a more complete unification with the user option | ||
| 147 | unify-8859-on-decoding-mode. That maps all the Latin-N character sets | ||
| 148 | into Unicode characters (from the latin-iso8859-1 and | ||
| 149 | mule-unicode-0100-24ff charsets) on decoding. | ||
| 150 | |||
| 151 | ** There is support for decoding Greek and Cyrillic characters into | ||
| 152 | either Unicode (the mule-unicode charsets) or the iso-8859 charsets, | ||
| 153 | when possible. The latter are more space-efficient. This is | ||
| 154 | controlled by user option utf-8-fragment-on-decoding. | ||
| 120 | 155 | ||
| 121 | --- | 156 | --- |
| 122 | ** The scrollbar under LessTif or Motif has a smoother drag-scrolling. | 157 | ** The scrollbar under LessTif or Motif has a smoother drag-scrolling. |
| @@ -940,6 +975,9 @@ mode-lines in inverse-video. | |||
| 940 | 975 | ||
| 941 | * Lisp Changes in Emacs 21.4 | 976 | * Lisp Changes in Emacs 21.4 |
| 942 | 977 | ||
| 978 | ** New CCL functions `lookup-character' and `lookup-integer' access | ||
| 979 | hash tables defined by the Lisp function `define-translation-hash-table'. | ||
| 980 | |||
| 943 | ** There is a new Warnings facility; see the functions `warn' | 981 | ** There is a new Warnings facility; see the functions `warn' |
| 944 | and `display-warning'. | 982 | and `display-warning'. |
| 945 | 983 | ||
| @@ -10825,7 +10863,7 @@ select one of those items. | |||
| 10825 | ---------------------------------------------------------------------- | 10863 | ---------------------------------------------------------------------- |
| 10826 | Copyright information: | 10864 | Copyright information: |
| 10827 | 10865 | ||
| 10828 | Copyright (C) 1999, 2000, 2001 Free Software Foundation, Inc. | 10866 | Copyright (C) 1999, 2000, 2001, 2002 Free Software Foundation, Inc. |
| 10829 | 10867 | ||
| 10830 | Permission is granted to anyone to make or distribute verbatim copies | 10868 | Permission is granted to anyone to make or distribute verbatim copies |
| 10831 | of this document as received, in any medium, provided that the | 10869 | of this document as received, in any medium, provided that the |
diff --git a/etc/PROBLEMS b/etc/PROBLEMS index e4bc7f4af98..42727efbaba 100644 --- a/etc/PROBLEMS +++ b/etc/PROBLEMS | |||
| @@ -1,6 +1,50 @@ | |||
| 1 | This file describes various problems that have been encountered | 1 | This file describes various problems that have been encountered |
| 2 | in compiling, installing and running GNU Emacs. | 2 | in compiling, installing and running GNU Emacs. |
| 3 | 3 | ||
| 4 | |||
| 5 | * Mule-UCS loads very slowly. | ||
| 6 | |||
| 7 | Changes to Emacs internals interact badly with Mule-UCS's `un-define' | ||
| 8 | library, which is the usual interface to Mule-UCS. Apply the | ||
| 9 | following patch to Mule-UCS 0.84 and rebuild it. That will help, | ||
| 10 | though loading will still be slower than in Emacs 20. (Some | ||
| 11 | distributions, such as Debian, may already have applied such a patch.) | ||
| 12 | |||
| 13 | --- lisp/un-define.el 6 Mar 2001 22:41:38 -0000 1.30 | ||
| 14 | +++ lisp/un-define.el 19 Apr 2002 18:34:26 -0000 | ||
| 15 | @@ -610,13 +624,21 @@ by calling post-read-conversion and pre- | ||
| 16 | |||
| 17 | (mapcar | ||
| 18 | (lambda (x) | ||
| 19 | - (mapcar | ||
| 20 | - (lambda (y) | ||
| 21 | - (mucs-define-coding-system | ||
| 22 | - (nth 0 y) (nth 1 y) (nth 2 y) | ||
| 23 | - (nth 3 y) (nth 4 y) (nth 5 y) (nth 6 y)) | ||
| 24 | - (coding-system-put (car y) 'alias-coding-systems (list (car x)))) | ||
| 25 | - (cdr x))) | ||
| 26 | + (if (fboundp 'register-char-codings) | ||
| 27 | + ;; Mule 5, where we don't need the eol-type specified and | ||
| 28 | + ;; register-char-codings may be very slow for these coding | ||
| 29 | + ;; system definitions. | ||
| 30 | + (let ((y (cadr x))) | ||
| 31 | + (mucs-define-coding-system | ||
| 32 | + (car x) (nth 1 y) (nth 2 y) | ||
| 33 | + (nth 3 y) (nth 4 y) (nth 5 y))) | ||
| 34 | + (mapcar | ||
| 35 | + (lambda (y) | ||
| 36 | + (mucs-define-coding-system | ||
| 37 | + (nth 0 y) (nth 1 y) (nth 2 y) | ||
| 38 | + (nth 3 y) (nth 4 y) (nth 5 y) (nth 6 y)) | ||
| 39 | + (coding-system-put (car y) 'alias-coding-systems (list (car x))))) | ||
| 40 | + (cdr x))) | ||
| 41 | `((utf-8 | ||
| 42 | (utf-8-unix | ||
| 43 | ?u "UTF-8 coding system" | ||
| 44 | |||
| 45 | Note that Emacs has native support for Unicode, roughly equivalent to | ||
| 46 | Mule-UCS's, so you may not need it. | ||
| 47 | |||
| 4 | * Building Emacs with GCC 2.9x fails in the `src' directory. | 48 | * Building Emacs with GCC 2.9x fails in the `src' directory. |
| 5 | 49 | ||
| 6 | This may happen if you use a development version of GNU `cpp' from one | 50 | This may happen if you use a development version of GNU `cpp' from one |
| @@ -115,7 +159,9 @@ should now succeed. | |||
| 115 | * JPEG images aren't displayed. | 159 | * JPEG images aren't displayed. |
| 116 | 160 | ||
| 117 | This has been reported when Emacs is built with jpeg-6a library. | 161 | This has been reported when Emacs is built with jpeg-6a library. |
| 118 | Upgrading to jpeg-6b solves the problem. | 162 | Upgrading to jpeg-6b solves the problem. Configure checks for the |
| 163 | correct version, but this problem could occur if a binary built | ||
| 164 | against a shared libjpeg is run on a system with an older version. | ||
| 119 | 165 | ||
| 120 | * Building `ctags' for MS-Windows with the MinGW port of GCC fails. | 166 | * Building `ctags' for MS-Windows with the MinGW port of GCC fails. |
| 121 | 167 | ||
| @@ -386,14 +432,13 @@ ought to recognize the Windows language-change event and set up the | |||
| 386 | appropriate keyboard encoding automatically, but it doesn't do that | 432 | appropriate keyboard encoding automatically, but it doesn't do that |
| 387 | yet.) | 433 | yet.) |
| 388 | 434 | ||
| 389 | Multilingual text put into the Windows clipboard by other Windows | 435 | Windows uses UTF-16 encoding to deal with multilingual text (text not |
| 390 | applications cannot be safely pasted into Emacs (as of v21.2). This | 436 | encodable in the `system codepage') in the clipboard. To deal with |
| 391 | is because Windows uses Unicode to represent multilingual text, but | 437 | this, load the library `utf-16' and use `set-selection-coding-system' |
| 392 | Emacs does not yet support Unicode well enough to decode it. This | 438 | to set the clipboard coding system to `utf-16-le-dos'. This won't |
| 393 | means that Emacs can only interchange non-ASCII text with other | 439 | cope with Far Eastern (`CJK') text; if necessary, install the Mule-UCS |
| 394 | Windows programs if the characters are in the system codepage. | 440 | package (see etc/MORE.STUFF), whose `utf-16-le-dos' coding system does |
| 395 | Reportedly, a partial solution is to install the Mule-UCS package and | 441 | encode a lot of CJK characters. |
| 396 | set selection-coding-system to utf-16-le-dos. | ||
| 397 | 442 | ||
| 398 | The %b specifier for format-time-string does not produce abbreviated | 443 | The %b specifier for format-time-string does not produce abbreviated |
| 399 | month names with consistent widths for some locales on some versions | 444 | month names with consistent widths for some locales on some versions |
| @@ -492,10 +537,9 @@ src/s/hpux10.h. | |||
| 492 | 537 | ||
| 493 | * Crashes when displaying GIF images in Emacs built with version | 538 | * Crashes when displaying GIF images in Emacs built with version |
| 494 | libungif-4.1.0 are resolved by using version libungif-4.1.0b1. | 539 | libungif-4.1.0 are resolved by using version libungif-4.1.0b1. |
| 495 | 540 | Configure checks for the correct version, but this problem could occur | |
| 496 | Beginning with version 21.3, Emacs refuses to link against libungif | 541 | if a binary built against a shared libungif is run on a system with an |
| 497 | whose version is 4.1.0 or older (the `configure' script behaves as if | 542 | older version. |
| 498 | libungif were not available at all). | ||
| 499 | 543 | ||
| 500 | * Font Lock displays portions of the buffer in incorrect faces. | 544 | * Font Lock displays portions of the buffer in incorrect faces. |
| 501 | 545 | ||
| @@ -596,9 +640,8 @@ this problem by putting this in your `.emacs' file: | |||
| 596 | 640 | ||
| 597 | (setq ange-ftp-ftp-program-args '("-i" "-n" "-g" "-v" "--prompt" "") | 641 | (setq ange-ftp-ftp-program-args '("-i" "-n" "-g" "-v" "--prompt" "") |
| 598 | 642 | ||
| 599 | * Some versions of the W3 package released before Emacs 21.1 don't run | 643 | * Versions of the W3 package released before Emacs 21.1 don't run |
| 600 | properly with Emacs 21. These problems are fixed in W3 version | 644 | under Emacs 21. This fixed in W3 version 4.0pre.47. |
| 601 | 4.0pre.47. | ||
| 602 | 645 | ||
| 603 | * On AIX, if linking fails because libXbsd isn't found, check if you | 646 | * On AIX, if linking fails because libXbsd isn't found, check if you |
| 604 | are compiling with the system's `cc' and CFLAGS containing `-O5'. If | 647 | are compiling with the system's `cc' and CFLAGS containing `-O5'. If |
| @@ -633,43 +676,6 @@ Version 1 of OpenLDAP is now deprecated. If you are still using it, | |||
| 633 | please upgrade to version 2. As a temporary workaround, remove | 676 | please upgrade to version 2. As a temporary workaround, remove |
| 634 | argument "-x" from the variable `ldap-ldapsearch-args'. | 677 | argument "-x" from the variable `ldap-ldapsearch-args'. |
| 635 | 678 | ||
| 636 | * Unicode characters are not unified with other Mule charsets. | ||
| 637 | |||
| 638 | As of v21.1, Emacs charsets are still not unified. This means that | ||
| 639 | characters which belong to charsets such as Latin-2, Greek, Hebrew, | ||
| 640 | etc. and the same characters in the `mule-unicode-*' charsets are | ||
| 641 | different characters, as far as Emacs is concerned. For example, text | ||
| 642 | which includes Unicode characters from the Latin-2 locale cannot be | ||
| 643 | encoded by Emacs with ISO 8859-2 coding system; and if you yank Greek | ||
| 644 | text from a buffer whose buffer-file-coding-system is greek-iso-8bit | ||
| 645 | into a mule-unicode-0100-24ff buffer, Emacs won't be able to save that | ||
| 646 | buffer neither as ISO 8859-7 nor as UTF-8. | ||
| 647 | |||
| 648 | To work around this, install some add-on package such as Mule-UCS. | ||
| 649 | |||
| 650 | * Problems when using Emacs with UTF-8 locales | ||
| 651 | |||
| 652 | Some systems, including recent versions of GNU/Linux, have terminals | ||
| 653 | or X11 subsystems that can be configured to provide Unicode/UTF-8 | ||
| 654 | input and display. Normally, such a system sets environment variables | ||
| 655 | such as LANG, LC_CTYPE, or LC_ALL to a string which ends with a | ||
| 656 | `.UTF-8'. For example, a system like this in a French locale might | ||
| 657 | use `fr_FR.UTF-8' as the value of LANG. | ||
| 658 | |||
| 659 | Since Unicode support in Emacs, as of v21.1, is not yet complete (see | ||
| 660 | the previous entry in this file), UTF-8 support is not enabled by | ||
| 661 | default, even in UTF-8 locales. Thus, some Emacs features, such as | ||
| 662 | non-ASCII keyboard input, might appear to be broken in these locales. | ||
| 663 | To solve these problems, you need to turn on some options in your | ||
| 664 | `.emacs' file. Specifically, the following customizations should make | ||
| 665 | Emacs work correctly with UTF-8 input and text: | ||
| 666 | |||
| 667 | (setq locale-coding-system 'utf-8) | ||
| 668 | (set-terminal-coding-system 'utf-8) | ||
| 669 | (set-keyboard-coding-system 'utf-8) | ||
| 670 | (set-selection-coding-system 'utf-8) | ||
| 671 | (prefer-coding-system 'utf-8) | ||
| 672 | |||
| 673 | * The `oc-unicode' package doesn't work with Emacs 21. | 679 | * The `oc-unicode' package doesn't work with Emacs 21. |
| 674 | 680 | ||
| 675 | This package tries to define more private charsets than there are free | 681 | This package tries to define more private charsets than there are free |
diff --git a/lisp/ChangeLog b/lisp/ChangeLog index 8d834cbcd83..91cc8035728 100644 --- a/lisp/ChangeLog +++ b/lisp/ChangeLog | |||
| @@ -1,3 +1,31 @@ | |||
| 1 | 2002-07-24 Dave Love <fx@gnu.org> | ||
| 2 | |||
| 3 | * international/mule.el (set-auto-coding): Doc fix. | ||
| 4 | |||
| 5 | * international/utf-16.el: New file. | ||
| 6 | |||
| 7 | * language/european.el ("German", "French", "Spanish", "Turkish"): | ||
| 8 | Add alternative coding systems. | ||
| 9 | ("Dutch"): Likewise. Add input method. | ||
| 10 | ("Welsh", "Latin-7"): Add nonascii-translation. | ||
| 11 | |||
| 12 | * language/georgian.el ("Georgian"): Add nonascii-translation. | ||
| 13 | |||
| 14 | * international/titdic-cnv.el: Doc fixes. | ||
| 15 | (tit-process-header): Add coding cookie. | ||
| 16 | (titdic-convert): Force writing as iso-2022-7bit. | ||
| 17 | |||
| 18 | * international/ja-dic-cnv.el (skkdic-convert): Add coding cookie. | ||
| 19 | |||
| 20 | * international/mule-cmds.el: Doc fixes. | ||
| 21 | (unencodable-char-position): New. | ||
| 22 | (select-safe-coding-system): Use it to indicate problematic | ||
| 23 | characters and add extra explanation. Avoid checking auto-coding | ||
| 24 | for compressed files. | ||
| 25 | (leim-list-header): Add coding cookie. | ||
| 26 | (input-method-verbose-flag): Modify :type. | ||
| 27 | (locale-language-names): Add bs, wa. Modify cy. | ||
| 28 | |||
| 1 | 2002-07-24 Richard M. Stallman <rms@gnu.org> | 29 | 2002-07-24 Richard M. Stallman <rms@gnu.org> |
| 2 | 30 | ||
| 3 | * emacs-lisp/bytecomp.el (byte-compile-log-warning): | 31 | * emacs-lisp/bytecomp.el (byte-compile-log-warning): |
diff --git a/lisp/international/utf-16.el b/lisp/international/utf-16.el new file mode 100644 index 00000000000..947fb336318 --- /dev/null +++ b/lisp/international/utf-16.el | |||
| @@ -0,0 +1,288 @@ | |||
| 1 | ;;; utf-16.el --- UTF-16 encoding/decoding | ||
| 2 | |||
| 3 | ;; Copyright (C) 2001, 2002 Free Software Foundation, Inc. | ||
| 4 | |||
| 5 | ;; Author: Dave Love <fx@gnu.org> | ||
| 6 | ;; Keywords: Unicode, UTF-16, i18n | ||
| 7 | |||
| 8 | ;; This file is part of GNU Emacs. | ||
| 9 | |||
| 10 | ;; GNU Emacs is free software; you can redistribute it and/or modify | ||
| 11 | ;; it under the terms of the GNU General Public License as published by | ||
| 12 | ;; the Free Software Foundation; either version 2, or (at your option) | ||
| 13 | ;; any later version. | ||
| 14 | |||
| 15 | ;; GNU Emacs is distributed in the hope that it will be useful, | ||
| 16 | ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 17 | ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 18 | ;; GNU General Public License for more details. | ||
| 19 | |||
| 20 | ;; You should have received a copy of the GNU General Public License | ||
| 21 | ;; along with GNU Emacs; see the file COPYING. If not, write to the | ||
| 22 | ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
| 23 | ;; Boston, MA 02111-1307, USA. | ||
| 24 | |||
| 25 | ;;; Commentary: | ||
| 26 | |||
| 27 | ;; Support for UTF-16, which is a two-byte encoding (modulo | ||
| 28 | ;; surrogates) of Unicode, written either in little or big endian | ||
| 29 | ;; order: coding-systems `mule-utf-16-le' and `mule-utf-16-be'. | ||
| 30 | ;; (utf-16-le is used by the DozeN'T clipboard, for instance.) The | ||
| 31 | ;; data are preceeded by a two-byte signature which identifies their | ||
| 32 | ;; byte sex. These are used by the coding-category-utf-16-{b,l}e code | ||
| 33 | ;; to identify the coding, but ignored on decoding. | ||
| 34 | |||
| 35 | ;; Note that un-decodable sequences aren't (yet?) preserved as raw | ||
| 36 | ;; bytes, as they are with utf-8, so reading and writing as utf-16 can | ||
| 37 | ;; corrupt data. | ||
| 38 | |||
| 39 | ;;; Code: | ||
| 40 | |||
| 41 | ;; We end up with trivially different -le and -be versions of most | ||
| 42 | ;; things below, sometimes with commonality abstracted into a let | ||
| 43 | ;; binding for maintenance convenience. | ||
| 44 | |||
| 45 | ;; We'd need new charsets distinct from ascii and eight-bit-control to | ||
| 46 | ;; deal with untranslated sequences, since we can't otherwise | ||
| 47 | ;; distinguish the bytes, as we can with utf-8. | ||
| 48 | |||
| 49 | ;; ;; Do a multibyte write for bytes in r3 and r4. | ||
| 50 | ;; ;; Intended for untranslatable utf-16 sequences. | ||
| 51 | ;; (define-ccl-program ccl-mule-utf-16-untrans | ||
| 52 | ;; `(0 | ||
| 53 | ;; (if (r3 < 128) | ||
| 54 | ;; (r0 = ,(charset-id 'ascii)) | ||
| 55 | ;; (if (r3 < 160) | ||
| 56 | ;; (r0 = ,(charset-id 'eight-bit-control)) | ||
| 57 | ;; (r0 = ,(charset-id 'eight-bit-graphic)))) | ||
| 58 | ;; (if (r4 < 128) | ||
| 59 | ;; (r0 = ,(charset-id 'ascii)) | ||
| 60 | ;; (if (r4 < 160) | ||
| 61 | ;; (r0 = ,(charset-id 'eight-bit-control)) | ||
| 62 | ;; (r0 = ,(charset-id 'eight-bit-graphic)))) | ||
| 63 | ;; (r1 = r4))) | ||
| 64 | ;; "Do a multibyte write for bytes in r3 and r4. | ||
| 65 | ;; First swap them if we're big endian, indicated by r5==0. | ||
| 66 | ;; Intended for untranslatable utf-16 sequences.") | ||
| 67 | |||
| 68 | ;; Needed in macro expansion, so can't be let-bound. Zapped after use. | ||
| 69 | (eval-and-compile | ||
| 70 | (defconst utf-16-decode-ucs | ||
| 71 | ;; We have the unicode in r1. Output is character codes in r0, r1, | ||
| 72 | ;; and r2 if appropriate. | ||
| 73 | `((lookup-integer utf-8-subst-table r0 r3) | ||
| 74 | (if r7 (r1 = r3)) ; got a translation | ||
| 75 | (if (r1 < 128) | ||
| 76 | (r0 = ,(charset-id 'ascii)) | ||
| 77 | (if (r1 < 160) | ||
| 78 | (r0 = ,(charset-id 'eight-bit-control)) | ||
| 79 | (if (r1 < 256) | ||
| 80 | ((r0 = ,(charset-id 'latin-iso8859-1)) | ||
| 81 | (r1 -= 128)) | ||
| 82 | (if (r1 < #x2500) | ||
| 83 | ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) | ||
| 84 | (r1 -= #x100) | ||
| 85 | (r2 = (((r1 / 96) + 32) << 7)) | ||
| 86 | (r1 %= 96) | ||
| 87 | (r1 += (r2 + 32))) | ||
| 88 | (if (r1 < #x3400) | ||
| 89 | ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) | ||
| 90 | (r1 -= #x2500) | ||
| 91 | (r2 = (((r1 / 96) + 32) << 7)) | ||
| 92 | (r1 %= 96) | ||
| 93 | (r1 += (r2 + 32))) | ||
| 94 | (if (r1 < #xd800) ; 2 untranslated bytes | ||
| 95 | ;; ;; Assume this is rare, so don't worry about the | ||
| 96 | ;; ;; overhead of the call. | ||
| 97 | ;; (call mule-utf-16-untrans) | ||
| 98 | ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) | ||
| 99 | (r1 = 15037)) ; U+fffd | ||
| 100 | (if (r1 < #xe000) ; surrogate | ||
| 101 | ;; ((call mule-utf-16-untrans) | ||
| 102 | ;; (write-multibyte-character r0 r1) | ||
| 103 | ;; (read r3 r4) | ||
| 104 | ;; (call mule-utf-16-untrans)) | ||
| 105 | ((read r3 r4) | ||
| 106 | (r0 = ,(charset-id 'mule-unicode-e000-ffff)) | ||
| 107 | (r1 = 15037)) | ||
| 108 | ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) | ||
| 109 | (r1 -= #xe000) | ||
| 110 | (r2 = (((r1 / 96) + 32) << 7)) | ||
| 111 | (r1 %= 96) | ||
| 112 | (r1 += (r2 + 32))))))))))))) | ||
| 113 | |||
| 114 | (define-ccl-program ccl-decode-mule-utf-16-le | ||
| 115 | `(2 ; 2 bytes -> 1 to 4 bytes | ||
| 116 | ((read r0 r1) ; signature | ||
| 117 | (loop | ||
| 118 | (read r3 r4) | ||
| 119 | (r1 = (r4 <8 r3)) | ||
| 120 | ,utf-16-decode-ucs | ||
| 121 | (translate-character utf-8-translation-table-for-decode r0 r1) | ||
| 122 | (write-multibyte-character r0 r1) | ||
| 123 | (repeat)))) | ||
| 124 | "Decode little endian UTF-16 (ignoring signature bytes). | ||
| 125 | Basic decoding is done into the charsets ascii, latin-iso8859-1 and | ||
| 126 | mule-unicode-*. Un-representable Unicode characters are | ||
| 127 | decoded as U+fffd. The result is run through translation table | ||
| 128 | `utf-8-translation-table-for-decode' if that is defined.") | ||
| 129 | |||
| 130 | (define-ccl-program ccl-decode-mule-utf-16-be | ||
| 131 | `(2 ; 2 bytes -> 1 to 4 bytes | ||
| 132 | ((read r0 r1) ; signature | ||
| 133 | (loop | ||
| 134 | (read r3 r4) | ||
| 135 | (r1 = (r3 <8 r4)) | ||
| 136 | ,utf-16-decode-ucs | ||
| 137 | (translate-character utf-8-translation-table-for-decode r0 r1) | ||
| 138 | (write-multibyte-character r0 r1) | ||
| 139 | (repeat)))) | ||
| 140 | "Decode big endian UTF-16 (ignoring signature bytes). | ||
| 141 | Basic decoding is done into the charsets ascii, latin-iso8859-1 and | ||
| 142 | mule-unicode-*. Un-representable Unicode characters are | ||
| 143 | decoded as U+fffd. The result is run through translation table | ||
| 144 | `utf-8-non-latin-8859-table'.") | ||
| 145 | |||
| 146 | (makunbound 'utf-16-decode-ucs) ; done with it | ||
| 147 | |||
| 148 | (eval-and-compile | ||
| 149 | (defconst utf-16-decode-to-ucs | ||
| 150 | ;; CCL which, given the result of a multibyte read in r0 and r1, | ||
| 151 | ;; sets r0 to the character's Unicode if the charset is one of the | ||
| 152 | ;; basic utf-8 coding system ones. Otherwise set to U+fffd. | ||
| 153 | `(if (r0 == ,(charset-id 'ascii)) | ||
| 154 | (r0 = r1) | ||
| 155 | (if (r0 == ,(charset-id 'latin-iso8859-1)) | ||
| 156 | (r0 = (r1 + 128)) | ||
| 157 | (if (r0 == ,(charset-id 'eight-bit-control)) | ||
| 158 | (r0 = r1) | ||
| 159 | (if (r0 == ,(charset-id 'eight-bit-graphic)) | ||
| 160 | (r0 = r1) | ||
| 161 | ((r2 = (r1 & #x7f)) | ||
| 162 | (r1 >>= 7) | ||
| 163 | (r3 = ((r1 - 32) * 96)) | ||
| 164 | (r3 += (r2 - 32)) | ||
| 165 | (if (r0 == ,(charset-id 'mule-unicode-0100-24ff)) | ||
| 166 | (r0 = (r3 + #x100)) | ||
| 167 | (if (r0 == ,(charset-id 'mule-unicode-2500-33ff)) | ||
| 168 | (r0 = (r3 + #x2500)) | ||
| 169 | (if (r0 == ,(charset-id 'mule-unicode-e000-ffff)) | ||
| 170 | (r0 = (r3 + #xe000)) | ||
| 171 | (r0 = #xfffd))))))))))) | ||
| 172 | |||
| 173 | (define-ccl-program ccl-encode-mule-utf-16-le | ||
| 174 | `(1 | ||
| 175 | ((write #xff) | ||
| 176 | (write #xfe) | ||
| 177 | (loop | ||
| 178 | (read-multibyte-character r0 r1) | ||
| 179 | (translate-character ucs-mule-to-mule-unicode r0 r1) | ||
| 180 | ,utf-16-decode-to-ucs | ||
| 181 | (write (r0 & 255)) | ||
| 182 | (write (r0 >> 8)) | ||
| 183 | (repeat)))) | ||
| 184 | "Encode to little endian UTF-16 with signature. | ||
| 185 | Characters from the charsets ascii, eight-bit-control, | ||
| 186 | eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded | ||
| 187 | after translation through the table `ucs-mule-to-mule-unicode'. | ||
| 188 | Others are encoded as U+FFFD.") | ||
| 189 | |||
| 190 | (define-ccl-program ccl-encode-mule-utf-16-be | ||
| 191 | `(1 | ||
| 192 | ((write #xfe) | ||
| 193 | (write #xff) | ||
| 194 | (loop | ||
| 195 | (read-multibyte-character r0 r1) | ||
| 196 | (translate-character ucs-mule-to-mule-unicode r0 r1) | ||
| 197 | ,utf-16-decode-to-ucs | ||
| 198 | (write (r0 >> 8)) | ||
| 199 | (write (r0 & 255)) | ||
| 200 | (repeat)))) | ||
| 201 | "Encode to big endian UTF-16 with signature. | ||
| 202 | Characters from the charsets ascii, eight-bit-control, | ||
| 203 | eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded | ||
| 204 | after translation through the table `ucs-mule-to-mule-unicode'. | ||
| 205 | Others are encoded as U+FFFD.") | ||
| 206 | |||
| 207 | (makunbound 'utf-16-decode-to-ucs) | ||
| 208 | |||
| 209 | (defun utf-16-le-pre-write-conversion (beg end) | ||
| 210 | "Semi-dummy pre-write function effectively to autoload ucs-tables." | ||
| 211 | ;; Ensure translation table is loaded. | ||
| 212 | (require 'ucs-tables) | ||
| 213 | ;; Don't do this again. | ||
| 214 | (coding-system-put 'mule-utf-16-le 'pre-write-conversion nil) | ||
| 215 | nil) | ||
| 216 | |||
| 217 | (defun utf-16-be-pre-write-conversion (beg end) | ||
| 218 | "Semi-dummy pre-write function effectively to autoload ucs-tables." | ||
| 219 | ;; Ensure translation table is loaded. | ||
| 220 | (require 'ucs-tables) | ||
| 221 | ;; Don't do this again. | ||
| 222 | (coding-system-put 'mule-utf-16-be 'pre-write-conversion nil) | ||
| 223 | nil) | ||
| 224 | |||
| 225 | (let ((doc " | ||
| 226 | |||
| 227 | Assumes and ignores the leading two-byte signature. | ||
| 228 | |||
| 229 | The supported Emacs character sets are the following, plus others | ||
| 230 | which may be included in the translation table | ||
| 231 | `ucs-mule-to-mule-unicode': | ||
| 232 | ascii | ||
| 233 | eight-bit-control | ||
| 234 | latin-iso8859-1 | ||
| 235 | mule-unicode-0100-24ff | ||
| 236 | mule-unicode-2500-33ff | ||
| 237 | mule-unicode-e000-ffff | ||
| 238 | |||
| 239 | Note that Unicode characters out of the ranges U+0000-U+33FF and | ||
| 240 | U+E200-U+FFFF are decoded as U+FFFD, effectively corrupting the data | ||
| 241 | if they are re-encoded. Emacs characters without Unicode conversions | ||
| 242 | are encoded as U+FFFD.")) | ||
| 243 | (make-coding-system | ||
| 244 | 'mule-utf-16-le 4 | ||
| 245 | ?u ; Mule-UCS uses ?U, but code-pages uses that for koi8-u. | ||
| 246 | (concat | ||
| 247 | "Little endian UTF-16 encoding for Emacs-supported Unicode characters." | ||
| 248 | doc) | ||
| 249 | |||
| 250 | '(ccl-decode-mule-utf-16-le . ccl-encode-mule-utf-16-le) | ||
| 251 | '((safe-charsets | ||
| 252 | ascii | ||
| 253 | eight-bit-control | ||
| 254 | latin-iso8859-1 | ||
| 255 | mule-unicode-0100-24ff | ||
| 256 | mule-unicode-2500-33ff | ||
| 257 | mule-unicode-e000-ffff) | ||
| 258 | (mime-charset . utf-16le) | ||
| 259 | (coding-category . coding-category-utf-16-le) | ||
| 260 | (valid-codes (0 . 255)) | ||
| 261 | (pre-write-conversion . utf-16-le-pre-write-conversion))) | ||
| 262 | |||
| 263 | (make-coding-system | ||
| 264 | 'mule-utf-16-be 4 ?u | ||
| 265 | (concat | ||
| 266 | "Big endian UTF-16 encoding for Emacs-supported Unicode characters." | ||
| 267 | doc) | ||
| 268 | |||
| 269 | '(ccl-decode-mule-utf-16-be . ccl-encode-mule-utf-16-be) | ||
| 270 | '((safe-charsets | ||
| 271 | ascii | ||
| 272 | eight-bit-control | ||
| 273 | latin-iso8859-1 | ||
| 274 | mule-unicode-0100-24ff | ||
| 275 | mule-unicode-2500-33ff | ||
| 276 | mule-unicode-e000-ffff) | ||
| 277 | (mime-charset . utf-16be) | ||
| 278 | (coding-category . coding-category-utf-16-be) | ||
| 279 | (valid-codes (0 . 255)) | ||
| 280 | (pre-write-conversion . utf-16-be-pre-write-conversion))) | ||
| 281 | ) | ||
| 282 | |||
| 283 | (define-coding-system-alias 'utf-16-le 'mule-utf-16-le) | ||
| 284 | (define-coding-system-alias 'utf-16-be 'mule-utf-16-be) | ||
| 285 | |||
| 286 | (provide 'utf-16) | ||
| 287 | |||
| 288 | ;;; utf-16.el ends here | ||