(ctext-non-standard-encodings-alist):

Renamed from non-standard-icccm-encodings-alist. (ctext-non-standard-encodings-regexp): New variable (ctext-post-read-conversion): Full rewrite. (ctext-non-standard-designations-alist): Renamed from non-standard-designations-alist. (ctext-pre-write-conversion): Full rewrite.
author: Kenichi Handa 2003-05-29 01:28:24 +0000
committer: Kenichi Handa 2003-05-29 01:28:24 +0000
commit: cc9269031d5a7bf5e6e61835610658148bda683e (patch)
tree: fa1ce18b8206ab9d5aee6956a94e640f0bafc28c
parent: 0651bdbbb8ef5b5c664eea562160d91d9c78c488 (diff)
download: emacs-cc9269031d5a7bf5e6e61835610658148bda683e.tar.gz
emacs-cc9269031d5a7bf5e6e61835610658148bda683e.zip
1 files changed, 93 insertions, 127 deletions
diff --git a/lisp/international/mule.el b/lisp/international/mule.el
index 17f47376f29..40fd3107a91 100644
--- a/lisp/international/mule.el
+++ b/lisp/international/mule.el
@@ -1316,108 +1316,73 @@ ARG is a list of coding categories ordered by priority."
 ;;; X selections
-(defvar non-standard-icccm-encodings-alist
+(defvar ctext-non-standard-encodings-alist
  '(("ISO8859-15" . latin-iso8859-15)
    ("ISO8859-14" . latin-iso8859-14)
    ("KOI8-R" . koi8-r)
    ("BIG5-0" . big5))
-  "Alist of font charset names defined by XLFD.
+  "Alist of non-standard encoding names vs Emacs coding systems.
-The cdr of each element is the corresponding Emacs charset or coding system.")
+This alist is used to decode an extened segment of a compound text.")
+(defvar ctext-non-standard-encodings-regexp
+  (string-to-multibyte
+   (concat
+    ;; For non-standard encodings.
+    "\\(\e%/[0-4][\200-\377][\200-\377]\\([^\002]+\\)\002\\)"
+    "\\|"
+    ;; For UTF-8 encoding.
+    "\\(\e%G[^\e]*\e%@\\)")))
 ;; Functions to support "Non-Standard Character Set Encodings" defined
 ;; by the COMPOUND-TEXT spec.
-;; We support that by converting the leading sequence of the
+;; We support that by decoding the whole data by `ctext' which just
-;; ``extended segment'' to the corresponding ISO-2022 sequences (if
+;; pertains byte sequences belonging to ``extended segment'', then
-;; the leading sequence names an Emacs charset), or decode the segment
+;; decoding those byte sequences one by one in Lisp.
-;; (if it names a coding system).  Encoding does the reverse.
 ;; This function also supports "The UTF-8 encoding" described in the
 ;; section 7 of the documentation fo COMPOUND-TEXT distributed with
 ;; XFree86.
 (defun ctext-post-read-conversion (len)
  "Decode LEN characters encoded as Compound Text with Extended Segments."
-  (buffer-disable-undo) ; minimize consing due to insertions and deletions
-  (narrow-to-region (point) (+ (point) len))
  (save-match-data
-    (let ((pt (point-marker))
+    (save-restriction
-          (oldpt (point-marker))
+      (let ((case-fold-search nil)
-          (newpt (make-marker))
+            (in-workbuf (string= (buffer-name) " *code-converting-work*"))
-          (modified-p (buffer-modified-p))
+            last-coding-system-used
-          (case-fold-search nil)
+            pos bytes)
-          ;; We need multibyte conversion of "TO" type because the
+        (or in-workbuf
-          ;; buffer may be multibyte, and, in that case, the pattern
+            (narrow-to-region (point) (+ (point) len)))
-          ;; must contain eight-bit-control/graphic characters.
+        (decode-coding-region (point-min) (point-max) 'ctext)
-          (pattern (string-to-multibyte "\\(\e\\)%/[0-4]\\([\200-\377][\200-\377]\\)\\([^\002]+\\)\002\\|\e%G[^\e]+\e%@"))
+        (if in-workbuf
-          last-coding-system-used
+            (set-buffer-multibyte t))
-          encoding textlen chset)
+        (while (re-search-forward ctext-non-standard-encodings-regexp
-      (while (re-search-forward pattern nil 'move)
+                                  nil 'move)
-        (set-marker newpt (point))
+          (setq pos (match-beginning 0))
-        (set-marker pt (match-beginning 0))
+          (if (match-beginning 1)
-        (if (= (preceding-char) ?@)
+              ;; ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
-            ;; We found embedded utf-8 sequence.
+              (let* ((M (char-after (+ pos 4)))
-            (progn
+                     (L (char-after (+ pos 5)))
-              (delete-char -3)          ; delete ESC % @ at the tail
+                     (encoding (match-string 2))
-              (goto-char pt)
+                     (coding (or (cdr (assoc-ignore-case 
-              (delete-char 3)           ; delete ESC % G at the head
+                                       encoding
-              (if (> pt oldpt)
+                                       ctext-non-standard-encodings-alist))
-                  (decode-coding-region oldpt pt 'ctext-no-compositions))
+                                 (coding-system-p
-              (decode-coding-region pt newpt 'mule-utf-8)
+                                  (intern (downcase encoding))))))
-              (goto-char newpt)
+                (setq bytes (- (+ (* (- M 128) 128) (- L 128))
-              (set-marker oldpt newpt))
+                               (- (point) (+ pos 6))))
-          (setq encoding (match-string 3))
+                (when coding
-          (setq textlen (- (+ (* (- (aref (match-string 2) 0) 128) 128)
+                  (delete-region pos (point))
-                              (- (aref (match-string 2) 1) 128))
+                  (forward-char bytes)
-                           (1+ (length encoding))))
+                  (decode-coding-region (- (point) bytes) (point) coding)))
-          (setq
+            ;; ESC % G --UTF-8-BYTES-- ESC % @
-           chset (cdr (assoc-ignore-case encoding
+            (setq bytes (- (point) pos))
-                                         non-standard-icccm-encodings-alist)))
+            (decode-coding-region (- (point) bytes) (point) 'utf-8))))
-          (cond ((null chset)
+      (goto-char (point-min))
-                 ;; This charset is not supported--leave this extended
+      (- (point-max) (point)))))
-                 ;; segment unaltered and skip over it.
-                 (goto-char (+ (point) textlen)))
-                ((charsetp chset)
-                 ;; If it's a charset, replace the leading escape sequence
-                 ;; with a standard ISO-2022 sequence.  We will decode all
-                 ;; such segments later, in one go, when we exit the loop
-                 ;; or find an extended segment that names a coding
-                 ;; system, not a charset.
-                 (replace-match
-                  (concat "\\1"
-                          (if (= 0 (charset-iso-graphic-plane chset))
-                              ;; GL charsets
-                              (if (= 1 (charset-dimension chset)) "(" "$(")
-                            ;; GR charsets
-                            (if (= 96 (charset-chars chset))
-                                "-"
-                              (if (= 1 (charset-dimension chset)) ")" "$)")))
-                          (string (charset-iso-final-char chset)))
-                  t)
-                 (goto-char (+ (point) textlen)))
-                ((coding-system-p chset)
-                 ;; If it's a coding system, we need to decode the segment
-                 ;; right away.  But first, decode what we've skipped
-                 ;; across until now.
-                 (when (> pt oldpt)
-                   (decode-coding-region oldpt pt 'ctext-no-compositions))
-                 (delete-region pt newpt)
-                 (set-marker newpt (+ newpt textlen))
-                 (decode-coding-region pt newpt chset)
-                 (goto-char newpt)
-                 (set-marker oldpt newpt)))))
-      ;; Decode what's left.
-      (when (> (point) oldpt)
-        (decode-coding-region oldpt (point) 'ctext-no-compositions))
-      ;; This buffer started as unibyte, because the string we get from
-      ;; the X selection is a unibyte string.  We must now make it
-      ;; multibyte, so that the decoded text is inserted as multibyte
-      ;; into its buffer.
-      (set-buffer-multibyte t)
-      (set-buffer-modified-p modified-p)
-      (- (point-max) (point-min)))))
 ;; If you add charsets here, be sure to modify the regexp used by
 ;; ctext-pre-write-conversion to look up non-standard charsets.
-(defvar non-standard-designations-alist
+(defvar ctext-non-standard-designations-alist
  '(("$(0" . (big5 "big5-0" 2))
    ("$(1" . (big5 "big5-0" 2))
    ;; The following are actually standard; generating extended
@@ -1449,44 +1414,47 @@ of octets per character is variable), 1, 2, 3, or 4.")
  "Encode characters between FROM and TO as Compound Text w/Extended Segments.
 If FROM is a string, or if the current buffer is not the one set up for us
-by run_pre_post_conversion_on_str, generate a new temp buffer, insert the
+by encode-coding-string, generate a new temp buffer, insert the
 text, and convert it in the temporary buffer.  Otherwise, convert in-place."
-  (cond ((and (string= (buffer-name) " *code-converting-work*")
-              (not (stringp from)))
-         ; Minimize consing due to subsequent insertions and deletions.
-         (buffer-disable-undo)
-         (narrow-to-region from to))
-        (t
-         (let ((buf (current-buffer)))
-           (set-buffer (generate-new-buffer " *temp"))
-           (buffer-disable-undo)
-           (if (stringp from)
-               (insert from)
-             (insert-buffer-substring buf from to))
-           (setq from (point-min) to (point-max)))))
-  (encode-coding-region from to 'ctext-no-compositions)
-  ;; Replace ISO-2022 charset designations with extended segments, for
-  ;; those charsets that are not part of the official X registry.
  (save-match-data
-    (goto-char (point-min))
+    ;; Setup a working buffer if necessary.
-    (let ((newpt (make-marker))
+    (cond ((stringp from)
-          (case-fold-search nil)
+           (let ((buf (current-buffer)))
-          pt desig encode-info encoding chset noctets textlen)
+             (set-buffer (generate-new-buffer " *temp"))
-      (set-buffer-multibyte nil)
+             (set-buffer-multibyte (multibyte-string-p from))
-      ;; The regexp below finds the leading sequences for big5.
+             (insert from)))
+          ((not (string= (buffer-name) " *code-converting-work*"))
+           (let ((buf (current-buffer))
+                 (multibyte enable-multibyte-characters))
+             (set-buffer (generate-new-buffer " *temp"))
+             (set-buffer-multibyte multibyte)
+             (insert-buffer-substring buf from to))))
+    ;; Now we can encode the whole buffer.
+    (let ((case-fold-search nil)
+          last-coding-system-used
+          pos posend desig encode-info encoding chset noctets textlen)
+      (goto-char (point-min))
+      ;; At first encode the whole buffer.
+      (encode-coding-region (point-min) (point-max) 'ctext-no-compositions)
+      ;; Then replace ISO-2022 charset designations with extended
+      ;; segments, for those charsets that are not part of the
+      ;; official X registry.  The regexp below finds the leading
+      ;; sequences for big5.
      (while (re-search-forward "\e\\(\$([01]\\)" nil 'move)
-        (setq desig (match-string 1)
+        (setq pos (match-beginning 0)
-              pt (point-marker)
+              posend (point)
-              encode-info (cdr (assoc desig non-standard-designations-alist))
+              desig (match-string 1)
+              encode-info (cdr (assoc desig
+                                      ctext-non-standard-designations-alist))
              encoding (car encode-info)
              chset (cadr encode-info)
              noctets (car (cddr encode-info)))
        (skip-chars-forward "^\e")
-        (set-marker newpt (point))
        (cond
         ((eq encoding t)  ; only the leading sequence needs to be changed
-          (setq textlen (+ (- newpt pt) (length chset) 1))
+          (setq textlen (+ (- (point) posend) (length chset) 1))
-          ;; Generate the ICCCM control sequence for an extended segment.
+          ;; Generate the control sequence for an extended segment.
          (replace-match (format "\e%%/%d%c%c%s"
                                 noctets
                                 (+ (/ textlen 128) 128)
@@ -1494,20 +1462,18 @@ text, and convert it in the temporary buffer.  Otherwise, convert in-place."
                                 chset)
                         t t))
         ((coding-system-p encoding) ; need to recode the entire segment...
-          (set-marker pt (match-beginning 0))
+          (decode-coding-region pos (point) 'ctext-no-compositions)
-          (decode-coding-region pt newpt 'ctext-no-compositions)
+          (encode-coding-region pos (point) encoding)
-          (set-buffer-multibyte t)
-          (encode-coding-region pt newpt encoding)
          (set-buffer-multibyte nil)
-          (setq textlen (+ (- newpt pt) (length chset) 1))
+          (setq textlen (+ (- (point) pos) (length chset) 1))
-          (goto-char pt)
+          (save-excursion
-          (insert (format "\e%%/%d%c%c%s"
+            (goto-char pos)
-                          noctets
+            (insert (format "\e%%/%d%c%c%s"
-                          (+ (/ textlen 128) 128)
+                            noctets
-                          (+ (% textlen 128) 128)
+                            (+ (/ textlen 128) 128)
-                          chset))))
+                            (+ (% textlen 128) 128)
-        (goto-char newpt))))
+                            chset))))))
-  (set-buffer-multibyte t)
+      (goto-char (point-min))))
  ;; Must return nil, as build_annotations_2 expects that.
  nil)
author	Kenichi Handa	2003-05-29 01:28:24 +0000
committer	Kenichi Handa	2003-05-29 01:28:24 +0000
commit	cc9269031d5a7bf5e6e61835610658148bda683e (patch)
tree	fa1ce18b8206ab9d5aee6956a94e640f0bafc28c
parent	0651bdbbb8ef5b5c664eea562160d91d9c78c488 (diff)
download	emacs-cc9269031d5a7bf5e6e61835610658148bda683e.tar.gz emacs-cc9269031d5a7bf5e6e61835610658148bda683e.zip