(non-standard-icccm-encodings-alist, non-standard-designations-alist): New

variables. (ctext-post-read-conversion, ctext-pre-write-conversion): New functions.
author: Eli Zaretskii 2002-02-22 13:44:21 +0000
committer: Eli Zaretskii 2002-02-22 13:44:21 +0000
commit: 835cbadb3fa1792c6fb9716b502f7502d018f57b (patch)
tree: 978743f3f71b1c1b1acef71763478cd4f1f691da
parent: 23e16093ecd087847f494aaebbf58816415eed06 (diff)
download: emacs-835cbadb3fa1792c6fb9716b502f7502d018f57b.tar.gz
emacs-835cbadb3fa1792c6fb9716b502f7502d018f57b.zip
1 files changed, 155 insertions, 0 deletions
diff --git a/lisp/international/mule.el b/lisp/international/mule.el
index 124761c0d2a..9bb9c4bf5cc 100644
--- a/lisp/international/mule.el
+++ b/lisp/international/mule.el
@@ -1284,6 +1284,161 @@ ARG is a list of coding categories ordered by priority."
    (setq coding-category-list (append arg current-list))
    (set-coding-priority-internal)))
+;;; X selections
+(defvar non-standard-icccm-encodings-alist
+  '(("ISO8859-15" . latin-iso8859-15)
+    ("ISO8859-14" . latin-iso8859-14)
+    ("KOI8-R" . koi8-r)
+    ("BIG5-0" . big5))
+  "Alist of font charset names defined by XLFD, and the corresponding Emacs
+charsets or coding systems.")
+;; Functions to support "Non-Standard Character Set Encodings" defined
+;; by the ICCCM spec.  We support that by converting the leading
+;; sequence of the ``extended segment'' to the corresponding ISO-2022
+;; sequences (if the leading sequence names an Emacs charset), or decode
+;; the segment (if it names a coding system).  Encoding does the reverse.
+(defun ctext-post-read-conversion (len)
+  "Decode LEN characters encoded as Compound Text with Extended Segments."
+  (buffer-disable-undo) ; minimize consing due to insertions and deletions
+  (narrow-to-region (point) (+ (point) len))
+  (save-match-data
+    (let ((pt (point-marker))
+          (oldpt (point-marker))
+          (newpt (make-marker))
+          (modified-p (buffer-modified-p))
+          (case-fold-search nil)
+          last-coding-system-used
+          encoding textlen chset)
+      (while (re-search-forward
+              "\\(\e\\)%/[0-4]\\([\200-\377][\200-\377]\\)\\([^\002]+\\)\002"
+              nil 'move)
+        (set-marker newpt (point))
+        (set-marker pt (match-beginning 0))
+        (setq encoding (match-string 3))
+        (setq textlen (- (+ (* (- (aref (match-string 2) 0) 128) 128)
+                            (- (aref (match-string 2) 1) 128))
+                         (1+ (length encoding))))
+        (setq
+         chset (cdr (assoc-ignore-case encoding
+                                       non-standard-icccm-encodings-alist)))
+        (cond ((null chset)
+               ;; This charset is not supported--leave this extended
+               ;; segment unaltered and skip over it.
+               (goto-char (+ (point) textlen)))
+              ((charsetp chset)
+             ;; If it's a charset, replace the leading escape sequence
+             ;; with a standard ISO-2022 sequence.  We will decode all
+              ;; such segments later, in one go, when we exit the loop
+               ;; or find an extended segment that names a coding
+               ;; system, not a charset.
+               (replace-match
+                (concat "\\1"
+                        (if (= 0 (charset-iso-graphic-plane chset))
+                            ;; GL charsets
+                            (if (= 1 (charset-dimension chset)) "(" "$(")
+                          ;; GR charsets
+                          (if (= 96 (charset-chars chset))
+                              "-"
+                            (if (= 1 (charset-dimension chset)) ")" "$)")))
+                        (string (charset-iso-final-char chset)))
+                t)
+               (goto-char (+ (point) textlen)))
+              ((coding-system-p chset)
+             ;; If it's a coding system, we need to decode the segment
+               ;; right away.  But first, decode what we've skipped
+               ;; across until now.
+               (when (> pt oldpt)
+                 (decode-coding-region oldpt pt 'ctext-no-compositions))
+               (delete-region pt newpt)
+               (set-marker newpt (+ newpt textlen))
+               (decode-coding-region pt newpt chset)
+               (goto-char newpt)
+               (set-marker oldpt newpt))))
+      ;; Decode what's left.
+      (when (> (point) oldpt)
+        (decode-coding-region oldpt (point) 'ctext-no-compositions))
+     ;; This buffer started as unibyte, because the string we get from
+      ;; the X selection is a unibyte string.  We must now make it
+      ;; multibyte, so that the decoded text is inserted as multibyte
+      ;; into its buffer.
+      (set-buffer-multibyte t)
+      (set-buffer-modified-p modified-p)
+      (- (point-max) (point-min)))))
+(defvar non-standard-designations-alist
+  '(("$(0" . (big5 "big5-0" 2))
+    ("$(1" . (big5 "big5-0" 2))
+    ("-V"  . (t "iso8859-10" 1))
+    ("-Y"  . (t "iso8859-13" 1))
+    ("-_"  . (t "iso8859-14" 1))
+    ("-b"  . (t "iso8859-15" 1))
+    ("-f"  . (t "iso8859-16" 1)))
+  "Alist of ctext control sequences that introduce character sets which
+are not in the list of approved ICCCM encodings, and the corresponding
+coding system, identifier string, and number of octets per encoded
+character.
+Each element has the form (CTLSEQ . (ENCODING CHARSET NOCTETS)).  CTLSEQ
+is the control sequence (sans the leading ESC) that introduces the character
+set in the text encoded by compound-text.  ENCODING is a coding system
+symbol; if it is t, it means that the ctext coding system already encodes
+the text correctly, and only the leading control sequence needs to be altered.
+If ENCODING is a coding system, we need to re-encode the text with that
+coding system.  CHARSET is the ICCCM name of the charset we need to put into
+the leading control sequence.  NOCTETS is the number of octets (bytes) that
+encode each character in this charset.  NOCTETS can be 0 (meaning the number
+of octets per character is variable), 1, 2, 3, or 4.")
+(defun ctext-pre-write-conversion (from to)
+  "Encode characters between FROM and TO as Compound Text w/Extended Segments."
+  (buffer-disable-undo) ; minimize consing due to insertions and deletions
+  (narrow-to-region from to)
+  (encode-coding-region from to 'ctext-no-compositions)
+  ;; Replace ISO-2022 charset designations with extended segments, for
+  ;; those charsets that are not part of the official X registry.
+  (save-match-data
+    (goto-char (point-min))
+    (let ((newpt (make-marker))
+          (case-fold-search nil)
+          pt desig encode-info encoding chset noctets textlen)
+      (set-buffer-multibyte nil)
+      (while (re-search-forward "\e\\(\$([01]\\|-[VY_bf]\\)" nil 'move)
+        (setq desig (match-string 1)
+              pt (point-marker)
+              encode-info (cdr (assoc desig non-standard-designations-alist))
+              encoding (car encode-info)
+              chset (cadr encode-info)
+              noctets (car (cddr encode-info)))
+        (skip-chars-forward "^\e")
+        (set-marker newpt (point))
+        (cond
+         ((eq encoding t)  ; only the leading sequence needs to be changed
+          (setq textlen (+ (- newpt pt) (length chset) 1))
+          (replace-match (format "\e%%/%d%c%c%s"
+                                 noctets
+                                 (+ (/ textlen 128) 128)
+                                 (+ (% textlen 128) 128)
+                                 chset)
+                         t t))
+         ((coding-system-p encoding) ; need to recode the entire segment...
+          (set-marker pt (match-beginning 0))
+          (decode-coding-region pt newpt 'ctext-no-compositions)
+          (set-buffer-multibyte t)
+          (encode-coding-region pt newpt encoding)
+          (set-buffer-multibyte nil)
+          (setq textlen (+ (- newpt pt) (length chset) 1))
+          (goto-char pt)
+          (insert (format "\e%%/%d%c%c%s"
+                          noctets
+                          (+ (/ textlen 128) 128)
+                          (+ (% textlen 128) 128)
+                          chset))))
+        (goto-char newpt))))
+  (set-buffer-multibyte t)
+  nil)
 ;;; FILE I/O
 (defcustom auto-coding-alist
author	Eli Zaretskii	2002-02-22 13:44:21 +0000
committer	Eli Zaretskii	2002-02-22 13:44:21 +0000
commit	835cbadb3fa1792c6fb9716b502f7502d018f57b (patch)
tree	978743f3f71b1c1b1acef71763478cd4f1f691da
parent	23e16093ecd087847f494aaebbf58816415eed06 (diff)
download	emacs-835cbadb3fa1792c6fb9716b502f7502d018f57b.tar.gz emacs-835cbadb3fa1792c6fb9716b502f7502d018f57b.zip

diff --git a/lisp/international/mule.el b/lisp/international/mule.el index 124761c0d2a..9bb9c4bf5cc 100644 --- a/lisp/international/mule.el +++ b/lisp/international/mule.el
@@ -1284,6 +1284,161 @@ ARG is a list of coding categories ordered by priority."
1284	(setq coding-category-list (append arg current-list))	1284	(setq coding-category-list (append arg current-list))
1285	(set-coding-priority-internal)))	1285	(set-coding-priority-internal)))
1286		1286
		1287	;;; X selections
		1288
		1289	(defvar non-standard-icccm-encodings-alist
		1290	'(("ISO8859-15" . latin-iso8859-15)
		1291	("ISO8859-14" . latin-iso8859-14)
		1292	("KOI8-R" . koi8-r)
		1293	("BIG5-0" . big5))
		1294	"Alist of font charset names defined by XLFD, and the corresponding Emacs
		1295	charsets or coding systems.")
		1296
		1297	;; Functions to support "Non-Standard Character Set Encodings" defined
		1298	;; by the ICCCM spec. We support that by converting the leading
		1299	;; sequence of the ``extended segment'' to the corresponding ISO-2022
		1300	;; sequences (if the leading sequence names an Emacs charset), or decode
		1301	;; the segment (if it names a coding system). Encoding does the reverse.
		1302	(defun ctext-post-read-conversion (len)
		1303	"Decode LEN characters encoded as Compound Text with Extended Segments."
		1304	(buffer-disable-undo) ; minimize consing due to insertions and deletions
		1305	(narrow-to-region (point) (+ (point) len))
		1306	(save-match-data
		1307	(let ((pt (point-marker))
		1308	(oldpt (point-marker))
		1309	(newpt (make-marker))
		1310	(modified-p (buffer-modified-p))
		1311	(case-fold-search nil)
		1312	last-coding-system-used
		1313	encoding textlen chset)
		1314	(while (re-search-forward
		1315	"\\(\e\\)%/[0-4]\\([\200-\377][\200-\377]\\)\\([^\002]+\\)\002"
		1316	nil 'move)
		1317	(set-marker newpt (point))
		1318	(set-marker pt (match-beginning 0))
		1319	(setq encoding (match-string 3))
		1320	(setq textlen (- (+ (* (- (aref (match-string 2) 0) 128) 128)
		1321	(- (aref (match-string 2) 1) 128))
		1322	(1+ (length encoding))))
		1323	(setq
		1324	chset (cdr (assoc-ignore-case encoding
		1325	non-standard-icccm-encodings-alist)))
		1326	(cond ((null chset)
		1327	;; This charset is not supported--leave this extended
		1328	;; segment unaltered and skip over it.
		1329	(goto-char (+ (point) textlen)))
		1330	((charsetp chset)
		1331	;; If it's a charset, replace the leading escape sequence
		1332	;; with a standard ISO-2022 sequence. We will decode all
		1333	;; such segments later, in one go, when we exit the loop
		1334	;; or find an extended segment that names a coding
		1335	;; system, not a charset.
		1336	(replace-match
		1337	(concat "\\1"
		1338	(if (= 0 (charset-iso-graphic-plane chset))
		1339	;; GL charsets
		1340	(if (= 1 (charset-dimension chset)) "(" "$(")
		1341	;; GR charsets
		1342	(if (= 96 (charset-chars chset))
		1343	"-"
		1344	(if (= 1 (charset-dimension chset)) ")" "$)")))
		1345	(string (charset-iso-final-char chset)))
		1346	t)
		1347	(goto-char (+ (point) textlen)))
		1348	((coding-system-p chset)
		1349	;; If it's a coding system, we need to decode the segment
		1350	;; right away. But first, decode what we've skipped
		1351	;; across until now.
		1352	(when (> pt oldpt)
		1353	(decode-coding-region oldpt pt 'ctext-no-compositions))
		1354	(delete-region pt newpt)
		1355	(set-marker newpt (+ newpt textlen))
		1356	(decode-coding-region pt newpt chset)
		1357	(goto-char newpt)
		1358	(set-marker oldpt newpt))))
		1359	;; Decode what's left.
		1360	(when (> (point) oldpt)
		1361	(decode-coding-region oldpt (point) 'ctext-no-compositions))
		1362	;; This buffer started as unibyte, because the string we get from
		1363	;; the X selection is a unibyte string. We must now make it
		1364	;; multibyte, so that the decoded text is inserted as multibyte
		1365	;; into its buffer.
		1366	(set-buffer-multibyte t)
		1367	(set-buffer-modified-p modified-p)
		1368	(- (point-max) (point-min)))))
		1369
		1370	(defvar non-standard-designations-alist
		1371	'(("$(0" . (big5 "big5-0" 2))
		1372	("$(1" . (big5 "big5-0" 2))
		1373	("-V" . (t "iso8859-10" 1))
		1374	("-Y" . (t "iso8859-13" 1))
		1375	("-_" . (t "iso8859-14" 1))
		1376	("-b" . (t "iso8859-15" 1))
		1377	("-f" . (t "iso8859-16" 1)))
		1378	"Alist of ctext control sequences that introduce character sets which
		1379	are not in the list of approved ICCCM encodings, and the corresponding
		1380	coding system, identifier string, and number of octets per encoded
		1381	character.
		1382
		1383	Each element has the form (CTLSEQ . (ENCODING CHARSET NOCTETS)). CTLSEQ
		1384	is the control sequence (sans the leading ESC) that introduces the character
		1385	set in the text encoded by compound-text. ENCODING is a coding system
		1386	symbol; if it is t, it means that the ctext coding system already encodes
		1387	the text correctly, and only the leading control sequence needs to be altered.
		1388	If ENCODING is a coding system, we need to re-encode the text with that
		1389	coding system. CHARSET is the ICCCM name of the charset we need to put into
		1390	the leading control sequence. NOCTETS is the number of octets (bytes) that
		1391	encode each character in this charset. NOCTETS can be 0 (meaning the number
		1392	of octets per character is variable), 1, 2, 3, or 4.")
		1393
		1394	(defun ctext-pre-write-conversion (from to)
		1395	"Encode characters between FROM and TO as Compound Text w/Extended Segments."
		1396	(buffer-disable-undo) ; minimize consing due to insertions and deletions
		1397	(narrow-to-region from to)
		1398	(encode-coding-region from to 'ctext-no-compositions)
		1399	;; Replace ISO-2022 charset designations with extended segments, for
		1400	;; those charsets that are not part of the official X registry.
		1401	(save-match-data
		1402	(goto-char (point-min))
		1403	(let ((newpt (make-marker))
		1404	(case-fold-search nil)
		1405	pt desig encode-info encoding chset noctets textlen)
		1406	(set-buffer-multibyte nil)
		1407	(while (re-search-forward "\e\\(\$([01]\\\|-[VY_bf]\\)" nil 'move)
		1408	(setq desig (match-string 1)
		1409	pt (point-marker)
		1410	encode-info (cdr (assoc desig non-standard-designations-alist))
		1411	encoding (car encode-info)
		1412	chset (cadr encode-info)
		1413	noctets (car (cddr encode-info)))
		1414	(skip-chars-forward "^\e")
		1415	(set-marker newpt (point))
		1416	(cond
		1417	((eq encoding t) ; only the leading sequence needs to be changed
		1418	(setq textlen (+ (- newpt pt) (length chset) 1))
		1419	(replace-match (format "\e%%/%d%c%c%s"
		1420	noctets
		1421	(+ (/ textlen 128) 128)
		1422	(+ (% textlen 128) 128)
		1423	chset)
		1424	t t))
		1425	((coding-system-p encoding) ; need to recode the entire segment...
		1426	(set-marker pt (match-beginning 0))
		1427	(decode-coding-region pt newpt 'ctext-no-compositions)
		1428	(set-buffer-multibyte t)
		1429	(encode-coding-region pt newpt encoding)
		1430	(set-buffer-multibyte nil)
		1431	(setq textlen (+ (- newpt pt) (length chset) 1))
		1432	(goto-char pt)
		1433	(insert (format "\e%%/%d%c%c%s"
		1434	noctets
		1435	(+ (/ textlen 128) 128)
		1436	(+ (% textlen 128) 128)
		1437	chset))))
		1438	(goto-char newpt))))
		1439	(set-buffer-multibyte t)
		1440	nil)
		1441
1287	;;; FILE I/O	1442	;;; FILE I/O
1288		1443
1289	(defcustom auto-coding-alist	1444	(defcustom auto-coding-alist