aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKenichi Handa2003-05-29 01:28:24 +0000
committerKenichi Handa2003-05-29 01:28:24 +0000
commitcc9269031d5a7bf5e6e61835610658148bda683e (patch)
treefa1ce18b8206ab9d5aee6956a94e640f0bafc28c
parent0651bdbbb8ef5b5c664eea562160d91d9c78c488 (diff)
downloademacs-cc9269031d5a7bf5e6e61835610658148bda683e.tar.gz
emacs-cc9269031d5a7bf5e6e61835610658148bda683e.zip
(ctext-non-standard-encodings-alist):
Renamed from non-standard-icccm-encodings-alist. (ctext-non-standard-encodings-regexp): New variable (ctext-post-read-conversion): Full rewrite. (ctext-non-standard-designations-alist): Renamed from non-standard-designations-alist. (ctext-pre-write-conversion): Full rewrite.
-rw-r--r--lisp/international/mule.el220
1 files changed, 93 insertions, 127 deletions
diff --git a/lisp/international/mule.el b/lisp/international/mule.el
index 17f47376f29..40fd3107a91 100644
--- a/lisp/international/mule.el
+++ b/lisp/international/mule.el
@@ -1316,108 +1316,73 @@ ARG is a list of coding categories ordered by priority."
1316 1316
1317;;; X selections 1317;;; X selections
1318 1318
1319(defvar non-standard-icccm-encodings-alist 1319(defvar ctext-non-standard-encodings-alist
1320 '(("ISO8859-15" . latin-iso8859-15) 1320 '(("ISO8859-15" . latin-iso8859-15)
1321 ("ISO8859-14" . latin-iso8859-14) 1321 ("ISO8859-14" . latin-iso8859-14)
1322 ("KOI8-R" . koi8-r) 1322 ("KOI8-R" . koi8-r)
1323 ("BIG5-0" . big5)) 1323 ("BIG5-0" . big5))
1324 "Alist of font charset names defined by XLFD. 1324 "Alist of non-standard encoding names vs Emacs coding systems.
1325The cdr of each element is the corresponding Emacs charset or coding system.") 1325This alist is used to decode an extened segment of a compound text.")
1326
1327(defvar ctext-non-standard-encodings-regexp
1328 (string-to-multibyte
1329 (concat
1330 ;; For non-standard encodings.
1331 "\\(\e%/[0-4][\200-\377][\200-\377]\\([^\002]+\\)\002\\)"
1332 "\\|"
1333 ;; For UTF-8 encoding.
1334 "\\(\e%G[^\e]*\e%@\\)")))
1326 1335
1327;; Functions to support "Non-Standard Character Set Encodings" defined 1336;; Functions to support "Non-Standard Character Set Encodings" defined
1328;; by the COMPOUND-TEXT spec. 1337;; by the COMPOUND-TEXT spec.
1329;; We support that by converting the leading sequence of the 1338;; We support that by decoding the whole data by `ctext' which just
1330;; ``extended segment'' to the corresponding ISO-2022 sequences (if 1339;; pertains byte sequences belonging to ``extended segment'', then
1331;; the leading sequence names an Emacs charset), or decode the segment 1340;; decoding those byte sequences one by one in Lisp.
1332;; (if it names a coding system). Encoding does the reverse.
1333;; This function also supports "The UTF-8 encoding" described in the 1341;; This function also supports "The UTF-8 encoding" described in the
1334;; section 7 of the documentation fo COMPOUND-TEXT distributed with 1342;; section 7 of the documentation fo COMPOUND-TEXT distributed with
1335;; XFree86. 1343;; XFree86.
1336 1344
1337(defun ctext-post-read-conversion (len) 1345(defun ctext-post-read-conversion (len)
1338 "Decode LEN characters encoded as Compound Text with Extended Segments." 1346 "Decode LEN characters encoded as Compound Text with Extended Segments."
1339 (buffer-disable-undo) ; minimize consing due to insertions and deletions
1340 (narrow-to-region (point) (+ (point) len))
1341 (save-match-data 1347 (save-match-data
1342 (let ((pt (point-marker)) 1348 (save-restriction
1343 (oldpt (point-marker)) 1349 (let ((case-fold-search nil)
1344 (newpt (make-marker)) 1350 (in-workbuf (string= (buffer-name) " *code-converting-work*"))
1345 (modified-p (buffer-modified-p)) 1351 last-coding-system-used
1346 (case-fold-search nil) 1352 pos bytes)
1347 ;; We need multibyte conversion of "TO" type because the 1353 (or in-workbuf
1348 ;; buffer may be multibyte, and, in that case, the pattern 1354 (narrow-to-region (point) (+ (point) len)))
1349 ;; must contain eight-bit-control/graphic characters. 1355 (decode-coding-region (point-min) (point-max) 'ctext)
1350 (pattern (string-to-multibyte "\\(\e\\)%/[0-4]\\([\200-\377][\200-\377]\\)\\([^\002]+\\)\002\\|\e%G[^\e]+\e%@")) 1356 (if in-workbuf
1351 last-coding-system-used 1357 (set-buffer-multibyte t))
1352 encoding textlen chset) 1358 (while (re-search-forward ctext-non-standard-encodings-regexp
1353 (while (re-search-forward pattern nil 'move) 1359 nil 'move)
1354 (set-marker newpt (point)) 1360 (setq pos (match-beginning 0))
1355 (set-marker pt (match-beginning 0)) 1361 (if (match-beginning 1)
1356 (if (= (preceding-char) ?@) 1362 ;; ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
1357 ;; We found embedded utf-8 sequence. 1363 (let* ((M (char-after (+ pos 4)))
1358 (progn 1364 (L (char-after (+ pos 5)))
1359 (delete-char -3) ; delete ESC % @ at the tail 1365 (encoding (match-string 2))
1360 (goto-char pt) 1366 (coding (or (cdr (assoc-ignore-case
1361 (delete-char 3) ; delete ESC % G at the head 1367 encoding
1362 (if (> pt oldpt) 1368 ctext-non-standard-encodings-alist))
1363 (decode-coding-region oldpt pt 'ctext-no-compositions)) 1369 (coding-system-p
1364 (decode-coding-region pt newpt 'mule-utf-8) 1370 (intern (downcase encoding))))))
1365 (goto-char newpt) 1371 (setq bytes (- (+ (* (- M 128) 128) (- L 128))
1366 (set-marker oldpt newpt)) 1372 (- (point) (+ pos 6))))
1367 (setq encoding (match-string 3)) 1373 (when coding
1368 (setq textlen (- (+ (* (- (aref (match-string 2) 0) 128) 128) 1374 (delete-region pos (point))
1369 (- (aref (match-string 2) 1) 128)) 1375 (forward-char bytes)
1370 (1+ (length encoding)))) 1376 (decode-coding-region (- (point) bytes) (point) coding)))
1371 (setq 1377 ;; ESC % G --UTF-8-BYTES-- ESC % @
1372 chset (cdr (assoc-ignore-case encoding 1378 (setq bytes (- (point) pos))
1373 non-standard-icccm-encodings-alist))) 1379 (decode-coding-region (- (point) bytes) (point) 'utf-8))))
1374 (cond ((null chset) 1380 (goto-char (point-min))
1375 ;; This charset is not supported--leave this extended 1381 (- (point-max) (point)))))
1376 ;; segment unaltered and skip over it.
1377 (goto-char (+ (point) textlen)))
1378 ((charsetp chset)
1379 ;; If it's a charset, replace the leading escape sequence
1380 ;; with a standard ISO-2022 sequence. We will decode all
1381 ;; such segments later, in one go, when we exit the loop
1382 ;; or find an extended segment that names a coding
1383 ;; system, not a charset.
1384 (replace-match
1385 (concat "\\1"
1386 (if (= 0 (charset-iso-graphic-plane chset))
1387 ;; GL charsets
1388 (if (= 1 (charset-dimension chset)) "(" "$(")
1389 ;; GR charsets
1390 (if (= 96 (charset-chars chset))
1391 "-"
1392 (if (= 1 (charset-dimension chset)) ")" "$)")))
1393 (string (charset-iso-final-char chset)))
1394 t)
1395 (goto-char (+ (point) textlen)))
1396 ((coding-system-p chset)
1397 ;; If it's a coding system, we need to decode the segment
1398 ;; right away. But first, decode what we've skipped
1399 ;; across until now.
1400 (when (> pt oldpt)
1401 (decode-coding-region oldpt pt 'ctext-no-compositions))
1402 (delete-region pt newpt)
1403 (set-marker newpt (+ newpt textlen))
1404 (decode-coding-region pt newpt chset)
1405 (goto-char newpt)
1406 (set-marker oldpt newpt)))))
1407 ;; Decode what's left.
1408 (when (> (point) oldpt)
1409 (decode-coding-region oldpt (point) 'ctext-no-compositions))
1410 ;; This buffer started as unibyte, because the string we get from
1411 ;; the X selection is a unibyte string. We must now make it
1412 ;; multibyte, so that the decoded text is inserted as multibyte
1413 ;; into its buffer.
1414 (set-buffer-multibyte t)
1415 (set-buffer-modified-p modified-p)
1416 (- (point-max) (point-min)))))
1417 1382
1418;; If you add charsets here, be sure to modify the regexp used by 1383;; If you add charsets here, be sure to modify the regexp used by
1419;; ctext-pre-write-conversion to look up non-standard charsets. 1384;; ctext-pre-write-conversion to look up non-standard charsets.
1420(defvar non-standard-designations-alist 1385(defvar ctext-non-standard-designations-alist
1421 '(("$(0" . (big5 "big5-0" 2)) 1386 '(("$(0" . (big5 "big5-0" 2))
1422 ("$(1" . (big5 "big5-0" 2)) 1387 ("$(1" . (big5 "big5-0" 2))
1423 ;; The following are actually standard; generating extended 1388 ;; The following are actually standard; generating extended
@@ -1449,44 +1414,47 @@ of octets per character is variable), 1, 2, 3, or 4.")
1449 "Encode characters between FROM and TO as Compound Text w/Extended Segments. 1414 "Encode characters between FROM and TO as Compound Text w/Extended Segments.
1450 1415
1451If FROM is a string, or if the current buffer is not the one set up for us 1416If FROM is a string, or if the current buffer is not the one set up for us
1452by run_pre_post_conversion_on_str, generate a new temp buffer, insert the 1417by encode-coding-string, generate a new temp buffer, insert the
1453text, and convert it in the temporary buffer. Otherwise, convert in-place." 1418text, and convert it in the temporary buffer. Otherwise, convert in-place."
1454 (cond ((and (string= (buffer-name) " *code-converting-work*")
1455 (not (stringp from)))
1456 ; Minimize consing due to subsequent insertions and deletions.
1457 (buffer-disable-undo)
1458 (narrow-to-region from to))
1459 (t
1460 (let ((buf (current-buffer)))
1461 (set-buffer (generate-new-buffer " *temp"))
1462 (buffer-disable-undo)
1463 (if (stringp from)
1464 (insert from)
1465 (insert-buffer-substring buf from to))
1466 (setq from (point-min) to (point-max)))))
1467 (encode-coding-region from to 'ctext-no-compositions)
1468 ;; Replace ISO-2022 charset designations with extended segments, for
1469 ;; those charsets that are not part of the official X registry.
1470 (save-match-data 1419 (save-match-data
1471 (goto-char (point-min)) 1420 ;; Setup a working buffer if necessary.
1472 (let ((newpt (make-marker)) 1421 (cond ((stringp from)
1473 (case-fold-search nil) 1422 (let ((buf (current-buffer)))
1474 pt desig encode-info encoding chset noctets textlen) 1423 (set-buffer (generate-new-buffer " *temp"))
1475 (set-buffer-multibyte nil) 1424 (set-buffer-multibyte (multibyte-string-p from))
1476 ;; The regexp below finds the leading sequences for big5. 1425 (insert from)))
1426 ((not (string= (buffer-name) " *code-converting-work*"))
1427 (let ((buf (current-buffer))
1428 (multibyte enable-multibyte-characters))
1429 (set-buffer (generate-new-buffer " *temp"))
1430 (set-buffer-multibyte multibyte)
1431 (insert-buffer-substring buf from to))))
1432
1433 ;; Now we can encode the whole buffer.
1434 (let ((case-fold-search nil)
1435 last-coding-system-used
1436 pos posend desig encode-info encoding chset noctets textlen)
1437 (goto-char (point-min))
1438 ;; At first encode the whole buffer.
1439 (encode-coding-region (point-min) (point-max) 'ctext-no-compositions)
1440 ;; Then replace ISO-2022 charset designations with extended
1441 ;; segments, for those charsets that are not part of the
1442 ;; official X registry. The regexp below finds the leading
1443 ;; sequences for big5.
1477 (while (re-search-forward "\e\\(\$([01]\\)" nil 'move) 1444 (while (re-search-forward "\e\\(\$([01]\\)" nil 'move)
1478 (setq desig (match-string 1) 1445 (setq pos (match-beginning 0)
1479 pt (point-marker) 1446 posend (point)
1480 encode-info (cdr (assoc desig non-standard-designations-alist)) 1447 desig (match-string 1)
1448 encode-info (cdr (assoc desig
1449 ctext-non-standard-designations-alist))
1481 encoding (car encode-info) 1450 encoding (car encode-info)
1482 chset (cadr encode-info) 1451 chset (cadr encode-info)
1483 noctets (car (cddr encode-info))) 1452 noctets (car (cddr encode-info)))
1484 (skip-chars-forward "^\e") 1453 (skip-chars-forward "^\e")
1485 (set-marker newpt (point))
1486 (cond 1454 (cond
1487 ((eq encoding t) ; only the leading sequence needs to be changed 1455 ((eq encoding t) ; only the leading sequence needs to be changed
1488 (setq textlen (+ (- newpt pt) (length chset) 1)) 1456 (setq textlen (+ (- (point) posend) (length chset) 1))
1489 ;; Generate the ICCCM control sequence for an extended segment. 1457 ;; Generate the control sequence for an extended segment.
1490 (replace-match (format "\e%%/%d%c%c%s" 1458 (replace-match (format "\e%%/%d%c%c%s"
1491 noctets 1459 noctets
1492 (+ (/ textlen 128) 128) 1460 (+ (/ textlen 128) 128)
@@ -1494,20 +1462,18 @@ text, and convert it in the temporary buffer. Otherwise, convert in-place."
1494 chset) 1462 chset)
1495 t t)) 1463 t t))
1496 ((coding-system-p encoding) ; need to recode the entire segment... 1464 ((coding-system-p encoding) ; need to recode the entire segment...
1497 (set-marker pt (match-beginning 0)) 1465 (decode-coding-region pos (point) 'ctext-no-compositions)
1498 (decode-coding-region pt newpt 'ctext-no-compositions) 1466 (encode-coding-region pos (point) encoding)
1499 (set-buffer-multibyte t)
1500 (encode-coding-region pt newpt encoding)
1501 (set-buffer-multibyte nil) 1467 (set-buffer-multibyte nil)
1502 (setq textlen (+ (- newpt pt) (length chset) 1)) 1468 (setq textlen (+ (- (point) pos) (length chset) 1))
1503 (goto-char pt) 1469 (save-excursion
1504 (insert (format "\e%%/%d%c%c%s" 1470 (goto-char pos)
1505 noctets 1471 (insert (format "\e%%/%d%c%c%s"
1506 (+ (/ textlen 128) 128) 1472 noctets
1507 (+ (% textlen 128) 128) 1473 (+ (/ textlen 128) 128)
1508 chset)))) 1474 (+ (% textlen 128) 128)
1509 (goto-char newpt)))) 1475 chset))))))
1510 (set-buffer-multibyte t) 1476 (goto-char (point-min))))
1511 ;; Must return nil, as build_annotations_2 expects that. 1477 ;; Must return nil, as build_annotations_2 expects that.
1512 nil) 1478 nil)
1513 1479