diff options
| author | Kenichi Handa | 2003-05-29 01:28:24 +0000 |
|---|---|---|
| committer | Kenichi Handa | 2003-05-29 01:28:24 +0000 |
| commit | cc9269031d5a7bf5e6e61835610658148bda683e (patch) | |
| tree | fa1ce18b8206ab9d5aee6956a94e640f0bafc28c | |
| parent | 0651bdbbb8ef5b5c664eea562160d91d9c78c488 (diff) | |
| download | emacs-cc9269031d5a7bf5e6e61835610658148bda683e.tar.gz emacs-cc9269031d5a7bf5e6e61835610658148bda683e.zip | |
(ctext-non-standard-encodings-alist):
Renamed from non-standard-icccm-encodings-alist.
(ctext-non-standard-encodings-regexp): New variable
(ctext-post-read-conversion): Full rewrite.
(ctext-non-standard-designations-alist): Renamed from
non-standard-designations-alist.
(ctext-pre-write-conversion): Full rewrite.
| -rw-r--r-- | lisp/international/mule.el | 220 |
1 files changed, 93 insertions, 127 deletions
diff --git a/lisp/international/mule.el b/lisp/international/mule.el index 17f47376f29..40fd3107a91 100644 --- a/lisp/international/mule.el +++ b/lisp/international/mule.el | |||
| @@ -1316,108 +1316,73 @@ ARG is a list of coding categories ordered by priority." | |||
| 1316 | 1316 | ||
| 1317 | ;;; X selections | 1317 | ;;; X selections |
| 1318 | 1318 | ||
| 1319 | (defvar non-standard-icccm-encodings-alist | 1319 | (defvar ctext-non-standard-encodings-alist |
| 1320 | '(("ISO8859-15" . latin-iso8859-15) | 1320 | '(("ISO8859-15" . latin-iso8859-15) |
| 1321 | ("ISO8859-14" . latin-iso8859-14) | 1321 | ("ISO8859-14" . latin-iso8859-14) |
| 1322 | ("KOI8-R" . koi8-r) | 1322 | ("KOI8-R" . koi8-r) |
| 1323 | ("BIG5-0" . big5)) | 1323 | ("BIG5-0" . big5)) |
| 1324 | "Alist of font charset names defined by XLFD. | 1324 | "Alist of non-standard encoding names vs Emacs coding systems. |
| 1325 | The cdr of each element is the corresponding Emacs charset or coding system.") | 1325 | This alist is used to decode an extened segment of a compound text.") |
| 1326 | |||
| 1327 | (defvar ctext-non-standard-encodings-regexp | ||
| 1328 | (string-to-multibyte | ||
| 1329 | (concat | ||
| 1330 | ;; For non-standard encodings. | ||
| 1331 | "\\(\e%/[0-4][\200-\377][\200-\377]\\([^\002]+\\)\002\\)" | ||
| 1332 | "\\|" | ||
| 1333 | ;; For UTF-8 encoding. | ||
| 1334 | "\\(\e%G[^\e]*\e%@\\)"))) | ||
| 1326 | 1335 | ||
| 1327 | ;; Functions to support "Non-Standard Character Set Encodings" defined | 1336 | ;; Functions to support "Non-Standard Character Set Encodings" defined |
| 1328 | ;; by the COMPOUND-TEXT spec. | 1337 | ;; by the COMPOUND-TEXT spec. |
| 1329 | ;; We support that by converting the leading sequence of the | 1338 | ;; We support that by decoding the whole data by `ctext' which just |
| 1330 | ;; ``extended segment'' to the corresponding ISO-2022 sequences (if | 1339 | ;; pertains byte sequences belonging to ``extended segment'', then |
| 1331 | ;; the leading sequence names an Emacs charset), or decode the segment | 1340 | ;; decoding those byte sequences one by one in Lisp. |
| 1332 | ;; (if it names a coding system). Encoding does the reverse. | ||
| 1333 | ;; This function also supports "The UTF-8 encoding" described in the | 1341 | ;; This function also supports "The UTF-8 encoding" described in the |
| 1334 | ;; section 7 of the documentation fo COMPOUND-TEXT distributed with | 1342 | ;; section 7 of the documentation fo COMPOUND-TEXT distributed with |
| 1335 | ;; XFree86. | 1343 | ;; XFree86. |
| 1336 | 1344 | ||
| 1337 | (defun ctext-post-read-conversion (len) | 1345 | (defun ctext-post-read-conversion (len) |
| 1338 | "Decode LEN characters encoded as Compound Text with Extended Segments." | 1346 | "Decode LEN characters encoded as Compound Text with Extended Segments." |
| 1339 | (buffer-disable-undo) ; minimize consing due to insertions and deletions | ||
| 1340 | (narrow-to-region (point) (+ (point) len)) | ||
| 1341 | (save-match-data | 1347 | (save-match-data |
| 1342 | (let ((pt (point-marker)) | 1348 | (save-restriction |
| 1343 | (oldpt (point-marker)) | 1349 | (let ((case-fold-search nil) |
| 1344 | (newpt (make-marker)) | 1350 | (in-workbuf (string= (buffer-name) " *code-converting-work*")) |
| 1345 | (modified-p (buffer-modified-p)) | 1351 | last-coding-system-used |
| 1346 | (case-fold-search nil) | 1352 | pos bytes) |
| 1347 | ;; We need multibyte conversion of "TO" type because the | 1353 | (or in-workbuf |
| 1348 | ;; buffer may be multibyte, and, in that case, the pattern | 1354 | (narrow-to-region (point) (+ (point) len))) |
| 1349 | ;; must contain eight-bit-control/graphic characters. | 1355 | (decode-coding-region (point-min) (point-max) 'ctext) |
| 1350 | (pattern (string-to-multibyte "\\(\e\\)%/[0-4]\\([\200-\377][\200-\377]\\)\\([^\002]+\\)\002\\|\e%G[^\e]+\e%@")) | 1356 | (if in-workbuf |
| 1351 | last-coding-system-used | 1357 | (set-buffer-multibyte t)) |
| 1352 | encoding textlen chset) | 1358 | (while (re-search-forward ctext-non-standard-encodings-regexp |
| 1353 | (while (re-search-forward pattern nil 'move) | 1359 | nil 'move) |
| 1354 | (set-marker newpt (point)) | 1360 | (setq pos (match-beginning 0)) |
| 1355 | (set-marker pt (match-beginning 0)) | 1361 | (if (match-beginning 1) |
| 1356 | (if (= (preceding-char) ?@) | 1362 | ;; ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES-- |
| 1357 | ;; We found embedded utf-8 sequence. | 1363 | (let* ((M (char-after (+ pos 4))) |
| 1358 | (progn | 1364 | (L (char-after (+ pos 5))) |
| 1359 | (delete-char -3) ; delete ESC % @ at the tail | 1365 | (encoding (match-string 2)) |
| 1360 | (goto-char pt) | 1366 | (coding (or (cdr (assoc-ignore-case |
| 1361 | (delete-char 3) ; delete ESC % G at the head | 1367 | encoding |
| 1362 | (if (> pt oldpt) | 1368 | ctext-non-standard-encodings-alist)) |
| 1363 | (decode-coding-region oldpt pt 'ctext-no-compositions)) | 1369 | (coding-system-p |
| 1364 | (decode-coding-region pt newpt 'mule-utf-8) | 1370 | (intern (downcase encoding)))))) |
| 1365 | (goto-char newpt) | 1371 | (setq bytes (- (+ (* (- M 128) 128) (- L 128)) |
| 1366 | (set-marker oldpt newpt)) | 1372 | (- (point) (+ pos 6)))) |
| 1367 | (setq encoding (match-string 3)) | 1373 | (when coding |
| 1368 | (setq textlen (- (+ (* (- (aref (match-string 2) 0) 128) 128) | 1374 | (delete-region pos (point)) |
| 1369 | (- (aref (match-string 2) 1) 128)) | 1375 | (forward-char bytes) |
| 1370 | (1+ (length encoding)))) | 1376 | (decode-coding-region (- (point) bytes) (point) coding))) |
| 1371 | (setq | 1377 | ;; ESC % G --UTF-8-BYTES-- ESC % @ |
| 1372 | chset (cdr (assoc-ignore-case encoding | 1378 | (setq bytes (- (point) pos)) |
| 1373 | non-standard-icccm-encodings-alist))) | 1379 | (decode-coding-region (- (point) bytes) (point) 'utf-8)))) |
| 1374 | (cond ((null chset) | 1380 | (goto-char (point-min)) |
| 1375 | ;; This charset is not supported--leave this extended | 1381 | (- (point-max) (point))))) |
| 1376 | ;; segment unaltered and skip over it. | ||
| 1377 | (goto-char (+ (point) textlen))) | ||
| 1378 | ((charsetp chset) | ||
| 1379 | ;; If it's a charset, replace the leading escape sequence | ||
| 1380 | ;; with a standard ISO-2022 sequence. We will decode all | ||
| 1381 | ;; such segments later, in one go, when we exit the loop | ||
| 1382 | ;; or find an extended segment that names a coding | ||
| 1383 | ;; system, not a charset. | ||
| 1384 | (replace-match | ||
| 1385 | (concat "\\1" | ||
| 1386 | (if (= 0 (charset-iso-graphic-plane chset)) | ||
| 1387 | ;; GL charsets | ||
| 1388 | (if (= 1 (charset-dimension chset)) "(" "$(") | ||
| 1389 | ;; GR charsets | ||
| 1390 | (if (= 96 (charset-chars chset)) | ||
| 1391 | "-" | ||
| 1392 | (if (= 1 (charset-dimension chset)) ")" "$)"))) | ||
| 1393 | (string (charset-iso-final-char chset))) | ||
| 1394 | t) | ||
| 1395 | (goto-char (+ (point) textlen))) | ||
| 1396 | ((coding-system-p chset) | ||
| 1397 | ;; If it's a coding system, we need to decode the segment | ||
| 1398 | ;; right away. But first, decode what we've skipped | ||
| 1399 | ;; across until now. | ||
| 1400 | (when (> pt oldpt) | ||
| 1401 | (decode-coding-region oldpt pt 'ctext-no-compositions)) | ||
| 1402 | (delete-region pt newpt) | ||
| 1403 | (set-marker newpt (+ newpt textlen)) | ||
| 1404 | (decode-coding-region pt newpt chset) | ||
| 1405 | (goto-char newpt) | ||
| 1406 | (set-marker oldpt newpt))))) | ||
| 1407 | ;; Decode what's left. | ||
| 1408 | (when (> (point) oldpt) | ||
| 1409 | (decode-coding-region oldpt (point) 'ctext-no-compositions)) | ||
| 1410 | ;; This buffer started as unibyte, because the string we get from | ||
| 1411 | ;; the X selection is a unibyte string. We must now make it | ||
| 1412 | ;; multibyte, so that the decoded text is inserted as multibyte | ||
| 1413 | ;; into its buffer. | ||
| 1414 | (set-buffer-multibyte t) | ||
| 1415 | (set-buffer-modified-p modified-p) | ||
| 1416 | (- (point-max) (point-min))))) | ||
| 1417 | 1382 | ||
| 1418 | ;; If you add charsets here, be sure to modify the regexp used by | 1383 | ;; If you add charsets here, be sure to modify the regexp used by |
| 1419 | ;; ctext-pre-write-conversion to look up non-standard charsets. | 1384 | ;; ctext-pre-write-conversion to look up non-standard charsets. |
| 1420 | (defvar non-standard-designations-alist | 1385 | (defvar ctext-non-standard-designations-alist |
| 1421 | '(("$(0" . (big5 "big5-0" 2)) | 1386 | '(("$(0" . (big5 "big5-0" 2)) |
| 1422 | ("$(1" . (big5 "big5-0" 2)) | 1387 | ("$(1" . (big5 "big5-0" 2)) |
| 1423 | ;; The following are actually standard; generating extended | 1388 | ;; The following are actually standard; generating extended |
| @@ -1449,44 +1414,47 @@ of octets per character is variable), 1, 2, 3, or 4.") | |||
| 1449 | "Encode characters between FROM and TO as Compound Text w/Extended Segments. | 1414 | "Encode characters between FROM and TO as Compound Text w/Extended Segments. |
| 1450 | 1415 | ||
| 1451 | If FROM is a string, or if the current buffer is not the one set up for us | 1416 | If FROM is a string, or if the current buffer is not the one set up for us |
| 1452 | by run_pre_post_conversion_on_str, generate a new temp buffer, insert the | 1417 | by encode-coding-string, generate a new temp buffer, insert the |
| 1453 | text, and convert it in the temporary buffer. Otherwise, convert in-place." | 1418 | text, and convert it in the temporary buffer. Otherwise, convert in-place." |
| 1454 | (cond ((and (string= (buffer-name) " *code-converting-work*") | ||
| 1455 | (not (stringp from))) | ||
| 1456 | ; Minimize consing due to subsequent insertions and deletions. | ||
| 1457 | (buffer-disable-undo) | ||
| 1458 | (narrow-to-region from to)) | ||
| 1459 | (t | ||
| 1460 | (let ((buf (current-buffer))) | ||
| 1461 | (set-buffer (generate-new-buffer " *temp")) | ||
| 1462 | (buffer-disable-undo) | ||
| 1463 | (if (stringp from) | ||
| 1464 | (insert from) | ||
| 1465 | (insert-buffer-substring buf from to)) | ||
| 1466 | (setq from (point-min) to (point-max))))) | ||
| 1467 | (encode-coding-region from to 'ctext-no-compositions) | ||
| 1468 | ;; Replace ISO-2022 charset designations with extended segments, for | ||
| 1469 | ;; those charsets that are not part of the official X registry. | ||
| 1470 | (save-match-data | 1419 | (save-match-data |
| 1471 | (goto-char (point-min)) | 1420 | ;; Setup a working buffer if necessary. |
| 1472 | (let ((newpt (make-marker)) | 1421 | (cond ((stringp from) |
| 1473 | (case-fold-search nil) | 1422 | (let ((buf (current-buffer))) |
| 1474 | pt desig encode-info encoding chset noctets textlen) | 1423 | (set-buffer (generate-new-buffer " *temp")) |
| 1475 | (set-buffer-multibyte nil) | 1424 | (set-buffer-multibyte (multibyte-string-p from)) |
| 1476 | ;; The regexp below finds the leading sequences for big5. | 1425 | (insert from))) |
| 1426 | ((not (string= (buffer-name) " *code-converting-work*")) | ||
| 1427 | (let ((buf (current-buffer)) | ||
| 1428 | (multibyte enable-multibyte-characters)) | ||
| 1429 | (set-buffer (generate-new-buffer " *temp")) | ||
| 1430 | (set-buffer-multibyte multibyte) | ||
| 1431 | (insert-buffer-substring buf from to)))) | ||
| 1432 | |||
| 1433 | ;; Now we can encode the whole buffer. | ||
| 1434 | (let ((case-fold-search nil) | ||
| 1435 | last-coding-system-used | ||
| 1436 | pos posend desig encode-info encoding chset noctets textlen) | ||
| 1437 | (goto-char (point-min)) | ||
| 1438 | ;; At first encode the whole buffer. | ||
| 1439 | (encode-coding-region (point-min) (point-max) 'ctext-no-compositions) | ||
| 1440 | ;; Then replace ISO-2022 charset designations with extended | ||
| 1441 | ;; segments, for those charsets that are not part of the | ||
| 1442 | ;; official X registry. The regexp below finds the leading | ||
| 1443 | ;; sequences for big5. | ||
| 1477 | (while (re-search-forward "\e\\(\$([01]\\)" nil 'move) | 1444 | (while (re-search-forward "\e\\(\$([01]\\)" nil 'move) |
| 1478 | (setq desig (match-string 1) | 1445 | (setq pos (match-beginning 0) |
| 1479 | pt (point-marker) | 1446 | posend (point) |
| 1480 | encode-info (cdr (assoc desig non-standard-designations-alist)) | 1447 | desig (match-string 1) |
| 1448 | encode-info (cdr (assoc desig | ||
| 1449 | ctext-non-standard-designations-alist)) | ||
| 1481 | encoding (car encode-info) | 1450 | encoding (car encode-info) |
| 1482 | chset (cadr encode-info) | 1451 | chset (cadr encode-info) |
| 1483 | noctets (car (cddr encode-info))) | 1452 | noctets (car (cddr encode-info))) |
| 1484 | (skip-chars-forward "^\e") | 1453 | (skip-chars-forward "^\e") |
| 1485 | (set-marker newpt (point)) | ||
| 1486 | (cond | 1454 | (cond |
| 1487 | ((eq encoding t) ; only the leading sequence needs to be changed | 1455 | ((eq encoding t) ; only the leading sequence needs to be changed |
| 1488 | (setq textlen (+ (- newpt pt) (length chset) 1)) | 1456 | (setq textlen (+ (- (point) posend) (length chset) 1)) |
| 1489 | ;; Generate the ICCCM control sequence for an extended segment. | 1457 | ;; Generate the control sequence for an extended segment. |
| 1490 | (replace-match (format "\e%%/%d%c%c%s" | 1458 | (replace-match (format "\e%%/%d%c%c%s" |
| 1491 | noctets | 1459 | noctets |
| 1492 | (+ (/ textlen 128) 128) | 1460 | (+ (/ textlen 128) 128) |
| @@ -1494,20 +1462,18 @@ text, and convert it in the temporary buffer. Otherwise, convert in-place." | |||
| 1494 | chset) | 1462 | chset) |
| 1495 | t t)) | 1463 | t t)) |
| 1496 | ((coding-system-p encoding) ; need to recode the entire segment... | 1464 | ((coding-system-p encoding) ; need to recode the entire segment... |
| 1497 | (set-marker pt (match-beginning 0)) | 1465 | (decode-coding-region pos (point) 'ctext-no-compositions) |
| 1498 | (decode-coding-region pt newpt 'ctext-no-compositions) | 1466 | (encode-coding-region pos (point) encoding) |
| 1499 | (set-buffer-multibyte t) | ||
| 1500 | (encode-coding-region pt newpt encoding) | ||
| 1501 | (set-buffer-multibyte nil) | 1467 | (set-buffer-multibyte nil) |
| 1502 | (setq textlen (+ (- newpt pt) (length chset) 1)) | 1468 | (setq textlen (+ (- (point) pos) (length chset) 1)) |
| 1503 | (goto-char pt) | 1469 | (save-excursion |
| 1504 | (insert (format "\e%%/%d%c%c%s" | 1470 | (goto-char pos) |
| 1505 | noctets | 1471 | (insert (format "\e%%/%d%c%c%s" |
| 1506 | (+ (/ textlen 128) 128) | 1472 | noctets |
| 1507 | (+ (% textlen 128) 128) | 1473 | (+ (/ textlen 128) 128) |
| 1508 | chset)))) | 1474 | (+ (% textlen 128) 128) |
| 1509 | (goto-char newpt)))) | 1475 | chset)))))) |
| 1510 | (set-buffer-multibyte t) | 1476 | (goto-char (point-min)))) |
| 1511 | ;; Must return nil, as build_annotations_2 expects that. | 1477 | ;; Must return nil, as build_annotations_2 expects that. |
| 1512 | nil) | 1478 | nil) |
| 1513 | 1479 | ||