diff options
| author | Michal Nazarewicz | 2016-10-05 00:06:01 +0200 |
|---|---|---|
| committer | Michal Nazarewicz | 2017-04-06 20:54:58 +0200 |
| commit | b3b9b258c4026baa1cad3f2e617f1a637fc8d205 (patch) | |
| tree | 1520ef9f5a3204784c597fcf2bf7a7c7fc1b8d7c /src | |
| parent | 2c87dabd0460cce83d2345b4ddff159969674fef (diff) | |
| download | emacs-b3b9b258c4026baa1cad3f2e617f1a637fc8d205.tar.gz emacs-b3b9b258c4026baa1cad3f2e617f1a637fc8d205.zip | |
Support casing characters which map into multiple code points (bug#24603)
Implement unconditional special casing rules defined in Unicode standard.
Among other things, they deal with cases when a single code point is
replaced by multiple ones because single character does not exist (e.g.
‘fi’ ligature turning into ‘FL’) or is not commonly used (e.g. ß turning
into SS).
* admin/unidata/SpecialCasing.txt: New data file pulled from Unicode
standard distribution.
* admin/unidata/README: Mention SpecialCasing.txt.
* admin/unidata/unidata-get.el (unidata-gen-table-special-casing,
unidata-gen-table-special-casing--do-load): New functions generating
‘special-uppercase’, ‘special-lowercase’ and ‘special-titlecase’
character Unicode properties built from the SpecialCasing.txt Unicode
data file.
* src/casefiddle.c (struct casing_str_buf): New structure for
representing short strings used to handle one-to-many character
mappings.
(case_character_imlp): New function which can handle one-to-many
character mappings.
(case_character, case_single_character): Wrappers for the above
functions. The former may map one character to multiple (or no)
code points while the latter does what the former used to do (i.e.
handles one-to-one mappings only).
(do_casify_natnum, do_casify_unibyte_string,
do_casify_unibyte_region): Use case_single_character.
(do_casify_multibyte_string, do_casify_multibyte_region): Support new
features of case_character.
* (do_casify_region): Updated to reflact do_casify_multibyte_string
changes.
(casify_word): Handle situation when one character-length of a word
can change affecting where end of the word is.
(upcase, capitalize, upcase-initials): Update documentation to mention
limitations when working on characters.
* test/src/casefiddle-tests.el (casefiddle-tests-char-properties):
Add test cases for the newly introduced character properties.
(casefiddle-tests-casing): Update test cases which are now passing.
* test/lisp/char-fold-tests.el (char-fold--ascii-upcase,
char-fold--ascii-downcase): New functions which behave like old ‘upcase’
and ‘downcase’.
(char-fold--test-match-exactly): Use the new functions. This is needed
because otherwise fi and similar characters are turned into their multi-
-character representation.
* doc/lispref/strings.texi: Describe issue with casing characters versus
strings.
* doc/lispref/nonascii.texi: Describe the new character properties.
Diffstat (limited to 'src')
| -rw-r--r-- | src/casefiddle.c | 289 |
1 files changed, 205 insertions, 84 deletions
diff --git a/src/casefiddle.c b/src/casefiddle.c index b1a5f8e236e..10674d963ec 100644 --- a/src/casefiddle.c +++ b/src/casefiddle.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | /* -*- coding: utf-8 -*- */ | ||
| 1 | /* GNU Emacs case conversion functions. | 2 | /* GNU Emacs case conversion functions. |
| 2 | 3 | ||
| 3 | Copyright (C) 1985, 1994, 1997-1999, 2001-2017 Free Software Foundation, | 4 | Copyright (C) 1985, 1994, 1997-1999, 2001-2017 Free Software Foundation, |
| @@ -36,6 +37,9 @@ struct casing_context { | |||
| 36 | /* A char-table with title-case character mappings or nil. Non-nil implies | 37 | /* A char-table with title-case character mappings or nil. Non-nil implies |
| 37 | flag is CASE_CAPITALIZE or CASE_CAPITALIZE_UP. */ | 38 | flag is CASE_CAPITALIZE or CASE_CAPITALIZE_UP. */ |
| 38 | Lisp_Object titlecase_char_table; | 39 | Lisp_Object titlecase_char_table; |
| 40 | /* The unconditional special-casing Unicode property char tables for upper | ||
| 41 | casing, lower casing and title casing respectively. */ | ||
| 42 | Lisp_Object specialcase_char_tables[3]; | ||
| 39 | /* User-requested action. */ | 43 | /* User-requested action. */ |
| 40 | enum case_action flag; | 44 | enum case_action flag; |
| 41 | /* If true, function operates on a buffer as opposed to a string or character. | 45 | /* If true, function operates on a buffer as opposed to a string or character. |
| @@ -58,6 +62,13 @@ prepare_casing_context (struct casing_context *ctx, | |||
| 58 | ctx->inword = flag == CASE_DOWN; | 62 | ctx->inword = flag == CASE_DOWN; |
| 59 | ctx->titlecase_char_table = (int)flag < (int)CASE_CAPITALIZE ? Qnil : | 63 | ctx->titlecase_char_table = (int)flag < (int)CASE_CAPITALIZE ? Qnil : |
| 60 | uniprop_table (intern_c_string ("titlecase")); | 64 | uniprop_table (intern_c_string ("titlecase")); |
| 65 | ctx->specialcase_char_tables[CASE_UP] = flag == CASE_DOWN ? Qnil : | ||
| 66 | uniprop_table (intern_c_string ("special-uppercase")); | ||
| 67 | ctx->specialcase_char_tables[CASE_DOWN] = flag == CASE_UP ? Qnil : | ||
| 68 | uniprop_table (intern_c_string ("special-lowercase")); | ||
| 69 | ctx->specialcase_char_tables[CASE_CAPITALIZE] = | ||
| 70 | (int)flag < (int)CASE_CAPITALIZE ? Qnil : | ||
| 71 | uniprop_table (intern_c_string ("special-titlecase")); | ||
| 61 | 72 | ||
| 62 | /* If the case table is flagged as modified, rescan it. */ | 73 | /* If the case table is flagged as modified, rescan it. */ |
| 63 | if (NILP (XCHAR_TABLE (BVAR (current_buffer, downcase_table))->extras[1])) | 74 | if (NILP (XCHAR_TABLE (BVAR (current_buffer, downcase_table))->extras[1])) |
| @@ -67,25 +78,98 @@ prepare_casing_context (struct casing_context *ctx, | |||
| 67 | SETUP_BUFFER_SYNTAX_TABLE (); /* For syntax_prefix_flag_p. */ | 78 | SETUP_BUFFER_SYNTAX_TABLE (); /* For syntax_prefix_flag_p. */ |
| 68 | } | 79 | } |
| 69 | 80 | ||
| 70 | /* Based on CTX, case character CH accordingly. Update CTX as necessary. | 81 | struct casing_str_buf { |
| 71 | Return cased character. */ | 82 | unsigned char data[MAX_MULTIBYTE_LENGTH > 6 ? MAX_MULTIBYTE_LENGTH : 6]; |
| 83 | unsigned char len_chars; | ||
| 84 | unsigned char len_bytes; | ||
| 85 | }; | ||
| 86 | |||
| 87 | /* Based on CTX, case character CH. If BUF is NULL, return cased character. | ||
| 88 | Otherwise, if BUF is non-NULL, save result in it and return whether the | ||
| 89 | character has been changed. | ||
| 90 | |||
| 91 | Since meaning of return value depends on arguments, it’s more convenient to | ||
| 92 | use case_single_character or case_character instead. */ | ||
| 72 | static int | 93 | static int |
| 73 | case_character (struct casing_context *ctx, int ch) | 94 | case_character_impl (struct casing_str_buf *buf, |
| 95 | struct casing_context *ctx, int ch) | ||
| 74 | { | 96 | { |
| 97 | enum case_action flag; | ||
| 75 | Lisp_Object prop; | 98 | Lisp_Object prop; |
| 99 | bool was_inword; | ||
| 100 | int cased; | ||
| 101 | |||
| 102 | /* Update inword state */ | ||
| 103 | was_inword = ctx->inword; | ||
| 104 | if ((int) ctx->flag >= (int) CASE_CAPITALIZE) | ||
| 105 | ctx->inword = SYNTAX (ch) == Sword && | ||
| 106 | (!ctx->inbuffer || was_inword || !syntax_prefix_flag_p (ch)); | ||
| 107 | |||
| 108 | /* Normalise flag so its one of CASE_UP, CASE_DOWN or CASE_CAPITALIZE. */ | ||
| 109 | if (!was_inword) | ||
| 110 | flag = ctx->flag == CASE_UP ? CASE_UP : CASE_CAPITALIZE; | ||
| 111 | else if (ctx->flag != CASE_CAPITALIZE_UP) | ||
| 112 | flag = CASE_DOWN; | ||
| 113 | else | ||
| 114 | { | ||
| 115 | cased = ch; | ||
| 116 | goto done; | ||
| 117 | } | ||
| 118 | |||
| 119 | /* Look through the special casing entries. */ | ||
| 120 | if (buf && !NILP(ctx->specialcase_char_tables[(int)flag])) | ||
| 121 | { | ||
| 122 | prop = CHAR_TABLE_REF(ctx->specialcase_char_tables[(int)flag], ch); | ||
| 123 | if (STRINGP(prop)) | ||
| 124 | { | ||
| 125 | struct Lisp_String *str = XSTRING(prop); | ||
| 126 | if (STRING_BYTES(str) <= sizeof buf->data) | ||
| 127 | { | ||
| 128 | buf->len_chars = str->size; | ||
| 129 | buf->len_bytes = STRING_BYTES(str); | ||
| 130 | memcpy(buf->data, str->data, buf->len_bytes); | ||
| 131 | return 1; | ||
| 132 | } | ||
| 133 | } | ||
| 134 | } | ||
| 76 | 135 | ||
| 77 | if (ctx->inword) | 136 | /* Handle simple, one-to-one case. */ |
| 78 | ch = ctx->flag == CASE_CAPITALIZE_UP ? ch : downcase (ch); | 137 | if (flag == CASE_DOWN) |
| 138 | cased = downcase (ch); | ||
| 79 | else if (!NILP (ctx->titlecase_char_table) && | 139 | else if (!NILP (ctx->titlecase_char_table) && |
| 80 | CHARACTERP (prop = CHAR_TABLE_REF (ctx->titlecase_char_table, ch))) | 140 | CHARACTERP (prop = CHAR_TABLE_REF (ctx->titlecase_char_table, ch))) |
| 81 | ch = XFASTINT (prop); | 141 | cased = XFASTINT (prop); |
| 82 | else | 142 | else |
| 83 | ch = upcase(ch); | 143 | cased = upcase(ch); |
| 144 | |||
| 145 | /* And we’re done. */ | ||
| 146 | done: | ||
| 147 | if (!buf) | ||
| 148 | return cased; | ||
| 149 | buf->len_chars = 1; | ||
| 150 | buf->len_bytes = CHAR_STRING (cased, buf->data); | ||
| 151 | return cased != ch; | ||
| 152 | } | ||
| 84 | 153 | ||
| 85 | if ((int) ctx->flag >= (int) CASE_CAPITALIZE) | 154 | /* Based on CTX, case character CH accordingly. Update CTX as necessary. |
| 86 | ctx->inword = SYNTAX (ch) == Sword && | 155 | Return cased character. |
| 87 | (!ctx->inbuffer || ctx->inword || !syntax_prefix_flag_p (ch)); | 156 | |
| 88 | return ch; | 157 | Special casing rules (such as upcase(fi) = FI) are not handled. For |
| 158 | characters whose casing results in multiple code points, the character is | ||
| 159 | returned unchanged. */ | ||
| 160 | static inline int | ||
| 161 | case_single_character (struct casing_context *ctx, int ch) | ||
| 162 | { | ||
| 163 | return case_character_impl (NULL, ctx, ch); | ||
| 164 | } | ||
| 165 | |||
| 166 | /* Save in BUF result of casing character CH. Return whether casing changed the | ||
| 167 | character. This is like case_single_character but also handles one-to-many | ||
| 168 | casing rules. */ | ||
| 169 | static inline bool | ||
| 170 | case_character (struct casing_str_buf *buf, struct casing_context *ctx, int ch) | ||
| 171 | { | ||
| 172 | return case_character_impl (buf, ctx, ch); | ||
| 89 | } | 173 | } |
| 90 | 174 | ||
| 91 | static Lisp_Object | 175 | static Lisp_Object |
| @@ -112,7 +196,7 @@ do_casify_natnum (struct casing_context *ctx, Lisp_Object obj) | |||
| 112 | || !NILP (BVAR (current_buffer, enable_multibyte_characters)); | 196 | || !NILP (BVAR (current_buffer, enable_multibyte_characters)); |
| 113 | if (! multibyte) | 197 | if (! multibyte) |
| 114 | MAKE_CHAR_MULTIBYTE (ch); | 198 | MAKE_CHAR_MULTIBYTE (ch); |
| 115 | cased = case_character (ctx, ch); | 199 | cased = case_single_character (ctx, ch); |
| 116 | if (cased == ch) | 200 | if (cased == ch) |
| 117 | return obj; | 201 | return obj; |
| 118 | 202 | ||
| @@ -125,25 +209,34 @@ do_casify_natnum (struct casing_context *ctx, Lisp_Object obj) | |||
| 125 | static Lisp_Object | 209 | static Lisp_Object |
| 126 | do_casify_multibyte_string (struct casing_context *ctx, Lisp_Object obj) | 210 | do_casify_multibyte_string (struct casing_context *ctx, Lisp_Object obj) |
| 127 | { | 211 | { |
| 128 | ptrdiff_t i, i_byte, size = SCHARS (obj); | 212 | /* We assume data is the first member of casing_str_buf structure so that if |
| 129 | int len, ch, cased; | 213 | we cast a (char *) into (struct casing_str_buf *) the representation of the |
| 214 | character is at the beginning of the buffer. This is why we don’t need | ||
| 215 | separate struct casing_str_buf object but rather write directly to o. */ | ||
| 216 | typedef char static_assertion[offsetof(struct casing_str_buf, data) ? -1 : 1]; | ||
| 217 | |||
| 218 | ptrdiff_t size = SCHARS (obj), n; | ||
| 219 | int ch; | ||
| 130 | USE_SAFE_ALLOCA; | 220 | USE_SAFE_ALLOCA; |
| 131 | ptrdiff_t o_size; | 221 | if (INT_MULTIPLY_WRAPV (size, MAX_MULTIBYTE_LENGTH, &n) || |
| 132 | if (INT_MULTIPLY_WRAPV (size, MAX_MULTIBYTE_LENGTH, &o_size)) | 222 | INT_ADD_WRAPV (n, sizeof(struct casing_str_buf), &n)) |
| 133 | o_size = PTRDIFF_MAX; | 223 | n = PTRDIFF_MAX; |
| 134 | unsigned char *dst = SAFE_ALLOCA (o_size); | 224 | unsigned char *const dst = SAFE_ALLOCA (n), *const dst_end = dst + n; |
| 135 | unsigned char *o = dst; | 225 | unsigned char *o = dst; |
| 136 | 226 | ||
| 137 | for (i = i_byte = 0; i < size; i++, i_byte += len) | 227 | const unsigned char *src = SDATA (obj); |
| 228 | |||
| 229 | for (n = 0; size; --size) | ||
| 138 | { | 230 | { |
| 139 | if (o_size - MAX_MULTIBYTE_LENGTH < o - dst) | 231 | if (dst_end - o < sizeof(struct casing_str_buf)) |
| 140 | string_overflow (); | 232 | string_overflow (); |
| 141 | ch = STRING_CHAR_AND_LENGTH (SDATA (obj) + i_byte, len); | 233 | ch = STRING_CHAR_ADVANCE (src); |
| 142 | cased = case_character (ctx, ch); | 234 | case_character ((void *)o, ctx, ch); |
| 143 | o += CHAR_STRING (cased, o); | 235 | n += ((struct casing_str_buf *)o)->len_chars; |
| 236 | o += ((struct casing_str_buf *)o)->len_bytes; | ||
| 144 | } | 237 | } |
| 145 | eassert (o - dst <= o_size); | 238 | eassert (o <= dst_end); |
| 146 | obj = make_multibyte_string ((char *) dst, size, o - dst); | 239 | obj = make_multibyte_string ((char *) dst, n, o - dst); |
| 147 | SAFE_FREE (); | 240 | SAFE_FREE (); |
| 148 | return obj; | 241 | return obj; |
| 149 | } | 242 | } |
| @@ -159,7 +252,7 @@ do_casify_unibyte_string (struct casing_context *ctx, Lisp_Object obj) | |||
| 159 | { | 252 | { |
| 160 | ch = SREF (obj, i); | 253 | ch = SREF (obj, i); |
| 161 | MAKE_CHAR_MULTIBYTE (ch); | 254 | MAKE_CHAR_MULTIBYTE (ch); |
| 162 | cased = case_character (ctx, ch); | 255 | cased = case_single_character (ctx, ch); |
| 163 | if (ch == cased) | 256 | if (ch == cased) |
| 164 | continue; | 257 | continue; |
| 165 | MAKE_CHAR_UNIBYTE (cased); | 258 | MAKE_CHAR_UNIBYTE (cased); |
| @@ -191,7 +284,9 @@ casify_object (enum case_action flag, Lisp_Object obj) | |||
| 191 | DEFUN ("upcase", Fupcase, Supcase, 1, 1, 0, | 284 | DEFUN ("upcase", Fupcase, Supcase, 1, 1, 0, |
| 192 | doc: /* Convert argument to upper case and return that. | 285 | doc: /* Convert argument to upper case and return that. |
| 193 | The argument may be a character or string. The result has the same type. | 286 | The argument may be a character or string. The result has the same type. |
| 194 | The argument object is not altered--the value is a copy. | 287 | The argument object is not altered--the value is a copy. If argument |
| 288 | is a character, characters which map to multiple code points when | ||
| 289 | cased, e.g. fi, are returned unchanged. | ||
| 195 | See also `capitalize', `downcase' and `upcase-initials'. */) | 290 | See also `capitalize', `downcase' and `upcase-initials'. */) |
| 196 | (Lisp_Object obj) | 291 | (Lisp_Object obj) |
| 197 | { | 292 | { |
| @@ -212,7 +307,9 @@ DEFUN ("capitalize", Fcapitalize, Scapitalize, 1, 1, 0, | |||
| 212 | This means that each word's first character is converted to either | 307 | This means that each word's first character is converted to either |
| 213 | title case or upper case, and the rest to lower case. | 308 | title case or upper case, and the rest to lower case. |
| 214 | The argument may be a character or string. The result has the same type. | 309 | The argument may be a character or string. The result has the same type. |
| 215 | The argument object is not altered--the value is a copy. */) | 310 | The argument object is not altered--the value is a copy. If argument |
| 311 | is a character, characters which map to multiple code points when | ||
| 312 | cased, e.g. fi, are returned unchanged. */) | ||
| 216 | (Lisp_Object obj) | 313 | (Lisp_Object obj) |
| 217 | { | 314 | { |
| 218 | return casify_object (CASE_CAPITALIZE, obj); | 315 | return casify_object (CASE_CAPITALIZE, obj); |
| @@ -225,21 +322,28 @@ DEFUN ("upcase-initials", Fupcase_initials, Supcase_initials, 1, 1, 0, | |||
| 225 | This means that each word's first character is converted to either | 322 | This means that each word's first character is converted to either |
| 226 | title case or upper case, and the rest are left unchanged. | 323 | title case or upper case, and the rest are left unchanged. |
| 227 | The argument may be a character or string. The result has the same type. | 324 | The argument may be a character or string. The result has the same type. |
| 228 | The argument object is not altered--the value is a copy. */) | 325 | The argument object is not altered--the value is a copy. If argument |
| 326 | is a character, characters which map to multiple code points when | ||
| 327 | cased, e.g. fi, are returned unchanged. */) | ||
| 229 | (Lisp_Object obj) | 328 | (Lisp_Object obj) |
| 230 | { | 329 | { |
| 231 | return casify_object (CASE_CAPITALIZE_UP, obj); | 330 | return casify_object (CASE_CAPITALIZE_UP, obj); |
| 232 | } | 331 | } |
| 233 | 332 | ||
| 234 | /* Based on CTX, case region in a unibyte buffer from POS to *ENDP. Return | 333 | /* Based on CTX, case region in a unibyte buffer from *STARTP to *ENDP. |
| 235 | first position that has changed and save last position in *ENDP. If no | 334 | |
| 236 | characters were changed, return -1 and *ENDP is unspecified. */ | 335 | Save first and last positions that has changed in *STARTP and *ENDP |
| 336 | respectively. If no characters were changed, save -1 to *STARTP and leave | ||
| 337 | *ENDP unspecified. | ||
| 338 | |||
| 339 | Always return 0. This is so that interface of this function is the same as | ||
| 340 | do_casify_multibyte_region. */ | ||
| 237 | static ptrdiff_t | 341 | static ptrdiff_t |
| 238 | do_casify_unibyte_region (struct casing_context *ctx, | 342 | do_casify_unibyte_region (struct casing_context *ctx, |
| 239 | ptrdiff_t pos, ptrdiff_t *endp) | 343 | ptrdiff_t *startp, ptrdiff_t *endp) |
| 240 | { | 344 | { |
| 241 | ptrdiff_t first = -1, last = -1; /* Position of first and last changes. */ | 345 | ptrdiff_t first = -1, last = -1; /* Position of first and last changes. */ |
| 242 | ptrdiff_t end = *endp; | 346 | ptrdiff_t pos = *startp, end = *endp; |
| 243 | int ch, cased; | 347 | int ch, cased; |
| 244 | 348 | ||
| 245 | for (; pos < end; ++pos) | 349 | for (; pos < end; ++pos) |
| @@ -247,11 +351,11 @@ do_casify_unibyte_region (struct casing_context *ctx, | |||
| 247 | ch = FETCH_BYTE (pos); | 351 | ch = FETCH_BYTE (pos); |
| 248 | MAKE_CHAR_MULTIBYTE (ch); | 352 | MAKE_CHAR_MULTIBYTE (ch); |
| 249 | 353 | ||
| 250 | cased = case_character (ctx, ch); | 354 | cased = case_single_character (ctx, ch); |
| 251 | if (cased == ch) | 355 | if (cased == ch) |
| 252 | continue; | 356 | continue; |
| 253 | 357 | ||
| 254 | last = pos; | 358 | last = pos + 1; |
| 255 | if (first < 0) | 359 | if (first < 0) |
| 256 | first = pos; | 360 | first = pos; |
| 257 | 361 | ||
| @@ -259,88 +363,107 @@ do_casify_unibyte_region (struct casing_context *ctx, | |||
| 259 | FETCH_BYTE (pos) = cased; | 363 | FETCH_BYTE (pos) = cased; |
| 260 | } | 364 | } |
| 261 | 365 | ||
| 262 | *endp = last + 1; | 366 | *startp = first; |
| 263 | return first; | 367 | *endp = last; |
| 368 | return 0; | ||
| 264 | } | 369 | } |
| 265 | 370 | ||
| 266 | /* Based on CTX, case region in a multibyte buffer from POS to *ENDP. Return | 371 | /* Based on CTX, case region in a multibyte buffer from *STARTP to *ENDP. |
| 267 | first position that has changed and save last position in *ENDP. If no | 372 | |
| 268 | characters were changed, return -1 and *ENDP is unspecified. */ | 373 | Return number of added characters (may be negative if more characters were |
| 374 | deleted then inserted), save first and last positions that has changed in | ||
| 375 | *STARTP and *ENDP respectively. If no characters were changed, return 0, | ||
| 376 | save -1 to *STARTP and leave *ENDP unspecified. */ | ||
| 269 | static ptrdiff_t | 377 | static ptrdiff_t |
| 270 | do_casify_multibyte_region (struct casing_context *ctx, | 378 | do_casify_multibyte_region (struct casing_context *ctx, |
| 271 | ptrdiff_t pos, ptrdiff_t *endp) | 379 | ptrdiff_t *startp, ptrdiff_t *endp) |
| 272 | { | 380 | { |
| 273 | ptrdiff_t first = -1, last = -1; /* Position of first and last changes. */ | 381 | ptrdiff_t first = -1, last = -1; /* Position of first and last changes. */ |
| 274 | ptrdiff_t pos_byte = CHAR_TO_BYTE (pos), end = *endp; | 382 | ptrdiff_t pos = *startp, pos_byte = CHAR_TO_BYTE (pos), size = *endp - pos; |
| 275 | ptrdiff_t opoint = PT; | 383 | ptrdiff_t opoint = PT, added = 0; |
| 384 | struct casing_str_buf buf; | ||
| 276 | int ch, cased, len; | 385 | int ch, cased, len; |
| 277 | 386 | ||
| 278 | while (pos < end) | 387 | for (; size; --size) |
| 279 | { | 388 | { |
| 280 | ch = STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (pos_byte), len); | 389 | ch = STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (pos_byte), len); |
| 281 | cased = case_character (ctx, ch); | 390 | if (!case_character (&buf, ctx, ch)) |
| 282 | if (cased != ch) | ||
| 283 | { | 391 | { |
| 284 | last = pos; | 392 | pos_byte += len; |
| 285 | if (first < 0) | 393 | ++pos; |
| 286 | first = pos; | 394 | continue; |
| 395 | } | ||
| 287 | 396 | ||
| 288 | if (ASCII_CHAR_P (cased) && ASCII_CHAR_P (ch)) | 397 | last = pos + buf.len_chars; |
| 289 | FETCH_BYTE (pos_byte) = cased; | 398 | if (first < 0) |
| 290 | else | 399 | first = pos; |
| 291 | { | 400 | |
| 292 | unsigned char str[MAX_MULTIBYTE_LENGTH]; | 401 | if (buf.len_chars == 1 && buf.len_bytes == len) |
| 293 | int totlen = CHAR_STRING (cased, str); | 402 | memcpy (BYTE_POS_ADDR (pos_byte), buf.data, len); |
| 294 | if (len == totlen) | 403 | else |
| 295 | memcpy (BYTE_POS_ADDR (pos_byte), str, len); | 404 | { |
| 296 | else | 405 | /* Replace one character with the other(s), keeping text |
| 297 | /* Replace one character with the other(s), keeping text | 406 | properties the same. */ |
| 298 | properties the same. */ | 407 | replace_range_2 (pos, pos_byte, pos + 1, pos_byte + len, |
| 299 | replace_range_2 (pos, pos_byte, pos + 1, pos_byte + len, | 408 | (const char *) buf.data, buf.len_chars, |
| 300 | (char *) str, 9, totlen, 0); | 409 | buf.len_bytes, |
| 301 | len = totlen; | 410 | 0); |
| 302 | } | 411 | added += (ptrdiff_t) buf.len_chars - 1; |
| 412 | if (opoint > pos) | ||
| 413 | opoint += (ptrdiff_t) buf.len_chars - 1; | ||
| 303 | } | 414 | } |
| 304 | pos++; | 415 | |
| 305 | pos_byte += len; | 416 | pos_byte += buf.len_bytes; |
| 417 | pos += buf.len_chars; | ||
| 306 | } | 418 | } |
| 307 | 419 | ||
| 308 | if (PT != opoint) | 420 | if (PT != opoint) |
| 309 | TEMP_SET_PT_BOTH (opoint, CHAR_TO_BYTE (opoint)); | 421 | TEMP_SET_PT_BOTH (opoint, CHAR_TO_BYTE (opoint)); |
| 310 | 422 | ||
| 423 | *startp = first; | ||
| 311 | *endp = last; | 424 | *endp = last; |
| 312 | return first; | 425 | return added; |
| 313 | } | 426 | } |
| 314 | 427 | ||
| 315 | /* flag is CASE_UP, CASE_DOWN or CASE_CAPITALIZE or CASE_CAPITALIZE_UP. | 428 | /* flag is CASE_UP, CASE_DOWN or CASE_CAPITALIZE or CASE_CAPITALIZE_UP. b and |
| 316 | b and e specify range of buffer to operate on. */ | 429 | e specify range of buffer to operate on. Return character position of the |
| 317 | static void | 430 | end of the region after changes. */ |
| 431 | static ptrdiff_t | ||
| 318 | casify_region (enum case_action flag, Lisp_Object b, Lisp_Object e) | 432 | casify_region (enum case_action flag, Lisp_Object b, Lisp_Object e) |
| 319 | { | 433 | { |
| 434 | ptrdiff_t start, end, orig_end, added; | ||
| 320 | struct casing_context ctx; | 435 | struct casing_context ctx; |
| 321 | ptrdiff_t start, end; | ||
| 322 | |||
| 323 | if (EQ (b, e)) | ||
| 324 | /* Not modifying because nothing marked */ | ||
| 325 | return; | ||
| 326 | 436 | ||
| 327 | validate_region (&b, &e); | 437 | validate_region (&b, &e); |
| 328 | start = XFASTINT (b); | 438 | start = XFASTINT (b); |
| 329 | end = XFASTINT (e); | 439 | end = XFASTINT (e); |
| 440 | if (start == end) | ||
| 441 | /* Not modifying because nothing marked */ | ||
| 442 | return end; | ||
| 330 | modify_text (start, end); | 443 | modify_text (start, end); |
| 331 | record_change (start, end - start); | ||
| 332 | prepare_casing_context (&ctx, flag, true); | 444 | prepare_casing_context (&ctx, flag, true); |
| 333 | 445 | ||
| 446 | orig_end = end; | ||
| 447 | record_delete (start, make_buffer_string (start, end, true), false); | ||
| 334 | if (NILP (BVAR (current_buffer, enable_multibyte_characters))) | 448 | if (NILP (BVAR (current_buffer, enable_multibyte_characters))) |
| 335 | start = do_casify_unibyte_region (&ctx, start, &end); | 449 | { |
| 450 | record_insert (start, end - start); | ||
| 451 | added = do_casify_unibyte_region (&ctx, &start, &end); | ||
| 452 | } | ||
| 336 | else | 453 | else |
| 337 | start = do_casify_multibyte_region (&ctx, start, &end); | 454 | { |
| 455 | ptrdiff_t len = end - start, ostart = start; | ||
| 456 | added = do_casify_multibyte_region (&ctx, &start, &end); | ||
| 457 | record_insert (ostart, len + added); | ||
| 458 | } | ||
| 338 | 459 | ||
| 339 | if (start >= 0) | 460 | if (start >= 0) |
| 340 | { | 461 | { |
| 341 | signal_after_change (start, end + 1 - start, end + 1 - start); | 462 | signal_after_change (start, end - start - added, end - start); |
| 342 | update_compositions (start, end + 1, CHECK_ALL); | 463 | update_compositions (start, end, CHECK_ALL); |
| 343 | } | 464 | } |
| 465 | |||
| 466 | return orig_end + added; | ||
| 344 | } | 467 | } |
| 345 | 468 | ||
| 346 | DEFUN ("upcase-region", Fupcase_region, Supcase_region, 2, 3, | 469 | DEFUN ("upcase-region", Fupcase_region, Supcase_region, 2, 3, |
| @@ -432,9 +555,7 @@ casify_word (enum case_action flag, Lisp_Object arg) | |||
| 432 | ptrdiff_t farend = scan_words (PT, XINT (arg)); | 555 | ptrdiff_t farend = scan_words (PT, XINT (arg)); |
| 433 | if (!farend) | 556 | if (!farend) |
| 434 | farend = XINT (arg) <= 0 ? BEGV : ZV; | 557 | farend = XINT (arg) <= 0 ? BEGV : ZV; |
| 435 | ptrdiff_t newpoint = max (PT, farend); | 558 | SET_PT (casify_region (flag, make_number (PT), make_number (farend))); |
| 436 | casify_region (flag, make_number (PT), make_number (farend)); | ||
| 437 | SET_PT (newpoint); | ||
| 438 | return Qnil; | 559 | return Qnil; |
| 439 | } | 560 | } |
| 440 | 561 | ||