diff options
| author | Michal Nazarewicz | 2016-09-19 00:52:47 +0200 |
|---|---|---|
| committer | Michal Nazarewicz | 2017-04-06 20:54:58 +0200 |
| commit | c1fa07222e9c76964d1261c31b50f1e399554fa2 (patch) | |
| tree | d1d48a5fd44b2d2abd049a70d17984a76c022c4f /src/casefiddle.c | |
| parent | b3b9b258c4026baa1cad3f2e617f1a637fc8d205 (diff) | |
| download | emacs-c1fa07222e9c76964d1261c31b50f1e399554fa2.tar.gz emacs-c1fa07222e9c76964d1261c31b50f1e399554fa2.zip | |
Implement special sigma casing rule (bug#24603)
In Greek, a sigma character has two lower case forms which depend on
their position in the word. Implement logic determining it.
* src/casefiddle.c (struct casing_context, case_character_impl): Don’t
assume inword is true when flag is CASE_UP and false when flag is
CASE_DOWN. For final sigma detection we need this information tracked
reliably;.
(CAPITAL_SIGMA, SMALL_SIGMA, SMALL_FINAL_SIGMA): New macros defining
Unicode code point of different forms of sigma letter.
(case_character): Implement support for final sigma casing.
(do_casify_multibyte_string, do_casify_multibyte_region): Update after
changes to case_character.
* test/src/casefiddle-tests.el (casefiddle-tests-casing): Add test
cases for final sigma.
Diffstat (limited to 'src/casefiddle.c')
| -rw-r--r-- | src/casefiddle.c | 73 |
1 files changed, 55 insertions, 18 deletions
diff --git a/src/casefiddle.c b/src/casefiddle.c index 10674d963ec..6fe584b8302 100644 --- a/src/casefiddle.c +++ b/src/casefiddle.c | |||
| @@ -46,9 +46,7 @@ struct casing_context { | |||
| 46 | When run on a buffer, syntax_prefix_flag_p is taken into account when | 46 | When run on a buffer, syntax_prefix_flag_p is taken into account when |
| 47 | determined inword flag. */ | 47 | determined inword flag. */ |
| 48 | bool inbuffer; | 48 | bool inbuffer; |
| 49 | /* Conceptually, this denotes whether we are inside of a word except | 49 | /* Whether we are inside of a word. */ |
| 50 | that if flag is CASE_UP it’s always false and if flag is CASE_DOWN | ||
| 51 | this is always true. */ | ||
| 52 | bool inword; | 50 | bool inword; |
| 53 | }; | 51 | }; |
| 54 | 52 | ||
| @@ -59,7 +57,7 @@ prepare_casing_context (struct casing_context *ctx, | |||
| 59 | { | 57 | { |
| 60 | ctx->flag = flag; | 58 | ctx->flag = flag; |
| 61 | ctx->inbuffer = inbuffer; | 59 | ctx->inbuffer = inbuffer; |
| 62 | ctx->inword = flag == CASE_DOWN; | 60 | ctx->inword = false; |
| 63 | ctx->titlecase_char_table = (int)flag < (int)CASE_CAPITALIZE ? Qnil : | 61 | ctx->titlecase_char_table = (int)flag < (int)CASE_CAPITALIZE ? Qnil : |
| 64 | uniprop_table (intern_c_string ("titlecase")); | 62 | uniprop_table (intern_c_string ("titlecase")); |
| 65 | ctx->specialcase_char_tables[CASE_UP] = flag == CASE_DOWN ? Qnil : | 63 | ctx->specialcase_char_tables[CASE_UP] = flag == CASE_DOWN ? Qnil : |
| @@ -101,15 +99,16 @@ case_character_impl (struct casing_str_buf *buf, | |||
| 101 | 99 | ||
| 102 | /* Update inword state */ | 100 | /* Update inword state */ |
| 103 | was_inword = ctx->inword; | 101 | was_inword = ctx->inword; |
| 104 | if ((int) ctx->flag >= (int) CASE_CAPITALIZE) | 102 | ctx->inword = SYNTAX (ch) == Sword && |
| 105 | ctx->inword = SYNTAX (ch) == Sword && | 103 | (!ctx->inbuffer || was_inword || !syntax_prefix_flag_p (ch)); |
| 106 | (!ctx->inbuffer || was_inword || !syntax_prefix_flag_p (ch)); | ||
| 107 | 104 | ||
| 108 | /* Normalise flag so its one of CASE_UP, CASE_DOWN or CASE_CAPITALIZE. */ | 105 | /* Normalise flag so its one of CASE_UP, CASE_DOWN or CASE_CAPITALIZE. */ |
| 109 | if (!was_inword) | 106 | if (ctx->flag == CASE_CAPITALIZE) |
| 110 | flag = ctx->flag == CASE_UP ? CASE_UP : CASE_CAPITALIZE; | 107 | flag = (enum case_action)((int)ctx->flag - was_inword); |
| 111 | else if (ctx->flag != CASE_CAPITALIZE_UP) | 108 | else if (ctx->flag != CASE_CAPITALIZE_UP) |
| 112 | flag = CASE_DOWN; | 109 | flag = ctx->flag; |
| 110 | else if (!was_inword) | ||
| 111 | flag = CASE_CAPITALIZE; | ||
| 113 | else | 112 | else |
| 114 | { | 113 | { |
| 115 | cased = ch; | 114 | cased = ch; |
| @@ -150,7 +149,18 @@ case_character_impl (struct casing_str_buf *buf, | |||
| 150 | buf->len_bytes = CHAR_STRING (cased, buf->data); | 149 | buf->len_bytes = CHAR_STRING (cased, buf->data); |
| 151 | return cased != ch; | 150 | return cased != ch; |
| 152 | } | 151 | } |
| 152 | |||
| 153 | /* In Greek, lower case sigma has two forms: one when used in the middle and one | ||
| 154 | when used at the end of a word. Below is to help handle those cases when | ||
| 155 | casing. | ||
| 156 | |||
| 157 | The rule does not conflict with any other casing rules so while it is | ||
| 158 | a conditional one, it is independent on language. */ | ||
| 153 | 159 | ||
| 160 | #define CAPITAL_SIGMA 0x03A3 | ||
| 161 | #define SMALL_SIGMA 0x03C3 | ||
| 162 | #define SMALL_FINAL_SIGMA 0x03C2 | ||
| 163 | |||
| 154 | /* Based on CTX, case character CH accordingly. Update CTX as necessary. | 164 | /* Based on CTX, case character CH accordingly. Update CTX as necessary. |
| 155 | Return cased character. | 165 | Return cased character. |
| 156 | 166 | ||
| @@ -164,12 +174,34 @@ case_single_character (struct casing_context *ctx, int ch) | |||
| 164 | } | 174 | } |
| 165 | 175 | ||
| 166 | /* Save in BUF result of casing character CH. Return whether casing changed the | 176 | /* Save in BUF result of casing character CH. Return whether casing changed the |
| 167 | character. This is like case_single_character but also handles one-to-many | 177 | character. |
| 168 | casing rules. */ | 178 | |
| 169 | static inline bool | 179 | If not-NULL, NEXT points to the next character in the cased string. If NULL, |
| 170 | case_character (struct casing_str_buf *buf, struct casing_context *ctx, int ch) | 180 | it is assumed current character is the last one being cased. This is used to |
| 181 | apply some rules which depend on proceeding state. | ||
| 182 | |||
| 183 | This is like case_single_character but also handles one-to-many casing | ||
| 184 | rules. */ | ||
| 185 | static bool | ||
| 186 | case_character (struct casing_str_buf *buf, struct casing_context *ctx, | ||
| 187 | int ch, const unsigned char *next) | ||
| 171 | { | 188 | { |
| 172 | return case_character_impl (buf, ctx, ch); | 189 | bool changed, was_inword; |
| 190 | |||
| 191 | was_inword = ctx->inword; | ||
| 192 | changed = case_character_impl (buf, ctx, ch); | ||
| 193 | |||
| 194 | /* If we have just down-cased a capital sigma and the next character no longer | ||
| 195 | has a word syntax (i.e. current character is end of word), use final | ||
| 196 | sigma. */ | ||
| 197 | if (was_inword && ch == CAPITAL_SIGMA && changed && | ||
| 198 | (!next || SYNTAX (STRING_CHAR (next)) != Sword)) | ||
| 199 | { | ||
| 200 | buf->len_bytes = CHAR_STRING (SMALL_FINAL_SIGMA, buf->data); | ||
| 201 | buf->len_chars = 1; | ||
| 202 | } | ||
| 203 | |||
| 204 | return changed; | ||
| 173 | } | 205 | } |
| 174 | 206 | ||
| 175 | static Lisp_Object | 207 | static Lisp_Object |
| @@ -231,7 +263,7 @@ do_casify_multibyte_string (struct casing_context *ctx, Lisp_Object obj) | |||
| 231 | if (dst_end - o < sizeof(struct casing_str_buf)) | 263 | if (dst_end - o < sizeof(struct casing_str_buf)) |
| 232 | string_overflow (); | 264 | string_overflow (); |
| 233 | ch = STRING_CHAR_ADVANCE (src); | 265 | ch = STRING_CHAR_ADVANCE (src); |
| 234 | case_character ((void *)o, ctx, ch); | 266 | case_character ((void *)o, ctx, ch, size > 1 ? src : NULL); |
| 235 | n += ((struct casing_str_buf *)o)->len_chars; | 267 | n += ((struct casing_str_buf *)o)->len_chars; |
| 236 | o += ((struct casing_str_buf *)o)->len_bytes; | 268 | o += ((struct casing_str_buf *)o)->len_bytes; |
| 237 | } | 269 | } |
| @@ -382,12 +414,17 @@ do_casify_multibyte_region (struct casing_context *ctx, | |||
| 382 | ptrdiff_t pos = *startp, pos_byte = CHAR_TO_BYTE (pos), size = *endp - pos; | 414 | ptrdiff_t pos = *startp, pos_byte = CHAR_TO_BYTE (pos), size = *endp - pos; |
| 383 | ptrdiff_t opoint = PT, added = 0; | 415 | ptrdiff_t opoint = PT, added = 0; |
| 384 | struct casing_str_buf buf; | 416 | struct casing_str_buf buf; |
| 385 | int ch, cased, len; | 417 | bool changed; |
| 418 | int ch, len; | ||
| 386 | 419 | ||
| 387 | for (; size; --size) | 420 | for (; size; --size) |
| 388 | { | 421 | { |
| 389 | ch = STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (pos_byte), len); | 422 | ch = STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (pos_byte), len); |
| 390 | if (!case_character (&buf, ctx, ch)) | 423 | changed = case_character ( |
| 424 | &buf, ctx, ch, | ||
| 425 | size > 1 ? BYTE_POS_ADDR (pos_byte + len) : NULL); | ||
| 426 | |||
| 427 | if (!changed) | ||
| 391 | { | 428 | { |
| 392 | pos_byte += len; | 429 | pos_byte += len; |
| 393 | ++pos; | 430 | ++pos; |