aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMichal Nazarewicz2016-09-19 00:52:47 +0200
committerMichal Nazarewicz2017-04-06 20:54:58 +0200
commitc1fa07222e9c76964d1261c31b50f1e399554fa2 (patch)
treed1d48a5fd44b2d2abd049a70d17984a76c022c4f
parentb3b9b258c4026baa1cad3f2e617f1a637fc8d205 (diff)
downloademacs-c1fa07222e9c76964d1261c31b50f1e399554fa2.tar.gz
emacs-c1fa07222e9c76964d1261c31b50f1e399554fa2.zip
Implement special sigma casing rule (bug#24603)
In Greek, a sigma character has two lower case forms which depend on their position in the word. Implement logic determining it. * src/casefiddle.c (struct casing_context, case_character_impl): Don’t assume inword is true when flag is CASE_UP and false when flag is CASE_DOWN. For final sigma detection we need this information tracked reliably;. (CAPITAL_SIGMA, SMALL_SIGMA, SMALL_FINAL_SIGMA): New macros defining Unicode code point of different forms of sigma letter. (case_character): Implement support for final sigma casing. (do_casify_multibyte_string, do_casify_multibyte_region): Update after changes to case_character. * test/src/casefiddle-tests.el (casefiddle-tests-casing): Add test cases for final sigma.
-rw-r--r--etc/NEWS5
-rw-r--r--src/casefiddle.c73
-rw-r--r--test/src/casefiddle-tests.el15
3 files changed, 69 insertions, 24 deletions
diff --git a/etc/NEWS b/etc/NEWS
index cc02e07f562..3574d0fb694 100644
--- a/etc/NEWS
+++ b/etc/NEWS
@@ -367,6 +367,11 @@ of incorrect DŽungla).
367*** Characters which turn into multiple ones when cased are correctly handled. 367*** Characters which turn into multiple ones when cased are correctly handled.
368For example, fi ligature is converted to FI when upper cased. 368For example, fi ligature is converted to FI when upper cased.
369 369
370*** Greek small sigma is correctly handled when at the end of the word.
371Strings such as ΌΣΟΣ are now correctly converted to Όσος when
372capitalized instead of incorrect Όσοσ (compare lowercase sigma at the
373end of the word).
374
370 375
371* Changes in Specialized Modes and Packages in Emacs 26.1 376* Changes in Specialized Modes and Packages in Emacs 26.1
372 377
diff --git a/src/casefiddle.c b/src/casefiddle.c
index 10674d963ec..6fe584b8302 100644
--- a/src/casefiddle.c
+++ b/src/casefiddle.c
@@ -46,9 +46,7 @@ struct casing_context {
46 When run on a buffer, syntax_prefix_flag_p is taken into account when 46 When run on a buffer, syntax_prefix_flag_p is taken into account when
47 determined inword flag. */ 47 determined inword flag. */
48 bool inbuffer; 48 bool inbuffer;
49 /* Conceptually, this denotes whether we are inside of a word except 49 /* Whether we are inside of a word. */
50 that if flag is CASE_UP it’s always false and if flag is CASE_DOWN
51 this is always true. */
52 bool inword; 50 bool inword;
53}; 51};
54 52
@@ -59,7 +57,7 @@ prepare_casing_context (struct casing_context *ctx,
59{ 57{
60 ctx->flag = flag; 58 ctx->flag = flag;
61 ctx->inbuffer = inbuffer; 59 ctx->inbuffer = inbuffer;
62 ctx->inword = flag == CASE_DOWN; 60 ctx->inword = false;
63 ctx->titlecase_char_table = (int)flag < (int)CASE_CAPITALIZE ? Qnil : 61 ctx->titlecase_char_table = (int)flag < (int)CASE_CAPITALIZE ? Qnil :
64 uniprop_table (intern_c_string ("titlecase")); 62 uniprop_table (intern_c_string ("titlecase"));
65 ctx->specialcase_char_tables[CASE_UP] = flag == CASE_DOWN ? Qnil : 63 ctx->specialcase_char_tables[CASE_UP] = flag == CASE_DOWN ? Qnil :
@@ -101,15 +99,16 @@ case_character_impl (struct casing_str_buf *buf,
101 99
102 /* Update inword state */ 100 /* Update inword state */
103 was_inword = ctx->inword; 101 was_inword = ctx->inword;
104 if ((int) ctx->flag >= (int) CASE_CAPITALIZE) 102 ctx->inword = SYNTAX (ch) == Sword &&
105 ctx->inword = SYNTAX (ch) == Sword && 103 (!ctx->inbuffer || was_inword || !syntax_prefix_flag_p (ch));
106 (!ctx->inbuffer || was_inword || !syntax_prefix_flag_p (ch));
107 104
108 /* Normalise flag so its one of CASE_UP, CASE_DOWN or CASE_CAPITALIZE. */ 105 /* Normalise flag so its one of CASE_UP, CASE_DOWN or CASE_CAPITALIZE. */
109 if (!was_inword) 106 if (ctx->flag == CASE_CAPITALIZE)
110 flag = ctx->flag == CASE_UP ? CASE_UP : CASE_CAPITALIZE; 107 flag = (enum case_action)((int)ctx->flag - was_inword);
111 else if (ctx->flag != CASE_CAPITALIZE_UP) 108 else if (ctx->flag != CASE_CAPITALIZE_UP)
112 flag = CASE_DOWN; 109 flag = ctx->flag;
110 else if (!was_inword)
111 flag = CASE_CAPITALIZE;
113 else 112 else
114 { 113 {
115 cased = ch; 114 cased = ch;
@@ -150,7 +149,18 @@ case_character_impl (struct casing_str_buf *buf,
150 buf->len_bytes = CHAR_STRING (cased, buf->data); 149 buf->len_bytes = CHAR_STRING (cased, buf->data);
151 return cased != ch; 150 return cased != ch;
152} 151}
152
153/* In Greek, lower case sigma has two forms: one when used in the middle and one
154 when used at the end of a word. Below is to help handle those cases when
155 casing.
156
157 The rule does not conflict with any other casing rules so while it is
158 a conditional one, it is independent on language. */
153 159
160#define CAPITAL_SIGMA 0x03A3
161#define SMALL_SIGMA 0x03C3
162#define SMALL_FINAL_SIGMA 0x03C2
163
154/* Based on CTX, case character CH accordingly. Update CTX as necessary. 164/* Based on CTX, case character CH accordingly. Update CTX as necessary.
155 Return cased character. 165 Return cased character.
156 166
@@ -164,12 +174,34 @@ case_single_character (struct casing_context *ctx, int ch)
164} 174}
165 175
166/* Save in BUF result of casing character CH. Return whether casing changed the 176/* Save in BUF result of casing character CH. Return whether casing changed the
167 character. This is like case_single_character but also handles one-to-many 177 character.
168 casing rules. */ 178
169static inline bool 179 If not-NULL, NEXT points to the next character in the cased string. If NULL,
170case_character (struct casing_str_buf *buf, struct casing_context *ctx, int ch) 180 it is assumed current character is the last one being cased. This is used to
181 apply some rules which depend on proceeding state.
182
183 This is like case_single_character but also handles one-to-many casing
184 rules. */
185static bool
186case_character (struct casing_str_buf *buf, struct casing_context *ctx,
187 int ch, const unsigned char *next)
171{ 188{
172 return case_character_impl (buf, ctx, ch); 189 bool changed, was_inword;
190
191 was_inword = ctx->inword;
192 changed = case_character_impl (buf, ctx, ch);
193
194 /* If we have just down-cased a capital sigma and the next character no longer
195 has a word syntax (i.e. current character is end of word), use final
196 sigma. */
197 if (was_inword && ch == CAPITAL_SIGMA && changed &&
198 (!next || SYNTAX (STRING_CHAR (next)) != Sword))
199 {
200 buf->len_bytes = CHAR_STRING (SMALL_FINAL_SIGMA, buf->data);
201 buf->len_chars = 1;
202 }
203
204 return changed;
173} 205}
174 206
175static Lisp_Object 207static Lisp_Object
@@ -231,7 +263,7 @@ do_casify_multibyte_string (struct casing_context *ctx, Lisp_Object obj)
231 if (dst_end - o < sizeof(struct casing_str_buf)) 263 if (dst_end - o < sizeof(struct casing_str_buf))
232 string_overflow (); 264 string_overflow ();
233 ch = STRING_CHAR_ADVANCE (src); 265 ch = STRING_CHAR_ADVANCE (src);
234 case_character ((void *)o, ctx, ch); 266 case_character ((void *)o, ctx, ch, size > 1 ? src : NULL);
235 n += ((struct casing_str_buf *)o)->len_chars; 267 n += ((struct casing_str_buf *)o)->len_chars;
236 o += ((struct casing_str_buf *)o)->len_bytes; 268 o += ((struct casing_str_buf *)o)->len_bytes;
237 } 269 }
@@ -382,12 +414,17 @@ do_casify_multibyte_region (struct casing_context *ctx,
382 ptrdiff_t pos = *startp, pos_byte = CHAR_TO_BYTE (pos), size = *endp - pos; 414 ptrdiff_t pos = *startp, pos_byte = CHAR_TO_BYTE (pos), size = *endp - pos;
383 ptrdiff_t opoint = PT, added = 0; 415 ptrdiff_t opoint = PT, added = 0;
384 struct casing_str_buf buf; 416 struct casing_str_buf buf;
385 int ch, cased, len; 417 bool changed;
418 int ch, len;
386 419
387 for (; size; --size) 420 for (; size; --size)
388 { 421 {
389 ch = STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (pos_byte), len); 422 ch = STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (pos_byte), len);
390 if (!case_character (&buf, ctx, ch)) 423 changed = case_character (
424 &buf, ctx, ch,
425 size > 1 ? BYTE_POS_ADDR (pos_byte + len) : NULL);
426
427 if (!changed)
391 { 428 {
392 pos_byte += len; 429 pos_byte += len;
393 ++pos; 430 ++pos;
diff --git a/test/src/casefiddle-tests.el b/test/src/casefiddle-tests.el
index dd260633f4c..234d233c71a 100644
--- a/test/src/casefiddle-tests.el
+++ b/test/src/casefiddle-tests.el
@@ -195,13 +195,16 @@
195 ("define" "DEFINE" "define" "Define" "Define") 195 ("define" "DEFINE" "define" "Define" "Define")
196 ("fish" "FISH" "fish" "Fish" "Fish") 196 ("fish" "FISH" "fish" "Fish" "Fish")
197 ("Straße" "STRASSE" "straße" "Straße" "Straße") 197 ("Straße" "STRASSE" "straße" "Straße" "Straße")
198 ;; FIXME(bug#24603): Everything below is broken at the moment.
199 ;; Here’s what should happen:
200 ;;("ΌΣΟΣ" "ΌΣΟΣ" "όσος" "Όσος" "Όσος")
201 ;; And here’s what is actually happening:
202 ("ΌΣΟΣ" "ΌΣΟΣ" "όσοσ" "Όσοσ" "ΌΣΟΣ")
203 198
204 ("όσος" "ΌΣΟΣ" "όσος" "Όσος" "Όσος")))))) 199 ;; The word repeated twice to test behaviour at the end of a word
200 ;; inside of an input string as well as at the end of the string.
201 ("ΌΣΟΣ ΌΣΟΣ" "ΌΣΟΣ ΌΣΟΣ" "όσος όσος" "Όσος Όσος" "ΌΣΟΣ ΌΣΟΣ")
202 ;; What should be done with sole sigma? It is ‘final’ but on the
203 ;; other hand it does not form a word. We’re using regular sigma.
204 ("Σ Σ" "Σ Σ" "σ σ" "Σ Σ" "Σ Σ")
205 ("όσος" "ΌΣΟΣ" "όσος" "Όσος" "Όσος")
206 ;; If sigma is already lower case, we don’t want to change it.
207 ("όσοσ" "ΌΣΟΣ" "όσοσ" "Όσοσ" "Όσοσ"))))))
205 208
206(ert-deftest casefiddle-tests-casing-byte8 () 209(ert-deftest casefiddle-tests-casing-byte8 ()
207 (should-not 210 (should-not