Implement special sigma casing rule (bug#24603)

In Greek, a sigma character has two lower case forms which depend on their position in the word. Implement logic determining it. * src/casefiddle.c (struct casing_context, case_character_impl): Don’t assume inword is true when flag is CASE_UP and false when flag is CASE_DOWN. For final sigma detection we need this information tracked reliably;. (CAPITAL_SIGMA, SMALL_SIGMA, SMALL_FINAL_SIGMA): New macros defining Unicode code point of different forms of sigma letter. (case_character): Implement support for final sigma casing. (do_casify_multibyte_string, do_casify_multibyte_region): Update after changes to case_character. * test/src/casefiddle-tests.el (casefiddle-tests-casing): Add test cases for final sigma.
author: Michal Nazarewicz 2016-09-19 00:52:47 +0200
committer: Michal Nazarewicz 2017-04-06 20:54:58 +0200
commit: c1fa07222e9c76964d1261c31b50f1e399554fa2 (patch)
tree: d1d48a5fd44b2d2abd049a70d17984a76c022c4f
parent: b3b9b258c4026baa1cad3f2e617f1a637fc8d205 (diff)
download: emacs-c1fa07222e9c76964d1261c31b50f1e399554fa2.tar.gz
emacs-c1fa07222e9c76964d1261c31b50f1e399554fa2.zip
3 files changed, 69 insertions, 24 deletions
diff --git a/etc/NEWS b/etc/NEWS
index cc02e07f562..3574d0fb694 100644
--- a/etc/NEWS
+++ b/etc/NEWS
@@ -367,6 +367,11 @@ of incorrect Ǆungla).
 *** Characters which turn into multiple ones when cased are correctly handled.
 For example, ﬁ ligature is converted to FI when upper cased.
+*** Greek small sigma is correctly handled when at the end of the word.
+Strings such as ΌΣΟΣ are now correctly converted to Όσος when
+capitalized instead of incorrect Όσοσ (compare lowercase sigma at the
+end of the word).
 * Changes in Specialized Modes and Packages in Emacs 26.1
diff --git a/src/casefiddle.c b/src/casefiddle.c
index 10674d963ec..6fe584b8302 100644
--- a/src/casefiddle.c
+++ b/src/casefiddle.c
@@ -46,9 +46,7 @@ struct casing_context {
     When run on a buffer, syntax_prefix_flag_p is taken into account when
     determined inword flag. */
  bool inbuffer;
-  /* Conceptually, this denotes whether we are inside of a word except
+  /* Whether we are inside of a word. */
-     that if flag is CASE_UP it’s always false and if flag is CASE_DOWN
-     this is always true. */
  bool inword;
 };
@@ -59,7 +57,7 @@ prepare_casing_context (struct casing_context *ctx,
 {
  ctx->flag = flag;
  ctx->inbuffer = inbuffer;
-  ctx->inword = flag == CASE_DOWN;
+  ctx->inword = false;
  ctx->titlecase_char_table = (int)flag < (int)CASE_CAPITALIZE ? Qnil :
    uniprop_table (intern_c_string ("titlecase"));
  ctx->specialcase_char_tables[CASE_UP] = flag == CASE_DOWN ? Qnil :
@@ -101,15 +99,16 @@ case_character_impl (struct casing_str_buf *buf,
  /* Update inword state */
  was_inword = ctx->inword;
-  if ((int) ctx->flag >= (int) CASE_CAPITALIZE)
+  ctx->inword = SYNTAX (ch) == Sword &&
-    ctx->inword = SYNTAX (ch) == Sword &&
+    (!ctx->inbuffer || was_inword || !syntax_prefix_flag_p (ch));
-      (!ctx->inbuffer || was_inword || !syntax_prefix_flag_p (ch));
  /* Normalise flag so its one of CASE_UP, CASE_DOWN or CASE_CAPITALIZE. */
-  if (!was_inword)
+  if (ctx->flag == CASE_CAPITALIZE)
-    flag = ctx->flag == CASE_UP ? CASE_UP : CASE_CAPITALIZE;
+    flag = (enum case_action)((int)ctx->flag - was_inword);
  else if (ctx->flag != CASE_CAPITALIZE_UP)
-    flag = CASE_DOWN;
+    flag = ctx->flag;
+  else if (!was_inword)
+    flag = CASE_CAPITALIZE;
  else
    {
      cased = ch;
@@ -150,7 +149,18 @@ case_character_impl (struct casing_str_buf *buf,
  buf->len_bytes = CHAR_STRING (cased, buf->data);
  return cased != ch;
 }
+/* In Greek, lower case sigma has two forms: one when used in the middle and one
+   when used at the end of a word.  Below is to help handle those cases when
+   casing.
+   The rule does not conflict with any other casing rules so while it is
+   a conditional one, it is independent on language. */
+#define CAPITAL_SIGMA     0x03A3
+#define SMALL_SIGMA       0x03C3
+#define SMALL_FINAL_SIGMA 0x03C2
 /* Based on CTX, case character CH accordingly.  Update CTX as necessary.
   Return cased character.
@@ -164,12 +174,34 @@ case_single_character (struct casing_context *ctx, int ch)
 }
 /* Save in BUF result of casing character CH.  Return whether casing changed the
-   character.  This is like case_single_character but also handles one-to-many
+   character.
-   casing rules. */
-static inline bool
+   If not-NULL, NEXT points to the next character in the cased string.  If NULL,
-case_character (struct casing_str_buf *buf, struct casing_context *ctx, int ch)
+   it is assumed current character is the last one being cased.  This is used to
+   apply some rules which depend on proceeding state.
+   This is like case_single_character but also handles one-to-many casing
+   rules. */
+static bool
+case_character (struct casing_str_buf *buf, struct casing_context *ctx,
+                int ch, const unsigned char *next)
 {
-  return case_character_impl (buf, ctx, ch);
+  bool changed, was_inword;
+  was_inword = ctx->inword;
+  changed = case_character_impl (buf, ctx, ch);
+  /* If we have just down-cased a capital sigma and the next character no longer
+     has a word syntax (i.e. current character is end of word), use final
+     sigma. */
+  if (was_inword && ch == CAPITAL_SIGMA && changed &&
+      (!next || SYNTAX (STRING_CHAR (next)) != Sword))
+    {
+      buf->len_bytes = CHAR_STRING (SMALL_FINAL_SIGMA, buf->data);
+      buf->len_chars = 1;
+    }
+  return changed;
 }
 static Lisp_Object
@@ -231,7 +263,7 @@ do_casify_multibyte_string (struct casing_context *ctx, Lisp_Object obj)
      if (dst_end - o < sizeof(struct casing_str_buf))
        string_overflow ();
      ch = STRING_CHAR_ADVANCE (src);
-      case_character ((void *)o, ctx, ch);
+      case_character ((void *)o, ctx, ch, size > 1 ? src : NULL);
      n += ((struct casing_str_buf *)o)->len_chars;
      o += ((struct casing_str_buf *)o)->len_bytes;
    }
@@ -382,12 +414,17 @@ do_casify_multibyte_region (struct casing_context *ctx,
  ptrdiff_t pos = *startp, pos_byte = CHAR_TO_BYTE (pos), size = *endp - pos;
  ptrdiff_t opoint = PT, added = 0;
  struct casing_str_buf buf;
-  int ch, cased, len;
+  bool changed;
+  int ch, len;
  for (; size; --size)
    {
      ch = STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (pos_byte), len);
-      if (!case_character (&buf, ctx, ch))
+      changed = case_character (
+          &buf, ctx, ch,
+          size > 1 ? BYTE_POS_ADDR (pos_byte + len) : NULL);
+      if (!changed)
        {
          pos_byte += len;
          ++pos;
diff --git a/test/src/casefiddle-tests.el b/test/src/casefiddle-tests.el
index dd260633f4c..234d233c71a 100644
--- a/test/src/casefiddle-tests.el
+++ b/test/src/casefiddle-tests.el
@@ -195,13 +195,16 @@
        ("deﬁne" "DEFINE" "deﬁne" "Deﬁne" "Deﬁne")
        ("ﬁsh" "FISH" "ﬁsh" "Fish" "Fish")
        ("Straße" "STRASSE" "straße" "Straße" "Straße")
-        ;; FIXME(bug#24603): Everything below is broken at the moment.
-        ;; Here’s what should happen:
-        ;;("ΌΣΟΣ" "ΌΣΟΣ" "όσος" "Όσος" "Όσος")
-        ;; And here’s what is actually happening:
-        ("ΌΣΟΣ" "ΌΣΟΣ" "όσοσ" "Όσοσ" "ΌΣΟΣ")
-        ("όσος" "ΌΣΟΣ" "όσος" "Όσος" "Όσος"))))))
+        ;; The word repeated twice to test behaviour at the end of a word
+        ;; inside of an input string as well as at the end of the string.
+        ("ΌΣΟΣ ΌΣΟΣ" "ΌΣΟΣ ΌΣΟΣ" "όσος όσος" "Όσος Όσος" "ΌΣΟΣ ΌΣΟΣ")
+        ;; What should be done with sole sigma?  It is ‘final’ but on the
+        ;; other hand it does not form a word.  We’re using regular sigma.
+        ("Σ Σ" "Σ Σ" "σ σ" "Σ Σ" "Σ Σ")
+        ("όσος" "ΌΣΟΣ" "όσος" "Όσος" "Όσος")
+        ;; If sigma is already lower case, we don’t want to change it.
+        ("όσοσ" "ΌΣΟΣ" "όσοσ" "Όσοσ" "Όσοσ"))))))
 (ert-deftest casefiddle-tests-casing-byte8 ()
  (should-not
author	Michal Nazarewicz	2016-09-19 00:52:47 +0200
committer	Michal Nazarewicz	2017-04-06 20:54:58 +0200
commit	c1fa07222e9c76964d1261c31b50f1e399554fa2 (patch)
tree	d1d48a5fd44b2d2abd049a70d17984a76c022c4f
parent	b3b9b258c4026baa1cad3f2e617f1a637fc8d205 (diff)
download	emacs-c1fa07222e9c76964d1261c31b50f1e399554fa2.tar.gz emacs-c1fa07222e9c76964d1261c31b50f1e399554fa2.zip

diff --git a/etc/NEWS b/etc/NEWS index cc02e07f562..3574d0fb694 100644 --- a/etc/NEWS +++ b/etc/NEWS
@@ -367,6 +367,11 @@ of incorrect Ǆungla).
367	*** Characters which turn into multiple ones when cased are correctly handled.	367	*** Characters which turn into multiple ones when cased are correctly handled.
368	For example, ﬁ ligature is converted to FI when upper cased.	368	For example, ﬁ ligature is converted to FI when upper cased.
369		369
		370	*** Greek small sigma is correctly handled when at the end of the word.
		371	Strings such as ΌΣΟΣ are now correctly converted to Όσος when
		372	capitalized instead of incorrect Όσοσ (compare lowercase sigma at the
		373	end of the word).
		374
370		375
371	* Changes in Specialized Modes and Packages in Emacs 26.1	376	* Changes in Specialized Modes and Packages in Emacs 26.1
372		377


diff --git a/src/casefiddle.c b/src/casefiddle.c index 10674d963ec..6fe584b8302 100644 --- a/src/casefiddle.c +++ b/src/casefiddle.c
@@ -46,9 +46,7 @@ struct casing_context {
46	When run on a buffer, syntax_prefix_flag_p is taken into account when	46	When run on a buffer, syntax_prefix_flag_p is taken into account when
47	determined inword flag. */	47	determined inword flag. */
48	bool inbuffer;	48	bool inbuffer;
49	/* Conceptually, this denotes whether we are inside of a word except	49	/* Whether we are inside of a word. */
50	that if flag is CASE_UP it’s always false and if flag is CASE_DOWN
51	this is always true. */
52	bool inword;	50	bool inword;
53	};	51	};
54		52
@@ -59,7 +57,7 @@ prepare_casing_context (struct casing_context *ctx,
59	{	57	{
60	ctx->flag = flag;	58	ctx->flag = flag;
61	ctx->inbuffer = inbuffer;	59	ctx->inbuffer = inbuffer;
62	ctx->inword = flag == CASE_DOWN;	60	ctx->inword = false;
63	ctx->titlecase_char_table = (int)flag < (int)CASE_CAPITALIZE ? Qnil :	61	ctx->titlecase_char_table = (int)flag < (int)CASE_CAPITALIZE ? Qnil :
64	uniprop_table (intern_c_string ("titlecase"));	62	uniprop_table (intern_c_string ("titlecase"));
65	ctx->specialcase_char_tables[CASE_UP] = flag == CASE_DOWN ? Qnil :	63	ctx->specialcase_char_tables[CASE_UP] = flag == CASE_DOWN ? Qnil :
@@ -101,15 +99,16 @@ case_character_impl (struct casing_str_buf *buf,
101		99
102	/* Update inword state */	100	/* Update inword state */
103	was_inword = ctx->inword;	101	was_inword = ctx->inword;
104	if ((int) ctx->flag >= (int) CASE_CAPITALIZE)	102	ctx->inword = SYNTAX (ch) == Sword &&
105	ctx->inword = SYNTAX (ch) == Sword &&	103	(!ctx->inbuffer \|\| was_inword \|\| !syntax_prefix_flag_p (ch));
106	(!ctx->inbuffer \|\| was_inword \|\| !syntax_prefix_flag_p (ch));
107		104
108	/* Normalise flag so its one of CASE_UP, CASE_DOWN or CASE_CAPITALIZE. */	105	/* Normalise flag so its one of CASE_UP, CASE_DOWN or CASE_CAPITALIZE. */
109	if (!was_inword)	106	if (ctx->flag == CASE_CAPITALIZE)
110	flag = ctx->flag == CASE_UP ? CASE_UP : CASE_CAPITALIZE;	107	flag = (enum case_action)((int)ctx->flag - was_inword);
111	else if (ctx->flag != CASE_CAPITALIZE_UP)	108	else if (ctx->flag != CASE_CAPITALIZE_UP)
112	flag = CASE_DOWN;	109	flag = ctx->flag;
		110	else if (!was_inword)
		111	flag = CASE_CAPITALIZE;
113	else	112	else
114	{	113	{
115	cased = ch;	114	cased = ch;
@@ -150,7 +149,18 @@ case_character_impl (struct casing_str_buf *buf,
150	buf->len_bytes = CHAR_STRING (cased, buf->data);	149	buf->len_bytes = CHAR_STRING (cased, buf->data);
151	return cased != ch;	150	return cased != ch;
152	}	151	}
		152
		153	/* In Greek, lower case sigma has two forms: one when used in the middle and one
		154	when used at the end of a word. Below is to help handle those cases when
		155	casing.
		156
		157	The rule does not conflict with any other casing rules so while it is
		158	a conditional one, it is independent on language. */
153		159
		160	#define CAPITAL_SIGMA 0x03A3
		161	#define SMALL_SIGMA 0x03C3
		162	#define SMALL_FINAL_SIGMA 0x03C2
		163
154	/* Based on CTX, case character CH accordingly. Update CTX as necessary.	164	/* Based on CTX, case character CH accordingly. Update CTX as necessary.
155	Return cased character.	165	Return cased character.
156		166
@@ -164,12 +174,34 @@ case_single_character (struct casing_context *ctx, int ch)
164	}	174	}
165		175
166	/* Save in BUF result of casing character CH. Return whether casing changed the	176	/* Save in BUF result of casing character CH. Return whether casing changed the
167	character. This is like case_single_character but also handles one-to-many	177	character.
168	casing rules. */	178
169	static inline bool	179	If not-NULL, NEXT points to the next character in the cased string. If NULL,
170	case_character (struct casing_str_buf buf, struct casing_context ctx, int ch)	180	it is assumed current character is the last one being cased. This is used to
		181	apply some rules which depend on proceeding state.
		182
		183	This is like case_single_character but also handles one-to-many casing
		184	rules. */
		185	static bool
		186	case_character (struct casing_str_buf buf, struct casing_context ctx,
		187	int ch, const unsigned char *next)
171	{	188	{
172	return case_character_impl (buf, ctx, ch);	189	bool changed, was_inword;
		190
		191	was_inword = ctx->inword;
		192	changed = case_character_impl (buf, ctx, ch);
		193
		194	/* If we have just down-cased a capital sigma and the next character no longer
		195	has a word syntax (i.e. current character is end of word), use final
		196	sigma. */
		197	if (was_inword && ch == CAPITAL_SIGMA && changed &&
		198	(!next \|\| SYNTAX (STRING_CHAR (next)) != Sword))
		199	{
		200	buf->len_bytes = CHAR_STRING (SMALL_FINAL_SIGMA, buf->data);
		201	buf->len_chars = 1;
		202	}
		203
		204	return changed;
173	}	205	}
174		206
175	static Lisp_Object	207	static Lisp_Object
@@ -231,7 +263,7 @@ do_casify_multibyte_string (struct casing_context *ctx, Lisp_Object obj)
231	if (dst_end - o < sizeof(struct casing_str_buf))	263	if (dst_end - o < sizeof(struct casing_str_buf))
232	string_overflow ();	264	string_overflow ();
233	ch = STRING_CHAR_ADVANCE (src);	265	ch = STRING_CHAR_ADVANCE (src);
234	case_character ((void *)o, ctx, ch);	266	case_character ((void *)o, ctx, ch, size > 1 ? src : NULL);
235	n += ((struct casing_str_buf *)o)->len_chars;	267	n += ((struct casing_str_buf *)o)->len_chars;
236	o += ((struct casing_str_buf *)o)->len_bytes;	268	o += ((struct casing_str_buf *)o)->len_bytes;
237	}	269	}
@@ -382,12 +414,17 @@ do_casify_multibyte_region (struct casing_context *ctx,
382	ptrdiff_t pos = startp, pos_byte = CHAR_TO_BYTE (pos), size = endp - pos;	414	ptrdiff_t pos = startp, pos_byte = CHAR_TO_BYTE (pos), size = endp - pos;
383	ptrdiff_t opoint = PT, added = 0;	415	ptrdiff_t opoint = PT, added = 0;
384	struct casing_str_buf buf;	416	struct casing_str_buf buf;
385	int ch, cased, len;	417	bool changed;
		418	int ch, len;
386		419
387	for (; size; --size)	420	for (; size; --size)
388	{	421	{
389	ch = STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (pos_byte), len);	422	ch = STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (pos_byte), len);
390	if (!case_character (&buf, ctx, ch))	423	changed = case_character (
		424	&buf, ctx, ch,
		425	size > 1 ? BYTE_POS_ADDR (pos_byte + len) : NULL);
		426
		427	if (!changed)
391	{	428	{
392	pos_byte += len;	429	pos_byte += len;
393	++pos;	430	++pos;


diff --git a/test/src/casefiddle-tests.el b/test/src/casefiddle-tests.el index dd260633f4c..234d233c71a 100644 --- a/test/src/casefiddle-tests.el +++ b/test/src/casefiddle-tests.el
@@ -195,13 +195,16 @@
195	("deﬁne" "DEFINE" "deﬁne" "Deﬁne" "Deﬁne")	195	("deﬁne" "DEFINE" "deﬁne" "Deﬁne" "Deﬁne")
196	("ﬁsh" "FISH" "ﬁsh" "Fish" "Fish")	196	("ﬁsh" "FISH" "ﬁsh" "Fish" "Fish")
197	("Straße" "STRASSE" "straße" "Straße" "Straße")	197	("Straße" "STRASSE" "straße" "Straße" "Straße")
198	;; FIXME(bug#24603): Everything below is broken at the moment.
199	;; Here’s what should happen:
200	;;("ΌΣΟΣ" "ΌΣΟΣ" "όσος" "Όσος" "Όσος")
201	;; And here’s what is actually happening:
202	("ΌΣΟΣ" "ΌΣΟΣ" "όσοσ" "Όσοσ" "ΌΣΟΣ")
203		198
204	("όσος" "ΌΣΟΣ" "όσος" "Όσος" "Όσος"))))))	199	;; The word repeated twice to test behaviour at the end of a word
		200	;; inside of an input string as well as at the end of the string.
		201	("ΌΣΟΣ ΌΣΟΣ" "ΌΣΟΣ ΌΣΟΣ" "όσος όσος" "Όσος Όσος" "ΌΣΟΣ ΌΣΟΣ")
		202	;; What should be done with sole sigma? It is ‘final’ but on the
		203	;; other hand it does not form a word. We’re using regular sigma.
		204	("Σ Σ" "Σ Σ" "σ σ" "Σ Σ" "Σ Σ")
		205	("όσος" "ΌΣΟΣ" "όσος" "Όσος" "Όσος")
		206	;; If sigma is already lower case, we don’t want to change it.
		207	("όσοσ" "ΌΣΟΣ" "όσοσ" "Όσοσ" "Όσοσ"))))))
205		208
206	(ert-deftest casefiddle-tests-casing-byte8 ()	209	(ert-deftest casefiddle-tests-casing-byte8 ()
207	(should-not	210	(should-not