Support casing characters which map into multiple code points (bug#24603)

Implement unconditional special casing rules defined in Unicode standard. Among other things, they deal with cases when a single code point is replaced by multiple ones because single character does not exist (e.g. ‘ﬁ’ ligature turning into ‘FL’) or is not commonly used (e.g. ß turning into SS). * admin/unidata/SpecialCasing.txt: New data file pulled from Unicode standard distribution. * admin/unidata/README: Mention SpecialCasing.txt. * admin/unidata/unidata-get.el (unidata-gen-table-special-casing, unidata-gen-table-special-casing--do-load): New functions generating ‘special-uppercase’, ‘special-lowercase’ and ‘special-titlecase’ character Unicode properties built from the SpecialCasing.txt Unicode data file. * src/casefiddle.c (struct casing_str_buf): New structure for representing short strings used to handle one-to-many character mappings. (case_character_imlp): New function which can handle one-to-many character mappings. (case_character, case_single_character): Wrappers for the above functions. The former may map one character to multiple (or no) code points while the latter does what the former used to do (i.e. handles one-to-one mappings only). (do_casify_natnum, do_casify_unibyte_string, do_casify_unibyte_region): Use case_single_character. (do_casify_multibyte_string, do_casify_multibyte_region): Support new features of case_character. * (do_casify_region): Updated to reflact do_casify_multibyte_string changes. (casify_word): Handle situation when one character-length of a word can change affecting where end of the word is. (upcase, capitalize, upcase-initials): Update documentation to mention limitations when working on characters. * test/src/casefiddle-tests.el (casefiddle-tests-char-properties): Add test cases for the newly introduced character properties. (casefiddle-tests-casing): Update test cases which are now passing. * test/lisp/char-fold-tests.el (char-fold--ascii-upcase, char-fold--ascii-downcase): New functions which behave like old ‘upcase’ and ‘downcase’. (char-fold--test-match-exactly): Use the new functions. This is needed because otherwise ﬁ and similar characters are turned into their multi- -character representation. * doc/lispref/strings.texi: Describe issue with casing characters versus strings. * doc/lispref/nonascii.texi: Describe the new character properties.
author: Michal Nazarewicz 2016-10-05 00:06:01 +0200
committer: Michal Nazarewicz 2017-04-06 20:54:58 +0200
commit: b3b9b258c4026baa1cad3f2e617f1a637fc8d205 (patch)
tree: 1520ef9f5a3204784c597fcf2bf7a7c7fc1b8d7c /src
parent: 2c87dabd0460cce83d2345b4ddff159969674fef (diff)
download: emacs-b3b9b258c4026baa1cad3f2e617f1a637fc8d205.tar.gz
emacs-b3b9b258c4026baa1cad3f2e617f1a637fc8d205.zip
1 files changed, 205 insertions, 84 deletions
diff --git a/src/casefiddle.c b/src/casefiddle.c
index b1a5f8e236e..10674d963ec 100644
--- a/src/casefiddle.c
+++ b/src/casefiddle.c
@@ -1,3 +1,4 @@
+/* -*- coding: utf-8 -*- */
 /* GNU Emacs case conversion functions.
 Copyright (C) 1985, 1994, 1997-1999, 2001-2017 Free Software Foundation,
@@ -36,6 +37,9 @@ struct casing_context {
  /* A char-table with title-case character mappings or nil.  Non-nil implies
     flag is CASE_CAPITALIZE or CASE_CAPITALIZE_UP.  */
  Lisp_Object titlecase_char_table;
+  /* The unconditional special-casing Unicode property char tables for upper
+     casing, lower casing and title casing respectively. */
+  Lisp_Object specialcase_char_tables[3];
  /* User-requested action. */
  enum case_action flag;
  /* If true, function operates on a buffer as opposed to a string or character.
@@ -58,6 +62,13 @@ prepare_casing_context (struct casing_context *ctx,
  ctx->inword = flag == CASE_DOWN;
  ctx->titlecase_char_table = (int)flag < (int)CASE_CAPITALIZE ? Qnil :
    uniprop_table (intern_c_string ("titlecase"));
+  ctx->specialcase_char_tables[CASE_UP] = flag == CASE_DOWN ? Qnil :
+    uniprop_table (intern_c_string ("special-uppercase"));
+  ctx->specialcase_char_tables[CASE_DOWN] = flag == CASE_UP ? Qnil :
+    uniprop_table (intern_c_string ("special-lowercase"));
+  ctx->specialcase_char_tables[CASE_CAPITALIZE] =
+    (int)flag < (int)CASE_CAPITALIZE ? Qnil :
+    uniprop_table (intern_c_string ("special-titlecase"));
  /* If the case table is flagged as modified, rescan it.  */
  if (NILP (XCHAR_TABLE (BVAR (current_buffer, downcase_table))->extras[1]))
@@ -67,25 +78,98 @@ prepare_casing_context (struct casing_context *ctx,
    SETUP_BUFFER_SYNTAX_TABLE ();       /* For syntax_prefix_flag_p.  */
 }
-/* Based on CTX, case character CH accordingly.  Update CTX as necessary.
+struct casing_str_buf {
-   Return cased character. */
+  unsigned char data[MAX_MULTIBYTE_LENGTH > 6 ? MAX_MULTIBYTE_LENGTH : 6];
+  unsigned char len_chars;
+  unsigned char len_bytes;
+};
+/* Based on CTX, case character CH.  If BUF is NULL, return cased character.
+   Otherwise, if BUF is non-NULL, save result in it and return whether the
+   character has been changed.
+   Since meaning of return value depends on arguments, it’s more convenient to
+   use case_single_character or case_character instead. */
 static int
-case_character (struct casing_context *ctx, int ch)
+case_character_impl (struct casing_str_buf *buf,
+                     struct casing_context *ctx, int ch)
 {
+  enum case_action flag;
  Lisp_Object prop;
+  bool was_inword;
+  int cased;
+  /* Update inword state */
+  was_inword = ctx->inword;
+  if ((int) ctx->flag >= (int) CASE_CAPITALIZE)
+    ctx->inword = SYNTAX (ch) == Sword &&
+      (!ctx->inbuffer || was_inword || !syntax_prefix_flag_p (ch));
+  /* Normalise flag so its one of CASE_UP, CASE_DOWN or CASE_CAPITALIZE. */
+  if (!was_inword)
+    flag = ctx->flag == CASE_UP ? CASE_UP : CASE_CAPITALIZE;
+  else if (ctx->flag != CASE_CAPITALIZE_UP)
+    flag = CASE_DOWN;
+  else
+    {
+      cased = ch;
+      goto done;
+    }
+  /* Look through the special casing entries. */
+  if (buf && !NILP(ctx->specialcase_char_tables[(int)flag]))
+    {
+      prop = CHAR_TABLE_REF(ctx->specialcase_char_tables[(int)flag], ch);
+      if (STRINGP(prop))
+        {
+          struct Lisp_String *str = XSTRING(prop);
+          if (STRING_BYTES(str) <= sizeof buf->data)
+            {
+              buf->len_chars = str->size;
+              buf->len_bytes = STRING_BYTES(str);
+              memcpy(buf->data, str->data, buf->len_bytes);
+              return 1;
+            }
+        }
+    }
-  if (ctx->inword)
+  /* Handle simple, one-to-one case. */
-    ch = ctx->flag == CASE_CAPITALIZE_UP ? ch : downcase (ch);
+  if (flag == CASE_DOWN)
+    cased = downcase (ch);
  else if (!NILP (ctx->titlecase_char_table) &&
           CHARACTERP (prop = CHAR_TABLE_REF (ctx->titlecase_char_table, ch)))
-    ch = XFASTINT (prop);
+    cased = XFASTINT (prop);
  else
-    ch = upcase(ch);
+    cased = upcase(ch);
+  /* And we’re done. */
+ done:
+  if (!buf)
+    return cased;
+  buf->len_chars = 1;
+  buf->len_bytes = CHAR_STRING (cased, buf->data);
+  return cased != ch;
+}
-  if ((int) ctx->flag >= (int) CASE_CAPITALIZE)
+/* Based on CTX, case character CH accordingly.  Update CTX as necessary.
-    ctx->inword = SYNTAX (ch) == Sword &&
+   Return cased character.
-      (!ctx->inbuffer || ctx->inword || !syntax_prefix_flag_p (ch));
-  return ch;
+   Special casing rules (such as upcase(ﬁ) = FI) are not handled.  For
+   characters whose casing results in multiple code points, the character is
+   returned unchanged. */
+static inline int
+case_single_character (struct casing_context *ctx, int ch)
+{
+  return case_character_impl (NULL, ctx, ch);
+}
+/* Save in BUF result of casing character CH.  Return whether casing changed the
+   character.  This is like case_single_character but also handles one-to-many
+   casing rules. */
+static inline bool
+case_character (struct casing_str_buf *buf, struct casing_context *ctx, int ch)
+{
+  return case_character_impl (buf, ctx, ch);
 }
 static Lisp_Object
@@ -112,7 +196,7 @@ do_casify_natnum (struct casing_context *ctx, Lisp_Object obj)
    || !NILP (BVAR (current_buffer, enable_multibyte_characters));
  if (! multibyte)
    MAKE_CHAR_MULTIBYTE (ch);
-  cased = case_character (ctx, ch);
+  cased = case_single_character (ctx, ch);
  if (cased == ch)
    return obj;
@@ -125,25 +209,34 @@ do_casify_natnum (struct casing_context *ctx, Lisp_Object obj)
 static Lisp_Object
 do_casify_multibyte_string (struct casing_context *ctx, Lisp_Object obj)
 {
-  ptrdiff_t i, i_byte, size = SCHARS (obj);
+  /* We assume data is the first member of casing_str_buf structure so that if
-  int len, ch, cased;
+     we cast a (char *) into (struct casing_str_buf *) the representation of the
+     character is at the beginning of the buffer.  This is why we don’t need
+     separate struct casing_str_buf object but rather write directly to o. */
+  typedef char static_assertion[offsetof(struct casing_str_buf, data) ? -1 : 1];
+  ptrdiff_t size = SCHARS (obj), n;
+  int ch;
  USE_SAFE_ALLOCA;
-  ptrdiff_t o_size;
+  if (INT_MULTIPLY_WRAPV (size, MAX_MULTIBYTE_LENGTH, &n) ||
-  if (INT_MULTIPLY_WRAPV (size, MAX_MULTIBYTE_LENGTH, &o_size))
+      INT_ADD_WRAPV (n, sizeof(struct casing_str_buf), &n))
-    o_size = PTRDIFF_MAX;
+    n = PTRDIFF_MAX;
-  unsigned char *dst = SAFE_ALLOCA (o_size);
+  unsigned char *const dst = SAFE_ALLOCA (n), *const dst_end = dst + n;
  unsigned char *o = dst;
-  for (i = i_byte = 0; i < size; i++, i_byte += len)
+  const unsigned char *src = SDATA (obj);
+  for (n = 0; size; --size)
    {
-      if (o_size - MAX_MULTIBYTE_LENGTH < o - dst)
+      if (dst_end - o < sizeof(struct casing_str_buf))
        string_overflow ();
-      ch = STRING_CHAR_AND_LENGTH (SDATA (obj) + i_byte, len);
+      ch = STRING_CHAR_ADVANCE (src);
-      cased = case_character (ctx, ch);
+      case_character ((void *)o, ctx, ch);
-      o += CHAR_STRING (cased, o);
+      n += ((struct casing_str_buf *)o)->len_chars;
+      o += ((struct casing_str_buf *)o)->len_bytes;
    }
-  eassert (o - dst <= o_size);
+  eassert (o <= dst_end);
-  obj = make_multibyte_string ((char *) dst, size, o - dst);
+  obj = make_multibyte_string ((char *) dst, n, o - dst);
  SAFE_FREE ();
  return obj;
 }
@@ -159,7 +252,7 @@ do_casify_unibyte_string (struct casing_context *ctx, Lisp_Object obj)
    {
      ch = SREF (obj, i);
      MAKE_CHAR_MULTIBYTE (ch);
-      cased = case_character (ctx, ch);
+      cased = case_single_character (ctx, ch);
      if (ch == cased)
        continue;
      MAKE_CHAR_UNIBYTE (cased);
@@ -191,7 +284,9 @@ casify_object (enum case_action flag, Lisp_Object obj)
 DEFUN ("upcase", Fupcase, Supcase, 1, 1, 0,
       doc: /* Convert argument to upper case and return that.
 The argument may be a character or string.  The result has the same type.
-The argument object is not altered--the value is a copy.
+The argument object is not altered--the value is a copy.  If argument
+is a character, characters which map to multiple code points when
+cased, e.g. ﬁ, are returned unchanged.
 See also `capitalize', `downcase' and `upcase-initials'.  */)
  (Lisp_Object obj)
 {
@@ -212,7 +307,9 @@ DEFUN ("capitalize", Fcapitalize, Scapitalize, 1, 1, 0,
 This means that each word's first character is converted to either
 title case or upper case, and the rest to lower case.
 The argument may be a character or string.  The result has the same type.
-The argument object is not altered--the value is a copy.  */)
+The argument object is not altered--the value is a copy.  If argument
+is a character, characters which map to multiple code points when
+cased, e.g. ﬁ, are returned unchanged.  */)
  (Lisp_Object obj)
 {
  return casify_object (CASE_CAPITALIZE, obj);
@@ -225,21 +322,28 @@ DEFUN ("upcase-initials", Fupcase_initials, Supcase_initials, 1, 1, 0,
 This means that each word's first character is converted to either
 title case or upper case, and the rest are left unchanged.
 The argument may be a character or string.  The result has the same type.
-The argument object is not altered--the value is a copy.  */)
+The argument object is not altered--the value is a copy.  If argument
+is a character, characters which map to multiple code points when
+cased, e.g. ﬁ, are returned unchanged.  */)
  (Lisp_Object obj)
 {
  return casify_object (CASE_CAPITALIZE_UP, obj);
 }
-/* Based on CTX, case region in a unibyte buffer from POS to *ENDP.  Return
+/* Based on CTX, case region in a unibyte buffer from *STARTP to *ENDP.
-   first position that has changed and save last position in *ENDP.  If no
-   characters were changed, return -1 and *ENDP is unspecified. */
+   Save first and last positions that has changed in *STARTP and *ENDP
+   respectively.  If no characters were changed, save -1 to *STARTP and leave
+   *ENDP unspecified.
+   Always return 0.  This is so that interface of this function is the same as
+   do_casify_multibyte_region. */
 static ptrdiff_t
 do_casify_unibyte_region (struct casing_context *ctx,
-                          ptrdiff_t pos, ptrdiff_t *endp)
+                          ptrdiff_t *startp, ptrdiff_t *endp)
 {
  ptrdiff_t first = -1, last = -1;  /* Position of first and last changes. */
-  ptrdiff_t end = *endp;
+  ptrdiff_t pos = *startp, end = *endp;
  int ch, cased;
  for (; pos < end; ++pos)
@@ -247,11 +351,11 @@ do_casify_unibyte_region (struct casing_context *ctx,
      ch = FETCH_BYTE (pos);
      MAKE_CHAR_MULTIBYTE (ch);
-      cased = case_character (ctx, ch);
+      cased = case_single_character (ctx, ch);
      if (cased == ch)
        continue;
-      last = pos;
+      last = pos + 1;
      if (first < 0)
        first = pos;
@@ -259,88 +363,107 @@ do_casify_unibyte_region (struct casing_context *ctx,
      FETCH_BYTE (pos) = cased;
    }
-  *endp = last + 1;
+  *startp = first;
-  return first;
+  *endp = last;
+  return 0;
 }
-/* Based on CTX, case region in a multibyte buffer from POS to *ENDP.  Return
+/* Based on CTX, case region in a multibyte buffer from *STARTP to *ENDP.
-   first position that has changed and save last position in *ENDP.  If no
-   characters were changed, return -1 and *ENDP is unspecified. */
+   Return number of added characters (may be negative if more characters were
+   deleted then inserted), save first and last positions that has changed in
+   *STARTP and *ENDP respectively.  If no characters were changed, return 0,
+   save -1 to *STARTP and leave *ENDP unspecified. */
 static ptrdiff_t
 do_casify_multibyte_region (struct casing_context *ctx,
-                           ptrdiff_t pos, ptrdiff_t *endp)
+                            ptrdiff_t *startp, ptrdiff_t *endp)
 {
  ptrdiff_t first = -1, last = -1;  /* Position of first and last changes. */
-  ptrdiff_t pos_byte = CHAR_TO_BYTE (pos), end = *endp;
+  ptrdiff_t pos = *startp, pos_byte = CHAR_TO_BYTE (pos), size = *endp - pos;
-  ptrdiff_t opoint = PT;
+  ptrdiff_t opoint = PT, added = 0;
+  struct casing_str_buf buf;
  int ch, cased, len;
-  while (pos < end)
+  for (; size; --size)
    {
      ch = STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (pos_byte), len);
-      cased = case_character (ctx, ch);
+      if (!case_character (&buf, ctx, ch))
-      if (cased != ch)
        {
-          last = pos;
+          pos_byte += len;
-          if (first < 0)
+          ++pos;
-            first = pos;
+          continue;
+        }
-          if (ASCII_CHAR_P (cased) && ASCII_CHAR_P (ch))
+      last = pos + buf.len_chars;
-            FETCH_BYTE (pos_byte) = cased;
+      if (first < 0)
-          else
+        first = pos;
-            {
-              unsigned char str[MAX_MULTIBYTE_LENGTH];
+      if (buf.len_chars == 1 && buf.len_bytes == len)
-              int totlen = CHAR_STRING (cased, str);
+        memcpy (BYTE_POS_ADDR (pos_byte), buf.data, len);
-              if (len == totlen)
+      else
-                memcpy (BYTE_POS_ADDR (pos_byte), str, len);
+        {
-              else
+          /* Replace one character with the other(s), keeping text
-                /* Replace one character with the other(s), keeping text
+             properties the same.  */
-                   properties the same.  */
+          replace_range_2 (pos, pos_byte, pos + 1, pos_byte + len,
-                replace_range_2 (pos, pos_byte, pos + 1, pos_byte + len,
+                           (const char *) buf.data, buf.len_chars,
-                                 (char *) str, 9, totlen, 0);
+                           buf.len_bytes,
-              len = totlen;
+                           0);
-            }
+          added += (ptrdiff_t) buf.len_chars - 1;
+          if (opoint > pos)
+            opoint += (ptrdiff_t) buf.len_chars - 1;
        }
-      pos++;
-      pos_byte += len;
+      pos_byte += buf.len_bytes;
+      pos += buf.len_chars;
    }
  if (PT != opoint)
    TEMP_SET_PT_BOTH (opoint, CHAR_TO_BYTE (opoint));
+  *startp = first;
  *endp = last;
-  return first;
+  return added;
 }
-/* flag is CASE_UP, CASE_DOWN or CASE_CAPITALIZE or CASE_CAPITALIZE_UP.
+/* flag is CASE_UP, CASE_DOWN or CASE_CAPITALIZE or CASE_CAPITALIZE_UP.  b and
-   b and e specify range of buffer to operate on. */
+   e specify range of buffer to operate on.  Return character position of the
-static void
+   end of the region after changes.  */
+static ptrdiff_t
 casify_region (enum case_action flag, Lisp_Object b, Lisp_Object e)
 {
+  ptrdiff_t start, end, orig_end, added;
  struct casing_context ctx;
-  ptrdiff_t start, end;
-  if (EQ (b, e))
-    /* Not modifying because nothing marked */
-    return;
  validate_region (&b, &e);
  start = XFASTINT (b);
  end = XFASTINT (e);
+  if (start == end)
+    /* Not modifying because nothing marked */
+    return end;
  modify_text (start, end);
-  record_change (start, end - start);
  prepare_casing_context (&ctx, flag, true);
+  orig_end = end;
+  record_delete (start, make_buffer_string (start, end, true), false);
  if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
-    start = do_casify_unibyte_region (&ctx, start, &end);
+    {
+      record_insert (start, end - start);
+      added = do_casify_unibyte_region (&ctx, &start, &end);
+    }
  else
-    start = do_casify_multibyte_region (&ctx, start, &end);
+    {
+      ptrdiff_t len = end - start, ostart = start;
+      added = do_casify_multibyte_region (&ctx, &start, &end);
+      record_insert (ostart, len + added);
+    }
  if (start >= 0)
    {
-      signal_after_change (start, end + 1 - start, end + 1 - start);
+      signal_after_change (start, end - start - added, end - start);
-      update_compositions (start, end + 1, CHECK_ALL);
+      update_compositions (start, end, CHECK_ALL);
    }
+  return orig_end + added;
 }
 DEFUN ("upcase-region", Fupcase_region, Supcase_region, 2, 3,
@@ -432,9 +555,7 @@ casify_word (enum case_action flag, Lisp_Object arg)
  ptrdiff_t farend = scan_words (PT, XINT (arg));
  if (!farend)
    farend = XINT (arg) <= 0 ? BEGV : ZV;
-  ptrdiff_t newpoint = max (PT, farend);
+  SET_PT (casify_region (flag, make_number (PT), make_number (farend)));
-  casify_region (flag, make_number (PT), make_number (farend));
-  SET_PT (newpoint);
  return Qnil;
 }
author	Michal Nazarewicz	2016-10-05 00:06:01 +0200
committer	Michal Nazarewicz	2017-04-06 20:54:58 +0200
commit	b3b9b258c4026baa1cad3f2e617f1a637fc8d205 (patch)
tree	1520ef9f5a3204784c597fcf2bf7a7c7fc1b8d7c /src
parent	2c87dabd0460cce83d2345b4ddff159969674fef (diff)
download	emacs-b3b9b258c4026baa1cad3f2e617f1a637fc8d205.tar.gz emacs-b3b9b258c4026baa1cad3f2e617f1a637fc8d205.zip

diff --git a/src/casefiddle.c b/src/casefiddle.c index b1a5f8e236e..10674d963ec 100644 --- a/src/casefiddle.c +++ b/src/casefiddle.c
@@ -1,3 +1,4 @@
		1	/* -- coding: utf-8 -- */
1	/* GNU Emacs case conversion functions.	2	/* GNU Emacs case conversion functions.
2		3
3	Copyright (C) 1985, 1994, 1997-1999, 2001-2017 Free Software Foundation,	4	Copyright (C) 1985, 1994, 1997-1999, 2001-2017 Free Software Foundation,
@@ -36,6 +37,9 @@ struct casing_context {
36	/* A char-table with title-case character mappings or nil. Non-nil implies	37	/* A char-table with title-case character mappings or nil. Non-nil implies
37	flag is CASE_CAPITALIZE or CASE_CAPITALIZE_UP. */	38	flag is CASE_CAPITALIZE or CASE_CAPITALIZE_UP. */
38	Lisp_Object titlecase_char_table;	39	Lisp_Object titlecase_char_table;
		40	/* The unconditional special-casing Unicode property char tables for upper
		41	casing, lower casing and title casing respectively. */
		42	Lisp_Object specialcase_char_tables[3];
39	/* User-requested action. */	43	/* User-requested action. */
40	enum case_action flag;	44	enum case_action flag;
41	/* If true, function operates on a buffer as opposed to a string or character.	45	/* If true, function operates on a buffer as opposed to a string or character.
@@ -58,6 +62,13 @@ prepare_casing_context (struct casing_context *ctx,
58	ctx->inword = flag == CASE_DOWN;	62	ctx->inword = flag == CASE_DOWN;
59	ctx->titlecase_char_table = (int)flag < (int)CASE_CAPITALIZE ? Qnil :	63	ctx->titlecase_char_table = (int)flag < (int)CASE_CAPITALIZE ? Qnil :
60	uniprop_table (intern_c_string ("titlecase"));	64	uniprop_table (intern_c_string ("titlecase"));
		65	ctx->specialcase_char_tables[CASE_UP] = flag == CASE_DOWN ? Qnil :
		66	uniprop_table (intern_c_string ("special-uppercase"));
		67	ctx->specialcase_char_tables[CASE_DOWN] = flag == CASE_UP ? Qnil :
		68	uniprop_table (intern_c_string ("special-lowercase"));
		69	ctx->specialcase_char_tables[CASE_CAPITALIZE] =
		70	(int)flag < (int)CASE_CAPITALIZE ? Qnil :
		71	uniprop_table (intern_c_string ("special-titlecase"));
61		72
62	/* If the case table is flagged as modified, rescan it. */	73	/* If the case table is flagged as modified, rescan it. */
63	if (NILP (XCHAR_TABLE (BVAR (current_buffer, downcase_table))->extras[1]))	74	if (NILP (XCHAR_TABLE (BVAR (current_buffer, downcase_table))->extras[1]))
@@ -67,25 +78,98 @@ prepare_casing_context (struct casing_context *ctx,
67	SETUP_BUFFER_SYNTAX_TABLE (); /* For syntax_prefix_flag_p. */	78	SETUP_BUFFER_SYNTAX_TABLE (); /* For syntax_prefix_flag_p. */
68	}	79	}
69		80
70	/* Based on CTX, case character CH accordingly. Update CTX as necessary.	81	struct casing_str_buf {
71	Return cased character. */	82	unsigned char data[MAX_MULTIBYTE_LENGTH > 6 ? MAX_MULTIBYTE_LENGTH : 6];
		83	unsigned char len_chars;
		84	unsigned char len_bytes;
		85	};
		86
		87	/* Based on CTX, case character CH. If BUF is NULL, return cased character.
		88	Otherwise, if BUF is non-NULL, save result in it and return whether the
		89	character has been changed.
		90
		91	Since meaning of return value depends on arguments, it’s more convenient to
		92	use case_single_character or case_character instead. */
72	static int	93	static int
73	case_character (struct casing_context *ctx, int ch)	94	case_character_impl (struct casing_str_buf *buf,
		95	struct casing_context *ctx, int ch)
74	{	96	{
		97	enum case_action flag;
75	Lisp_Object prop;	98	Lisp_Object prop;
		99	bool was_inword;
		100	int cased;
		101
		102	/* Update inword state */
		103	was_inword = ctx->inword;
		104	if ((int) ctx->flag >= (int) CASE_CAPITALIZE)
		105	ctx->inword = SYNTAX (ch) == Sword &&
		106	(!ctx->inbuffer \|\| was_inword \|\| !syntax_prefix_flag_p (ch));
		107
		108	/* Normalise flag so its one of CASE_UP, CASE_DOWN or CASE_CAPITALIZE. */
		109	if (!was_inword)
		110	flag = ctx->flag == CASE_UP ? CASE_UP : CASE_CAPITALIZE;
		111	else if (ctx->flag != CASE_CAPITALIZE_UP)
		112	flag = CASE_DOWN;
		113	else
		114	{
		115	cased = ch;
		116	goto done;
		117	}
		118
		119	/* Look through the special casing entries. */
		120	if (buf && !NILP(ctx->specialcase_char_tables[(int)flag]))
		121	{
		122	prop = CHAR_TABLE_REF(ctx->specialcase_char_tables[(int)flag], ch);
		123	if (STRINGP(prop))
		124	{
		125	struct Lisp_String *str = XSTRING(prop);
		126	if (STRING_BYTES(str) <= sizeof buf->data)
		127	{
		128	buf->len_chars = str->size;
		129	buf->len_bytes = STRING_BYTES(str);
		130	memcpy(buf->data, str->data, buf->len_bytes);
		131	return 1;
		132	}
		133	}
		134	}
76		135
77	if (ctx->inword)	136	/* Handle simple, one-to-one case. */
78	ch = ctx->flag == CASE_CAPITALIZE_UP ? ch : downcase (ch);	137	if (flag == CASE_DOWN)
		138	cased = downcase (ch);
79	else if (!NILP (ctx->titlecase_char_table) &&	139	else if (!NILP (ctx->titlecase_char_table) &&
80	CHARACTERP (prop = CHAR_TABLE_REF (ctx->titlecase_char_table, ch)))	140	CHARACTERP (prop = CHAR_TABLE_REF (ctx->titlecase_char_table, ch)))
81	ch = XFASTINT (prop);	141	cased = XFASTINT (prop);
82	else	142	else
83	ch = upcase(ch);	143	cased = upcase(ch);
		144
		145	/* And we’re done. */
		146	done:
		147	if (!buf)
		148	return cased;
		149	buf->len_chars = 1;
		150	buf->len_bytes = CHAR_STRING (cased, buf->data);
		151	return cased != ch;
		152	}
84		153
85	if ((int) ctx->flag >= (int) CASE_CAPITALIZE)	154	/* Based on CTX, case character CH accordingly. Update CTX as necessary.
86	ctx->inword = SYNTAX (ch) == Sword &&	155	Return cased character.
87	(!ctx->inbuffer \|\| ctx->inword \|\| !syntax_prefix_flag_p (ch));	156
88	return ch;	157	Special casing rules (such as upcase(ﬁ) = FI) are not handled. For
		158	characters whose casing results in multiple code points, the character is
		159	returned unchanged. */
		160	static inline int
		161	case_single_character (struct casing_context *ctx, int ch)
		162	{
		163	return case_character_impl (NULL, ctx, ch);
		164	}
		165
		166	/* Save in BUF result of casing character CH. Return whether casing changed the
		167	character. This is like case_single_character but also handles one-to-many
		168	casing rules. */
		169	static inline bool
		170	case_character (struct casing_str_buf buf, struct casing_context ctx, int ch)
		171	{
		172	return case_character_impl (buf, ctx, ch);
89	}	173	}
90		174
91	static Lisp_Object	175	static Lisp_Object
@@ -112,7 +196,7 @@ do_casify_natnum (struct casing_context *ctx, Lisp_Object obj)
112	\|\| !NILP (BVAR (current_buffer, enable_multibyte_characters));	196	\|\| !NILP (BVAR (current_buffer, enable_multibyte_characters));
113	if (! multibyte)	197	if (! multibyte)
114	MAKE_CHAR_MULTIBYTE (ch);	198	MAKE_CHAR_MULTIBYTE (ch);
115	cased = case_character (ctx, ch);	199	cased = case_single_character (ctx, ch);
116	if (cased == ch)	200	if (cased == ch)
117	return obj;	201	return obj;
118		202
@@ -125,25 +209,34 @@ do_casify_natnum (struct casing_context *ctx, Lisp_Object obj)
125	static Lisp_Object	209	static Lisp_Object
126	do_casify_multibyte_string (struct casing_context *ctx, Lisp_Object obj)	210	do_casify_multibyte_string (struct casing_context *ctx, Lisp_Object obj)
127	{	211	{
128	ptrdiff_t i, i_byte, size = SCHARS (obj);	212	/* We assume data is the first member of casing_str_buf structure so that if
129	int len, ch, cased;	213	we cast a (char ) into (struct casing_str_buf ) the representation of the
		214	character is at the beginning of the buffer. This is why we don’t need
		215	separate struct casing_str_buf object but rather write directly to o. */
		216	typedef char static_assertion[offsetof(struct casing_str_buf, data) ? -1 : 1];
		217
		218	ptrdiff_t size = SCHARS (obj), n;
		219	int ch;
130	USE_SAFE_ALLOCA;	220	USE_SAFE_ALLOCA;
131	ptrdiff_t o_size;	221	if (INT_MULTIPLY_WRAPV (size, MAX_MULTIBYTE_LENGTH, &n) \|\|
132	if (INT_MULTIPLY_WRAPV (size, MAX_MULTIBYTE_LENGTH, &o_size))	222	INT_ADD_WRAPV (n, sizeof(struct casing_str_buf), &n))
133	o_size = PTRDIFF_MAX;	223	n = PTRDIFF_MAX;
134	unsigned char *dst = SAFE_ALLOCA (o_size);	224	unsigned char const dst = SAFE_ALLOCA (n), const dst_end = dst + n;
135	unsigned char *o = dst;	225	unsigned char *o = dst;
136		226
137	for (i = i_byte = 0; i < size; i++, i_byte += len)	227	const unsigned char *src = SDATA (obj);
		228
		229	for (n = 0; size; --size)
138	{	230	{
139	if (o_size - MAX_MULTIBYTE_LENGTH < o - dst)	231	if (dst_end - o < sizeof(struct casing_str_buf))
140	string_overflow ();	232	string_overflow ();
141	ch = STRING_CHAR_AND_LENGTH (SDATA (obj) + i_byte, len);	233	ch = STRING_CHAR_ADVANCE (src);
142	cased = case_character (ctx, ch);	234	case_character ((void *)o, ctx, ch);
143	o += CHAR_STRING (cased, o);	235	n += ((struct casing_str_buf *)o)->len_chars;
		236	o += ((struct casing_str_buf *)o)->len_bytes;
144	}	237	}
145	eassert (o - dst <= o_size);	238	eassert (o <= dst_end);
146	obj = make_multibyte_string ((char *) dst, size, o - dst);	239	obj = make_multibyte_string ((char *) dst, n, o - dst);
147	SAFE_FREE ();	240	SAFE_FREE ();
148	return obj;	241	return obj;
149	}	242	}
@@ -159,7 +252,7 @@ do_casify_unibyte_string (struct casing_context *ctx, Lisp_Object obj)
159	{	252	{
160	ch = SREF (obj, i);	253	ch = SREF (obj, i);
161	MAKE_CHAR_MULTIBYTE (ch);	254	MAKE_CHAR_MULTIBYTE (ch);
162	cased = case_character (ctx, ch);	255	cased = case_single_character (ctx, ch);
163	if (ch == cased)	256	if (ch == cased)
164	continue;	257	continue;
165	MAKE_CHAR_UNIBYTE (cased);	258	MAKE_CHAR_UNIBYTE (cased);
@@ -191,7 +284,9 @@ casify_object (enum case_action flag, Lisp_Object obj)
191	DEFUN ("upcase", Fupcase, Supcase, 1, 1, 0,	284	DEFUN ("upcase", Fupcase, Supcase, 1, 1, 0,
192	doc: /* Convert argument to upper case and return that.	285	doc: /* Convert argument to upper case and return that.
193	The argument may be a character or string. The result has the same type.	286	The argument may be a character or string. The result has the same type.
194	The argument object is not altered--the value is a copy.	287	The argument object is not altered--the value is a copy. If argument
		288	is a character, characters which map to multiple code points when
		289	cased, e.g. ﬁ, are returned unchanged.
195	See also `capitalize', `downcase' and `upcase-initials'. */)	290	See also `capitalize', `downcase' and `upcase-initials'. */)
196	(Lisp_Object obj)	291	(Lisp_Object obj)
197	{	292	{
@@ -212,7 +307,9 @@ DEFUN ("capitalize", Fcapitalize, Scapitalize, 1, 1, 0,
212	This means that each word's first character is converted to either	307	This means that each word's first character is converted to either
213	title case or upper case, and the rest to lower case.	308	title case or upper case, and the rest to lower case.
214	The argument may be a character or string. The result has the same type.	309	The argument may be a character or string. The result has the same type.
215	The argument object is not altered--the value is a copy. */)	310	The argument object is not altered--the value is a copy. If argument
		311	is a character, characters which map to multiple code points when
		312	cased, e.g. ﬁ, are returned unchanged. */)
216	(Lisp_Object obj)	313	(Lisp_Object obj)
217	{	314	{
218	return casify_object (CASE_CAPITALIZE, obj);	315	return casify_object (CASE_CAPITALIZE, obj);
@@ -225,21 +322,28 @@ DEFUN ("upcase-initials", Fupcase_initials, Supcase_initials, 1, 1, 0,
225	This means that each word's first character is converted to either	322	This means that each word's first character is converted to either
226	title case or upper case, and the rest are left unchanged.	323	title case or upper case, and the rest are left unchanged.
227	The argument may be a character or string. The result has the same type.	324	The argument may be a character or string. The result has the same type.
228	The argument object is not altered--the value is a copy. */)	325	The argument object is not altered--the value is a copy. If argument
		326	is a character, characters which map to multiple code points when
		327	cased, e.g. ﬁ, are returned unchanged. */)
229	(Lisp_Object obj)	328	(Lisp_Object obj)
230	{	329	{
231	return casify_object (CASE_CAPITALIZE_UP, obj);	330	return casify_object (CASE_CAPITALIZE_UP, obj);
232	}	331	}
233		332
234	/* Based on CTX, case region in a unibyte buffer from POS to *ENDP. Return	333	/* Based on CTX, case region in a unibyte buffer from STARTP to ENDP.
235	first position that has changed and save last position in *ENDP. If no	334
236	characters were changed, return -1 and ENDP is unspecified. /	335	Save first and last positions that has changed in STARTP and ENDP
		336	respectively. If no characters were changed, save -1 to *STARTP and leave
		337	*ENDP unspecified.
		338
		339	Always return 0. This is so that interface of this function is the same as
		340	do_casify_multibyte_region. */
237	static ptrdiff_t	341	static ptrdiff_t
238	do_casify_unibyte_region (struct casing_context *ctx,	342	do_casify_unibyte_region (struct casing_context *ctx,
239	ptrdiff_t pos, ptrdiff_t *endp)	343	ptrdiff_t startp, ptrdiff_t endp)
240	{	344	{
241	ptrdiff_t first = -1, last = -1; /* Position of first and last changes. */	345	ptrdiff_t first = -1, last = -1; /* Position of first and last changes. */
242	ptrdiff_t end = *endp;	346	ptrdiff_t pos = startp, end = endp;
243	int ch, cased;	347	int ch, cased;
244		348
245	for (; pos < end; ++pos)	349	for (; pos < end; ++pos)
@@ -247,11 +351,11 @@ do_casify_unibyte_region (struct casing_context *ctx,
247	ch = FETCH_BYTE (pos);	351	ch = FETCH_BYTE (pos);
248	MAKE_CHAR_MULTIBYTE (ch);	352	MAKE_CHAR_MULTIBYTE (ch);
249		353
250	cased = case_character (ctx, ch);	354	cased = case_single_character (ctx, ch);
251	if (cased == ch)	355	if (cased == ch)
252	continue;	356	continue;
253		357
254	last = pos;	358	last = pos + 1;
255	if (first < 0)	359	if (first < 0)
256	first = pos;	360	first = pos;
257		361
@@ -259,88 +363,107 @@ do_casify_unibyte_region (struct casing_context *ctx,
259	FETCH_BYTE (pos) = cased;	363	FETCH_BYTE (pos) = cased;
260	}	364	}
261		365
262	*endp = last + 1;	366	*startp = first;
263	return first;	367	*endp = last;
		368	return 0;
264	}	369	}
265		370
266	/* Based on CTX, case region in a multibyte buffer from POS to *ENDP. Return	371	/* Based on CTX, case region in a multibyte buffer from STARTP to ENDP.
267	first position that has changed and save last position in *ENDP. If no	372
268	characters were changed, return -1 and ENDP is unspecified. /	373	Return number of added characters (may be negative if more characters were
		374	deleted then inserted), save first and last positions that has changed in
		375	STARTP and ENDP respectively. If no characters were changed, return 0,
		376	save -1 to STARTP and leave ENDP unspecified. */
269	static ptrdiff_t	377	static ptrdiff_t
270	do_casify_multibyte_region (struct casing_context *ctx,	378	do_casify_multibyte_region (struct casing_context *ctx,
271	ptrdiff_t pos, ptrdiff_t *endp)	379	ptrdiff_t startp, ptrdiff_t endp)
272	{	380	{
273	ptrdiff_t first = -1, last = -1; /* Position of first and last changes. */	381	ptrdiff_t first = -1, last = -1; /* Position of first and last changes. */
274	ptrdiff_t pos_byte = CHAR_TO_BYTE (pos), end = *endp;	382	ptrdiff_t pos = startp, pos_byte = CHAR_TO_BYTE (pos), size = endp - pos;
275	ptrdiff_t opoint = PT;	383	ptrdiff_t opoint = PT, added = 0;
		384	struct casing_str_buf buf;
276	int ch, cased, len;	385	int ch, cased, len;
277		386
278	while (pos < end)	387	for (; size; --size)
279	{	388	{
280	ch = STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (pos_byte), len);	389	ch = STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (pos_byte), len);
281	cased = case_character (ctx, ch);	390	if (!case_character (&buf, ctx, ch))
282	if (cased != ch)
283	{	391	{
284	last = pos;	392	pos_byte += len;
285	if (first < 0)	393	++pos;
286	first = pos;	394	continue;
		395	}
287		396
288	if (ASCII_CHAR_P (cased) && ASCII_CHAR_P (ch))	397	last = pos + buf.len_chars;
289	FETCH_BYTE (pos_byte) = cased;	398	if (first < 0)
290	else	399	first = pos;
291	{	400
292	unsigned char str[MAX_MULTIBYTE_LENGTH];	401	if (buf.len_chars == 1 && buf.len_bytes == len)
293	int totlen = CHAR_STRING (cased, str);	402	memcpy (BYTE_POS_ADDR (pos_byte), buf.data, len);
294	if (len == totlen)	403	else
295	memcpy (BYTE_POS_ADDR (pos_byte), str, len);	404	{
296	else	405	/* Replace one character with the other(s), keeping text
297	/* Replace one character with the other(s), keeping text	406	properties the same. */
298	properties the same. */	407	replace_range_2 (pos, pos_byte, pos + 1, pos_byte + len,
299	replace_range_2 (pos, pos_byte, pos + 1, pos_byte + len,	408	(const char *) buf.data, buf.len_chars,
300	(char *) str, 9, totlen, 0);	409	buf.len_bytes,
301	len = totlen;	410	0);
302	}	411	added += (ptrdiff_t) buf.len_chars - 1;
		412	if (opoint > pos)
		413	opoint += (ptrdiff_t) buf.len_chars - 1;
303	}	414	}
304	pos++;	415
305	pos_byte += len;	416	pos_byte += buf.len_bytes;
		417	pos += buf.len_chars;
306	}	418	}
307		419
308	if (PT != opoint)	420	if (PT != opoint)
309	TEMP_SET_PT_BOTH (opoint, CHAR_TO_BYTE (opoint));	421	TEMP_SET_PT_BOTH (opoint, CHAR_TO_BYTE (opoint));
310		422
		423	*startp = first;
311	*endp = last;	424	*endp = last;
312	return first;	425	return added;
313	}	426	}
314		427
315	/* flag is CASE_UP, CASE_DOWN or CASE_CAPITALIZE or CASE_CAPITALIZE_UP.	428	/* flag is CASE_UP, CASE_DOWN or CASE_CAPITALIZE or CASE_CAPITALIZE_UP. b and
316	b and e specify range of buffer to operate on. */	429	e specify range of buffer to operate on. Return character position of the
317	static void	430	end of the region after changes. */
		431	static ptrdiff_t
318	casify_region (enum case_action flag, Lisp_Object b, Lisp_Object e)	432	casify_region (enum case_action flag, Lisp_Object b, Lisp_Object e)
319	{	433	{
		434	ptrdiff_t start, end, orig_end, added;
320	struct casing_context ctx;	435	struct casing_context ctx;
321	ptrdiff_t start, end;
322
323	if (EQ (b, e))
324	/* Not modifying because nothing marked */
325	return;
326		436
327	validate_region (&b, &e);	437	validate_region (&b, &e);
328	start = XFASTINT (b);	438	start = XFASTINT (b);
329	end = XFASTINT (e);	439	end = XFASTINT (e);
		440	if (start == end)
		441	/* Not modifying because nothing marked */
		442	return end;
330	modify_text (start, end);	443	modify_text (start, end);
331	record_change (start, end - start);
332	prepare_casing_context (&ctx, flag, true);	444	prepare_casing_context (&ctx, flag, true);
333		445
		446	orig_end = end;
		447	record_delete (start, make_buffer_string (start, end, true), false);
334	if (NILP (BVAR (current_buffer, enable_multibyte_characters)))	448	if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
335	start = do_casify_unibyte_region (&ctx, start, &end);	449	{
		450	record_insert (start, end - start);
		451	added = do_casify_unibyte_region (&ctx, &start, &end);
		452	}
336	else	453	else
337	start = do_casify_multibyte_region (&ctx, start, &end);	454	{
		455	ptrdiff_t len = end - start, ostart = start;
		456	added = do_casify_multibyte_region (&ctx, &start, &end);
		457	record_insert (ostart, len + added);
		458	}
338		459
339	if (start >= 0)	460	if (start >= 0)
340	{	461	{
341	signal_after_change (start, end + 1 - start, end + 1 - start);	462	signal_after_change (start, end - start - added, end - start);
342	update_compositions (start, end + 1, CHECK_ALL);	463	update_compositions (start, end, CHECK_ALL);
343	}	464	}
		465
		466	return orig_end + added;
344	}	467	}
345		468
346	DEFUN ("upcase-region", Fupcase_region, Supcase_region, 2, 3,	469	DEFUN ("upcase-region", Fupcase_region, Supcase_region, 2, 3,
@@ -432,9 +555,7 @@ casify_word (enum case_action flag, Lisp_Object arg)
432	ptrdiff_t farend = scan_words (PT, XINT (arg));	555	ptrdiff_t farend = scan_words (PT, XINT (arg));
433	if (!farend)	556	if (!farend)
434	farend = XINT (arg) <= 0 ? BEGV : ZV;	557	farend = XINT (arg) <= 0 ? BEGV : ZV;
435	ptrdiff_t newpoint = max (PT, farend);	558	SET_PT (casify_region (flag, make_number (PT), make_number (farend)));
436	casify_region (flag, make_number (PT), make_number (farend));
437	SET_PT (newpoint);
438	return Qnil;	559	return Qnil;
439	}	560	}
440		561