diff options
| author | Lars Ingebrigtsen | 2020-09-27 02:01:03 +0200 |
|---|---|---|
| committer | Lars Ingebrigtsen | 2020-09-27 02:01:03 +0200 |
| commit | 8c569683f2ee5d14040f5605fd0570b2eb009c05 (patch) | |
| tree | 913a8be96d0b7057797ab7b394fba184cdd1b2f8 /src | |
| parent | 8a148c5976e3fad53d540ce5aa52a36c6b658f85 (diff) | |
| download | emacs-8c569683f2ee5d14040f5605fd0570b2eb009c05.tar.gz emacs-8c569683f2ee5d14040f5605fd0570b2eb009c05.zip | |
Fix searching for multibyte needles in unibyte haystacks
* src/fns.c (Fstring_search): Make this work better when searching
unibyte haystacks for multibyte needles (bug#43598).
Diffstat (limited to 'src')
| -rw-r--r-- | src/fns.c | 45 |
1 files changed, 41 insertions, 4 deletions
| @@ -5454,6 +5454,21 @@ It should not be used for anything security-related. See | |||
| 5454 | return make_digest_string (digest, SHA1_DIGEST_SIZE); | 5454 | return make_digest_string (digest, SHA1_DIGEST_SIZE); |
| 5455 | } | 5455 | } |
| 5456 | 5456 | ||
| 5457 | static bool | ||
| 5458 | string_ascii_p (Lisp_Object string) | ||
| 5459 | { | ||
| 5460 | if (STRING_MULTIBYTE (string)) | ||
| 5461 | return SBYTES (string) == SCHARS (string); | ||
| 5462 | else | ||
| 5463 | { | ||
| 5464 | ptrdiff_t nbytes = SBYTES (string); | ||
| 5465 | for (ptrdiff_t i = 0; i < nbytes; i++) | ||
| 5466 | if (SREF (string, i) > 127) | ||
| 5467 | return false; | ||
| 5468 | return true; | ||
| 5469 | } | ||
| 5470 | } | ||
| 5471 | |||
| 5457 | DEFUN ("string-search", Fstring_search, Sstring_search, 2, 3, 0, | 5472 | DEFUN ("string-search", Fstring_search, Sstring_search, 2, 3, 0, |
| 5458 | doc: /* Search for the string NEEDLE in the string HAYSTACK. | 5473 | doc: /* Search for the string NEEDLE in the string HAYSTACK. |
| 5459 | The return value is the position of the first occurrence of NEEDLE in | 5474 | The return value is the position of the first occurrence of NEEDLE in |
| @@ -5490,7 +5505,9 @@ Case is always significant and text properties are ignored. */) | |||
| 5490 | haystart = SSDATA (haystack) + start_byte; | 5505 | haystart = SSDATA (haystack) + start_byte; |
| 5491 | haybytes = SBYTES (haystack) - start_byte; | 5506 | haybytes = SBYTES (haystack) - start_byte; |
| 5492 | 5507 | ||
| 5493 | if (STRING_MULTIBYTE (haystack) == STRING_MULTIBYTE (needle)) | 5508 | if (STRING_MULTIBYTE (haystack) == STRING_MULTIBYTE (needle) |
| 5509 | || string_ascii_p (needle) | ||
| 5510 | || string_ascii_p (haystack)) | ||
| 5494 | res = memmem (haystart, haybytes, | 5511 | res = memmem (haystart, haybytes, |
| 5495 | SSDATA (needle), SBYTES (needle)); | 5512 | SSDATA (needle), SBYTES (needle)); |
| 5496 | else if (STRING_MULTIBYTE (haystack)) /* unibyte needle */ | 5513 | else if (STRING_MULTIBYTE (haystack)) /* unibyte needle */ |
| @@ -5501,9 +5518,29 @@ Case is always significant and text properties are ignored. */) | |||
| 5501 | } | 5518 | } |
| 5502 | else /* unibyte haystack, multibyte needle */ | 5519 | else /* unibyte haystack, multibyte needle */ |
| 5503 | { | 5520 | { |
| 5504 | Lisp_Object uni_needle = Fstring_as_unibyte (needle); | 5521 | /* The only possible way we can find the multibyte needle in the |
| 5505 | res = memmem (haystart, haybytes, | 5522 | unibyte stack (since we know that neither are pure-ASCII) is |
| 5506 | SSDATA (uni_needle), SBYTES (uni_needle)); | 5523 | if they contain "raw bytes" (and no other non-ASCII chars.) */ |
| 5524 | ptrdiff_t chars = SCHARS (needle); | ||
| 5525 | const unsigned char *src = SDATA (needle); | ||
| 5526 | |||
| 5527 | for (ptrdiff_t i = 0; i < chars; i++) | ||
| 5528 | { | ||
| 5529 | int c = string_char_advance (&src); | ||
| 5530 | |||
| 5531 | if (!CHAR_BYTE8_P (c) | ||
| 5532 | && !ASCII_CHAR_P (c)) | ||
| 5533 | /* Found a char that can't be in the haystack. */ | ||
| 5534 | return Qnil; | ||
| 5535 | } | ||
| 5536 | |||
| 5537 | { | ||
| 5538 | /* "Raw bytes" (aka eighth-bit) are represented differently in | ||
| 5539 | multibyte and unibyte strings. */ | ||
| 5540 | Lisp_Object uni_needle = Fstring_to_unibyte (needle); | ||
| 5541 | res = memmem (haystart, haybytes, | ||
| 5542 | SSDATA (uni_needle), SBYTES (uni_needle)); | ||
| 5543 | } | ||
| 5507 | } | 5544 | } |
| 5508 | 5545 | ||
| 5509 | if (! res) | 5546 | if (! res) |