diff options
| author | Michal Nazarewicz | 2016-09-19 00:23:40 +0200 |
|---|---|---|
| committer | Michal Nazarewicz | 2017-02-15 16:54:06 +0100 |
| commit | 5ec3a58462e99533ea5200de356302181d634d0b (patch) | |
| tree | e0149bb01ac5e087cfaf3429880deebadb703b56 | |
| parent | 0d4290650d9ec635a657ed8537cfc960b41381b9 (diff) | |
| download | emacs-5ec3a58462e99533ea5200de356302181d634d0b.tar.gz emacs-5ec3a58462e99533ea5200de356302181d634d0b.zip | |
Generate upcase and downcase tables from Unicode data (bug#24603)
Use Unicode data to generate case tables instead of mostly repeating
them in lisp code. Do that in a way which maps ‘Dz’ (and similar)
digraph to ‘dz’ when down- and ‘DZ’ when upcasing.
https://debbugs.gnu.org/cgi/bugreport.cgi?msg=89;bug=24603 lists all
changes to syntax table and case tables introduced by this commit.
* lisp/international/characters.el: Remove case-pairs defined with
explicit Lisp code and instead use Unicode character properties.
* test/src/casefiddle-tests.el (casefiddle-tests--characters,
casefiddle-tests-casing): Update test cases which are now working
as they should.
| -rw-r--r-- | lisp/international/characters.el | 345 | ||||
| -rw-r--r-- | test/src/casefiddle-tests.el | 7 |
2 files changed, 73 insertions, 279 deletions
diff --git a/lisp/international/characters.el b/lisp/international/characters.el index 2b9711aec6b..b2c0e39741a 100644 --- a/lisp/international/characters.el +++ b/lisp/international/characters.el | |||
| @@ -543,10 +543,6 @@ with L, LRE, or LRO Unicode bidi character type.") | |||
| 543 | (set-case-syntax ?½ "_" tbl) | 543 | (set-case-syntax ?½ "_" tbl) |
| 544 | (set-case-syntax ?¾ "_" tbl) | 544 | (set-case-syntax ?¾ "_" tbl) |
| 545 | (set-case-syntax ?¿ "." tbl) | 545 | (set-case-syntax ?¿ "." tbl) |
| 546 | (let ((c 192)) | ||
| 547 | (while (<= c 222) | ||
| 548 | (set-case-syntax-pair c (+ c 32) tbl) | ||
| 549 | (setq c (1+ c)))) | ||
| 550 | (set-case-syntax ?× "_" tbl) | 546 | (set-case-syntax ?× "_" tbl) |
| 551 | (set-case-syntax ?ß "w" tbl) | 547 | (set-case-syntax ?ß "w" tbl) |
| 552 | (set-case-syntax ?÷ "_" tbl) | 548 | (set-case-syntax ?÷ "_" tbl) |
| @@ -558,101 +554,8 @@ with L, LRE, or LRO Unicode bidi character type.") | |||
| 558 | (modify-category-entry c ?l) | 554 | (modify-category-entry c ?l) |
| 559 | (setq c (1+ c))) | 555 | (setq c (1+ c))) |
| 560 | 556 | ||
| 561 | (let ((pair-ranges '((#x0100 . #x012F) | ||
| 562 | (#x0132 . #x0137) | ||
| 563 | (#x0139 . #x0148) | ||
| 564 | (#x014a . #x0177) | ||
| 565 | (#x0179 . #x017E) | ||
| 566 | (#x0182 . #x0185) | ||
| 567 | (#x0187 . #x0188) | ||
| 568 | (#x018B . #x018C) | ||
| 569 | (#x0191 . #x0192) | ||
| 570 | (#x0198 . #x0199) | ||
| 571 | (#x01A0 . #x01A5) | ||
| 572 | (#x01A7 . #x01A8) | ||
| 573 | (#x01AC . #x01AD) | ||
| 574 | (#x01AF . #x01B0) | ||
| 575 | (#x01B3 . #x01B6) | ||
| 576 | (#x01B8 . #x01B9) | ||
| 577 | (#x01BC . #x01BD) | ||
| 578 | (#x01CD . #x01DC) | ||
| 579 | (#x01DE . #x01EF) | ||
| 580 | (#x01F4 . #x01F5) | ||
| 581 | (#x01F8 . #x021F) | ||
| 582 | (#x0222 . #x0233) | ||
| 583 | (#x023B . #x023C) | ||
| 584 | (#x0241 . #x0242) | ||
| 585 | (#x0246 . #x024F)))) | ||
| 586 | (dolist (elt pair-ranges) | ||
| 587 | (let ((from (car elt)) (to (cdr elt))) | ||
| 588 | (while (< from to) | ||
| 589 | (set-case-syntax-pair from (1+ from) tbl) | ||
| 590 | (setq from (+ from 2)))))) | ||
| 591 | |||
| 592 | (set-case-syntax-pair ?Ÿ ?ÿ tbl) | ||
| 593 | |||
| 594 | ;; In some languages, such as Turkish, U+0049 LATIN CAPITAL LETTER I | ||
| 595 | ;; and U+0131 LATIN SMALL LETTER DOTLESS I make a case pair, and so | ||
| 596 | ;; do U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE and U+0069 LATIN | ||
| 597 | ;; SMALL LETTER I. | ||
| 598 | |||
| 599 | ;; We used to set up half of those correspondence unconditionally, | ||
| 600 | ;; but that makes searches slow. So now we don't set up either half | ||
| 601 | ;; of these correspondences by default. | ||
| 602 | |||
| 603 | ;; (set-downcase-syntax ?İ ?i tbl) | ||
| 604 | ;; (set-upcase-syntax ?I ?ı tbl) | ||
| 605 | |||
| 606 | (set-case-syntax-pair ?Ɓ ?ɓ tbl) | ||
| 607 | (set-case-syntax-pair ?Ɔ ?ɔ tbl) | ||
| 608 | (set-case-syntax-pair ?Ɖ ?ɖ tbl) | ||
| 609 | (set-case-syntax-pair ?Ɗ ?ɗ tbl) | ||
| 610 | (set-case-syntax-pair ?Ǝ ?ǝ tbl) | ||
| 611 | (set-case-syntax-pair ?Ə ?ə tbl) | ||
| 612 | (set-case-syntax-pair ?Ɛ ?ɛ tbl) | ||
| 613 | (set-case-syntax-pair ?Ɠ ?ɠ tbl) | ||
| 614 | (set-case-syntax-pair ?Ɣ ?ɣ tbl) | ||
| 615 | (set-case-syntax-pair ?Ɩ ?ɩ tbl) | ||
| 616 | (set-case-syntax-pair ?Ɨ ?ɨ tbl) | ||
| 617 | (set-case-syntax-pair ?Ɯ ?ɯ tbl) | ||
| 618 | (set-case-syntax-pair ?Ɲ ?ɲ tbl) | ||
| 619 | (set-case-syntax-pair ?Ɵ ?ɵ tbl) | ||
| 620 | (set-case-syntax-pair ?Ʀ ?ʀ tbl) | ||
| 621 | (set-case-syntax-pair ?Ʃ ?ʃ tbl) | ||
| 622 | (set-case-syntax-pair ?Ʈ ?ʈ tbl) | ||
| 623 | (set-case-syntax-pair ?Ʊ ?ʊ tbl) | ||
| 624 | (set-case-syntax-pair ?Ʋ ?ʋ tbl) | ||
| 625 | (set-case-syntax-pair ?Ʒ ?ʒ tbl) | ||
| 626 | ;; We use set-downcase-syntax below, since we want upcase of dž | ||
| 627 | ;; return DŽ, not Dž, and the same for the rest. | ||
| 628 | (set-case-syntax-pair ?DŽ ?dž tbl) | ||
| 629 | (set-downcase-syntax ?Dž ?dž tbl) | ||
| 630 | (set-case-syntax-pair ?LJ ?lj tbl) | ||
| 631 | (set-downcase-syntax ?Lj ?lj tbl) | ||
| 632 | (set-case-syntax-pair ?NJ ?nj tbl) | ||
| 633 | (set-downcase-syntax ?Nj ?nj tbl) | ||
| 634 | |||
| 635 | ;; 01F0; F; 006A 030C; # LATIN SMALL LETTER J WITH CARON | ||
| 636 | |||
| 637 | (set-case-syntax-pair ?DZ ?dz tbl) | ||
| 638 | (set-downcase-syntax ?Dz ?dz tbl) | ||
| 639 | (set-case-syntax-pair ?Ƕ ?ƕ tbl) | ||
| 640 | (set-case-syntax-pair ?Ƿ ?ƿ tbl) | ||
| 641 | (set-case-syntax-pair ?Ⱥ ?ⱥ tbl) | ||
| 642 | (set-case-syntax-pair ?Ƚ ?ƚ tbl) | ||
| 643 | (set-case-syntax-pair ?Ⱦ ?ⱦ tbl) | ||
| 644 | (set-case-syntax-pair ?Ƀ ?ƀ tbl) | ||
| 645 | (set-case-syntax-pair ?Ʉ ?ʉ tbl) | ||
| 646 | (set-case-syntax-pair ?Ʌ ?ʌ tbl) | ||
| 647 | |||
| 648 | ;; Latin Extended Additional | 557 | ;; Latin Extended Additional |
| 649 | (modify-category-entry '(#x1e00 . #x1ef9) ?l) | 558 | (modify-category-entry '(#x1e00 . #x1ef9) ?l) |
| 650 | (setq c #x1e00) | ||
| 651 | (while (<= c #x1ef9) | ||
| 652 | (and (zerop (% c 2)) | ||
| 653 | (or (<= c #x1e94) (>= c #x1ea0)) | ||
| 654 | (set-case-syntax-pair c (1+ c) tbl)) | ||
| 655 | (setq c (1+ c))) | ||
| 656 | 559 | ||
| 657 | ;; Latin Extended-C | 560 | ;; Latin Extended-C |
| 658 | (setq c #x2C60) | 561 | (setq c #x2C60) |
| @@ -660,57 +563,12 @@ with L, LRE, or LRO Unicode bidi character type.") | |||
| 660 | (modify-category-entry c ?l) | 563 | (modify-category-entry c ?l) |
| 661 | (setq c (1+ c))) | 564 | (setq c (1+ c))) |
| 662 | 565 | ||
| 663 | (let ((pair-ranges '((#x2C60 . #x2C61) | ||
| 664 | (#x2C67 . #x2C6C) | ||
| 665 | (#x2C72 . #x2C73) | ||
| 666 | (#x2C75 . #x2C76)))) | ||
| 667 | (dolist (elt pair-ranges) | ||
| 668 | (let ((from (car elt)) (to (cdr elt))) | ||
| 669 | (while (< from to) | ||
| 670 | (set-case-syntax-pair from (1+ from) tbl) | ||
| 671 | (setq from (+ from 2)))))) | ||
| 672 | |||
| 673 | (set-case-syntax-pair ?Ɫ ?ɫ tbl) | ||
| 674 | (set-case-syntax-pair ?Ᵽ ?ᵽ tbl) | ||
| 675 | (set-case-syntax-pair ?Ɽ ?ɽ tbl) | ||
| 676 | (set-case-syntax-pair ?Ɑ ?ɑ tbl) | ||
| 677 | (set-case-syntax-pair ?Ɱ ?ɱ tbl) | ||
| 678 | (set-case-syntax-pair ?Ɐ ?ɐ tbl) | ||
| 679 | (set-case-syntax-pair ?Ɒ ?ɒ tbl) | ||
| 680 | (set-case-syntax-pair ?Ȿ ?ȿ tbl) | ||
| 681 | (set-case-syntax-pair ?Ɀ ?ɀ tbl) | ||
| 682 | |||
| 683 | ;; Latin Extended-D | 566 | ;; Latin Extended-D |
| 684 | (setq c #xA720) | 567 | (setq c #xA720) |
| 685 | (while (<= c #xA7FF) | 568 | (while (<= c #xA7FF) |
| 686 | (modify-category-entry c ?l) | 569 | (modify-category-entry c ?l) |
| 687 | (setq c (1+ c))) | 570 | (setq c (1+ c))) |
| 688 | 571 | ||
| 689 | (let ((pair-ranges '((#xA722 . #xA72F) | ||
| 690 | (#xA732 . #xA76F) | ||
| 691 | (#xA779 . #xA77C) | ||
| 692 | (#xA77E . #xA787) | ||
| 693 | (#xA78B . #xA78E) | ||
| 694 | (#xA790 . #xA793) | ||
| 695 | (#xA796 . #xA7A9) | ||
| 696 | (#xA7B4 . #xA7B7)))) | ||
| 697 | (dolist (elt pair-ranges) | ||
| 698 | (let ((from (car elt)) (to (cdr elt))) | ||
| 699 | (while (< from to) | ||
| 700 | (set-case-syntax-pair from (1+ from) tbl) | ||
| 701 | (setq from (+ from 2)))))) | ||
| 702 | |||
| 703 | (set-case-syntax-pair ?Ᵹ ?ᵹ tbl) | ||
| 704 | (set-case-syntax-pair ?Ɦ ?ɦ tbl) | ||
| 705 | (set-case-syntax-pair ?Ɜ ?ɜ tbl) | ||
| 706 | (set-case-syntax-pair ?Ɡ ?ɡ tbl) | ||
| 707 | (set-case-syntax-pair ?Ɬ ?ɬ tbl) | ||
| 708 | (set-case-syntax-pair ?Ɪ ?ɪ tbl) | ||
| 709 | (set-case-syntax-pair ?Ʞ ?ʞ tbl) | ||
| 710 | (set-case-syntax-pair ?Ʇ ?ʇ tbl) | ||
| 711 | (set-case-syntax-pair ?Ʝ ?ʝ tbl) | ||
| 712 | (set-case-syntax-pair ?Ꭓ ?ꭓ tbl) | ||
| 713 | |||
| 714 | ;; Latin Extended-E | 572 | ;; Latin Extended-E |
| 715 | (setq c #xAB30) | 573 | (setq c #xAB30) |
| 716 | (while (<= c #xAB64) | 574 | (while (<= c #xAB64) |
| @@ -719,102 +577,19 @@ with L, LRE, or LRO Unicode bidi character type.") | |||
| 719 | 577 | ||
| 720 | ;; Greek | 578 | ;; Greek |
| 721 | (modify-category-entry '(#x0370 . #x03ff) ?g) | 579 | (modify-category-entry '(#x0370 . #x03ff) ?g) |
| 722 | (setq c #x0370) | ||
| 723 | (while (<= c #x03ff) | ||
| 724 | (if (or (and (>= c #x0391) (<= c #x03a1)) | ||
| 725 | (and (>= c #x03a3) (<= c #x03ab))) | ||
| 726 | (set-case-syntax-pair c (+ c 32) tbl)) | ||
| 727 | (and (>= c #x03da) | ||
| 728 | (<= c #x03ee) | ||
| 729 | (zerop (% c 2)) | ||
| 730 | (set-case-syntax-pair c (1+ c) tbl)) | ||
| 731 | (setq c (1+ c))) | ||
| 732 | (set-case-syntax-pair ?Ά ?ά tbl) | ||
| 733 | (set-case-syntax-pair ?Έ ?έ tbl) | ||
| 734 | (set-case-syntax-pair ?Ή ?ή tbl) | ||
| 735 | (set-case-syntax-pair ?Ί ?ί tbl) | ||
| 736 | (set-case-syntax-pair ?Ό ?ό tbl) | ||
| 737 | (set-case-syntax-pair ?Ύ ?ύ tbl) | ||
| 738 | (set-case-syntax-pair ?Ώ ?ώ tbl) | ||
| 739 | 580 | ||
| 740 | ;; Armenian | 581 | ;; Armenian |
| 741 | (setq c #x531) | 582 | (setq c #x531) |
| 742 | (while (<= c #x556) | ||
| 743 | (set-case-syntax-pair c (+ c #x30) tbl) | ||
| 744 | (setq c (1+ c))) | ||
| 745 | 583 | ||
| 746 | ;; Greek Extended | 584 | ;; Greek Extended |
| 747 | (modify-category-entry '(#x1f00 . #x1fff) ?g) | 585 | (modify-category-entry '(#x1f00 . #x1fff) ?g) |
| 748 | (setq c #x1f00) | ||
| 749 | (while (<= c #x1fff) | ||
| 750 | (and (<= (logand c #x000f) 7) | ||
| 751 | (<= c #x1fa7) | ||
| 752 | (not (memq c '(#x1f16 #x1f17 #x1f56 #x1f57 | ||
| 753 | #x1f50 #x1f52 #x1f54 #x1f56))) | ||
| 754 | (/= (logand c #x00f0) #x70) | ||
| 755 | (set-case-syntax-pair (+ c 8) c tbl)) | ||
| 756 | (setq c (1+ c))) | ||
| 757 | (set-case-syntax-pair ?Ᾰ ?ᾰ tbl) | ||
| 758 | (set-case-syntax-pair ?Ᾱ ?ᾱ tbl) | ||
| 759 | (set-case-syntax-pair ?Ὰ ?ὰ tbl) | ||
| 760 | (set-case-syntax-pair ?Ά ?ά tbl) | ||
| 761 | (set-case-syntax-pair ?ᾼ ?ᾳ tbl) | ||
| 762 | (set-case-syntax-pair ?Ὲ ?ὲ tbl) | ||
| 763 | (set-case-syntax-pair ?Έ ?έ tbl) | ||
| 764 | (set-case-syntax-pair ?Ὴ ?ὴ tbl) | ||
| 765 | (set-case-syntax-pair ?Ή ?ή tbl) | ||
| 766 | (set-case-syntax-pair ?ῌ ?ῃ tbl) | ||
| 767 | (set-case-syntax-pair ?Ῐ ?ῐ tbl) | ||
| 768 | (set-case-syntax-pair ?Ῑ ?ῑ tbl) | ||
| 769 | (set-case-syntax-pair ?Ὶ ?ὶ tbl) | ||
| 770 | (set-case-syntax-pair ?Ί ?ί tbl) | ||
| 771 | (set-case-syntax-pair ?Ῠ ?ῠ tbl) | ||
| 772 | (set-case-syntax-pair ?Ῡ ?ῡ tbl) | ||
| 773 | (set-case-syntax-pair ?Ὺ ?ὺ tbl) | ||
| 774 | (set-case-syntax-pair ?Ύ ?ύ tbl) | ||
| 775 | (set-case-syntax-pair ?Ῥ ?ῥ tbl) | ||
| 776 | (set-case-syntax-pair ?Ὸ ?ὸ tbl) | ||
| 777 | (set-case-syntax-pair ?Ό ?ό tbl) | ||
| 778 | (set-case-syntax-pair ?Ὼ ?ὼ tbl) | ||
| 779 | (set-case-syntax-pair ?Ώ ?ώ tbl) | ||
| 780 | (set-case-syntax-pair ?ῼ ?ῳ tbl) | ||
| 781 | 586 | ||
| 782 | ;; cyrillic | 587 | ;; cyrillic |
| 783 | (modify-category-entry '(#x0400 . #x04FF) ?y) | 588 | (modify-category-entry '(#x0400 . #x04FF) ?y) |
| 784 | (setq c #x0400) | ||
| 785 | (while (<= c #x04ff) | ||
| 786 | (and (>= c #x0400) | ||
| 787 | (<= c #x040f) | ||
| 788 | (set-case-syntax-pair c (+ c 80) tbl)) | ||
| 789 | (and (>= c #x0410) | ||
| 790 | (<= c #x042f) | ||
| 791 | (set-case-syntax-pair c (+ c 32) tbl)) | ||
| 792 | (and (zerop (% c 2)) | ||
| 793 | (or (and (>= c #x0460) (<= c #x0480)) | ||
| 794 | (and (>= c #x048c) (<= c #x04be)) | ||
| 795 | (and (>= c #x04d0) (<= c #x052e))) | ||
| 796 | (set-case-syntax-pair c (1+ c) tbl)) | ||
| 797 | (setq c (1+ c))) | ||
| 798 | (set-case-syntax-pair ?Ӂ ?ӂ tbl) | ||
| 799 | (set-case-syntax-pair ?Ӄ ?ӄ tbl) | ||
| 800 | (set-case-syntax-pair ?Ӈ ?ӈ tbl) | ||
| 801 | (set-case-syntax-pair ?Ӌ ?ӌ tbl) | ||
| 802 | |||
| 803 | (modify-category-entry '(#xA640 . #xA69F) ?y) | 589 | (modify-category-entry '(#xA640 . #xA69F) ?y) |
| 804 | (setq c #xA640) | ||
| 805 | (while (<= c #xA66C) | ||
| 806 | (set-case-syntax-pair c (+ c 1) tbl) | ||
| 807 | (setq c (+ c 2))) | ||
| 808 | (setq c #xA680) | ||
| 809 | (while (<= c #xA69A) | ||
| 810 | (set-case-syntax-pair c (+ c 1) tbl) | ||
| 811 | (setq c (+ c 2))) | ||
| 812 | 590 | ||
| 813 | ;; Georgian | 591 | ;; Georgian |
| 814 | (setq c #x10A0) | 592 | (setq c #x10A0) |
| 815 | (while (<= c #x10CD) | ||
| 816 | (set-case-syntax-pair c (+ c #x1C60) tbl) | ||
| 817 | (setq c (1+ c))) | ||
| 818 | 593 | ||
| 819 | ;; Cyrillic Extended-C | 594 | ;; Cyrillic Extended-C |
| 820 | (modify-category-entry '(#x1C80 . #x1C8F) ?y) | 595 | (modify-category-entry '(#x1C80 . #x1C8F) ?y) |
| @@ -844,12 +619,6 @@ with L, LRE, or LRO Unicode bidi character type.") | |||
| 844 | (set-case-syntax c "." tbl) | 619 | (set-case-syntax c "." tbl) |
| 845 | (setq c (1+ c))) | 620 | (setq c (1+ c))) |
| 846 | 621 | ||
| 847 | ;; Roman numerals | ||
| 848 | (setq c #x2160) | ||
| 849 | (while (<= c #x216f) | ||
| 850 | (set-case-syntax-pair c (+ c #x10) tbl) | ||
| 851 | (setq c (1+ c))) | ||
| 852 | |||
| 853 | ;; Fixme: The following blocks might be better as symbol rather than | 622 | ;; Fixme: The following blocks might be better as symbol rather than |
| 854 | ;; punctuation. | 623 | ;; punctuation. |
| 855 | ;; Arrows | 624 | ;; Arrows |
| @@ -873,25 +642,11 @@ with L, LRE, or LRO Unicode bidi character type.") | |||
| 873 | ;; Circled Latin | 642 | ;; Circled Latin |
| 874 | (setq c #x24b6) | 643 | (setq c #x24b6) |
| 875 | (while (<= c #x24cf) | 644 | (while (<= c #x24cf) |
| 876 | (set-case-syntax-pair c (+ c 26) tbl) | ||
| 877 | (modify-category-entry c ?l) | 645 | (modify-category-entry c ?l) |
| 878 | (modify-category-entry (+ c 26) ?l) | 646 | (modify-category-entry (+ c 26) ?l) |
| 879 | (setq c (1+ c))) | 647 | (setq c (1+ c))) |
| 880 | 648 | ||
| 881 | ;; Glagolitic | ||
| 882 | (setq c #x2C00) | ||
| 883 | (while (<= c #x2C2E) | ||
| 884 | (set-case-syntax-pair c (+ c 48) tbl) | ||
| 885 | (setq c (1+ c))) | ||
| 886 | |||
| 887 | ;; Coptic | 649 | ;; Coptic |
| 888 | (let ((pair-ranges '((#x2C80 . #x2CE2) | ||
| 889 | (#x2CEB . #x2CF2)))) | ||
| 890 | (dolist (elt pair-ranges) | ||
| 891 | (let ((from (car elt)) (to (cdr elt))) | ||
| 892 | (while (< from to) | ||
| 893 | (set-case-syntax-pair from (1+ from) tbl) | ||
| 894 | (setq from (+ from 2)))))) | ||
| 895 | ;; There's no Coptic category. However, Coptic letters that are | 650 | ;; There's no Coptic category. However, Coptic letters that are |
| 896 | ;; part of the Greek block above get the Greek category, and those | 651 | ;; part of the Greek block above get the Greek category, and those |
| 897 | ;; in this block are derived from Greek letters, so let's be | 652 | ;; in this block are derived from Greek letters, so let's be |
| @@ -901,45 +656,85 @@ with L, LRE, or LRO Unicode bidi character type.") | |||
| 901 | ;; Fullwidth Latin | 656 | ;; Fullwidth Latin |
| 902 | (setq c #xff21) | 657 | (setq c #xff21) |
| 903 | (while (<= c #xff3a) | 658 | (while (<= c #xff3a) |
| 904 | (set-case-syntax-pair c (+ c #x20) tbl) | ||
| 905 | (modify-category-entry c ?l) | 659 | (modify-category-entry c ?l) |
| 906 | (modify-category-entry (+ c #x20) ?l) | 660 | (modify-category-entry (+ c #x20) ?l) |
| 907 | (setq c (1+ c))) | 661 | (setq c (1+ c))) |
| 908 | 662 | ||
| 909 | ;; Deseret | 663 | ;; Combining diacritics |
| 910 | (setq c #x10400) | 664 | (modify-category-entry '(#x300 . #x362) ?^) |
| 911 | (while (<= c #x10427) | 665 | ;; Combining marks |
| 912 | (set-case-syntax-pair c (+ c 28) tbl) | 666 | (modify-category-entry '(#x20d0 . #x20ff) ?^) |
| 913 | (setq c (1+ c))) | ||
| 914 | 667 | ||
| 915 | ;; Osage | 668 | ;; Set all Letter, uppercase; Letter, lowercase and Letter, titlecase syntax |
| 916 | (setq c #x104B0) | 669 | ;; to word. |
| 917 | (while (<= c #x104D3) | 670 | (let ((syn-tab (standard-syntax-table))) |
| 918 | (set-case-syntax-pair c (+ c 40) tbl) | 671 | (map-char-table |
| 919 | (setq c (1+ c))) | 672 | (lambda (ch cat) |
| 673 | (when (memq cat '(Lu Ll Lt)) | ||
| 674 | (modify-syntax-entry ch "w " syn-tab))) | ||
| 675 | (unicode-property-table-internal 'general-category)) | ||
| 920 | 676 | ||
| 921 | ;; Old Hungarian | 677 | ;; Ⅰ through Ⅻ had word syntax in the past so set it here as well. |
| 922 | (setq c #x10c80) | 678 | ;; General category of those characers is Number, Letter. |
| 923 | (while (<= c #x10cb2) | 679 | (modify-syntax-entry '(#x2160 . #x216b) "w " syn-tab) |
| 924 | (set-case-syntax-pair c (+ c #x40) tbl) | ||
| 925 | (setq c (1+ c))) | ||
| 926 | 680 | ||
| 927 | ;; Warang Citi | 681 | ;; ⓐ thourgh ⓩ are symbols, other according to Unicode but Emacs set |
| 928 | (setq c #x118a0) | 682 | ;; their syntax to word in the past so keep backwards compatibility. |
| 929 | (while (<= c #x118bf) | 683 | (modify-syntax-entry '(#x24D0 . #x24E9) "w " syn-tab)) |
| 930 | (set-case-syntax-pair c (+ c #x20) tbl) | ||
| 931 | (setq c (1+ c))) | ||
| 932 | 684 | ||
| 933 | ;; Adlam | 685 | ;; Set downcase and upcase from Unicode properties |
| 934 | (setq c #x1e900) | ||
| 935 | (while (<= c #x1e921) | ||
| 936 | (set-case-syntax-pair c (+ c #x22) tbl) | ||
| 937 | (setq c (1+ c))) | ||
| 938 | 686 | ||
| 939 | ;; Combining diacritics | 687 | ;; In some languages, such as Turkish, U+0049 LATIN CAPITAL LETTER I and |
| 940 | (modify-category-entry '(#x300 . #x362) ?^) | 688 | ;; U+0131 LATIN SMALL LETTER DOTLESS I make a case pair, and so do U+0130 |
| 941 | ;; Combining marks | 689 | ;; LATIN CAPITAL LETTER I WITH DOT ABOVE and U+0069 LATIN SMALL LETTER I. |
| 942 | (modify-category-entry '(#x20d0 . #x20ff) ?^) | 690 | |
| 691 | ;; We used to set up half of those correspondence unconditionally, but that | ||
| 692 | ;; makes searches slow. So now we don't set up either half of these | ||
| 693 | ;; correspondences by default. | ||
| 694 | |||
| 695 | ;; (set-downcase-syntax ?İ ?i tbl) | ||
| 696 | ;; (set-upcase-syntax ?I ?ı tbl) | ||
| 697 | |||
| 698 | (let ((map-unicode-property | ||
| 699 | (lambda (property func) | ||
| 700 | (map-char-table | ||
| 701 | (lambda (ch cased) | ||
| 702 | ;; ASCII characters skipped due to reasons outlined above. As of | ||
| 703 | ;; Unicode 9.0, this exception affects the following: | ||
| 704 | ;; lc(U+0130 İ) = i | ||
| 705 | ;; uc(U+0131 ı) = I | ||
| 706 | ;; uc(U+017F ſ) = S | ||
| 707 | ;; uc(U+212A K) = k | ||
| 708 | (when (> cased 127) | ||
| 709 | (let ((end (if (consp ch) (cdr ch) ch))) | ||
| 710 | (setq ch (max 128 (if (consp ch) (car ch) ch))) | ||
| 711 | (while (<= ch end) | ||
| 712 | (funcall func ch cased) | ||
| 713 | (setq ch (1+ ch)))))) | ||
| 714 | (unicode-property-table-internal property)))) | ||
| 715 | (down tbl) | ||
| 716 | (up (case-table-get-table tbl 'up))) | ||
| 717 | |||
| 718 | ;; This works on an assumption that if toUpper(x) != x then toLower(x) == | ||
| 719 | ;; x (and the opposite for toLower/toUpper). This doesn’t hold for title | ||
| 720 | ;; case characters but those incorrect mappings will be overwritten later. | ||
| 721 | (funcall map-unicode-property 'uppercase | ||
| 722 | (lambda (lc uc) (aset down lc lc) (aset up uc uc))) | ||
| 723 | (funcall map-unicode-property 'lowercase | ||
| 724 | (lambda (uc lc) (aset down lc lc) (aset up uc uc))) | ||
| 725 | |||
| 726 | ;; Now deal with the actual mapping. This will correctly assign casing for | ||
| 727 | ;; title-case characters. | ||
| 728 | (funcall map-unicode-property 'uppercase | ||
| 729 | (lambda (lc uc) (aset up lc uc) (aset up uc uc))) | ||
| 730 | (funcall map-unicode-property 'lowercase | ||
| 731 | (lambda (uc lc) (aset down uc lc) (aset down lc lc)))) | ||
| 732 | |||
| 733 | ;; Clear out the extra slots so that they will be recomputed from the main | ||
| 734 | ;; (downcase) table and upcase table. Since we’re side-stepping the usual | ||
| 735 | ;; set-case-syntax-* functions, we need to do it explicitly. | ||
| 736 | (set-char-table-extra-slot tbl 1 nil) | ||
| 737 | (set-char-table-extra-slot tbl 2 nil) | ||
| 943 | 738 | ||
| 944 | ;; Fixme: syntax for symbols &c | 739 | ;; Fixme: syntax for symbols &c |
| 945 | ) | 740 | ) |
diff --git a/test/src/casefiddle-tests.el b/test/src/casefiddle-tests.el index 8d9cf34ee50..c752bb09172 100644 --- a/test/src/casefiddle-tests.el +++ b/test/src/casefiddle-tests.el | |||
| @@ -73,8 +73,7 @@ | |||
| 73 | 73 | ||
| 74 | (?Σ ?Σ ?σ ?Σ) | 74 | (?Σ ?Σ ?σ ?Σ) |
| 75 | (?σ ?Σ ?σ ?Σ) | 75 | (?σ ?Σ ?σ ?Σ) |
| 76 | ;; FIXME(bug#24603): Another broken one: | 76 | (?ς ?Σ ?ς ?Σ) |
| 77 | ;;(?ς ?Σ ?ς ?Σ) | ||
| 78 | 77 | ||
| 79 | (?Ⅷ ?Ⅷ ?ⅷ ?Ⅷ) | 78 | (?Ⅷ ?Ⅷ ?ⅷ ?Ⅷ) |
| 80 | (?ⅷ ?Ⅷ ?ⅷ ?Ⅷ))) | 79 | (?ⅷ ?Ⅷ ?ⅷ ?Ⅷ))) |
| @@ -196,7 +195,6 @@ | |||
| 196 | ;;("fish" "FIsh" "fish" "Fish" "Fish") | 195 | ;;("fish" "FIsh" "fish" "Fish" "Fish") |
| 197 | ;;("Straße" "STRASSE" "straße" "Straße" "Straße") | 196 | ;;("Straße" "STRASSE" "straße" "Straße" "Straße") |
| 198 | ;;("ΌΣΟΣ" "ΌΣΟΣ" "όσος" "Όσος" "Όσος") | 197 | ;;("ΌΣΟΣ" "ΌΣΟΣ" "όσος" "Όσος" "Όσος") |
| 199 | ;;("όσος" "ΌΣΟΣ" "όσος" "Όσος" "Όσος") | ||
| 200 | ;; And here’s what is actually happening: | 198 | ;; And here’s what is actually happening: |
| 201 | ("DŽUNGLA" "DŽUNGLA" "džungla" "DŽungla" "DŽUNGLA") | 199 | ("DŽUNGLA" "DŽUNGLA" "džungla" "DŽungla" "DŽUNGLA") |
| 202 | ("Džungla" "DžUNGLA" "džungla" "Džungla" "Džungla") | 200 | ("Džungla" "DžUNGLA" "džungla" "Džungla" "Džungla") |
| @@ -205,7 +203,8 @@ | |||
| 205 | ("fish" "fiSH" "fish" "fish" "fish") | 203 | ("fish" "fiSH" "fish" "fish" "fish") |
| 206 | ("Straße" "STRAßE" "straße" "Straße" "Straße") | 204 | ("Straße" "STRAßE" "straße" "Straße" "Straße") |
| 207 | ("ΌΣΟΣ" "ΌΣΟΣ" "όσοσ" "Όσοσ" "ΌΣΟΣ") | 205 | ("ΌΣΟΣ" "ΌΣΟΣ" "όσοσ" "Όσοσ" "ΌΣΟΣ") |
| 208 | ("όσος" "ΌΣΟς" "όσος" "Όσος" "Όσος")))))) | 206 | |
| 207 | ("όσος" "ΌΣΟΣ" "όσος" "Όσος" "Όσος")))))) | ||
| 209 | 208 | ||
| 210 | (ert-deftest casefiddle-tests-casing-byte8 () | 209 | (ert-deftest casefiddle-tests-casing-byte8 () |
| 211 | (should-not | 210 | (should-not |