[latex3-commits] [git/LaTeX3-latex3-latex3] main: Convert all accents to Unicode in supporting engines (60e284652)

Joseph Wright joseph.wright at morningstar2.co.uk
Sun Feb 5 18:02:31 CET 2023


Repository : https://github.com/latex3/latex3
On branch  : main
Link       : https://github.com/latex3/latex3/commit/60e284652c21844303f80df31eee59180795eb74

>---------------------------------------------------------------

commit 60e284652c21844303f80df31eee59180795eb74
Author: Joseph Wright <joseph.wright at morningstar2.co.uk>
Date:   Sun Feb 5 17:02:31 2023 +0000

    Convert all accents to Unicode in supporting engines


>---------------------------------------------------------------

60e284652c21844303f80df31eee59180795eb74
 l3kernel/l3text-purify.dtx              | 83 +++++++++++++++++----------------
 l3kernel/l3text.dtx                     | 57 +++++++++++++++++++---
 l3kernel/testfiles/m3text001.luatex.tlg |  2 +-
 l3kernel/testfiles/m3text001.xetex.tlg  |  2 +-
 l3kernel/testfiles/m3text002.luatex.tlg |  8 ++--
 l3kernel/testfiles/m3text002.xetex.tlg  |  8 ++--
 6 files changed, 105 insertions(+), 55 deletions(-)

diff --git a/l3kernel/l3text-purify.dtx b/l3kernel/l3text-purify.dtx
index c547cd14b..0599f7f8c 100644
--- a/l3kernel/l3text-purify.dtx
+++ b/l3kernel/l3text-purify.dtx
@@ -459,51 +459,56 @@
 %   the $8$-bit engines need these leaving as commands for typesetting.
 %   However, we now need to make those into real Unicode codepoints: they
 %   come after the letter, so there is a shuffle as well as a change of
-%   data.
+%   data. This only applies to non-Unicode engines.
 %    \begin{macrocode}
-\cs_new:Npn \@@_purify_accent:NN #1#2
+\bool_lazy_or:nnF
+  { \sys_if_engine_luatex_p: }
+  { \sys_if_engine_xetex_p: }
   {
-    \cs_if_exist:cTF
-      { c_@@_purify_ \token_to_str:N #1 _ \token_to_str:N #2 _tl }
+    \cs_new:Npn \@@_purify_accent:NN #1#2
       {
-        \exp_not:v
+        \cs_if_exist:cTF
           { c_@@_purify_ \token_to_str:N #1 _ \token_to_str:N #2 _tl }
+          {
+            \exp_not:v
+              { c_@@_purify_ \token_to_str:N #1 _ \token_to_str:N #2 _tl }
+          }
+          {
+            \exp_not:n {#2}
+            \exp_not:v { c_@@_purify_ \token_to_str:N #1 _tl }
+          }
       }
-      {
-        \exp_not:n {#2}
-        \exp_not:v { c_@@_purify_ \token_to_str:N #1 _tl }
-      }
-  }
-\tl_map_inline:nn { \` \' \^ \~ \= \u \. \" \r \H \v \d \c \k \b \t }
-  { \text_declare_purify_equivalent:Nn #1 { \@@_purify_accent:NN #1 } }
-\group_begin:
-  \cs_set_protected:Npn \@@_loop:Nn #1#2
-    {
-      \quark_if_recursion_tail_stop:N #1
-      \tl_const:cx { c_@@_purify_ \token_to_str:N #1 _tl }
-        { \codepoint_generate:nn {"#2} { \char_value_catcode:n { "#2 } } }
+    \tl_map_inline:nn { \` \' \^ \~ \= \u \. \" \r \H \v \d \c \k \b \t }
+      { \text_declare_purify_equivalent:Nn #1 { \@@_purify_accent:NN #1 } }
+    \group_begin:
+      \cs_set_protected:Npn \@@_loop:Nn #1#2
+        {
+          \quark_if_recursion_tail_stop:N #1
+          \tl_const:cx { c_@@_purify_ \token_to_str:N #1 _tl }
+            { \codepoint_generate:nn {"#2} { \char_value_catcode:n { "#2 } } }
+          \@@_loop:Nn
+        }
       \@@_loop:Nn
-    }
-  \@@_loop:Nn
-    \` { 0300 }
-    \' { 0301 }
-    \^ { 0302 }
-    \~ { 0303 }
-    \= { 0304 }
-    \u { 0306 }
-    \. { 0307 }
-    \" { 0308 }
-    \r { 030A }
-    \H { 030B }
-    \v { 030C }
-    \d { 0323 }
-    \c { 0327 }
-    \k { 0328 }
-    \b { 0331 }
-    \t { 0361 }
-    \q_recursion_tail { }
-    \q_recursion_stop
-\group_end:
+        \` { 0300 }
+        \' { 0301 }
+        \^ { 0302 }
+        \~ { 0303 }
+        \= { 0304 }
+        \u { 0306 }
+        \. { 0307 }
+        \" { 0308 }
+        \r { 030A }
+        \H { 030B }
+        \v { 030C }
+        \d { 0323 }
+        \c { 0327 }
+        \k { 0328 }
+        \b { 0331 }
+        \t { 0361 }
+        \q_recursion_tail { }
+        \q_recursion_stop
+    \group_end:
+  }
 %    \end{macrocode}
 % \end{macro}
 %
diff --git a/l3kernel/l3text.dtx b/l3kernel/l3text.dtx
index 4d7d2f2a1..9655809f7 100644
--- a/l3kernel/l3text.dtx
+++ b/l3kernel/l3text.dtx
@@ -73,6 +73,8 @@
 %   Any commands listed in \cs{l_text_expand_exclude_tl} are excluded from
 %   expansion. Letter-like commands, e.g.~\cs{ae}, and accent commands creating
 %   pre-composed UTF-8 codepoints are converted to the UTF-8 equivalents.
+%   For Unicode engines, all accents are converted to the appropriate combining
+%   characters.
 % \end{function}
 %
 % \begin{function}[added = 2020-01-22, updated = 2023-02-05]
@@ -1432,15 +1434,16 @@
 %
 % \subsection{Accent and letter-like data for expandsion}
 %
-% \begin{macro}[rEXP]{\@@_expand_accent:NN}
+% \begin{macro}[rEXP]{\@@_expand_accent:Nn, \@@_expand_accent_aux:Nn}
 %   Accent \textsc{licr} handling is a little more complex. Accents may exist
 %   as pre-composed codepoints or as independent glyphs. At the expansion
 %   stage, only pre-composed glyphs can be processed: for \emph{ad hoc} accents,
-%   we have to retain the command version for $8$-bit typesetting. So at this
-%   stage we do not remove everything (see the purification code for the
-%   action once we are definitely past typesetting).
+%   we have to retain the command version for $8$-bit typesetting. On the
+%   other hand, some text processing will not work properly with such
+%   command-based accents, so we convert to combining characters if there is
+%   engine support.
 %    \begin{macrocode}
-\cs_new:Npn \@@_expand_accent:NN #1#2
+\cs_new:Npn \@@_expand_accent:Nn #1#2
   {
     \cs_if_exist:cTF
       { c_@@_expand_ \token_to_str:N #1 _ \token_to_str:N #2 _tl }
@@ -1448,10 +1451,52 @@
         \exp_not:v
           { c_@@_expand_ \token_to_str:N #1 _ \token_to_str:N #2 _tl }
       }
+      { \@@_expand_accent_aux:Nn #1 {#2} }
+  }
+\bool_lazy_or:nnTF
+  { \sys_if_engine_luatex_p: }
+  { \sys_if_engine_xetex_p: }
+  {
+    \cs_new:Npn \@@_expand_accent_aux:Nn #1#2
+      {
+        #2
+        \use:c { c_@@_expand_ \token_to_str:N #1 _tl }
+      }
+    \group_begin:
+      \cs_set_protected:Npn \@@_loop:Nn #1#2
+        {
+          \quark_if_recursion_tail_stop:N #1
+          \tl_const:cx { c_@@_expand_ \token_to_str:N #1 _tl }
+            { \codepoint_generate:nn {"#2} { \char_value_catcode:n { "#2 } } }
+          \@@_loop:Nn
+        }
+      \@@_loop:Nn
+        \` { 0300 }
+        \' { 0301 }
+        \^ { 0302 }
+        \~ { 0303 }
+        \= { 0304 }
+        \u { 0306 }
+        \. { 0307 }
+        \" { 0308 }
+        \r { 030A }
+        \H { 030B }
+        \v { 030C }
+        \d { 0323 }
+        \c { 0327 }
+        \k { 0328 }
+        \b { 0331 }
+        \t { 0361 }
+        \q_recursion_tail { }
+        \q_recursion_stop
+    \group_end:
+  }
+  {
+    \cs_new:Npn \@@_expand_accent_aux:Nn #1#2
       { \exp_not:n {#1} {#2} }
   }
 \tl_map_inline:nn { \` \' \^ \~ \= \u \. \" \r \H \v \d \c \k \b \t }
-  { \text_declare_expand_equivalent:Nn #1 { \@@_expand_accent:NN #1 } }
+  { \text_declare_expand_equivalent:Nn #1 { \@@_expand_accent:Nn #1 } }
 %    \end{macrocode}
 %   The list of pre-composed accent characters here is taken from
 %   \texttt{puenc.def}. All of the pre-composed cases take a single letter
diff --git a/l3kernel/testfiles/m3text001.luatex.tlg b/l3kernel/testfiles/m3text001.luatex.tlg
index aa4607de4..67ae745ae 100644
--- a/l3kernel/testfiles/m3text001.luatex.tlg
+++ b/l3kernel/testfiles/m3text001.luatex.tlg
@@ -30,7 +30,7 @@ TEST 4: Letter-like commands
 ============================================================
 TEST 5: Accents
 ============================================================
-^^e4ĕ\H {i}ǒ\.{u}
+^^e4ĕi̋ǒu̇
 ============================================================
 ============================================================
 TEST 6: Implicit tokens
diff --git a/l3kernel/testfiles/m3text001.xetex.tlg b/l3kernel/testfiles/m3text001.xetex.tlg
index aa4607de4..67ae745ae 100644
--- a/l3kernel/testfiles/m3text001.xetex.tlg
+++ b/l3kernel/testfiles/m3text001.xetex.tlg
@@ -30,7 +30,7 @@ TEST 4: Letter-like commands
 ============================================================
 TEST 5: Accents
 ============================================================
-^^e4ĕ\H {i}ǒ\.{u}
+^^e4ĕi̋ǒu̇
 ============================================================
 ============================================================
 TEST 6: Implicit tokens
diff --git a/l3kernel/testfiles/m3text002.luatex.tlg b/l3kernel/testfiles/m3text002.luatex.tlg
index 10708fb9b..6f4570b0c 100644
--- a/l3kernel/testfiles/m3text002.luatex.tlg
+++ b/l3kernel/testfiles/m3text002.luatex.tlg
@@ -347,10 +347,10 @@ TEST 27: Letter-like commands
 ============================================================
 TEST 28: Accents
 ============================================================
-^^e4ĕ\H {i}ǒ\.{u}
-^^c4Ĕ\H {I}Ǒ\.{U}
-^^c4Ĕ\H {I}ǒ\.{u}
-^^c4Ĕ\H {I}ǒ\.{u}
+^^e4ĕi̋ǒu̇
+^^c4ĔI̋ǑU̇
+^^c4ĔI̋ǒu̇
+^^c4ĔI̋ǒu̇
 ============================================================
 ============================================================
 TEST 29: Active chars
diff --git a/l3kernel/testfiles/m3text002.xetex.tlg b/l3kernel/testfiles/m3text002.xetex.tlg
index 10708fb9b..6f4570b0c 100644
--- a/l3kernel/testfiles/m3text002.xetex.tlg
+++ b/l3kernel/testfiles/m3text002.xetex.tlg
@@ -347,10 +347,10 @@ TEST 27: Letter-like commands
 ============================================================
 TEST 28: Accents
 ============================================================
-^^e4ĕ\H {i}ǒ\.{u}
-^^c4Ĕ\H {I}Ǒ\.{U}
-^^c4Ĕ\H {I}ǒ\.{u}
-^^c4Ĕ\H {I}ǒ\.{u}
+^^e4ĕi̋ǒu̇
+^^c4ĔI̋ǑU̇
+^^c4ĔI̋ǒu̇
+^^c4ĔI̋ǒu̇
 ============================================================
 ============================================================
 TEST 29: Active chars





More information about the latex3-commits mailing list.