[latex3-commits] [git/LaTeX3-latex3-latex3] main: Convert all accents to Unicode in supporting engines (60e284652)
Joseph Wright
joseph.wright at morningstar2.co.uk
Sun Feb 5 18:02:31 CET 2023
Repository : https://github.com/latex3/latex3
On branch : main
Link : https://github.com/latex3/latex3/commit/60e284652c21844303f80df31eee59180795eb74
>---------------------------------------------------------------
commit 60e284652c21844303f80df31eee59180795eb74
Author: Joseph Wright <joseph.wright at morningstar2.co.uk>
Date: Sun Feb 5 17:02:31 2023 +0000
Convert all accents to Unicode in supporting engines
>---------------------------------------------------------------
60e284652c21844303f80df31eee59180795eb74
l3kernel/l3text-purify.dtx | 83 +++++++++++++++++----------------
l3kernel/l3text.dtx | 57 +++++++++++++++++++---
l3kernel/testfiles/m3text001.luatex.tlg | 2 +-
l3kernel/testfiles/m3text001.xetex.tlg | 2 +-
l3kernel/testfiles/m3text002.luatex.tlg | 8 ++--
l3kernel/testfiles/m3text002.xetex.tlg | 8 ++--
6 files changed, 105 insertions(+), 55 deletions(-)
diff --git a/l3kernel/l3text-purify.dtx b/l3kernel/l3text-purify.dtx
index c547cd14b..0599f7f8c 100644
--- a/l3kernel/l3text-purify.dtx
+++ b/l3kernel/l3text-purify.dtx
@@ -459,51 +459,56 @@
% the $8$-bit engines need these leaving as commands for typesetting.
% However, we now need to make those into real Unicode codepoints: they
% come after the letter, so there is a shuffle as well as a change of
-% data.
+% data. This only applies to non-Unicode engines.
% \begin{macrocode}
-\cs_new:Npn \@@_purify_accent:NN #1#2
+\bool_lazy_or:nnF
+ { \sys_if_engine_luatex_p: }
+ { \sys_if_engine_xetex_p: }
{
- \cs_if_exist:cTF
- { c_@@_purify_ \token_to_str:N #1 _ \token_to_str:N #2 _tl }
+ \cs_new:Npn \@@_purify_accent:NN #1#2
{
- \exp_not:v
+ \cs_if_exist:cTF
{ c_@@_purify_ \token_to_str:N #1 _ \token_to_str:N #2 _tl }
+ {
+ \exp_not:v
+ { c_@@_purify_ \token_to_str:N #1 _ \token_to_str:N #2 _tl }
+ }
+ {
+ \exp_not:n {#2}
+ \exp_not:v { c_@@_purify_ \token_to_str:N #1 _tl }
+ }
}
- {
- \exp_not:n {#2}
- \exp_not:v { c_@@_purify_ \token_to_str:N #1 _tl }
- }
- }
-\tl_map_inline:nn { \` \' \^ \~ \= \u \. \" \r \H \v \d \c \k \b \t }
- { \text_declare_purify_equivalent:Nn #1 { \@@_purify_accent:NN #1 } }
-\group_begin:
- \cs_set_protected:Npn \@@_loop:Nn #1#2
- {
- \quark_if_recursion_tail_stop:N #1
- \tl_const:cx { c_@@_purify_ \token_to_str:N #1 _tl }
- { \codepoint_generate:nn {"#2} { \char_value_catcode:n { "#2 } } }
+ \tl_map_inline:nn { \` \' \^ \~ \= \u \. \" \r \H \v \d \c \k \b \t }
+ { \text_declare_purify_equivalent:Nn #1 { \@@_purify_accent:NN #1 } }
+ \group_begin:
+ \cs_set_protected:Npn \@@_loop:Nn #1#2
+ {
+ \quark_if_recursion_tail_stop:N #1
+ \tl_const:cx { c_@@_purify_ \token_to_str:N #1 _tl }
+ { \codepoint_generate:nn {"#2} { \char_value_catcode:n { "#2 } } }
+ \@@_loop:Nn
+ }
\@@_loop:Nn
- }
- \@@_loop:Nn
- \` { 0300 }
- \' { 0301 }
- \^ { 0302 }
- \~ { 0303 }
- \= { 0304 }
- \u { 0306 }
- \. { 0307 }
- \" { 0308 }
- \r { 030A }
- \H { 030B }
- \v { 030C }
- \d { 0323 }
- \c { 0327 }
- \k { 0328 }
- \b { 0331 }
- \t { 0361 }
- \q_recursion_tail { }
- \q_recursion_stop
-\group_end:
+ \` { 0300 }
+ \' { 0301 }
+ \^ { 0302 }
+ \~ { 0303 }
+ \= { 0304 }
+ \u { 0306 }
+ \. { 0307 }
+ \" { 0308 }
+ \r { 030A }
+ \H { 030B }
+ \v { 030C }
+ \d { 0323 }
+ \c { 0327 }
+ \k { 0328 }
+ \b { 0331 }
+ \t { 0361 }
+ \q_recursion_tail { }
+ \q_recursion_stop
+ \group_end:
+ }
% \end{macrocode}
% \end{macro}
%
diff --git a/l3kernel/l3text.dtx b/l3kernel/l3text.dtx
index 4d7d2f2a1..9655809f7 100644
--- a/l3kernel/l3text.dtx
+++ b/l3kernel/l3text.dtx
@@ -73,6 +73,8 @@
% Any commands listed in \cs{l_text_expand_exclude_tl} are excluded from
% expansion. Letter-like commands, e.g.~\cs{ae}, and accent commands creating
% pre-composed UTF-8 codepoints are converted to the UTF-8 equivalents.
+% For Unicode engines, all accents are converted to the appropriate combining
+% characters.
% \end{function}
%
% \begin{function}[added = 2020-01-22, updated = 2023-02-05]
@@ -1432,15 +1434,16 @@
%
% \subsection{Accent and letter-like data for expandsion}
%
-% \begin{macro}[rEXP]{\@@_expand_accent:NN}
+% \begin{macro}[rEXP]{\@@_expand_accent:Nn, \@@_expand_accent_aux:Nn}
% Accent \textsc{licr} handling is a little more complex. Accents may exist
% as pre-composed codepoints or as independent glyphs. At the expansion
% stage, only pre-composed glyphs can be processed: for \emph{ad hoc} accents,
-% we have to retain the command version for $8$-bit typesetting. So at this
-% stage we do not remove everything (see the purification code for the
-% action once we are definitely past typesetting).
+% we have to retain the command version for $8$-bit typesetting. On the
+% other hand, some text processing will not work properly with such
+% command-based accents, so we convert to combining characters if there is
+% engine support.
% \begin{macrocode}
-\cs_new:Npn \@@_expand_accent:NN #1#2
+\cs_new:Npn \@@_expand_accent:Nn #1#2
{
\cs_if_exist:cTF
{ c_@@_expand_ \token_to_str:N #1 _ \token_to_str:N #2 _tl }
@@ -1448,10 +1451,52 @@
\exp_not:v
{ c_@@_expand_ \token_to_str:N #1 _ \token_to_str:N #2 _tl }
}
+ { \@@_expand_accent_aux:Nn #1 {#2} }
+ }
+\bool_lazy_or:nnTF
+ { \sys_if_engine_luatex_p: }
+ { \sys_if_engine_xetex_p: }
+ {
+ \cs_new:Npn \@@_expand_accent_aux:Nn #1#2
+ {
+ #2
+ \use:c { c_@@_expand_ \token_to_str:N #1 _tl }
+ }
+ \group_begin:
+ \cs_set_protected:Npn \@@_loop:Nn #1#2
+ {
+ \quark_if_recursion_tail_stop:N #1
+ \tl_const:cx { c_@@_expand_ \token_to_str:N #1 _tl }
+ { \codepoint_generate:nn {"#2} { \char_value_catcode:n { "#2 } } }
+ \@@_loop:Nn
+ }
+ \@@_loop:Nn
+ \` { 0300 }
+ \' { 0301 }
+ \^ { 0302 }
+ \~ { 0303 }
+ \= { 0304 }
+ \u { 0306 }
+ \. { 0307 }
+ \" { 0308 }
+ \r { 030A }
+ \H { 030B }
+ \v { 030C }
+ \d { 0323 }
+ \c { 0327 }
+ \k { 0328 }
+ \b { 0331 }
+ \t { 0361 }
+ \q_recursion_tail { }
+ \q_recursion_stop
+ \group_end:
+ }
+ {
+ \cs_new:Npn \@@_expand_accent_aux:Nn #1#2
{ \exp_not:n {#1} {#2} }
}
\tl_map_inline:nn { \` \' \^ \~ \= \u \. \" \r \H \v \d \c \k \b \t }
- { \text_declare_expand_equivalent:Nn #1 { \@@_expand_accent:NN #1 } }
+ { \text_declare_expand_equivalent:Nn #1 { \@@_expand_accent:Nn #1 } }
% \end{macrocode}
% The list of pre-composed accent characters here is taken from
% \texttt{puenc.def}. All of the pre-composed cases take a single letter
diff --git a/l3kernel/testfiles/m3text001.luatex.tlg b/l3kernel/testfiles/m3text001.luatex.tlg
index aa4607de4..67ae745ae 100644
--- a/l3kernel/testfiles/m3text001.luatex.tlg
+++ b/l3kernel/testfiles/m3text001.luatex.tlg
@@ -30,7 +30,7 @@ TEST 4: Letter-like commands
============================================================
TEST 5: Accents
============================================================
-^^e4ĕ\H {i}ǒ\.{u}
+^^e4ĕi̋ǒu̇
============================================================
============================================================
TEST 6: Implicit tokens
diff --git a/l3kernel/testfiles/m3text001.xetex.tlg b/l3kernel/testfiles/m3text001.xetex.tlg
index aa4607de4..67ae745ae 100644
--- a/l3kernel/testfiles/m3text001.xetex.tlg
+++ b/l3kernel/testfiles/m3text001.xetex.tlg
@@ -30,7 +30,7 @@ TEST 4: Letter-like commands
============================================================
TEST 5: Accents
============================================================
-^^e4ĕ\H {i}ǒ\.{u}
+^^e4ĕi̋ǒu̇
============================================================
============================================================
TEST 6: Implicit tokens
diff --git a/l3kernel/testfiles/m3text002.luatex.tlg b/l3kernel/testfiles/m3text002.luatex.tlg
index 10708fb9b..6f4570b0c 100644
--- a/l3kernel/testfiles/m3text002.luatex.tlg
+++ b/l3kernel/testfiles/m3text002.luatex.tlg
@@ -347,10 +347,10 @@ TEST 27: Letter-like commands
============================================================
TEST 28: Accents
============================================================
-^^e4ĕ\H {i}ǒ\.{u}
-^^c4Ĕ\H {I}Ǒ\.{U}
-^^c4Ĕ\H {I}ǒ\.{u}
-^^c4Ĕ\H {I}ǒ\.{u}
+^^e4ĕi̋ǒu̇
+^^c4ĔI̋ǑU̇
+^^c4ĔI̋ǒu̇
+^^c4ĔI̋ǒu̇
============================================================
============================================================
TEST 29: Active chars
diff --git a/l3kernel/testfiles/m3text002.xetex.tlg b/l3kernel/testfiles/m3text002.xetex.tlg
index 10708fb9b..6f4570b0c 100644
--- a/l3kernel/testfiles/m3text002.xetex.tlg
+++ b/l3kernel/testfiles/m3text002.xetex.tlg
@@ -347,10 +347,10 @@ TEST 27: Letter-like commands
============================================================
TEST 28: Accents
============================================================
-^^e4ĕ\H {i}ǒ\.{u}
-^^c4Ĕ\H {I}Ǒ\.{U}
-^^c4Ĕ\H {I}ǒ\.{u}
-^^c4Ĕ\H {I}ǒ\.{u}
+^^e4ĕi̋ǒu̇
+^^c4ĔI̋ǑU̇
+^^c4ĔI̋ǒu̇
+^^c4ĔI̋ǒu̇
============================================================
============================================================
TEST 29: Active chars
More information about the latex3-commits
mailing list.