[latex3-commits] [git/LaTeX3-latex3-latex3] main: Expand letter-like commands in \text_expand:n (1c6d0275c)

Joseph Wright joseph.wright at morningstar2.co.uk
Sun Feb 5 12:24:40 CET 2023


Repository : https://github.com/latex3/latex3
On branch  : main
Link       : https://github.com/latex3/latex3/commit/1c6d0275c15cb3075e38da858b689e1a3b230b64

>---------------------------------------------------------------

commit 1c6d0275c15cb3075e38da858b689e1a3b230b64
Author: Joseph Wright <joseph.wright at morningstar2.co.uk>
Date:   Sun Feb 5 11:23:39 2023 +0000

    Expand letter-like commands in \text_expand:n
    
    The issues with Greek case changing highlight that
    leaving 'letter like' commands in text is an issue.
    Thus whilst this is formally a breaking change, it
    will make longer-term work easier.
    
    Further commits will address related issues, most
    obviously accents, before Greek and Cyrillic
    letter commands are added to the support.


>---------------------------------------------------------------

1c6d0275c15cb3075e38da858b689e1a3b230b64
 l3kernel/CHANGELOG.md                   |   7 ++
 l3kernel/doc/l3obsolete.txt             |   1 +
 l3kernel/l3text-purify.dtx              |  74 --------------------
 l3kernel/l3text.dtx                     | 119 ++++++++++++++++----------------
 l3kernel/testfiles/m3text001.tlg        |   2 +-
 l3kernel/testfiles/m3text002.luatex.tlg |   8 +--
 l3kernel/testfiles/m3text002.tlg        |   8 +--
 l3kernel/testfiles/m3text002.xetex.tlg  |   8 +--
 8 files changed, 82 insertions(+), 145 deletions(-)

diff --git a/l3kernel/CHANGELOG.md b/l3kernel/CHANGELOG.md
index 9024ea39c..a2c668ce7 100644
--- a/l3kernel/CHANGELOG.md
+++ b/l3kernel/CHANGELOG.md
@@ -8,9 +8,16 @@ this project uses date-based 'snapshot' version identifiers.
 ## [Unreleased]
 
 ### Changed
+- `\text_expand:n` now converts letter-like commands,
+  e.g. `\ae`, to the UTF-8 equivalent: breaking change
+  also removes `\l_text_letterlike_tl` (unused outside of
+  the `expl3` kernel in TeX Live)
 - Swap meaning of `el` and `el-x-iota` when case changing
   Greek: match traditional LaTeX approach
 
+### Removed
+- `\l_text_letterlike_tl` (breaking change)
+
 ## [2023-02-02]
 
 ### Fixed
diff --git a/l3kernel/doc/l3obsolete.txt b/l3kernel/doc/l3obsolete.txt
index 722f807a6..b1a258a4d 100644
--- a/l3kernel/doc/l3obsolete.txt
+++ b/l3kernel/doc/l3obsolete.txt
@@ -342,6 +342,7 @@ Function                                      Date removed
 \KV_process_space_removal_sanitize:NNn          2011-09-08
 \l_iow_line_length_int                          2013-01-08
 \l_last_box                                     2012-05-11
+\l_text_letterlike_tl                           2023-02-05
 \l_tl_replace_toks                              2011-09-08
 \l_tmpa_toks                                    2011-09-08
 \l_tmpb_toks                                    2011-09-08
diff --git a/l3kernel/l3text-purify.dtx b/l3kernel/l3text-purify.dtx
index a28b73fe5..a086dc060 100644
--- a/l3kernel/l3text-purify.dtx
+++ b/l3kernel/l3text-purify.dtx
@@ -454,80 +454,6 @@
 %
 % \subsection{Accent and letter-like data for purifying text}
 %
-% In contrast to case changing, both $8$-bit and Unicode engines need
-% information for text purification to handle accents and letter-like
-% functions: these all need to be removed. However, the results are
-% of course engine-dependent.
-%
-% For the letter-like commands, life is relatively easy: they are all
-% simply added as standard exceptions. The only oddity is \tn{SS}, which
-% gets converted to two letters. (At some stage an alternative version
-% can presumably be added to \pkg{babel} or similar.)
-%    \begin{macrocode}
-\bool_lazy_or:nnTF
-  { \sys_if_engine_luatex_p: }
-  { \sys_if_engine_xetex_p: }
-  {
-    \cs_set_protected:Npn \@@_loop:Nn #1#2
-      {
-        \quark_if_recursion_tail_stop:N #1
-        \text_declare_purify_equivalent:Nx #1
-          {
-            \char_generate:nn { "#2 }
-              { \char_value_catcode:n { "#2 } }
-          }
-        \@@_loop:Nn
-      }
-  }
-  {
-    \cs_set_protected:Npn \@@_loop:Nn #1#2
-      {
-        \quark_if_recursion_tail_stop:N #1
-        \text_declare_purify_equivalent:Nx #1
-          {
-            \exp_args:Ne \@@_tmp:n
-              { \__kernel_codepoint_to_bytes:n { "#2 } }
-          }
-        \@@_loop:Nn
-      }
-    \cs_set:Npn \@@_tmp:n #1 { \@@_tmp:nnnn #1 }
-    \cs_set:Npn \@@_tmp:nnnn #1#2#3#4
-      {
-        \exp_after:wN \exp_after:wN \exp_after:wN
-          \exp_not:N \char_generate:nn {#1} { 13 }
-        \exp_after:wN \exp_after:wN \exp_after:wN
-          \exp_not:N \char_generate:nn {#2} { 13 }
-      }
-  }
-\@@_loop:Nn
-  \AA { 00C5 }
-  \AE { 00C6 }
-  \DH { 00D0 }
-  \DJ { 0110 }
-  \IJ { 0132 }
-  \L  { 0141 }
-  \NG { 014A }
-  \O  { 00D8 }
-  \OE { 0152 }
-  \TH { 00DE }
-  \aa { 00E5 }
-  \ae { 00E6 }
-  \dh { 00F0 }
-  \dj { 0111 }
-  \i  { 0131 }
-  \j  { 0237 }
-  \ij { 0132 }
-  \l  { 0142 }
-  \ng { 014B }
-  \o  { 00F8 }
-  \oe { 0153 }
-  \ss { 00DF }
-  \th { 00FE }
-  \q_recursion_tail ?
-  \q_recursion_stop
-\text_declare_purify_equivalent:Nn \SS { SS }
-%    \end{macrocode}
-%
 % \begin{macro}[rEXP]{\@@_purify_accent:NN}
 %   Accent \textsc{licr} handling is a little more complex. Accents may exist
 %   as pre-composed codepoints or as independent glyphs. The former are all
diff --git a/l3kernel/l3text.dtx b/l3kernel/l3text.dtx
index 6bd33f190..c111068bc 100644
--- a/l3kernel/l3text.dtx
+++ b/l3kernel/l3text.dtx
@@ -58,7 +58,8 @@
 %
 % \section{Expanding text}
 %
-% \begin{function}[EXP, added = 2020-01-02]{\text_expand:n}
+% \begin{function}[EXP, added = 2020-01-02, updated = 2023-02-05]
+%   {\text_expand:n}
 %   \begin{syntax}
 %     \cs{text_expand:n} \Arg{text}
 %   \end{syntax}
@@ -69,15 +70,17 @@
 %   \cs{l_text_math_delims_tl} or as the argument to commands listed
 %   in \cs{l_text_math_arg_tl}). Commands which are neither engine-
 %   nor \LaTeX{} protected are expanded exhaustively.
-%   Any commands listed in \cs{l_text_expand_exclude_tl},
-%   \cs{l_text_accents_tl} and \cs{l_text_letterlike_tl} are excluded from
-%   expansion.
+%   Any commands listed in \cs{l_text_expand_exclude_tl} and
+%   \cs{l_text_accents_tl} are excluded from expansion. Letter-like
+%   commands, e.g.~\cs{ae}, are converted to the UTF-8 equivalent.
 % \end{function}
 %
-% \begin{function}[added = 2020-01-22]
+% \begin{function}[added = 2020-01-22, updated = 2023-02-05]
 %   {
 %     \text_declare_expand_equivalent:Nn ,
-%     \text_declare_expand_equivalent:cn
+%     \text_declare_expand_equivalent:Nx ,
+%     \text_declare_expand_equivalent:cn ,
+%     \text_declare_expand_equivalent:cx
 %   }
 %   \begin{syntax}
 %     \cs{text_declare_expand_equivalent:Nn} \meta{cmd} \Arg{replacement}
@@ -261,11 +264,6 @@
 %   by expansion. (Defined only for the \LaTeXe{} package.)
 % \end{variable}
 %
-% \begin{variable}{\l_text_letterlike_tl}
-%   Lists commands which represent  letters; these are left unchanged by
-%   expansion. (Defined only for the \LaTeXe{} package.)
-% \end{variable}
-%
 % \begin{variable}{\l_text_math_arg_tl}
 %   Lists commands present in the \meta{text} where the argument of the
 %   command should be treated as math mode material. The treatment here is
@@ -794,28 +792,13 @@
 %
 % \subsection{Configuration variables}
 %
-% \begin{variable}{\l_text_accents_tl, \l_text_letterlike_tl}
+% \begin{variable}{\l_text_accents_tl}
 %   Special cases for accents and letter-like symbols, which in some cases will
 %   need to be converted further.
 %    \begin{macrocode}
 \tl_new:N \l_text_accents_tl
 \tl_set:Nn \l_text_accents_tl
   { \` \' \^ \~ \= \u \. \" \r \H \v \d \c \k \b \t }
-\tl_new:N \l_text_letterlike_tl
-\tl_set:Nn \l_text_letterlike_tl
-  {
-    \AA \aa
-    \AE \ae
-    \DH \dh
-    \DJ \dj
-    \IJ \ij
-    \L  \l
-    \NG \ng
-    \O  \o
-    \OE \oe
-    \SS \ss
-    \TH \th
-  }
 %    \end{macrocode}
 % \end{variable}
 %
@@ -953,8 +936,6 @@
 % \begin{macro}[EXP]{\@@_expand_exclude:Nnn}
 % \begin{macro}[EXP]{\@@_expand_accent:N}
 % \begin{macro}[EXP]{\@@_expand_accent:NN}
-% \begin{macro}[EXP]{\@@_expand_letterlike:N}
-% \begin{macro}[EXP]{\@@_expand_letterlike:NN}
 % \begin{macro}[EXP]{\@@_expand_cs:N}
 % \begin{macro}[EXP]{\@@_expand_protect:w}
 % \begin{macro}[EXP]{\@@_expand_protect:N}
@@ -1264,30 +1245,6 @@
       \q_@@_recursion_tail \q_@@_recursion_stop
   }
 \cs_new:Npn \@@_expand_accent:NN #1#2
-  {
-    \@@_if_q_recursion_tail_stop_do:Nn #2
-      { \@@_expand_letterlike:N #1 }
-    \cs_if_eq:NNTF #2 #1
-      {
-        \@@_use_i_delimit_by_q_recursion_stop:nw
-          {
-            \@@_expand_store:n {#1}
-            \@@_expand_loop:w
-          }
-      }
-      { \@@_expand_accent:NN #1 }
-  }
-%    \end{macrocode}
-%   Another list of exceptions: these ones take no arguments so are
-%   easier to handle.
-%    \begin{macrocode}
-\cs_new:Npn \@@_expand_letterlike:N #1
-  {
-    \exp_after:wN \@@_expand_letterlike:NN \exp_after:wN
-      #1 \l_text_letterlike_tl
-      \q_@@_recursion_tail \q_@@_recursion_stop
-  }
-\cs_new:Npn \@@_expand_letterlike:NN #1#2
   {
     \@@_if_q_recursion_tail_stop_do:Nn #2
       { \@@_expand_cs:N #1 }
@@ -1299,7 +1256,7 @@
             \@@_expand_loop:w
           }
       }
-      { \@@_expand_letterlike:NN #1 }
+      { \@@_expand_accent:NN #1 }
   }
 %    \end{macrocode}
 %   \LaTeXe{}'s \cs{protect} makes life interesting. Where possible, we
@@ -1495,13 +1452,13 @@
 % \end{macro}
 % \end{macro}
 % \end{macro}
-% \end{macro}
-% \end{macro}
 %
 % \begin{macro}
 %   {
 %     \text_declare_expand_equivalent:Nn ,
-%     \text_declare_expand_equivalent:cn
+%     \text_declare_expand_equivalent:cn ,
+%     \text_declare_expand_equivalent:Nx ,
+%     \text_declare_expand_equivalent:cx
 %   }
 %  Create equivalents to allow replacement.
 %    \begin{macrocode}
@@ -1510,10 +1467,56 @@
     \tl_clear_new:c { l_@@_expand_ \token_to_str:N #1 _tl }
     \tl_set:cn { l_@@_expand_ \token_to_str:N #1 _tl } {#2}
   }
-\cs_generate_variant:Nn \text_declare_expand_equivalent:Nn { c }
+\cs_generate_variant:Nn \text_declare_expand_equivalent:Nn { Nx }
+\cs_generate_variant:Nn \text_declare_expand_equivalent:Nn { c , cx }
 %    \end{macrocode}
 % \end{macro}
 %
+% \subsection{Accent and letter-like data for expandsion}
+%
+% For the letter-like commands, life is relatively easy: they are all
+% simply added as standard exceptions. The only oddity is \tn{SS}, which
+% gets converted to two letters.
+%    \begin{macrocode}
+\cs_set_protected:Npn \@@_loop:Nn #1#2
+  {
+    \quark_if_recursion_tail_stop:N #1
+    \text_declare_expand_equivalent:Nx #1
+      {
+        \codepoint_generate:nn {"#2}
+          { \char_value_catcode:n {"#2} }
+      }
+    \@@_loop:Nn
+  }
+\@@_loop:Nn
+  \AA { 00C5 }
+  \AE { 00C6 }
+  \DH { 00D0 }
+  \DJ { 0110 }
+  \IJ { 0132 }
+  \L  { 0141 }
+  \NG { 014A }
+  \O  { 00D8 }
+  \OE { 0152 }
+  \TH { 00DE }
+  \aa { 00E5 }
+  \ae { 00E6 }
+  \dh { 00F0 }
+  \dj { 0111 }
+  \i  { 0131 }
+  \j  { 0237 }
+  \ij { 0132 }
+  \l  { 0142 }
+  \ng { 014B }
+  \o  { 00F8 }
+  \oe { 0153 }
+  \ss { 00DF }
+  \th { 00FE }
+  \q_recursion_tail ?
+  \q_recursion_stop
+\text_declare_expand_equivalent:Nn \SS { SS }
+%    \end{macrocode}
+%
 %    \begin{macrocode}
 %</package>
 %    \end{macrocode}
diff --git a/l3kernel/testfiles/m3text001.tlg b/l3kernel/testfiles/m3text001.tlg
index aca51bd87..0077f55ca 100644
--- a/l3kernel/testfiles/m3text001.tlg
+++ b/l3kernel/testfiles/m3text001.tlg
@@ -25,7 +25,7 @@ Opps not close token in $y = \sin \theta
 ============================================================
 TEST 4: Letter-like commands
 ============================================================
-\AA \aa \ae \dh \ss \l \O 
+^^c3^^85^^c3^^a5^^c3^^a6^^c3^^b0^^c3^^9f^^c5^^82^^c3^^98
 ============================================================
 ============================================================
 TEST 5: Accents
diff --git a/l3kernel/testfiles/m3text002.luatex.tlg b/l3kernel/testfiles/m3text002.luatex.tlg
index 6256fe866..b026ee684 100644
--- a/l3kernel/testfiles/m3text002.luatex.tlg
+++ b/l3kernel/testfiles/m3text002.luatex.tlg
@@ -339,10 +339,10 @@ FUSSBALL
 ============================================================
 TEST 27: Letter-like commands
 ============================================================
-\aa \aa \ae \dh \ss \l \o 
-\AA \AA \AE \DH \SS \L \O 
-\AA \aa \ae \dh \ss \l \o 
-\AA \aa \ae \dh \ss \l \O 
+^^e5^^e5^^e6^^f0^^dfł^^f8
+^^c5^^c5^^c6^^d0SSŁ^^d8
+^^c5^^c5^^c6^^d0SsŁ^^d8
+^^c5^^c5^^c6^^d0SsŁ^^d8
 ============================================================
 ============================================================
 TEST 28: Accents
diff --git a/l3kernel/testfiles/m3text002.tlg b/l3kernel/testfiles/m3text002.tlg
index 5177226a9..5cbe50b00 100644
--- a/l3kernel/testfiles/m3text002.tlg
+++ b/l3kernel/testfiles/m3text002.tlg
@@ -339,10 +339,10 @@ FUSSBALL
 ============================================================
 TEST 27: Letter-like commands
 ============================================================
-\aa \aa \ae \dh \ss \l \o 
-\AA \AA \AE \DH \SS \L \O 
-\AA \aa \ae \dh \ss \l \o 
-\AA \aa \ae \dh \ss \l \O 
+^^c3^^a5^^c3^^a5^^c3^^a6^^c3^^b0^^c3^^9f^^c5^^82^^c3^^b8
+^^c3^^85^^c3^^85^^c3^^86^^c3^^90SS^^c5^^81^^c3^^98
+^^c3^^85^^c3^^a5^^c3^^a6^^c3^^b0^^c3^^9f^^c5^^82^^c3^^b8
+^^c3^^85^^c3^^a5^^c3^^a6^^c3^^b0^^c3^^9f^^c5^^82^^c3^^98
 ============================================================
 ============================================================
 TEST 28: Accents
diff --git a/l3kernel/testfiles/m3text002.xetex.tlg b/l3kernel/testfiles/m3text002.xetex.tlg
index 6256fe866..b026ee684 100644
--- a/l3kernel/testfiles/m3text002.xetex.tlg
+++ b/l3kernel/testfiles/m3text002.xetex.tlg
@@ -339,10 +339,10 @@ FUSSBALL
 ============================================================
 TEST 27: Letter-like commands
 ============================================================
-\aa \aa \ae \dh \ss \l \o 
-\AA \AA \AE \DH \SS \L \O 
-\AA \aa \ae \dh \ss \l \o 
-\AA \aa \ae \dh \ss \l \O 
+^^e5^^e5^^e6^^f0^^dfł^^f8
+^^c5^^c5^^c6^^d0SSŁ^^d8
+^^c5^^c5^^c6^^d0SsŁ^^d8
+^^c5^^c5^^c6^^d0SsŁ^^d8
 ============================================================
 ============================================================
 TEST 28: Accents





More information about the latex3-commits mailing list.