[latex3-commits] [git/LaTeX3-latex3-latex3] l3text: Add \text_purify:n (253291ebd)

Fri Dec 6 10:22:48 CET 2019

Repository : https://github.com/latex3/latex3
On branch  : l3text
Link       : https://github.com/latex3/latex3/commit/253291ebd9663e08bec2c086b0be59ada3e08eaa

>---------------------------------------------------------------

commit 253291ebd9663e08bec2c086b0be59ada3e08eaa
Author: Joseph Wright <joseph.wright at morningstar2.co.uk>
Date:   Fri Dec 6 07:29:38 2019 +0000

    Add \text_purify:n
    
    This removes *text* formatting: math mode still
    to be considered.


>---------------------------------------------------------------

253291ebd9663e08bec2c086b0be59ada3e08eaa
 l3kernel/CHANGELOG.md                              |   1 +
 l3kernel/l3text.dtx                                | 734 ++++++++++++++++++++-
 .../{m3text001.tlg => m3text003.luatex.tlg}        |  22 +-
 l3kernel/testfiles/m3text003.lvt                   |  62 ++
 .../{m3str002.uptex.tlg => m3text003.ptex.tlg}     |  27 +-
 .../testfiles/{m3text001.tlg => m3text003.tlg}     |  22 +-
 .../{m3text001.tlg => m3text003.uptex.tlg}         |  22 +-
 .../{m3text001.tlg => m3text003.xetex.tlg}         |  22 +-
 8 files changed, 830 insertions(+), 82 deletions(-)

diff --git a/l3kernel/CHANGELOG.md b/l3kernel/CHANGELOG.md
index a711ed31f..1bea4ea51 100644
--- a/l3kernel/CHANGELOG.md
+++ b/l3kernel/CHANGELOG.md
@@ -13,6 +13,7 @@ this project uses date-based 'snapshot' version identifiers.
 - `\str_foldcase:n`
 - `\str_lowercase:n`
 - `\str_uppercase:n`
+- `\text_purify:n`
 - `\text_lowercase:n, `\text_lowercase:nn`
 - `\text_uppercase:n, `\text_uppercase:nn`
 - `\text_titlecase:n, `\text_titlecase:nn`
diff --git a/l3kernel/l3text.dtx b/l3kernel/l3text.dtx
index 700c9a3d9..ab7513970 100644
--- a/l3kernel/l3text.dtx
+++ b/l3kernel/l3text.dtx
@@ -159,6 +159,33 @@
 %   \end{itemize}
 % \end{function}
 %
+% \begin{function}[rEXP, added = 2019-12-05]{\text_purify:n}
+%   \begin{syntax}
+%     \cs{text_purify:n} \Arg{text}
+%   \end{syntax}
+%   Takes user input \meta{text} and expands as described for
+%   \cs{text_expand:n}, then removes all functions from the resulting
+%   text. No processing takes place of math mode material (as delimited by
+%   pairs given in \cs{l_text_math_delims_tl} or as the argument to commands
+%   listed in \cs{l_text_math_arg_tl}); these tokens and functions are left
+%   in-place. Non-expandable functions present in the \meta{text} must either
+%   have a defined equivalent (see \cs{text_declare_purify_equivalent:Nn})
+%   or will be removed from the result.
+% \end{function}
+%
+% \begin{function}[added = 2019-12-05]
+%   {
+%     \text_declare_purify_equivalent:Nn ,
+%     \text_declare_purify_equivalent:Nx
+%   }
+%   \begin{syntax}
+%     \cs{text_declare_purify_equivalent:Nn} \meta{cmd} \Arg{replacement}
+%   \end{syntax}
+%   Declares that the \meta{replacement} tokens should be used whenever the
+%   \meta{cmd} (a single token) is encountered. The \meta{replacement} tokens
+%   should be expandable.
+% \end{function}
+%
 % \begin{variable}{\l_text_accents_tl}
 %   Lists commands which represent accents, and which are left unchanged
 %   by expansion.
@@ -372,6 +399,29 @@
 %    \end{macrocode}
 % \end{macro}
 %
+% \begin{macro}[EXP, TF]{\@@_if_expandable:N}
+%   Test for tokens that make sense to expand here: that is more
+%   restrictive than the engine view.
+%    \begin{macrocode}
+\prg_new_conditional:Npnn \@@_if_expandable:N #1 { T , F , TF }
+  {
+    \token_if_expandable:NTF #1
+      {
+        \bool_lazy_any:nTF
+          {
+            { \token_if_protected_macro_p:N      #1 }
+            { \token_if_protected_long_macro_p:N #1 }
+            { \token_if_eq_meaning_p:NN \q_recursion_tail #1 }
+          }
+          { \prg_return_false: }
+          { \prg_return_true: }
+      }
+      { \prg_return_false: }
+  }
+%    \end{macrocode}
+% \end{macro}
+%
+%
 % \subsection{Configuration variables}
 %
 % \begin{variable}{\l_text_accents_tl, \l_text_letterlike_tl}
@@ -482,7 +532,6 @@
 % \begin{macro}[rEXP]{\@@_expand_protect:nN}
 % \begin{macro}[rEXP]{\@@_expand_protect:Nw}
 % \begin{macro}[rEXP]{\@@_expand_cs_expand:N}
-% \begin{macro}[rEXP]{\@@_expand_if_expand:NTF}
 %   After precautions against |&| tokens, start a simple loop: that of
 %   course means that \enquote{text} cannot contain the two recursion
 %   quarks.
@@ -772,28 +821,13 @@
 %    \begin{macrocode}
 \cs_new:Npn \@@_expand_cs_expand:N #1
   {
-    \@@_expand_if_expand:NTF #1
+    \@@_if_expandable:NTF #1
       { \exp_after:wN \@@_expand_loop:w #1 }
       {
         \exp_not:n {#1}
         \@@_expand_loop:w
       }
   }
-\cs_new:Npn \@@_expand_if_expand:NTF #1
-  {
-    \token_if_expandable:NTF #1
-      {
-        \bool_lazy_any:nTF
-          {
-            { \token_if_protected_macro_p:N      #1 }
-            { \token_if_protected_long_macro_p:N #1 }
-            { \token_if_eq_meaning_p:NN \q_recursion_tail #1 }
-          }
-          { \use_ii:nn }
-          { \use_i:nn }
-      }
-      { \use_ii:nn }
-  }
 %    \end{macrocode}
 % \end{macro}
 % \end{macro}
@@ -815,7 +849,6 @@
 % \end{macro}
 % \end{macro}
 % \end{macro}
-% \end{macro}
 %
 % \subsection{Case changing}
 %
@@ -1806,7 +1839,7 @@
 %    \end{macrocode}
 % \end{macro}
 %
-% \subsection{Case changing data}
+% \subsection{Case changing data for $8$-bit engines}
 %
 % \begin{variable}
 %   {
@@ -2064,6 +2097,669 @@
 \group_end:
 %    \end{macrocode}
 %
+% \subsection{Purifying text}
+%
+% \begin{macro}[rEXP]{\text_purify:n, \@@_purify:n}
+% \begin{macro}[rEXP]{\@@_purify_loop:w}
+% \begin{macro}[rEXP]{\@@_purify_N_type:N}
+% \begin{macro}[rEXP]{\@@_purify_group:n}
+% \begin{macro}[rEXP]{\@@_purify_space:w}
+%   As in the other parts of the module, we start off with a standard
+%   \enquote{action} loop, with expansion applied up-front.
+%    \begin{macrocode}
+\cs_new:Npn \text_purify:n #1
+  {
+    \group_align_safe_begin:
+      \exp_args:Ne \@@_purify:n
+        { \text_expand:n {#1} }
+    \group_align_safe_end:
+  }
+\cs_new:Npn \@@_purify:n #1
+  { \@@_purify_loop:w #1 \q_recursion_tail \q_recursion_stop }
+%    \end{macrocode}
+%   The main loop is a standard \enquote{tl action}. Unlike the expansion
+%   or case changing, here any groups have to be run inline. Most of the
+%   business end is as before in the \texttt{N}-type token processing.
+%    \begin{macrocode}
+\cs_new:Npn \@@_purify_loop:w #1 \q_recursion_stop
+  {
+    \tl_if_head_is_N_type:nTF {#1}
+      { \@@_purify_N_type:N }
+      {
+        \tl_if_head_is_group:nTF {#1}
+          { \@@_purify_group:n }
+          { \@@_purify_space:w }
+      }
+    #1 \q_recursion_stop
+  }
+\cs_new:Npn \@@_purify_group:n #1 { \@@_purify_loop:w #1 }
+\exp_last_unbraced:NNo \cs_new:Npn \@@_purify_space:w \c_space_tl
+  {
+    \c_space_tl
+    \@@_purify_loop:w
+  }
+%    \end{macrocode}
+%   The first part of handling math mode is exactly the same as in the
+%   other functions: look for a start-of-math mode token and if found start
+%   a new loop tracking the closing token.
+%    \begin{macrocode}
+\cs_new:Npn \@@_purify_N_type:N #1
+  {
+    \quark_if_recursion_tail_stop:N #1
+    \@@_purify_N_type_aux:N #1
+  }
+\cs_new:Npn \@@_purify_N_type_aux:N #1
+  {
+    \exp_after:wN \@@_purify_math_search:NNN
+      \exp_after:wN #1 \l_text_math_delims_tl
+      \q_recursion_tail ?
+      \q_recursion_stop
+  }
+\cs_new:Npn \@@_purify_math_search:NNN #1#2#3
+  {
+    \quark_if_recursion_tail_stop_do:Nn #2
+      { \@@_purify_math_cmd:N #1 }
+    \token_if_eq_meaning:NNTF #1 #2
+      {
+        \use_i_delimit_by_q_recursion_stop:nw
+           {
+             \exp_not:n {#1}
+             \@@_purify_math_loop:Nw #3
+           }
+      }
+      { \@@_purify_math_search:NNN #1 }
+  }
+\cs_new:Npn \@@_purify_math_loop:Nw #1#2 \q_recursion_stop
+  {
+    \tl_if_head_is_N_type:nTF {#2}
+      { \@@_purify_math_N_type:NN }
+      {
+        \tl_if_head_is_group:nTF {#2}
+          { \@@_purify_math_group:Nn }
+          { \@@_purify_math_space:Nw }
+      }
+    #1#2 \q_recursion_stop
+  }
+\cs_new:Npn \@@_purify_math_N_type:NN #1#2
+  {
+    \quark_if_recursion_tail_stop:N #2
+    \exp_not:n {#2}
+    \token_if_eq_meaning:NNTF #2 #1
+      { \@@_purify_loop:w }
+      { \@@_purify_math_loop:Nw #1 }
+  }
+\cs_new:Npn \@@_purify_math_group:Nn #1#2
+  {
+    { \exp_not:n {#2} }
+    \@@_purify_math_loop:Nw #1
+  }
+\exp_after:wN \cs_new:Npn \exp_after:wN \@@_purify_math_space:Nw 
+  \exp_after:wN # \exp_after:wN 1 \c_space_tl
+  {
+    \c_space_tl
+    \@@_purify_math_loop:Nw #1
+  }
+%    \end{macrocode}
+%   Then handle math mode as an argument: same outcomes, different input
+%   syntax.
+%    \begin{macrocode}
+\cs_new:Npn \@@_purify_math_cmd:N #1
+  {
+    \exp_after:wN \@@_purify_math_cmd:NN \exp_after:wN #1
+      \l_text_math_arg_tl \q_recursion_tail \q_recursion_stop
+  }
+\cs_new:Npn \@@_purify_math_cmd:NN #1#2
+  {
+    \quark_if_recursion_tail_stop_do:Nn #2
+      { \@@_purify_replace:N #1 }
+    \cs_if_eq:NNTF #2 #1
+      {
+        \use_i_delimit_by_q_recursion_stop:nw
+          { \@@_purify_math_cmd:Nn #1 }
+      }
+      { \@@_purify_math_cmd:NN #1 }
+  }
+\cs_new:Npn \@@_purify_math_cmd:Nn #1#2
+  {
+    \exp_not:n { #1 {#2} }
+    \@@_purify_loop:w
+  }
+%    \end{macrocode}
+%   For \texttt{N}-type tokens, we first look for a string-context replacement
+%   before anything else: this can therefore cover anything. Assuming we don't
+%   find one, check to see if we can expand control sequences: if not, they have
+%   to be dropped. We also allow for \LaTeXe{} \tn{protect}: there's an
+%   assumption that we don't have |\protect { \oops }| or similar, but that's
+%   also in the expansion code and seems like a reasonable balance.
+%    \begin{macrocode}
+\cs_new:Npn \@@_purify_replace:N #1
+  {
+    \cs_if_exist:cTF { l_@@_purify_ \token_to_str:N #1 _tl }
+      {
+        \exp_args:Nv \@@_purify_replace:n
+          { l_@@_purify_ \token_to_str:N #1 _tl }
+      }
+      {
+        \token_if_cs:NTF #1
+          { \@@_purify_expand:N #1 }
+          {
+            \exp_not:n {#1}
+            \@@_purify_loop:w
+          }
+      }
+  }
+\cs_new:Npn \@@_purify_replace:n #1 { \@@_purify_loop:w #1 }
+\cs_new:Npn \@@_purify_expand:N #1
+  {
+    \str_if_eq:nnTF {#1} { \protect }
+      { \@@_purify_protect:N }
+      {
+        \@@_if_expandable:NTF #1
+          { \exp_after:wN \@@_purify_loop:w #1 }
+          { \@@_purify_loop:w }
+      }
+  }
+\cs_new:Npn \@@_purify_protect:N #1
+  {
+    \quark_if_recursion_tail_stop:N #1
+    \@@_purify_loop:w
+  }
+%    \end{macrocode}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+%
+% \begin{macro}
+%   {
+%     \text_declare_purify_equivalent:Nn,
+%     \text_declare_purify_equivalent:Nx
+%   }
+%    \begin{macrocode}
+\cs_new_protected:Npn \text_declare_purify_equivalent:Nn #1#2
+  {
+    \tl_clear_new:c { l_@@_purify_ \token_to_str:N #1 _tl }
+    \tl_set:cn { l_@@_purify_ \token_to_str:N #1 _tl } {#2}
+  }
+\cs_generate_variant:Nn \text_declare_purify_equivalent:Nn { Nx }
+%    \end{macrocode}
+% \end{macro}
+%
+% Now pre-define a range of standard commands that need dedicated definitions
+% in purified text. First handle font-related stuff: all of this needs to be
+% disabled.
+%    \begin{macrocode}
+\tl_map_inline:nn
+  {
+    \fontencoding
+    \fontfamily
+    \fontseries
+    \fontshape
+  }
+  { \text_declare_purify_equivalent:Nn #1 { \use_none:n } }
+\text_declare_purify_equivalent:Nn \fontsize { \use_none:nn }
+\text_declare_purify_equivalent:Nn \selectfont { }
+\text_declare_purify_equivalent:Nn \usefont { \use_none:nnnn }
+\tl_map_inline:nn
+  {
+    \emph
+    \text
+    \textnormal
+    \textrm
+    \textsf
+    \texttt
+    \textbf
+    \textmd
+    \textit
+    \textsl
+    \textup
+    \textsc
+    \textulc
+  }
+  { \text_declare_purify_equivalent:Nn #1 { \use:n } }
+\tl_map_inline:nn
+  {
+    \normalfont
+    \rmfamily
+    \sffamily
+    \ttfamily
+    \bfseries
+    \mdseries
+    \itshape
+    \scshape
+    \slshape
+    \upshape
+    \em
+    \Huge
+    \LARGE
+    \Large
+    \footnotesize
+    \huge
+    \large
+    \normalsize
+    \scriptsize
+    \small
+    \tiny
+  }
+  { \text_declare_purify_equivalent:Nn #1 { } }
+%    \end{macrocode}
+% Environments have to be handled by pure expansion.
+%    \begin{macrocode}
+\text_declare_purify_equivalent:Nn \begin { \use:c }
+\text_declare_purify_equivalent:Nn \end { \use:c }
+%    \end{macrocode}
+% Some common symbols and similar ideas.
+%    \begin{macrocode}
+\text_declare_purify_equivalent:Nn \\ { }
+\tl_map_inline:nn
+  { \{ \} \# \$ \% \_ }
+  { \text_declare_purify_equivalent:Nx #1 { \cs_to_str:N #1 } }
+%    \end{macrocode}
+% Cross-referencing.
+%    \begin{macrocode}
+\text_declare_purify_equivalent:Nn \label { \use_none:n }
+%    \end{macrocode}
+%
+% \subsection{Accent and letter-like data for purifying text}
+%
+% In contrast to case changing, both $8$-bit and Unicode engines need
+% information for text purification to handle accents and letter-like
+% functions: these all need to be removed. However, the results are
+% of course engine-dependent.
+%
+% For the letter-like commands, life is relatively easy: they are all
+% simply added as standard exceptions. The only oddity is \tn{SS}, which
+% gets converted to two letters. (At some stage an alternative version
+% can presumably be added to \pkg{babel} or similar.)
+%    \begin{macrocode}
+\bool_lazy_or:nnTF
+  { \sys_if_engine_luatex_p: }
+  { \sys_if_engine_xetex_p: }
+  {
+    \cs_set_protected:Npn \@@_loop:Nn #1#2
+      {
+        \quark_if_recursion_tail_stop:N #1
+        \text_declare_purify_equivalent:Nx #1
+          {
+            \char_generate:nn { "#2 }
+              { \char_value_catcode:n { "#2 } }
+          }
+        \@@_loop:Nn
+      }
+  }
+  {
+    \cs_set_protected:Npn \@@_loop:Nn #1#2
+      {
+        \quark_if_recursion_tail_stop:N #1
+        \text_declare_purify_equivalent:Nx #1
+          {
+            \exp_args:Ne \@@_tmp:n
+              { \char_codepoint_to_bytes:n { "#2 } }
+          }
+        \@@_loop:Nn
+      }
+    \cs_set:Npn \@@_tmp:n #1 { \@@_tmp:nnnn #1 }
+    \cs_set:Npn \@@_tmp:nnnn #1#2#3#4
+      {
+        \exp_after:wN \exp_after:wN \exp_after:wN
+          \exp_not:N \char_generate:nn {#1} { 13 }
+        \exp_after:wN \exp_after:wN \exp_after:wN
+          \exp_not:N \char_generate:nn {#2} { 13 }
+      }
+  }
+\@@_loop:Nn
+  \AA { 00C5 }
+  \AE { 00C6 }
+  \DH { 00D0 }
+  \DJ { 0110 }
+  \IJ { 0132 }
+  \L  { 0141 }
+  \NG { 014A }
+  \O  { 00D8 }
+  \OE { 0152 }
+  \TH { 00DE }
+  \aa { 00E5 }
+  \ae { 00E6 }
+  \dh { 00F0 }
+  \dj { 0111 }
+  \i  { 0131 }
+  \j  { 0237 }
+  \ij { 0132 }
+  \l  { 0142 }
+  \ng { 014B }
+  \o  { 00F8 }
+  \oe { 0153 }
+  \ss { 00DF }
+  \th { 00FE }
+  \q_recursion_tail ?
+  \q_recursion_stop
+\text_declare_purify_equivalent:Nn \SS { SS }
+%    \end{macrocode}
+%
+% \begin{macro}[rEXP]{\@@_purify_accent:NN}
+%   Accent \textsc{licr} handling is a little more complex. Accents may exist
+%   as pre-composed codepoints or as independent glyphs. The former are all
+%   saved as single token lists, whilst for the latter the combining accent
+%   needs to be re-ordered compared to the character it applies to.
+%    \begin{macrocode}
+\cs_new:Npn \@@_purify_accent:NN #1#2
+  {
+    \cs_if_exist:cTF
+      { c_@@_purify_ \token_to_str:N #1 _ \token_to_str:N #2 _tl }
+      {
+        \exp_not:v
+          { c_@@_purify_ \token_to_str:N #1 _ \token_to_str:N #2 _tl }
+      }
+      {
+        \exp_not:n {#2}
+        \exp_not:v { c_@@_purify_ \token_to_str:N #1 _tl }
+      }
+  }
+\tl_map_inline:Nn \l_text_accents_tl
+  { \text_declare_purify_equivalent:Nn #1 { \@@_purify_accent:NN #1 } }
+%    \end{macrocode}
+%   First set up the combining accents.
+%    \begin{macrocode}
+\group_begin:
+  \cs_set_protected:Npn \@@_loop:Nn #1#2
+    {
+      \quark_if_recursion_tail_stop:N #1
+      \tl_const:cx { c_@@_purify_ \token_to_str:N #1 _tl }
+        { \@@_tmp:n {#2} }
+      \@@_loop:Nn
+    }
+  \bool_lazy_or:nnTF
+    { \sys_if_engine_luatex_p: }
+    { \sys_if_engine_xetex_p: }
+    {
+      \cs_set:Npn \@@_tmp:n #1
+        {
+          \char_generate:nn { "#1 }
+            { \char_value_catcode:n { "#1 } }
+        }
+    }
+    {
+      \cs_set:Npn \@@_tmp:n #1
+        {
+          \exp_args:Ne \@@_tmp_aux:n
+            { \char_codepoint_to_bytes:n { "#1 } }
+        }
+      \cs_set:Npn \@@_tmp_aux:n #1 { \@@_tmp:nnnn #1 }
+      \cs_set:Npn \@@_tmp:nnnn #1#2#3#4
+        {
+          \exp_after:wN \exp_after:wN \exp_after:wN
+            \exp_not:N \char_generate:nn {#1} { 13 }
+          \exp_after:wN \exp_after:wN \exp_after:wN
+            \exp_not:N \char_generate:nn {#2} { 13 }
+        }
+    }
+  \@@_loop:Nn
+    \` { 0300 }
+    \' { 0301 }
+    \^ { 0302 }
+    \~ { 0303 }
+    \= { 0304 }
+    \u { 0306 }
+    \U { 0306 }
+    \. { 0307 }
+    \" { 0308 }
+    \r { 030A }
+    \H { 030B }
+    \v { 030C }
+    \G { 030F }
+    \C { 030F }
+    \d { 0323 }
+    \c { 0327 }
+    \k { 0328 }
+    \b { 0331 }
+    \t { 0361 }
+    \q_recursion_tail ?
+    \q_recursion_stop
+%    \end{macrocode}
+%   Now we handle the pre-composed accents: the list here is taken from
+%   \texttt{puenc.def}. All of the precomposed cases take a single letter
+%   as their second argument. We do not try to cover the case where an accent
+%   is added to a \enquote{real} dotless-i or -j, or a \ae/\AE. Rather, we
+%   assume that if the \textsc{utf}-8 character is used, it will have the
+%   real accent character too.
+%    \begin{macrocode}
+  \cs_set_protected:Npn \@@_loop:NNn #1#2#3
+    {
+      \quark_if_recursion_tail_stop:N #1
+      \tl_const:cx
+        { c_@@_purify_ \token_to_str:N #1 _ \token_to_str:N #2 _tl }
+        { \@@_tmp:n {#3} }
+      \@@_loop:NNn
+    }
+  \@@_loop:NNn
+    \` A   { 00C0 }
+    \' A   { 00C1 }
+    \^ A   { 00C2 }
+    \~ A   { 00C3 }
+    \" A   { 00C4 }
+    \r A   { 00C5 }
+    \c C   { 00C7 }
+    \` E   { 00C8 }
+    \' E   { 00C9 }
+    \^ E   { 00CA }
+    \" E   { 00CB }
+    \` I   { 00CC }
+    \' I   { 00CD }
+    \^ I   { 00CE }
+    \" I   { 00CF }
+    \~ N   { 00D1 }
+    \` O   { 00D2 }
+    \' O   { 00D3 }
+    \^ O   { 00D4 }
+    \~ O   { 00D5 }
+    \" O   { 00D6 }
+    \` U   { 00D9 }
+    \' U   { 00DA }
+    \^ U   { 00DB }
+    \" U   { 00DC }
+    \' Y   { 00DD }
+    \` a   { 00E0 }
+    \' a   { 00E1 }
+    \^ a   { 00E2 }
+    \~ a   { 00E3 }
+    \" a   { 00E4 }
+    \r a   { 00E5 }
+    \c c   { 00E7 }
+    \` e   { 00E8 }
+    \' e   { 00E9 }
+    \^ e   { 00EA }
+    \" e   { 00EB }
+    \` i   { 00EC }
+    \` \i  { 00EC }
+    \' i   { 00ED }
+    \' \i  { 00ED }
+    \^ i   { 00EE }
+    \^ \i  { 00EE }
+    \" i   { 00EF }
+    \" \i  { 00EF }
+    \~ n   { 00F1 }
+    \` o   { 00F2 }
+    \' o   { 00F3 }
+    \^ o   { 00F4 }
+    \~ o   { 00F5 }
+    \" o   { 00F6 }
+    \` u   { 00F9 }
+    \' u   { 00FA }
+    \^ u   { 00FB }
+    \" u   { 00FC }
+    \' y   { 00FD }
+    \" y   { 00FF }
+    \= A   { 0100 }
+    \= a   { 0101 }
+    \u A   { 0102 }
+    \u a   { 0103 }
+    \k A   { 0104 }
+    \k a   { 0105 }
+    \' C   { 0106 }
+    \' c   { 0107 }
+    \^ C   { 0108 }
+    \^ c   { 0109 }
+    \. C   { 010A }
+    \. c   { 010B }
+    \v C   { 010C }
+    \v c   { 010D }
+    \v D   { 010E }
+    \v d   { 010F }
+    \= E   { 0112 }
+    \= e   { 0113 }
+    \u E   { 0114 }
+    \u e   { 0115 }
+    \. E   { 0116 }
+    \. e   { 0117 }
+    \k E   { 0118 }
+    \k e   { 0119 }
+    \v E   { 011A }
+    \v e   { 011B }
+    \^ G   { 011C }
+    \^ g   { 011D }
+    \u G   { 011E }
+    \u g   { 011F }
+    \. G   { 0120 }
+    \. g   { 0121 }
+    \c G   { 0122 }
+    \c g   { 0123 }
+    \^ H   { 0124 }
+    \^ h   { 0125 }
+    \~ I   { 0128 }
+    \~ i   { 0129 }
+    \~ \i  { 0129 }
+    \= I   { 012A }
+    \= i   { 012B }
+    \= \i  { 012B }
+    \u I   { 012C }
+    \u i   { 012D }
+    \u \i  { 012D }
+    \k I   { 012E }
+    \k i   { 012F }
+    \k \i  { 012F }
+    \. I   { 0130 }
+    \^ J   { 0134 }
+    \^ j   { 0135 }
+    \^ \j  { 0135 }
+    \c K   { 0136 }
+    \c k   { 0137 }
+    \' L   { 0139 }
+    \' l   { 013A }
+    \c L   { 013B }
+    \c l   { 013C }
+    \v L   { 013D }
+    \v l   { 013E }
+    \. L   { 013F }
+    \. l   { 0140 }
+    \' N   { 0143 }
+    \' n   { 0144 }
+    \c N   { 0145 }
+    \c n   { 0146 }
+    \v N   { 0147 }
+    \v n   { 0148 }
+    \= O   { 014C }
+    \= o   { 014D }
+    \u O   { 014E }
+    \u o   { 014F }
+    \H O   { 0150 }
+    \H o   { 0151 }
+    \' R   { 0154 }
+    \' r   { 0155 }
+    \c R   { 0156 }
+    \c r   { 0157 }
+    \v R   { 0158 }
+    \v r   { 0159 }
+    \' S   { 015A }
+    \' s   { 015B }
+    \^ S   { 015C }
+    \^ s   { 015D }
+    \c S   { 015E }
+    \c s   { 015F }
+    \v S   { 0160 }
+    \v s   { 0161 }
+    \c T   { 0162 }
+    \c t   { 0163 }
+    \v T   { 0164 }
+    \v t   { 0165 }
+    \~ U   { 0168 }
+    \~ u   { 0169 }
+    \= U   { 016A }
+    \= u   { 016B }
+    \u U   { 016C }
+    \u u   { 016D }
+    \r U   { 016E }
+    \r u   { 016F }
+    \H U   { 0170 }
+    \H u   { 0171 }
+    \k U   { 0172 }
+    \k u   { 0173 }
+    \^ W   { 0174 }
+    \^ w   { 0175 }
+    \^ Y   { 0176 }
+    \^ y   { 0177 }
+    \" Y   { 0178 }
+    \' Z   { 0179 }
+    \' z   { 017A }
+    \. Z   { 017B }
+    \. z   { 017C }
+    \v Z   { 017D }
+    \v z   { 017E }
+    \v A   { 01CD }
+    \v a   { 01CE }
+    \v I   { 01CF }
+    \v \i  { 01D0 }
+    \v i   { 01D0 }
+    \v O   { 01D1 }
+    \v o   { 01D2 }
+    \v U   { 01D3 }
+    \v u   { 01D4 }
+    \v G   { 01E6 }
+    \v g   { 01E7 }
+    \v K   { 01E8 }
+    \v k   { 01E9 }
+    \k O   { 01EA }
+    \k o   { 01EB }
+    \v \j  { 01F0 }
+    \v j   { 01F0 }
+    \' G   { 01F4 }
+    \' g   { 01F5 }
+    \` N   { 01F8 }
+    \` n   { 01F9 }
+    \' \AE { 01FC }
+    \' \ae { 01FD }
+    \' \O  { 01FE }
+    \' \o  { 01FF }
+    \G A   { 0200 }
+    \G a   { 0201 }
+    \G E   { 0204 }
+    \G e   { 0205 }
+    \G I   { 0208 }
+    \G \i  { 0209 }
+    \G i   { 0209 }
+    \G O   { 020C }
+    \G o   { 020D }
+    \G R   { 0210 }
+    \G r   { 0211 }
+    \G U   { 0214 }
+    \G u   { 0215 }
+    \v H   { 021E }
+    \v h   { 021F }
+    \. A   { 0226 }
+    \. a   { 0227 }
+    \c E   { 0228 }
+    \c e   { 0229 }
+    \. O   { 022E }
+    \. o   { 022F }
+    \= Y   { 0232 }
+    \= y   { 0233 }
+    \q_recursion_tail ? ?
+    \q_recursion_stop
+%    \end{macrocode}
+\group_end:
+%    \end{macrocode}
+% \end{macro}
+%
 %    \begin{macrocode}
 %</initex|package>
 %    \end{macrocode}
diff --git a/l3kernel/testfiles/m3text001.tlg b/l3kernel/testfiles/m3text003.luatex.tlg
similarity index 82%
copy from l3kernel/testfiles/m3text001.tlg
copy to l3kernel/testfiles/m3text003.luatex.tlg
index 66b9aba64..165d69b57 100644
--- a/l3kernel/testfiles/m3text001.tlg
+++ b/l3kernel/testfiles/m3text003.luatex.tlg
@@ -2,39 +2,37 @@ This is a generated file for the LaTeX (2e + expl3) validation system.
 Don't change this file in any respect.
 Author: Joseph Wright
 ============================================================
-TEST 1: Expanding content
+TEST 1: Purify content
 ============================================================
 Some text Hello
 Hello sometext
-Some text Hello
-Hello sometext
-Some text \cs_tmp:w 
-\cs_tmp:w  sometext
 ============================================================
 ============================================================
-TEST 2: Expansion in braces
+TEST 2: Purify in braces
 ============================================================
-{Hello} world \par with \ERROR &##
+Hello world with &##
 ============================================================
 ============================================================
-TEST 3: Expansion exclusions
+TEST 3: Purification exclusions
 ============================================================
-FOO \cite {\l_tmpa_tl } {BAZ}
-\emph {BAR} {BAZ}
+FOO Hello BAZ
+ BAZ
 ============================================================
 ============================================================
 TEST 4: Math-mode escape
 ============================================================
 Some text $y = \sin \theta $
 Opps not close token in $y = \sin \theta 
+\ensuremath {y=mx+c} is an equation
 ============================================================
 ============================================================
 TEST 5: Letter-like commands
 ============================================================
-\AA \aa \J \ae \dh \ss \l \O 
+^^c5^^e5^^e6^^f0^^dfł^^d8
 ============================================================
 ============================================================
 TEST 6: Accents
 ============================================================
-\"{a}\u {e}\H {i}\v {o}\.{u}
+^^e4ĕi̋ǒu̇
+^^f1r̨
 ============================================================
diff --git a/l3kernel/testfiles/m3text003.lvt b/l3kernel/testfiles/m3text003.lvt
new file mode 100644
index 000000000..c1031f295
--- /dev/null
+++ b/l3kernel/testfiles/m3text003.lvt
@@ -0,0 +1,62 @@
+%
+% Copyright (C) 2019 LaTeX3 Project
+%
+\input{regression-test}
+
+\RequirePackage[enable-debug]{expl3}
+\ExplSyntaxOn
+\debug_on:n { check-declarations , deprecation , log-functions }
+\ExplSyntaxOff
+
+\START
+\AUTHOR{Joseph Wright}
+\ExplSyntaxOn
+
+\OMIT
+  \tl_set:Nn \l_tmpa_tl { Hello }
+  \tl_set:Nn \l_tmpb_tl { \l_tmpa_tl }
+  \cs_set_protected:Npn \cs_tmp:w { \l_tmpa_tl }
+\TIMO
+
+\TESTEXP { Purify~content }
+  {
+    \text_purify:n { Some~\emph{text}~\l_tmpa_tl }
+    \NEWLINE
+    \text_purify:n { \l_tmpa_tl \c_space_tl some { \bfseries text } }
+  }
+
+\TESTEXP { Purify~in~braces }
+  {
+    \text_purify:n { { \emph { \l_tmpa_tl } }~world~\par with~\ERROR & # }
+  }
+
+\TESTEXP { Purification~exclusions }
+  {
+    \text_purify:n { FOO~\cite { \l_tmpa_tl } ~ { BAZ } }
+    \NEWLINE
+    \text_purify:n { \label { BAR } ~ { BAZ } }
+  }
+
+\TESTEXP { Math-mode~escape }
+  {
+    \text_purify:n { Some~text~$y~=~\sin \theta$ }
+    \NEWLINE
+    \text_purify:n { Opps~not~close~token~in~$y~=~\sin \theta }
+    \NEWLINE
+    \text_purify:n { \ensuremath { y = mx + c }~is~an~equation }
+  }
+
+\sys_if_engine_ptex:T { \END }
+
+\TESTEXP { Letter-like~commands }
+  {
+    \text_purify:n { \AA \aa \J \ae \dh \ss \l \O }
+  }
+
+\TESTEXP { Accents }
+  {
+    \text_purify:n { \"{a} \u{e} \H{i} \v{o} \.{u} } \NEWLINE
+    \text_purify:n { \~{n} \k{r} }
+  }
+
+\END
diff --git a/l3kernel/testfiles/m3str002.uptex.tlg b/l3kernel/testfiles/m3text003.ptex.tlg
similarity index 74%
copy from l3kernel/testfiles/m3str002.uptex.tlg
copy to l3kernel/testfiles/m3text003.ptex.tlg
index 221de224e..f3efcb6c4 100644
--- a/l3kernel/testfiles/m3str002.uptex.tlg
+++ b/l3kernel/testfiles/m3text003.ptex.tlg
@@ -2,29 +2,26 @@ This is a generated file for the LaTeX (2e + expl3) validation system.
 Don't change this file in any respect.
 Author: Joseph Wright
 ============================================================
-TEST 1: Simple Latin case folding
+TEST 1: Purify content
 ============================================================
-"abc 123 abc !@"
-" abc 123 abc !@ "
-"some $&## odd text { } "
+Some text Hello
+Hello sometext
 ============================================================
 ============================================================
-TEST 2: Checking category codes
+TEST 2: Purify in braces
 ============================================================
-FALSE
-TRUE
+Hello world with &##
 ============================================================
 ============================================================
-TEST 3: Accented characters, etc.
+TEST 3: Purification exclusions
 ============================================================
-"ĆėƊṐṑ"
-"ᾪωΝ"
-"ΰῢﬆ"
-"Ꚉꚇ"
-"ＺꝎⓍ"
+FOO Hello BAZ
+ BAZ
 ============================================================
 ============================================================
-TEST 4: Characters with context-sensitive Unicode behaviour
+TEST 4: Math-mode escape
 ============================================================
-FALSE
+Some text $y = \sin \theta $
+Opps not close token in $y = \sin \theta 
+\ensuremath {y=mx+c} is an equation
 ============================================================
diff --git a/l3kernel/testfiles/m3text001.tlg b/l3kernel/testfiles/m3text003.tlg
similarity index 82%
copy from l3kernel/testfiles/m3text001.tlg
copy to l3kernel/testfiles/m3text003.tlg
index 66b9aba64..509f3fd2d 100644
--- a/l3kernel/testfiles/m3text001.tlg
+++ b/l3kernel/testfiles/m3text003.tlg
@@ -2,39 +2,37 @@ This is a generated file for the LaTeX (2e + expl3) validation system.
 Don't change this file in any respect.
 Author: Joseph Wright
 ============================================================
-TEST 1: Expanding content
+TEST 1: Purify content
 ============================================================
 Some text Hello
 Hello sometext
-Some text Hello
-Hello sometext
-Some text \cs_tmp:w 
-\cs_tmp:w  sometext
 ============================================================
 ============================================================
-TEST 2: Expansion in braces
+TEST 2: Purify in braces
 ============================================================
-{Hello} world \par with \ERROR &##
+Hello world with &##
 ============================================================
 ============================================================
-TEST 3: Expansion exclusions
+TEST 3: Purification exclusions
 ============================================================
-FOO \cite {\l_tmpa_tl } {BAZ}
-\emph {BAR} {BAZ}
+FOO Hello BAZ
+ BAZ
 ============================================================
 ============================================================
 TEST 4: Math-mode escape
 ============================================================
 Some text $y = \sin \theta $
 Opps not close token in $y = \sin \theta 
+\ensuremath {y=mx+c} is an equation
 ============================================================
 ============================================================
 TEST 5: Letter-like commands
 ============================================================
-\AA \aa \J \ae \dh \ss \l \O 
+^^c3^^85^^c3^^a5^^c3^^a6^^c3^^b0^^c3^^9f^^c5^^82^^c3^^98
 ============================================================
 ============================================================
 TEST 6: Accents
 ============================================================
-\"{a}\u {e}\H {i}\v {o}\.{u}
+^^c3^^a4^^c4^^95i^^cc^^8b^^c7^^92u^^cc^^87
+^^c3^^b1r^^cc^^a8
 ============================================================
diff --git a/l3kernel/testfiles/m3text001.tlg b/l3kernel/testfiles/m3text003.uptex.tlg
similarity index 82%
copy from l3kernel/testfiles/m3text001.tlg
copy to l3kernel/testfiles/m3text003.uptex.tlg
index 66b9aba64..165d69b57 100644
--- a/l3kernel/testfiles/m3text001.tlg
+++ b/l3kernel/testfiles/m3text003.uptex.tlg
@@ -2,39 +2,37 @@ This is a generated file for the LaTeX (2e + expl3) validation system.
 Don't change this file in any respect.
 Author: Joseph Wright
 ============================================================
-TEST 1: Expanding content
+TEST 1: Purify content
 ============================================================
 Some text Hello
 Hello sometext
-Some text Hello
-Hello sometext
-Some text \cs_tmp:w 
-\cs_tmp:w  sometext
 ============================================================
 ============================================================
-TEST 2: Expansion in braces
+TEST 2: Purify in braces
 ============================================================
-{Hello} world \par with \ERROR &##
+Hello world with &##
 ============================================================
 ============================================================
-TEST 3: Expansion exclusions
+TEST 3: Purification exclusions
 ============================================================
-FOO \cite {\l_tmpa_tl } {BAZ}
-\emph {BAR} {BAZ}
+FOO Hello BAZ
+ BAZ
 ============================================================
 ============================================================
 TEST 4: Math-mode escape
 ============================================================
 Some text $y = \sin \theta $
 Opps not close token in $y = \sin \theta 
+\ensuremath {y=mx+c} is an equation
 ============================================================
 ============================================================
 TEST 5: Letter-like commands
 ============================================================
-\AA \aa \J \ae \dh \ss \l \O 
+^^c5^^e5^^e6^^f0^^dfł^^d8
 ============================================================
 ============================================================
 TEST 6: Accents
 ============================================================
-\"{a}\u {e}\H {i}\v {o}\.{u}
+^^e4ĕi̋ǒu̇
+^^f1r̨
 ============================================================
diff --git a/l3kernel/testfiles/m3text001.tlg b/l3kernel/testfiles/m3text003.xetex.tlg
similarity index 82%
copy from l3kernel/testfiles/m3text001.tlg
copy to l3kernel/testfiles/m3text003.xetex.tlg
index 66b9aba64..165d69b57 100644
--- a/l3kernel/testfiles/m3text001.tlg
+++ b/l3kernel/testfiles/m3text003.xetex.tlg
@@ -2,39 +2,37 @@ This is a generated file for the LaTeX (2e + expl3) validation system.
 Don't change this file in any respect.
 Author: Joseph Wright
 ============================================================
-TEST 1: Expanding content
+TEST 1: Purify content
 ============================================================
 Some text Hello
 Hello sometext
-Some text Hello
-Hello sometext
-Some text \cs_tmp:w 
-\cs_tmp:w  sometext
 ============================================================
 ============================================================
-TEST 2: Expansion in braces
+TEST 2: Purify in braces
 ============================================================
-{Hello} world \par with \ERROR &##
+Hello world with &##
 ============================================================
 ============================================================
-TEST 3: Expansion exclusions
+TEST 3: Purification exclusions
 ============================================================
-FOO \cite {\l_tmpa_tl } {BAZ}
-\emph {BAR} {BAZ}
+FOO Hello BAZ
+ BAZ
 ============================================================
 ============================================================
 TEST 4: Math-mode escape
 ============================================================
 Some text $y = \sin \theta $
 Opps not close token in $y = \sin \theta 
+\ensuremath {y=mx+c} is an equation
 ============================================================
 ============================================================
 TEST 5: Letter-like commands
 ============================================================
-\AA \aa \J \ae \dh \ss \l \O 
+^^c5^^e5^^e6^^f0^^dfł^^d8
 ============================================================
 ============================================================
 TEST 6: Accents
 ============================================================
-\"{a}\u {e}\H {i}\v {o}\.{u}
+^^e4ĕi̋ǒu̇
+^^f1r̨
 ============================================================