[latex3-commits] [git/LaTeX3-latex3-latex3] text-purify: New function \text_purify:n plus support (2523f9a49)

Fri Jan 3 21:11:46 CET 2020

Repository : https://github.com/latex3/latex3
On branch  : text-purify
Link       : https://github.com/latex3/latex3/commit/2523f9a493f21df9b7db500a535ce40eb15e7c69

>---------------------------------------------------------------

commit 2523f9a493f21df9b7db500a535ce40eb15e7c69
Author: Joseph Wright <joseph.wright at morningstar2.co.uk>
Date:   Fri Jan 3 18:04:03 2020 +0000

    New function \text_purify:n plus support


>---------------------------------------------------------------

2523f9a493f21df9b7db500a535ce40eb15e7c69
 l3kernel/CHANGELOG.md                              |   1 +
 l3kernel/doc/source3body.tex                       |   3 +-
 l3kernel/l3.ins                                    |   1 +
 l3kernel/l3format.ins                              |   1 +
 l3kernel/l3text-purify.dtx                         | 785 +++++++++++++++++++++
 .../{m3text001.tlg => m3text003.luatex.tlg}        |  31 +-
 l3kernel/testfiles/m3text003.lvt                   |  62 ++
 .../{m3str002.luatex.tlg => m3text003.ptex.tlg}    |  27 +-
 .../testfiles/{m3text001.tlg => m3text003.tlg}     |  31 +-
 .../{m3text001.tlg => m3text003.uptex.tlg}         |  31 +-
 .../{m3text001.tlg => m3text003.xetex.tlg}         |  31 +-
 11 files changed, 924 insertions(+), 80 deletions(-)

diff --git a/l3kernel/CHANGELOG.md b/l3kernel/CHANGELOG.md
index 45d1b656c..662f23899 100644
--- a/l3kernel/CHANGELOG.md
+++ b/l3kernel/CHANGELOG.md
@@ -13,6 +13,7 @@ this project uses date-based 'snapshot' version identifiers.
 - `\str_<type>case:n`
 - `\text_<type>case:n(n)`
 - `\text_expand:n` and supporting data structures
+- `\text_purify:n` and supporting data structures
 
 ### Changed
 - Distribute LaTeX3 News
diff --git a/l3kernel/doc/source3body.tex b/l3kernel/doc/source3body.tex
index f985c0d14..78f18796c 100644
--- a/l3kernel/doc/source3body.tex
+++ b/l3kernel/doc/source3body.tex
@@ -488,7 +488,8 @@ used on top of \LaTeXe{} if \cs{outer} tokens are used in the arguments.
 \ExplSyntaxOn
 \clist_gput_right:Nn \g_docinput_clist
   {
-    l3text-case.dtx
+    l3text-case.dtx ,
+    l3text-purify.dtx
   }
 \ExplSyntaxOff
 \DocInput{l3legacy.dtx}
diff --git a/l3kernel/l3.ins b/l3kernel/l3.ins
index b509a2478..86d4ac528 100644
--- a/l3kernel/l3.ins
+++ b/l3kernel/l3.ins
@@ -106,6 +106,7 @@ and all files in that bundle must be distributed together.
         \from{l3unicode.dtx}    {package}
         \from{l3text.dtx}       {package}
         \from{l3text-case.dtx}  {package}
+        \from{l3text-purify.dtx}{package}
         \from{l3candidates.dtx} {package}
         \from{l3legacy.dtx}     {package}
         \from{l3deprecation.dtx}{package,kernel}
diff --git a/l3kernel/l3format.ins b/l3kernel/l3format.ins
index 6e0c98cb8..a4f728a0d 100644
--- a/l3kernel/l3format.ins
+++ b/l3kernel/l3format.ins
@@ -106,6 +106,7 @@ and all files in that bundle must be distributed together.
         \from{l3unicode.dtx}    {initex}
         \from{l3text.dtx}       {initex}
         \from{l3text-case.dtx}  {initex}
+        \from{l3text-purify.dtx}{initex}
         \from{l3candidates.dtx} {initex}
         % ======== FORMAT ONLY =========
         \from{l3final.dtx}      {initex}
diff --git a/l3kernel/l3text-purify.dtx b/l3kernel/l3text-purify.dtx
new file mode 100644
index 000000000..96b36c105
--- /dev/null
+++ b/l3kernel/l3text-purify.dtx
@@ -0,0 +1,785 @@
+% \iffalse meta-comment
+%
+%% File: l3text-purify.dtx
+%
+% Copyright (C) 2020 The LaTeX3 Project
+%
+% It may be distributed and/or modified under the conditions of the
+% LaTeX Project Public License (LPPL), either version 1.3c of this
+% license or (at your option) any later version.  The latest version
+% of this license is in the file
+%
+%    https://www.latex-project.org/lppl.txt
+%
+% This file is part of the "l3kernel bundle" (The Work in LPPL)
+% and all files in that bundle must be distributed together.
+%
+% -----------------------------------------------------------------------
+%
+% The development version of the bundle can be found at
+%
+%    https://github.com/latex3/latex3
+%
+% for those people who are interested.
+%
+%<*driver>
+\documentclass[full,kernel]{l3doc}
+\begin{document}
+  \DocInput{\jobname.dtx}
+\end{document}
+%</driver>
+% \fi
+%
+% \title{^^A
+%   The \textsf{l3text-purify} package: text processing (purification)^^A
+% }
+%
+% \author{^^A
+%  The \LaTeX3 Project\thanks
+%    {^^A
+%      E-mail:
+%        \href{mailto:latex-team at latex-project.org}
+%          {latex-team at latex-project.org}^^A
+%    }^^A
+% }
+%
+% \date{Released 2019-11-07}
+%
+% \maketitle
+%
+% \begin{documentation}
+%
+% \end{documentation}
+%
+% \begin{implementation}
+%
+% \section{\pkg{l3text} implementation}
+%
+%    \begin{macrocode}
+%<*initex|package>
+%    \end{macrocode}
+%
+%    \begin{macrocode}
+%<@@=text>
+%    \end{macrocode}
+%
+% \subsection{Purifying text}
+%
+% \begin{macro}[rEXP]{\text_purify:n, \@@_purify:n}
+% \begin{macro}[rEXP]{\@@_purify_loop:w}
+% \begin{macro}[rEXP]{\@@_purify_group:n}
+% \begin{macro}[rEXP]{\@@_purify_space:w}
+% \begin{macro}[rEXP]{\@@_purify_N_type:N, \@@_purify_N_type_aux:N}
+% \begin{macro}[rEXP]{\@@_purify_math_search:NNN}
+% \begin{macro}[rEXP]{\@@_purify_math_start:NNw}
+% \begin{macro}[rEXP]{\@@_purify_math_store:n}
+% \begin{macro}[rEXP]{\@@_purify_math_store:nw}
+% \begin{macro}[rEXP]{\@@_purify_math_end:w}
+% \begin{macro}[rEXP]{\@@_purify_math_loop:NNw}
+% \begin{macro}[rEXP]{\@@_purify_math_N_type:NNN}
+% \begin{macro}[rEXP]{\@@_purify_math_group:NNn}
+% \begin{macro}[rEXP]{\@@_purify_math_space:NNw}
+% \begin{macro}[rEXP]{\@@_purify_math_cmd:N}
+% \begin{macro}[rEXP]{\@@_purify_math_cmd:NN}
+% \begin{macro}[rEXP]{\@@_purify_math_cmd:Nn}
+% \begin{macro}[rEXP]{\@@_purify_replace:N}
+% \begin{macro}[rEXP]{\@@_purify_replace:n}
+% \begin{macro}[rEXP]{\@@_purify_expand:N, \@@_purify_protect:N}
+%   As in the other parts of the module, we start off with a standard
+%   \enquote{action} loop, with expansion applied up-front. Here, as there
+%   will be no text commands left in the output, there is no concern about
+%   using \cs{exp_not:n} and \texttt{e}-type expansion.
+%    \begin{macrocode}
+\cs_new:Npn \text_purify:n #1
+  {
+    \group_align_safe_begin:
+      \exp_args:Ne \@@_purify:n
+        { \text_expand:n {#1} }
+    \group_align_safe_end:
+  }
+\cs_new:Npn \@@_purify:n #1
+  { \@@_purify_loop:w #1 \q_recursion_tail \q_recursion_stop }
+%    \end{macrocode}
+%   The main loop is a standard \enquote{tl action}. Unlike the expansion
+%   or case changing, here any groups have to be run inline. Most of the
+%   business end is as before in the \texttt{N}-type token processing.
+%    \begin{macrocode}
+\cs_new:Npn \@@_purify_loop:w #1 \q_recursion_stop
+  {
+    \tl_if_head_is_N_type:nTF {#1}
+      { \@@_purify_N_type:N }
+      {
+        \tl_if_head_is_group:nTF {#1}
+          { \@@_purify_group:n }
+          { \@@_purify_space:w }
+      }
+    #1 \q_recursion_stop
+  }
+\cs_new:Npn \@@_purify_group:n #1 { \@@_purify_loop:w #1 }
+\exp_last_unbraced:NNo \cs_new:Npn \@@_purify_space:w \c_space_tl
+  {
+    \c_space_tl
+    \@@_purify_loop:w
+  }
+%    \end{macrocode}
+%   The first part of handling math mode is exactly the same as in the
+%   other functions: look for a start-of-math mode token and if found start
+%   a new loop tracking the closing token.
+%    \begin{macrocode}
+\cs_new:Npn \@@_purify_N_type:N #1
+  {
+    \quark_if_recursion_tail_stop:N #1
+    \@@_purify_N_type_aux:N #1
+  }
+\cs_new:Npn \@@_purify_N_type_aux:N #1
+  {
+    \exp_after:wN \@@_purify_math_search:NNN
+      \exp_after:wN #1 \l_text_math_delims_tl
+      \q_recursion_tail ?
+      \q_recursion_stop
+  }
+\cs_new:Npn \@@_purify_math_search:NNN #1#2#3
+  {
+    \quark_if_recursion_tail_stop_do:Nn #2
+      { \@@_purify_math_cmd:N #1 }
+    \token_if_eq_meaning:NNTF #1 #2
+      {
+        \use_i_delimit_by_q_recursion_stop:nw
+           { \@@_purify_math_start:NNw #2 #3 }
+      }
+      { \@@_purify_math_search:NNN #1 }
+  }
+\cs_new:Npn \@@_purify_math_start:NNw #1#2#3 \q_recursion_stop
+  {
+    \@@_purify_math_loop:NNw #1#2#3 \q_recursion_stop
+      \@@_purify_math_result:n { }
+  }
+\cs_new:Npn \@@_purify_math_store:n #1
+  { \@@_purify_math_store:nw {#1} }
+\cs_new:Npn \@@_purify_math_store:nw #1#2 \@@_purify_math_result:n #3
+  { #2 \@@_purify_math_result:n { #3 #1 } }
+\cs_new:Npn \@@_purify_math_end:w #1 \@@_purify_math_result:n #2
+  {
+    \use:c { text_purify_math_ \l_@@_math_mode_tl :n } {#2}
+    \@@_purify_loop:w #1
+  }
+\cs_new:Npn \@@_purify_math_stop:Nw #1 \@@_purify_math_result:n #2
+  { \exp_not:n {#1#2} }
+\cs_new:Npn \@@_purify_math_loop:NNw #1#2#3 \q_recursion_stop
+  {
+    \tl_if_head_is_N_type:nTF {#3}
+      { \@@_purify_math_N_type:NNN }
+      {
+        \tl_if_head_is_group:nTF {#3}
+          { \@@_purify_math_group:NNn }
+          { \@@_purify_math_space:NNw }
+      }
+        #1#2#3 \q_recursion_stop
+  }
+\cs_new:Npn \@@_purify_math_N_type:NNN #1#2#3
+  {
+    \quark_if_recursion_tail_stop_do:Nn #3
+      { \@@_purify_math_stop:Nw #1 }
+    \token_if_eq_meaning:NNTF #3 #2
+      { \@@_purify_math_end:w }
+      {
+        \@@_purify_math_store:n {#3}
+        \@@_purify_math_loop:NNw #1#2
+      }
+  }
+\cs_new:Npn \@@_purify_math_group:NNn #1#2#3
+  {
+    \@@_purify_math_store:n { {#3} }
+    \@@_purify_math_loop:NNw #1#2
+  }
+\exp_after:wN \cs_new:Npn \exp_after:wN \@@_purify_math_space:NNw 
+  \exp_after:wN # \exp_after:wN 1
+  \exp_after:wN # \exp_after:wN 2 \c_space_tl
+  {
+    \@@_purify_math_store:n { ~ }
+    \@@_purify_math_loop:NNw #1#2
+  }
+%    \end{macrocode}
+%   Then handle math mode as an argument: same outcomes, different input
+%   syntax.
+%    \begin{macrocode}
+\cs_new:Npn \@@_purify_math_cmd:N #1
+  {
+    \exp_after:wN \@@_purify_math_cmd:NN \exp_after:wN #1
+      \l_text_math_arg_tl \q_recursion_tail \q_recursion_stop
+  }
+\cs_new:Npn \@@_purify_math_cmd:NN #1#2
+  {
+    \quark_if_recursion_tail_stop_do:Nn #2
+      { \@@_purify_replace:N #1 }
+    \cs_if_eq:NNTF #2 #1
+      {
+        \use_i_delimit_by_q_recursion_stop:nw
+          { \@@_purify_math_cmd:n }
+      }
+      { \@@_purify_math_cmd:NN #1 }
+  }
+\cs_new:Npn \@@_purify_math_cmd:n #1
+  { \@@_purify_math_end:w \@@_purify_math_result:n {#1} }
+%    \end{macrocode}
+%   For \texttt{N}-type tokens, we first look for a string-context replacement
+%   before anything else: this can therefore cover anything. Assuming we don't
+%   find one, check to see if we can expand control sequences: if not, they have
+%   to be dropped. We also allow for \LaTeXe{} \tn{protect}: there's an
+%   assumption that we don't have |\protect { \oops }| or similar, but that's
+%   also in the expansion code and seems like a reasonable balance.
+%    \begin{macrocode}
+\cs_new:Npn \@@_purify_replace:N #1
+  {
+    \bool_lazy_and:nnTF
+      { \cs_if_exist_p:c { l_@@_purify_ \token_to_str:N #1 _tl } }
+      {
+        \bool_lazy_or_p:nn
+          { \token_if_cs_p:N #1 }
+          { \token_if_active_p:N #1 }
+      }
+      {
+        \exp_args:Nv \@@_purify_replace:n
+          { l_@@_purify_ \token_to_str:N #1 _tl }
+      }
+      {
+        \token_if_cs:NTF #1
+          { \@@_purify_expand:N #1 }
+          {
+            \exp_not:n {#1}
+            \@@_purify_loop:w
+          }
+      }
+  }
+\cs_new:Npn \@@_purify_replace:n #1 { \@@_purify_loop:w #1 }
+\cs_new:Npn \@@_purify_expand:N #1
+  {
+    \str_if_eq:nnTF {#1} { \protect }
+      { \@@_purify_protect:N }
+      {
+        \@@_if_expandable:NTF #1
+          { \exp_after:wN \@@_purify_loop:w #1 }
+          { \@@_purify_loop:w }
+      }
+  }
+\cs_new:Npn \@@_purify_protect:N #1
+  {
+    \quark_if_recursion_tail_stop:N #1
+    \@@_purify_loop:w
+  }
+%    \end{macrocode}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+%
+% \begin{macro}
+%   {
+%     \text_declare_purify_equivalent:Nn,
+%     \text_declare_purify_equivalent:Nx
+%   }
+%    \begin{macrocode}
+\cs_new_protected:Npn \text_declare_purify_equivalent:Nn #1#2
+  {
+    \tl_clear_new:c { l_@@_purify_ \token_to_str:N #1 _tl }
+    \tl_set:cn { l_@@_purify_ \token_to_str:N #1 _tl } {#2}
+  }
+\cs_generate_variant:Nn \text_declare_purify_equivalent:Nn { Nx }
+%    \end{macrocode}
+% \end{macro}
+%
+% Now pre-define a range of standard commands that need dedicated definitions
+% in purified text. First handle font-related stuff: all of this needs to be
+% disabled.
+%    \begin{macrocode}
+\tl_map_inline:nn
+  {
+    \fontencoding
+    \fontfamily
+    \fontseries
+    \fontshape
+  }
+  { \text_declare_purify_equivalent:Nn #1 { \use_none:n } }
+\text_declare_purify_equivalent:Nn \fontsize { \use_none:nn }
+\text_declare_purify_equivalent:Nn \selectfont { }
+\text_declare_purify_equivalent:Nn \usefont { \use_none:nnnn }
+\tl_map_inline:nn
+  {
+    \emph
+    \text
+    \textnormal
+    \textrm
+    \textsf
+    \texttt
+    \textbf
+    \textmd
+    \textit
+    \textsl
+    \textup
+    \textsc
+    \textulc
+  }
+  { \text_declare_purify_equivalent:Nn #1 { \use:n } }
+\tl_map_inline:nn
+  {
+    \normalfont
+    \rmfamily
+    \sffamily
+    \ttfamily
+    \bfseries
+    \mdseries
+    \itshape
+    \scshape
+    \slshape
+    \upshape
+    \em
+    \Huge
+    \LARGE
+    \Large
+    \footnotesize
+    \huge
+    \large
+    \normalsize
+    \scriptsize
+    \small
+    \tiny
+  }
+  { \text_declare_purify_equivalent:Nn #1 { } }
+%    \end{macrocode}
+% Environments have to be handled by pure expansion.
+%    \begin{macrocode}
+\text_declare_purify_equivalent:Nn \begin { \use:c }
+\text_declare_purify_equivalent:Nn \end { \use:c }
+%    \end{macrocode}
+% Some common symbols and similar ideas.
+%    \begin{macrocode}
+\text_declare_purify_equivalent:Nn \\ { }
+\tl_map_inline:nn
+  { \{ \} \# \$ \% \_ }
+  { \text_declare_purify_equivalent:Nx #1 { \cs_to_str:N #1 } }
+%    \end{macrocode}
+% Cross-referencing and similar.
+%    \begin{macrocode}
+\tl_map_inline:nn
+  { \cite \label \ref }
+  { \text_declare_purify_equivalent:Nn #1 { \use_none:n } }
+%    \end{macrocode}
+% Spaces.
+%    \begin{macrocode}
+\group_begin:
+\char_set_catcode_active:N \~
+\use:n
+  {
+    \group_end:
+    \text_declare_purify_equivalent:Nx ~ { \c_space_tl }
+  }
+\text_declare_purify_equivalent:Nn \nobreakspace { ~ }
+\text_declare_purify_equivalent:Nn \  { ~ }
+\text_declare_purify_equivalent:Nn \, { ~ }
+%    \end{macrocode}
+%
+% \subsection{Accent and letter-like data for purifying text}
+%
+% In contrast to case changing, both $8$-bit and Unicode engines need
+% information for text purification to handle accents and letter-like
+% functions: these all need to be removed. However, the results are
+% of course engine-dependent.
+%
+% For the letter-like commands, life is relatively easy: they are all
+% simply added as standard exceptions. The only oddity is \tn{SS}, which
+% gets converted to two letters. (At some stage an alternative version
+% can presumably be added to \pkg{babel} or similar.)
+%    \begin{macrocode}
+\bool_lazy_or:nnTF
+  { \sys_if_engine_luatex_p: }
+  { \sys_if_engine_xetex_p: }
+  {
+    \cs_set_protected:Npn \@@_loop:Nn #1#2
+      {
+        \quark_if_recursion_tail_stop:N #1
+        \text_declare_purify_equivalent:Nx #1
+          {
+            \char_generate:nn { "#2 }
+              { \char_value_catcode:n { "#2 } }
+          }
+        \@@_loop:Nn
+      }
+  }
+  {
+    \cs_set_protected:Npn \@@_loop:Nn #1#2
+      {
+        \quark_if_recursion_tail_stop:N #1
+        \text_declare_purify_equivalent:Nx #1
+          {
+            \exp_args:Ne \@@_tmp:n
+              { \char_codepoint_to_bytes:n { "#2 } }
+          }
+        \@@_loop:Nn
+      }
+    \cs_set:Npn \@@_tmp:n #1 { \@@_tmp:nnnn #1 }
+    \cs_set:Npn \@@_tmp:nnnn #1#2#3#4
+      {
+        \exp_after:wN \exp_after:wN \exp_after:wN
+          \exp_not:N \char_generate:nn {#1} { 13 }
+        \exp_after:wN \exp_after:wN \exp_after:wN
+          \exp_not:N \char_generate:nn {#2} { 13 }
+      }
+  }
+\@@_loop:Nn
+  \AA { 00C5 }
+  \AE { 00C6 }
+  \DH { 00D0 }
+  \DJ { 0110 }
+  \IJ { 0132 }
+  \L  { 0141 }
+  \NG { 014A }
+  \O  { 00D8 }
+  \OE { 0152 }
+  \TH { 00DE }
+  \aa { 00E5 }
+  \ae { 00E6 }
+  \dh { 00F0 }
+  \dj { 0111 }
+  \i  { 0131 }
+  \j  { 0237 }
+  \ij { 0132 }
+  \l  { 0142 }
+  \ng { 014B }
+  \o  { 00F8 }
+  \oe { 0153 }
+  \ss { 00DF }
+  \th { 00FE }
+  \q_recursion_tail ?
+  \q_recursion_stop
+\text_declare_purify_equivalent:Nn \SS { SS }
+%    \end{macrocode}
+%
+% \begin{macro}[rEXP]{\@@_purify_accent:NN}
+%   Accent \textsc{licr} handling is a little more complex. Accents may exist
+%   as pre-composed codepoints or as independent glyphs. The former are all
+%   saved as single token lists, whilst for the latter the combining accent
+%   needs to be re-ordered compared to the character it applies to.
+%    \begin{macrocode}
+\cs_new:Npn \@@_purify_accent:NN #1#2
+  {
+    \cs_if_exist:cTF
+      { c_@@_purify_ \token_to_str:N #1 _ \token_to_str:N #2 _tl }
+      {
+        \exp_not:v
+          { c_@@_purify_ \token_to_str:N #1 _ \token_to_str:N #2 _tl }
+      }
+      {
+        \exp_not:n {#2}
+        \exp_not:v { c_@@_purify_ \token_to_str:N #1 _tl }
+      }
+  }
+\tl_map_inline:Nn \l_text_accents_tl
+  { \text_declare_purify_equivalent:Nn #1 { \@@_purify_accent:NN #1 } }
+%    \end{macrocode}
+%   First set up the combining accents.
+%    \begin{macrocode}
+\group_begin:
+  \cs_set_protected:Npn \@@_loop:Nn #1#2
+    {
+      \quark_if_recursion_tail_stop:N #1
+      \tl_const:cx { c_@@_purify_ \token_to_str:N #1 _tl }
+        { \@@_tmp:n {#2} }
+      \@@_loop:Nn
+    }
+  \bool_lazy_or:nnTF
+    { \sys_if_engine_luatex_p: }
+    { \sys_if_engine_xetex_p: }
+    {
+      \cs_set:Npn \@@_tmp:n #1
+        {
+          \char_generate:nn { "#1 }
+            { \char_value_catcode:n { "#1 } }
+        }
+    }
+    {
+      \cs_set:Npn \@@_tmp:n #1
+        {
+          \exp_args:Ne \@@_tmp_aux:n
+            { \char_codepoint_to_bytes:n { "#1 } }
+        }
+      \cs_set:Npn \@@_tmp_aux:n #1 { \@@_tmp:nnnn #1 }
+      \cs_set:Npn \@@_tmp:nnnn #1#2#3#4
+        {
+          \exp_after:wN \exp_after:wN \exp_after:wN
+            \exp_not:N \char_generate:nn {#1} { 13 }
+          \exp_after:wN \exp_after:wN \exp_after:wN
+            \exp_not:N \char_generate:nn {#2} { 13 }
+        }
+    }
+  \@@_loop:Nn
+    \` { 0300 }
+    \' { 0301 }
+    \^ { 0302 }
+    \~ { 0303 }
+    \= { 0304 }
+    \u { 0306 }
+    \. { 0307 }
+    \" { 0308 }
+    \r { 030A }
+    \H { 030B }
+    \v { 030C }
+    \d { 0323 }
+    \c { 0327 }
+    \k { 0328 }
+    \b { 0331 }
+    \t { 0361 }
+    \q_recursion_tail { }
+    \q_recursion_stop
+%    \end{macrocode}
+%   Now we handle the pre-composed accents: the list here is taken from
+%   \texttt{puenc.def}. All of the precomposed cases take a single letter
+%   as their second argument. We do not try to cover the case where an accent
+%   is added to a \enquote{real} dotless-i or -j, or a \ae/\AE. Rather, we
+%   assume that if the \textsc{utf}-8 character is used, it will have the
+%   real accent character too.
+%    \begin{macrocode}
+  \cs_set_protected:Npn \@@_loop:NNn #1#2#3
+    {
+      \quark_if_recursion_tail_stop:N #1
+      \tl_const:cx
+        { c_@@_purify_ \token_to_str:N #1 _ \token_to_str:N #2 _tl }
+        { \@@_tmp:n {#3} }
+      \@@_loop:NNn
+    }
+  \@@_loop:NNn
+    \` A   { 00C0 }
+    \' A   { 00C1 }
+    \^ A   { 00C2 }
+    \~ A   { 00C3 }
+    \" A   { 00C4 }
+    \r A   { 00C5 }
+    \c C   { 00C7 }
+    \` E   { 00C8 }
+    \' E   { 00C9 }
+    \^ E   { 00CA }
+    \" E   { 00CB }
+    \` I   { 00CC }
+    \' I   { 00CD }
+    \^ I   { 00CE }
+    \" I   { 00CF }
+    \~ N   { 00D1 }
+    \` O   { 00D2 }
+    \' O   { 00D3 }
+    \^ O   { 00D4 }
+    \~ O   { 00D5 }
+    \" O   { 00D6 }
+    \` U   { 00D9 }
+    \' U   { 00DA }
+    \^ U   { 00DB }
+    \" U   { 00DC }
+    \' Y   { 00DD }
+    \` a   { 00E0 }
+    \' a   { 00E1 }
+    \^ a   { 00E2 }
+    \~ a   { 00E3 }
+    \" a   { 00E4 }
+    \r a   { 00E5 }
+    \c c   { 00E7 }
+    \` e   { 00E8 }
+    \' e   { 00E9 }
+    \^ e   { 00EA }
+    \" e   { 00EB }
+    \` i   { 00EC }
+    \` \i  { 00EC }
+    \' i   { 00ED }
+    \' \i  { 00ED }
+    \^ i   { 00EE }
+    \^ \i  { 00EE }
+    \" i   { 00EF }
+    \" \i  { 00EF }
+    \~ n   { 00F1 }
+    \` o   { 00F2 }
+    \' o   { 00F3 }
+    \^ o   { 00F4 }
+    \~ o   { 00F5 }
+    \" o   { 00F6 }
+    \` u   { 00F9 }
+    \' u   { 00FA }
+    \^ u   { 00FB }
+    \" u   { 00FC }
+    \' y   { 00FD }
+    \" y   { 00FF }
+    \= A   { 0100 }
+    \= a   { 0101 }
+    \u A   { 0102 }
+    \u a   { 0103 }
+    \k A   { 0104 }
+    \k a   { 0105 }
+    \' C   { 0106 }
+    \' c   { 0107 }
+    \^ C   { 0108 }
+    \^ c   { 0109 }
+    \. C   { 010A }
+    \. c   { 010B }
+    \v C   { 010C }
+    \v c   { 010D }
+    \v D   { 010E }
+    \v d   { 010F }
+    \= E   { 0112 }
+    \= e   { 0113 }
+    \u E   { 0114 }
+    \u e   { 0115 }
+    \. E   { 0116 }
+    \. e   { 0117 }
+    \k E   { 0118 }
+    \k e   { 0119 }
+    \v E   { 011A }
+    \v e   { 011B }
+    \^ G   { 011C }
+    \^ g   { 011D }
+    \u G   { 011E }
+    \u g   { 011F }
+    \. G   { 0120 }
+    \. g   { 0121 }
+    \c G   { 0122 }
+    \c g   { 0123 }
+    \^ H   { 0124 }
+    \^ h   { 0125 }
+    \~ I   { 0128 }
+    \~ i   { 0129 }
+    \~ \i  { 0129 }
+    \= I   { 012A }
+    \= i   { 012B }
+    \= \i  { 012B }
+    \u I   { 012C }
+    \u i   { 012D }
+    \u \i  { 012D }
+    \k I   { 012E }
+    \k i   { 012F }
+    \k \i  { 012F }
+    \. I   { 0130 }
+    \^ J   { 0134 }
+    \^ j   { 0135 }
+    \^ \j  { 0135 }
+    \c K   { 0136 }
+    \c k   { 0137 }
+    \' L   { 0139 }
+    \' l   { 013A }
+    \c L   { 013B }
+    \c l   { 013C }
+    \v L   { 013D }
+    \v l   { 013E }
+    \. L   { 013F }
+    \. l   { 0140 }
+    \' N   { 0143 }
+    \' n   { 0144 }
+    \c N   { 0145 }
+    \c n   { 0146 }
+    \v N   { 0147 }
+    \v n   { 0148 }
+    \= O   { 014C }
+    \= o   { 014D }
+    \u O   { 014E }
+    \u o   { 014F }
+    \H O   { 0150 }
+    \H o   { 0151 }
+    \' R   { 0154 }
+    \' r   { 0155 }
+    \c R   { 0156 }
+    \c r   { 0157 }
+    \v R   { 0158 }
+    \v r   { 0159 }
+    \' S   { 015A }
+    \' s   { 015B }
+    \^ S   { 015C }
+    \^ s   { 015D }
+    \c S   { 015E }
+    \c s   { 015F }
+    \v S   { 0160 }
+    \v s   { 0161 }
+    \c T   { 0162 }
+    \c t   { 0163 }
+    \v T   { 0164 }
+    \v t   { 0165 }
+    \~ U   { 0168 }
+    \~ u   { 0169 }
+    \= U   { 016A }
+    \= u   { 016B }
+    \u U   { 016C }
+    \u u   { 016D }
+    \r U   { 016E }
+    \r u   { 016F }
+    \H U   { 0170 }
+    \H u   { 0171 }
+    \k U   { 0172 }
+    \k u   { 0173 }
+    \^ W   { 0174 }
+    \^ w   { 0175 }
+    \^ Y   { 0176 }
+    \^ y   { 0177 }
+    \" Y   { 0178 }
+    \' Z   { 0179 }
+    \' z   { 017A }
+    \. Z   { 017B }
+    \. z   { 017C }
+    \v Z   { 017D }
+    \v z   { 017E }
+    \v A   { 01CD }
+    \v a   { 01CE }
+    \v I   { 01CF }
+    \v \i  { 01D0 }
+    \v i   { 01D0 }
+    \v O   { 01D1 }
+    \v o   { 01D2 }
+    \v U   { 01D3 }
+    \v u   { 01D4 }
+    \v G   { 01E6 }
+    \v g   { 01E7 }
+    \v K   { 01E8 }
+    \v k   { 01E9 }
+    \k O   { 01EA }
+    \k o   { 01EB }
+    \v \j  { 01F0 }
+    \v j   { 01F0 }
+    \' G   { 01F4 }
+    \' g   { 01F5 }
+    \` N   { 01F8 }
+    \` n   { 01F9 }
+    \' \AE { 01FC }
+    \' \ae { 01FD }
+    \' \O  { 01FE }
+    \' \o  { 01FF }
+    \v H   { 021E }
+    \v h   { 021F }
+    \. A   { 0226 }
+    \. a   { 0227 }
+    \c E   { 0228 }
+    \c e   { 0229 }
+    \. O   { 022E }
+    \. o   { 022F }
+    \= Y   { 0232 }
+    \= y   { 0233 }
+    \q_recursion_tail ? { }
+    \q_recursion_stop
+\group_end:
+%    \end{macrocode}
+% \end{macro}
+%
+%    \begin{macrocode}
+%</initex|package>
+%    \end{macrocode}
+%
+% \end{implementation}
+%
+% \PrintIndex
diff --git a/l3kernel/testfiles/m3text001.tlg b/l3kernel/testfiles/m3text003.luatex.tlg
similarity index 77%
copy from l3kernel/testfiles/m3text001.tlg
copy to l3kernel/testfiles/m3text003.luatex.tlg
index b468aa196..cf6b57f22 100644
--- a/l3kernel/testfiles/m3text001.tlg
+++ b/l3kernel/testfiles/m3text003.luatex.tlg
@@ -2,38 +2,37 @@ This is a generated file for the LaTeX (2e + expl3) validation system.
 Don't change this file in any respect.
 Author: Joseph Wright
 ============================================================
-TEST 1: Expanding content
+TEST 1: Purify content
 ============================================================
 Some text Hello
 Hello sometext
-Some text Hello
-Hello sometext
-Some text \cs_tmp:w 
-\cs_tmp:w  sometext
 ============================================================
 ============================================================
-TEST 2: Expansion in braces
+TEST 2: Purify in braces
 ============================================================
-{Hello} world \par with \ERROR &##
+Hello world with &##
 ============================================================
 ============================================================
-TEST 3: Math-mode escape
+TEST 3: Purification exclusions
 ============================================================
-Some text $y = \sin \theta $
-Opps not close token in $y = \sin \theta 
+FOO  BAZ
+ BAZ
 ============================================================
 ============================================================
-TEST 4: Letter-like commands
+TEST 4: Math-mode escape
 ============================================================
-\AA \aa \J \ae \dh \ss \l \O 
+Some text \text_purify_math_:n {y = \protect \mathop {\mathgroup \symoperators sin}\nolimits \theta }
+Opps not close token in $y = \sin \theta 
+\text_purify_math_:n {y=mx+c} is an equation
 ============================================================
 ============================================================
-TEST 5: Accents
+TEST 5: Letter-like commands
 ============================================================
-\"{a}\u {e}\H {i}\v {o}\.{u}
+^^c5^^e5^^e6^^f0^^dfł^^d8
 ============================================================
 ============================================================
-TEST 6: Implicit tokens
+TEST 6: Accents
 ============================================================
-"A"B"C" " " "
+^^e4ĕi̋ǒu̇
+^^f1r̨
 ============================================================
diff --git a/l3kernel/testfiles/m3text003.lvt b/l3kernel/testfiles/m3text003.lvt
new file mode 100644
index 000000000..a0e84c878
--- /dev/null
+++ b/l3kernel/testfiles/m3text003.lvt
@@ -0,0 +1,62 @@
+%
+% Copyright (C) 2020 LaTeX3 Project
+%
+\input{regression-test}
+
+\RequirePackage[enable-debug]{expl3}
+\ExplSyntaxOn
+\debug_on:n { check-declarations , deprecation , log-functions }
+\ExplSyntaxOff
+
+\START
+\AUTHOR{Joseph Wright}
+\ExplSyntaxOn
+
+\OMIT
+  \tl_set:Nn \l_tmpa_tl { Hello }
+  \tl_set:Nn \l_tmpb_tl { \l_tmpa_tl }
+  \cs_set_protected:Npn \cs_tmp:w { \l_tmpa_tl }
+\TIMO
+
+\TESTEXP { Purify~content }
+  {
+    \text_purify:n { Some~\emph{text}~\l_tmpa_tl }
+    \NEWLINE
+    \text_purify:n { \l_tmpa_tl \c_space_tl some { \bfseries text } }
+  }
+
+\TESTEXP { Purify~in~braces }
+  {
+    \text_purify:n { { \emph { \l_tmpa_tl } }~world~\par with~\ERROR & # }
+  }
+
+\TESTEXP { Purification~exclusions }
+  {
+    \text_purify:n { FOO~\cite { \l_tmpa_tl } ~ { BAZ } }
+    \NEWLINE
+    \text_purify:n { \label { BAR } ~ { BAZ } }
+  }
+
+\TESTEXP { Math-mode~escape }
+  {
+    \text_purify:n { Some~text~$y~=~\sin \theta$ }
+    \NEWLINE
+    \text_purify:n { Opps~not~close~token~in~$y~=~\sin \theta }
+    \NEWLINE
+    \text_purify:n { \ensuremath { y = mx + c }~is~an~equation }
+  }
+
+\sys_if_engine_ptex:T { \END }
+
+\TESTEXP { Letter-like~commands }
+  {
+    \text_purify:n { \AA \aa \J \ae \dh \ss \l \O }
+  }
+
+\TESTEXP { Accents }
+  {
+    \text_purify:n { \"{a} \u{e} \H{i} \v{o} \.{u} } \NEWLINE
+    \text_purify:n { \~{n} \k{r} }
+  }
+
+\END
diff --git a/l3kernel/testfiles/m3str002.luatex.tlg b/l3kernel/testfiles/m3text003.ptex.tlg
similarity index 70%
copy from l3kernel/testfiles/m3str002.luatex.tlg
copy to l3kernel/testfiles/m3text003.ptex.tlg
index a03481826..256eb542b 100644
--- a/l3kernel/testfiles/m3str002.luatex.tlg
+++ b/l3kernel/testfiles/m3text003.ptex.tlg
@@ -2,29 +2,26 @@ This is a generated file for the LaTeX (2e + expl3) validation system.
 Don't change this file in any respect.
 Author: Joseph Wright
 ============================================================
-TEST 1: Simple Latin case folding
+TEST 1: Purify content
 ============================================================
-"abc 123 abc !@"
-" abc 123 abc !@ "
-"some $&## odd text { } "
+Some text Hello
+Hello sometext
 ============================================================
 ============================================================
-TEST 2: Checking category codes
+TEST 2: Purify in braces
 ============================================================
-FALSE
-TRUE
+Hello world with &##
 ============================================================
 ============================================================
-TEST 3: Accented characters, etc.
+TEST 3: Purification exclusions
 ============================================================
-"ćėɗṑṑ"
-"ὢιων"
-"ΰῢst"
-"ꚉꚇ"
-"ｚꝏⓧ"
+FOO  BAZ
+ BAZ
 ============================================================
 ============================================================
-TEST 4: Characters with context-sensitive Unicode behaviour
+TEST 4: Math-mode escape
 ============================================================
-TRUE
+Some text \text_purify_math_:n {y = \protect \mathop {\mathgroup \symoperators sin}\nolimits \theta }
+Opps not close token in $y = \sin \theta 
+\text_purify_math_:n {y=mx+c} is an equation
 ============================================================
diff --git a/l3kernel/testfiles/m3text001.tlg b/l3kernel/testfiles/m3text003.tlg
similarity index 74%
copy from l3kernel/testfiles/m3text001.tlg
copy to l3kernel/testfiles/m3text003.tlg
index b468aa196..3bf32391c 100644
--- a/l3kernel/testfiles/m3text001.tlg
+++ b/l3kernel/testfiles/m3text003.tlg
@@ -2,38 +2,37 @@ This is a generated file for the LaTeX (2e + expl3) validation system.
 Don't change this file in any respect.
 Author: Joseph Wright
 ============================================================
-TEST 1: Expanding content
+TEST 1: Purify content
 ============================================================
 Some text Hello
 Hello sometext
-Some text Hello
-Hello sometext
-Some text \cs_tmp:w 
-\cs_tmp:w  sometext
 ============================================================
 ============================================================
-TEST 2: Expansion in braces
+TEST 2: Purify in braces
 ============================================================
-{Hello} world \par with \ERROR &##
+Hello world with &##
 ============================================================
 ============================================================
-TEST 3: Math-mode escape
+TEST 3: Purification exclusions
 ============================================================
-Some text $y = \sin \theta $
-Opps not close token in $y = \sin \theta 
+FOO  BAZ
+ BAZ
 ============================================================
 ============================================================
-TEST 4: Letter-like commands
+TEST 4: Math-mode escape
 ============================================================
-\AA \aa \J \ae \dh \ss \l \O 
+Some text \text_purify_math_:n {y = \protect \mathop {\mathgroup \symoperators sin}\nolimits \theta }
+Opps not close token in $y = \sin \theta 
+\text_purify_math_:n {y=mx+c} is an equation
 ============================================================
 ============================================================
-TEST 5: Accents
+TEST 5: Letter-like commands
 ============================================================
-\"{a}\u {e}\H {i}\v {o}\.{u}
+^^c3^^85^^c3^^a5^^c3^^a6^^c3^^b0^^c3^^9f^^c5^^82^^c3^^98
 ============================================================
 ============================================================
-TEST 6: Implicit tokens
+TEST 6: Accents
 ============================================================
-"A"B"C" " " "
+^^c3^^a4^^c4^^95i^^cc^^8b^^c7^^92u^^cc^^87
+^^c3^^b1r^^cc^^a8
 ============================================================
diff --git a/l3kernel/testfiles/m3text001.tlg b/l3kernel/testfiles/m3text003.uptex.tlg
similarity index 77%
copy from l3kernel/testfiles/m3text001.tlg
copy to l3kernel/testfiles/m3text003.uptex.tlg
index b468aa196..cf6b57f22 100644
--- a/l3kernel/testfiles/m3text001.tlg
+++ b/l3kernel/testfiles/m3text003.uptex.tlg
@@ -2,38 +2,37 @@ This is a generated file for the LaTeX (2e + expl3) validation system.
 Don't change this file in any respect.
 Author: Joseph Wright
 ============================================================
-TEST 1: Expanding content
+TEST 1: Purify content
 ============================================================
 Some text Hello
 Hello sometext
-Some text Hello
-Hello sometext
-Some text \cs_tmp:w 
-\cs_tmp:w  sometext
 ============================================================
 ============================================================
-TEST 2: Expansion in braces
+TEST 2: Purify in braces
 ============================================================
-{Hello} world \par with \ERROR &##
+Hello world with &##
 ============================================================
 ============================================================
-TEST 3: Math-mode escape
+TEST 3: Purification exclusions
 ============================================================
-Some text $y = \sin \theta $
-Opps not close token in $y = \sin \theta 
+FOO  BAZ
+ BAZ
 ============================================================
 ============================================================
-TEST 4: Letter-like commands
+TEST 4: Math-mode escape
 ============================================================
-\AA \aa \J \ae \dh \ss \l \O 
+Some text \text_purify_math_:n {y = \protect \mathop {\mathgroup \symoperators sin}\nolimits \theta }
+Opps not close token in $y = \sin \theta 
+\text_purify_math_:n {y=mx+c} is an equation
 ============================================================
 ============================================================
-TEST 5: Accents
+TEST 5: Letter-like commands
 ============================================================
-\"{a}\u {e}\H {i}\v {o}\.{u}
+^^c5^^e5^^e6^^f0^^dfł^^d8
 ============================================================
 ============================================================
-TEST 6: Implicit tokens
+TEST 6: Accents
 ============================================================
-"A"B"C" " " "
+^^e4ĕi̋ǒu̇
+^^f1r̨
 ============================================================
diff --git a/l3kernel/testfiles/m3text001.tlg b/l3kernel/testfiles/m3text003.xetex.tlg
similarity index 77%
copy from l3kernel/testfiles/m3text001.tlg
copy to l3kernel/testfiles/m3text003.xetex.tlg
index b468aa196..cf6b57f22 100644
--- a/l3kernel/testfiles/m3text001.tlg
+++ b/l3kernel/testfiles/m3text003.xetex.tlg
@@ -2,38 +2,37 @@ This is a generated file for the LaTeX (2e + expl3) validation system.
 Don't change this file in any respect.
 Author: Joseph Wright
 ============================================================
-TEST 1: Expanding content
+TEST 1: Purify content
 ============================================================
 Some text Hello
 Hello sometext
-Some text Hello
-Hello sometext
-Some text \cs_tmp:w 
-\cs_tmp:w  sometext
 ============================================================
 ============================================================
-TEST 2: Expansion in braces
+TEST 2: Purify in braces
 ============================================================
-{Hello} world \par with \ERROR &##
+Hello world with &##
 ============================================================
 ============================================================
-TEST 3: Math-mode escape
+TEST 3: Purification exclusions
 ============================================================
-Some text $y = \sin \theta $
-Opps not close token in $y = \sin \theta 
+FOO  BAZ
+ BAZ
 ============================================================
 ============================================================
-TEST 4: Letter-like commands
+TEST 4: Math-mode escape
 ============================================================
-\AA \aa \J \ae \dh \ss \l \O 
+Some text \text_purify_math_:n {y = \protect \mathop {\mathgroup \symoperators sin}\nolimits \theta }
+Opps not close token in $y = \sin \theta 
+\text_purify_math_:n {y=mx+c} is an equation
 ============================================================
 ============================================================
-TEST 5: Accents
+TEST 5: Letter-like commands
 ============================================================
-\"{a}\u {e}\H {i}\v {o}\.{u}
+^^c5^^e5^^e6^^f0^^dfł^^d8
 ============================================================
 ============================================================
-TEST 6: Implicit tokens
+TEST 6: Accents
 ============================================================
-"A"B"C" " " "
+^^e4ĕi̋ǒu̇
+^^f1r̨
 ============================================================