[latex3-commits] [git/LaTeX3-latex3-latex3] l3text: Implement first pass at \text_purify_math_unicode:n (6974a4637)

Sat Dec 14 00:43:05 CET 2019

Repository : https://github.com/latex3/latex3
On branch  : l3text
Link       : https://github.com/latex3/latex3/commit/6974a46373bae43f1593a1ada8f244e7d25f9cca

>---------------------------------------------------------------

commit 6974a46373bae43f1593a1ada8f244e7d25f9cca
Author: Joseph Wright <joseph.wright at morningstar2.co.uk>
Date:   Tue Dec 10 10:00:39 2019 +0000

    Implement first pass at \text_purify_math_unicode:n
    
    Decisions to be made on the scope of support needed,
    clearly some additional commands to be added
    (what to do about matrices, \binom, ...).


>---------------------------------------------------------------

6974a46373bae43f1593a1ada8f244e7d25f9cca
 l3kernel/l3text.dtx | 1375 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 1370 insertions(+), 5 deletions(-)

diff --git a/l3kernel/l3text.dtx b/l3kernel/l3text.dtx
index e6ccea3bf..ebf96abab 100644
--- a/l3kernel/l3text.dtx
+++ b/l3kernel/l3text.dtx
@@ -59,6 +59,8 @@
 % operate by expansion. Begin-group and end-group tokens in the \meta{text}
 % are normalized and become |{| and |}|, respectively.
 %
+% \subsection{Expanding text}
+%
 % \begin{function}[EXP, added = 2019-11-20]{\text_expand:n}
 %   \begin{syntax}
 %     \cs{text_expand:n} \Arg{text}
@@ -76,6 +78,8 @@
 %   and \cs{l_text_letterlike_tl} are excluded from expansion.
 % \end{function}
 %
+% \subsection{Case changing}
+%
 % \begin{function}[EXP, added = 2019-11-20]
 %   {
 %     \text_lowercase:n,  \text_uppercase:n,  \text_titlecase:n,
@@ -161,6 +165,8 @@
 %   \end{itemize}
 % \end{function}
 %
+% \subsection{Removing formatting from text}
+%
 % \begin{function}[rEXP, added = 2019-12-05]{\text_purify:n}
 %   \begin{syntax}
 %     \cs{text_purify:n} \Arg{text}
@@ -189,10 +195,13 @@
 %   should be expandable.
 % \end{function}
 %
+% \subsection{Converting math mode material}
+%
 % \begin{function}[rEXP, added = 2019-12-08]
 %   {
 %     \text_purify_math_unchanged:n,
-%     \text_purify_math_chars:n
+%     \text_purify_math_chars:n    ,
+%     \text_purify_math_unicode:n
 %   }
 %   \begin{syntax}
 %     \cs{text_purify_math_unchanged:n} \Arg{math}
@@ -206,19 +215,121 @@
 %     \item \texttt{chars} Retains only the text characters (category code
 %      $11$ and $12$) and spaces in the math mode material, and drops all
 %      other content
+%     \item \texttt{unicode} Converts the math mode material as far as
+%       possible to Unicode characters; see below for further details
+%       for more details
 %   \end{itemize}
 %   In all cases, the result is protected from further expansion by
 %   \cs{exp_not:n}.
 % \end{function}
 %
+% Whilst leaving math mode unchanged or retention of only characters is
+% relatively straight-forward, production of Unicode text from \LaTeX{}
+% math syntax is more involved. The approach here implements as far as
+% possible conversion of core \LaTeX{} and \pkg{amsmath} commands, plus
+% a small number of additional ideas from \pkg{unicode-math}.
+% \begin{itemize}
+%    \item Latin letters and Arabic numerals are input literally, whilst
+%      Greek letters use the control sequences
+%      \texttt{\textbackslash\meta{name}}, for example \tn{alpha} or
+%      \tn{Omega}
+%    \item Latin and Greek letters (including the \tn{var\dots} versions),
+%      Arabic numerals, \tn{nabla} and \tn{partial} are converted to
+%      the Unicode math range equivalent based on the currently-active
+%      math font command, with the latter being one or
+%      \begin{itemize}
+%        \item \tn{mathnormal}
+%        \item \tn{mathrm}
+%        \item \tn{mathup}
+%        \item \tn{mathbf} (alias \cs{mathbfup})
+%        \item \tn{mathit}
+%        \item \tn{mathbfit}
+%        \item \tn{mathscr}
+%        \item \tn{mathbfscr}
+%        \item \tn{mathfrak}
+%        \item \tn{mathbb}
+%        \item \tn{mathbffrak}
+%        \item \tn{mathsf} (alias \cs{mathsfup})
+%        \item \tn{mathsfup}
+%        \item \tn{mathbfsf} (alias \tn{mathbfsfup})
+%        \item \tn{mathsfit}
+%        \item \tn{mathbfsfit}
+%        \item \tn{mathtt}
+%      \end{itemize}
+%      along with the \texttt{unicode-math} \tn{sym\dots} equivalents.
+%    \item Super- and subscript material (marked up by |_|/|^| or by \tn{sp} and
+%      \tn{sb})is converted to the Unicode raised or lowered codepoints if the
+%      entire script portion can be represented in this way; otherwise it is
+%      marked using a |^| or |_| followed by the argument in parenthesis. The
+%      Unicode codepoints cover
+%      \begin{itemize}
+%        \item Arabic numerals
+%        \item Parenthesis, |-|, |+| and |=|
+%        \item For subscripts, the letters |i| and |n|
+%        \item For superscripts, the letters |a|, |e|, |o|, |k|, |l|, |m|,
+%          |n|, |p|, |s| and |t|
+%      \end{itemize}
+%    \item The math accent commands
+%      \begin{itemize}
+%        \item \tn{grave}
+%        \item \tn{acute}
+%        \item \tn{hat}
+%        \item \tn{widehat}
+%        \item \tn{tilde}
+%        \item \tn{widetilde}
+%        \item \tn{bar}
+%        \item \tn{breve}
+%        \item \tn{dot}
+%        \item \tn{ddot}
+%      \end{itemize}
+%      are converted to the trailing modifier characters.
+%    \item \tn{frac} is converted to
+%      \texttt{\meta{numerator}/\texttt{denominator}}, where parenthesis are
+%      added where the \meta{numerator} or \texttt{denominator} are no a single
+%      character, and where a space is added after the \texttt{denominator}.
+%      The same approach is used to convert \texttt{genfrac} from \pkg{amsmath}.
+%    \item \tn{sqrt} is converted to the Unicode symbol, with the mandatory
+%      argument in parenthesis. Any optional argument is given \emph{before}
+%      the root.
+%    \item Operators such as \tn{sin} are converted to text, with a space.
+%      added after the operator or, where there is a super/subscript, after
+%      any indices. (Operators are converted based on their internal
+%      structure, not by hard-coding their names.)
+%    \item Commands defined by the \LaTeX{} kernel which have a direct
+%      Unicode equivalent are replaced directly: there are around $200$ such
+%      commands.
+%    \item Other implicit characters are converted to their explicit
+%      equivalent: this is mainly relevant for use with \kg{unicode-math}.
+%    \item Embedded textual content is treated as for \cs{text_purify:n},
+%      with the following commands marking up such text
+%      \begin{itemize}
+%        \item \tn{mbox}
+%        \item \tn{text}
+%        \item \tn{textrm}
+%        \item \tn{textsf}
+%        \item \tn{texttt}
+%        \item \tn{textnormal}
+%        \item \tn{textbf}
+%        \item \tn{textmd}
+%        \item \tn{textit}
+%        \item \tn{textsl}
+%        \item \tn{textsc}
+%        \item \tn{textup}
+%        \item \tn{textulc}
+%        \item \tn{emph}
+%      \end{itemize}
+% \end{itemize}
+%
 % \begin{function}[added = 2019-12-08]{\text_set_purify_math_mode:n}
 %   \begin{syntax}
 %     \cs{text_set_purify_math_mode:n} \meta{mode}
 %   \end{syntax}
-%   Sets the math purification mode to one of \texttt{unchanged} or
-%   \texttt{chars}.
+%   Sets the math purification mode to one of \texttt{unchanged},
+%   \texttt{chars} or \texttt{unicode}.
 % \end{function}
 %
+% \subsection{Control variables}
+%
 % \begin{variable}{\l_text_accents_tl}
 %   Lists commands which represent accents, and which are left unchanged
 %   by expansion.
@@ -2762,7 +2873,7 @@
     \k { 0328 }
     \b { 0331 }
     \t { 0361 }
-    \q_recursion_tail ?
+    \q_recursion_tail { }
     \q_recursion_stop
 %    \end{macrocode}
 %   Now we handle the pre-composed accents: the list here is taken from
@@ -2988,7 +3099,7 @@
     \. o   { 022F }
     \= Y   { 0232 }
     \= y   { 0233 }
-    \q_recursion_tail ? ?
+    \q_recursion_tail ? { }
     \q_recursion_stop
 \group_end:
 %    \end{macrocode}
@@ -3068,6 +3179,697 @@
 % \end{macro}
 % \end{macro}
 %
+% \begin{macro}[rEXP]{\text_purify_math_unicode:n}
+% \begin{macro}[rEXP]{\@@_purify_unimath_loop:w}
+% \begin{macro}[rEXP]{\@@_purify_unimath_space:w}
+% \begin{macro}[rEXP]{\@@_purify_unimath_group:n}
+% \begin{macro}[rEXP]{\@@_purify_unimath_N_type:N}
+% \begin{macro}[rEXP]{\@@_purify_unimath_chars:nn}
+% \begin{macro}[rEXP]
+%   {
+%     \@@_purify_unimath_normal:nnnnnnnnn ,
+%     \@@_purify_unimath_rm:nnnnnnnnn     ,
+%     \@@_purify_unimath_bf:nnnnnnnnn     ,
+%     \@@_purify_unimath_it:nnnnnnnnn     ,
+%     \@@_purify_unimath_bfit:nnnnnnnnn   ,
+%     \@@_purify_unimath_scr:nnnnnnnnn    ,
+%     \@@_purify_unimath_bfscr:nnnnnnnnn  ,
+%     \@@_purify_unimath_frak:nnnnnnnnn   ,
+%     \@@_purify_unimath_bb:nnnnnnnnn    ,
+%     \@@_purify_unimath_bffrak:nnnnnnnnn ,
+%     \@@_purify_unimath_sf:nnnnnnnnn     ,
+%     \@@_purify_unimath_bfsf:nnnnnnnnn   ,
+%     \@@_purify_unimath_itsf:nnnnnnnnn   ,
+%     \@@_purify_unimath_bfitsf:nnnnnnnnn ,
+%     \@@_purify_unimath_tt:nnnnnnnnn
+%   }
+% \begin{macro}[rEXP]
+%   {
+%     \@@_purify_unimath_sf:nnnnn     ,
+%     \@@_purify_unimath_bfsf:nnnnn   ,
+%     \@@_purify_unimath_itsf:nnnnn   ,
+%     \@@_purify_unimath_bfitsf:nnnnn ,
+%     \@@_purify_unimath_tt:nnnnn
+%   }
+% \begin{macro}[rEXP]
+%   {
+%     \@@_purify_unimath_cs:nN       ,
+%     \@@_purify_unimath_replace:nN
+%   }
+% \begin{macro}[rEXP]{\@@_purify_unimath_cs:nN}
+% \begin{macro}[rEXP]{\@@_purify_unimath_cs:nw}
+% \begin{macro}[rEXP]{\@@_purify_unimath_char:nN}
+% \begin{macro}[rEXP]{\@@_purify_unimath_script:nnn}
+% \begin{macro}[rEXP]{\@@_purify_unimath_script:nnnnw}
+% \begin{macro}[rEXP]
+%   {
+%     \@@_purify_unimath_script_super: ,
+%     \@@_purify_unimath_script_sub:
+%   }
+%   Converting to Unicode text (called |unimath| for convenience in the
+%   code) has been implemented by a number of people in various tools.
+%   In particular, see \url{http://latex2unicode.herokuapp.com/}
+%   and the Unicod report on the approach taken by Word
+%   (\url{http://www.unicode.org/notes/tn28/UTN28-PlainTextMath-v3.1.pdf}).
+%
+%  In contrast to the text purfying code, here we are dealing with
+%  a set of tokens that are essentially fixed; they have to match with
+%  Unicode and with what other converters do. Thus the various mappings
+%  are \emph{not} flexible: all internal data.
+%
+%   As with the other math mode loops, we start off in the standard way.
+%   Here, we have to track which math font is active, so there is an
+%   \texttt{n}-type argument to pass.
+%    \begin{macrocode}
+\cs_new:Npn \text_purify_math_unicode:n #1
+  {
+    \@@_purify_unimath_loop:nw { normal } #1
+      \q_recursion_tail \q_recursion_stop
+  }
+\cs_new:Npn \@@_purify_unimath_loop:nw #1#2 \q_recursion_stop
+  {
+    \tl_if_head_is_N_type:nTF {#2}
+      { \@@_purify_unimath_N_type:nN }
+      {
+        \tl_if_head_is_group:nTF {#2}
+          { \@@_purify_unimath_group:nn }
+          { \@@_purify_unimath_space:nw }
+      }
+    {#1} #2 \q_recursion_stop
+  }
+\exp_after:wN \cs_new:Npn \exp_after:wN \@@_purify_unimath_space:nw
+  \exp_after:wN # \exp_after:wN 1 \c_space_tl
+  {
+    \c_space_tl
+    \@@_purify_unimath_loop:nw {#1}
+  }
+\cs_new:Npn \@@_purify_unimath_group:nn #1#2
+  { \@@_purify_unimath_loop:nw {#1} #2 }
+%    \end{macrocode}
+%   For \texttt{N}-type arguments, we first look for the cases that
+%   follow font choice. Those can be both chars and control sequences,
+%   hence going up-front. Much of the work there is done at the
+%   set-up stage, so the lookup is quite easy. Then we look for fixed
+%   mappings: again, these could be chars such as |-| or commands.
+%   Only if none of these apply do we split into two paths.
+%    \begin{macrocode}
+\cs_new:Npn \@@_purify_unimath_N_type:nN #1#2
+  {
+    \quark_if_recursion_tail_stop:N #2
+    \cs_if_exist:cTF
+      { c_@@_math_chars_ \token_to_str:N #2 _tl }
+      {
+        \exp_args:Nv \@@_purify_unimath_chars:nn
+          { c_@@_math_chars_ \token_to_str:N #2 _tl }
+          {#1}
+      }
+      {
+        \cs_if_exist:cTF
+          { c_@@_math_char_ \token_to_str:N #2 _tl }
+          {
+            \exp_not:v
+              { c_@@_math_char_ \token_to_str:N #2 _tl }
+            \@@_purify_unimath_loop:nw {#1}
+          }
+          {
+            \token_if_cs:NTF #2
+              { \@@_purify_unimath_cs:nN }
+              { \@@_purify_unimath_char:nN }
+                {#1} #2
+          }
+      }
+  }
+%    \end{macrocode}
+%   The font-sensitive char data is stored in token lists. Other than |normal|,
+%   everything is straight extraction. For the former, we have to pick up Greek
+%   capitals, which has the complexity of handling $8$-bit characters.
+%    \begin{macrocode}
+\cs_new:Npn \@@_purify_unimath_chars:nn #1#2
+  {
+    \use:c { @@_purify_unimath_ #2 :nnnnnnnnn } #1
+    \@@_purify_unimath_loop:nw {#2}
+  }
+\bool_lazy_or:nnTF
+  { \sys_if_engine_luatex_p: }
+  { \sys_if_engine_xetex_p: }
+  {
+    \cs_new:Npn \@@_purify_unimath_normal:nnnnnnnnn #1#2#3#4#5#6#7#8#9
+      {
+        \bool_lazy_or:nnTF
+          { \int_compare_p:nNn { `#1 } < { "0391 } }
+          { \int_compare_p:nNn { `#1 } > { "03A9 } }
+          { \@@_purify_unimath_it:nnnnnnnnn }
+          { \@@_purify_unimath_rm:nnnnnnnnn }
+            {#1} {#2} {#3} {#4} {#5} {#6} {#6} {#8} {#9}
+      }
+  }
+  {
+    \cs_new:Npn \@@_purify_unimath_normal:nnnnnnnnn #1#2#3#4#5#6#7#8#9
+      {
+        \int_compare:nNnTF { \exp_after:wN ` \tl_head:w #1 \q_stop } = { "CE }
+          {
+            \bool_lazy_or:nnTF
+              {
+                \int_compare_p:nNn
+                  { \exp_after:wN ` \use_ii:nn #1 } < { "91 }
+              }
+              {
+                \int_compare_p:nNn
+                  { \exp_after:wN ` \use_ii:nn #1 } > { "A9 }
+              }
+              { \@@_purify_unimath_it:nnnnnnnnn }
+              { \@@_purify_unimath_rm:nnnnnnnnn }
+          }
+          { \@@_purify_unimath_it:nnnnnnnnn }
+            {#1} {#2} {#3} {#4} {#5} {#6} {#6} {#8} {#9}
+      }
+  }
+%    \end{macrocode}
+%   Lots and lots of simple lookups.
+%    \begin{macrocode}
+\cs_new:Npn \@@_purify_unimath_rm:nnnnnnnnn #1#2#3#4#5#6#7#8#9
+  {
+    \exp_not:n {#1}
+    \use_none:nnnnn
+  }
+\cs_new:Npn \@@_purify_unimath_bf:nnnnnnnnn #1#2#3#4#5#6#7#8#9
+  {
+    \exp_not:n {#2}
+    \use_none:nnnnn
+  }
+\cs_new:Npn \@@_purify_unimath_it:nnnnnnnnn #1#2#3#4#5#6#7#8#9
+  {
+    \exp_not:n {#3}
+    \use_none:nnnnn
+  }
+\cs_new:Npn \@@_purify_unimath_bfit:nnnnnnnnn #1#2#3#4#5#6#7#8#9
+  {
+    \exp_not:n {#4}
+    \use_none:nnnnn
+  }
+\cs_new:Npn \@@_purify_unimath_scr:nnnnnnnnn #1#2#3#4#5#6#7#8#9
+  {
+    \exp_not:n {#5}
+    \use_none:nnnnn
+  }
+\cs_new:Npn \@@_purify_unimath_bfscr:nnnnnnnnn #1#2#3#4#5#6#7#8#9
+  {
+    \exp_not:n {#6}
+    \use_none:nnnnn
+  }
+\cs_new:Npn \@@_purify_unimath_frak:nnnnnnnnn #1#2#3#4#5#6#7#8#9
+  {
+    \exp_not:n {#7}
+    \use_none:nnnnn
+  }
+\cs_new:Npn \@@_purify_unimath_bb:nnnnnnnnn #1#2#3#4#5#6#7#8#9
+  {
+    \exp_not:n {#8}
+    \use_none:nnnnn
+  }
+\cs_new:Npn \@@_purify_unimath_bffrak:nnnnnnnnn #1#2#3#4#5#6#7#8#9
+  {
+    \exp_not:n {#9}
+    \use_none:nnnnn
+  }
+\cs_new:Npn \@@_purify_unimath_sf:nnnnnnnnn #1#2#3#4#5#6#7#8#9
+  { \@@_purify_unimath_ss:nnnnn }
+\cs_new:Npn \@@_purify_unimath_sf:nnnnn #1#2#3#4#5
+  { \exp_not:n {#1} }
+\cs_new:Npn \@@_purify_unimath_bfsf:nnnnnnnnn #1#2#3#4#5#6#7#8#9
+  { \@@_purify_unimath_bfsf:nnnnn }
+\cs_new:Npn \@@_purify_unimath_bfsf:nnnnn #1#2#3#4#5
+  { \exp_not:n {#2} }
+\cs_new:Npn \@@_purify_unimath_itsf:nnnnnnnnn #1#2#3#4#5#6#7#8#9
+  { \@@_purify_unimath_itsf:nnnnn }
+\cs_new:Npn \@@_purify_unimath_itsf:nnnnn #1#2#3#4#5
+  { \exp_not:n {#3} }
+\cs_new:Npn \@@_purify_unimath_bfitsf:nnnnnnnnn #1#2#3#4#5#6#7#8#9
+  { \@@_purify_unimath_bfitsf:nnnnn }
+\cs_new:Npn \@@_purify_unimath_bfitsf:nnnnn #1#2#3#4#5
+  { \exp_not:n {#4} }
+\cs_new:Npn \@@_purify_unimath_tt:nnnnnnnnn #1#2#3#4#5#6#7#8#9
+  { \@@_purify_unimath_tt:nnnnn }
+\cs_new:Npn \@@_purify_unimath_tt:nnnnn #1#2#3#4#5
+  { \exp_not:n {#5} }
+%    \end{macrocode}
+%   For control sequences, we may have specific replacement functions,
+%   for example to handle |\frac|: these are covered here. The
+%   replacements themselves may loop before or after the additional
+%   tokens, so they are responsible for adding the loop function.
+%    \begin{macrocode}
+\cs_new:Npn \@@_purify_unimath_cs:nN #1#2
+  {
+    \cs_if_exist:cTF { @@_purify_unimath_ \token_to_str:N #2 :nw }
+      {
+        \exp_last_unbraced:Nno \@@_purify_unimath_replace:nN {#1}
+          { \cs:w @@_purify_unimath_ \token_to_str:N #2 :nw \cs_end: }
+      }
+      { \@@_purify_unimath_expand:nN {#1} #2 }
+  }
+\cs_new:Npn \@@_purify_unimath_replace:nN #1#2
+  { #2 {#1} }
+%    \end{macrocode}
+%   If we get this far, simply expand the token if possible, accounting
+%   for the case of taking an argument.
+%    \begin{macrocode}
+\cs_new:Npn \@@_purify_unimath_expand:nN #1#2
+  {
+    \token_if_expandable:NTF #2
+      { \@@_purify_unimath_expand:nw {#1} #2 }
+      { \@@_purify_unimath_loop:nw {#1} }
+  }
+\cs_new:Npn \@@_purify_unimath_expand:nw #1#2 \q_recursion_stop
+  {
+    \exp_last_unbraced:Nno \@@_purify_unimath_loop:nw {#1}
+      { #2 \q_recursion_stop }
+  }
+%    \end{macrocode}
+%   To deal with super- and subscripts, we need to map over everything
+%   and find the limited number of available slots. If there is no hit,
+%   check for implicit characters then move on.
+%    \begin{macrocode}
+\cs_new:Npn \@@_purify_unimath_char:nN #1#2
+  {
+    \token_if_math_subscript:NTF #2
+      { \@@_purify_unimath_script:nnn {#1} { sub } }
+      {
+        \token_if_math_superscript:NTF #2
+          { \@@_purify_unimath_script:nnn {#1} { super } }
+          {
+            \@@_token_to_explicit:N #2
+            \@@_purify_unimath_loop:nw {#1}
+          }
+      }
+  }
+\cs_new:Npn \@@_purify_unimath_script:nnn #1#2#3
+  {
+    \@@_purify_unimath_script:nnnnw { } {#1} {#2} {#3} #3
+      \q_recursion_tail \q_recursion_stop
+  }
+\cs_new:Npn \@@_purify_unimath_script:nnnnw #1#2#3#4#5
+  {
+    \quark_if_recursion_tail_stop_do:nn {#5}
+      {
+        \exp_not:n {#1}
+        \@@_purify_unimath_loop:nw {#2}
+      }
+    \cs_if_exist:cTF { c_@@_math_ #3 _ \tl_to_str:n {#5} _tl }
+      {
+        \exp_args:Ne \@@_purify_unimath_script:nnnnw
+          {
+            \exp_not:n {#1}
+            \exp_not:v { c_@@_math_ #3 _ \tl_to_str:n {#5} _tl }
+          }
+          {#2} {#3} {#4}
+      }
+      {
+        \use_i_delimit_by_q_recursion_stop:nw
+          {
+            \use:c { @@_purify_unimath_script_ #3 : }
+            \@@_purify_unimath_loop:nw {#2} ( #4 )
+          }
+      }
+  }
+\cs_new:Npn \@@_purify_unimath_script_super: { ^ }
+\cs_new:Npx \@@_purify_unimath_script_sub:
+  { \char_generate:nn { `\_ } { 8 } }
+%    \end{macrocode}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \begin{macro}[rEXP]{\@@_purify_unimath_accent:nNn}
+% \begin{macro}[rEXP]
+%   {
+%     \@@_purify_unimath_\grave:nw     ,
+%     \@@_purify_unimath_\acute:nw     ,
+%     \@@_purify_unimath_\hat:nw       ,
+%     \@@_purify_unimath_\widehat:nw   ,
+%     \@@_purify_unimath_\tilde:nw     ,
+%     \@@_purify_unimath_\widetilde:nw ,
+%     \@@_purify_unimath_\bar:nw       ,
+%     \@@_purify_unimath_\breve:nw     ,
+%     \@@_purify_unimath_\dot:nw       ,
+%     \@@_purify_unimath_\ddot:nw 
+%   }
+%    \begin{macrocode}
+\cs_new:Npn \@@_purify_unimath_accent:nNn #1#2#3
+  {
+    \quark_if_recursion_tail_stop:n {#3}
+    \exp_args:Nnnv \@@_purify_unimath_loop:nw {#1} {#3}
+      { c_@@_math_accent_ \token_to_str:N #2 _tl }
+  }
+\tl_map_inline:nn
+  {
+    \grave
+    \acute
+    \hat
+    \widehat
+    \tilde
+    \widetilde
+    \bar
+    \breve
+    \dot
+    \ddot
+  }
+  {
+    \cs_new:cpn { @@_purify_unimath_ \token_to_str:N #1 :nw } ##1
+      { \@@_purify_unimath_accent:nNn {##1} #1 }
+  }
+%    \end{macrocode}
+% \end{macro}
+% \end{macro}
+% \begin{macro}[rEXP]{\@@_purify_unimath_\frac:nw}
+% \begin{macro}[rEXP]{\@@_purify_unimath_\frac:nw}
+% \begin{macro}[rEXP]{\@@_purify_unimath_\frac:nn}
+% \begin{macro}[rEXP]{\@@_purify_unimath_\frac:n}
+%   Handling \tn{frac} required that the two arguments are processed
+%   first, so that a check can be made on whether the two parts are
+%   single output characters. The exact detail then depends on whether
+%   the engine is $8$-bit or Unicode. For the former, a single character
+%   may be one or more bytes, so there is a check to see if we have
+%   more than one \enquote{real} character.
+%    \begin{macrocode}
+\cs_new:cpx { @@_purify_unimath_ \token_to_str:N \frac :nw } #1#2#3
+  {
+    \exp_not:c { @@_purify_unimath_ \token_to_str:N \frac :nn }
+       {#1} {#2}
+    /
+    \exp_not:c { @@_purify_unimath_ \token_to_str:N \frac :nn }
+       {#1} {#3}
+    \c_space_tl
+    \exp_not:N \@@_purify_unimath_loop:nw {#1}
+  }
+\bool_lazy_or:nnTF
+  { \sys_if_engine_luatex_p: }
+  { \sys_if_engine_xetex_p: }
+  {
+    \cs_new:cpx { @@_purify_unimath_ \token_to_str:N \frac :nn } #1#2
+      {
+        \exp_not:N \exp_args:Ne
+        \exp_not:c { @@_purify_unimath_ \token_to_str:N \frac :n }
+          {
+            \exp_not:N \@@_purify_unimath_loop:nw {#1} #2
+              \exp_not:N \q_recursion_tail \exp_not:N \q_recursion_stop
+          }
+      }
+    \cs_new:cpn { @@_purify_unimath_ \token_to_str:N \frac :n } #1
+      {
+        \tl_if_single_token:nTF {#1}
+          { \exp_not:n {#1} }
+          { ( \exp_not:n { #1 } ) }
+      }
+  }
+  {
+    \cs_new:cpx { @@_purify_unimath_ \token_to_str:N \frac :nn } #1#2
+      {
+        \exp_not:N \exp_args:Ne
+          \exp_not:c { @@_purify_unimath_ \token_to_str:N \frac :n }
+          {
+            \exp_not:N \@@_purify_unimath_loop:nw {#1} #2
+              \exp_not:N \q_recursion_tail \exp_not:N \q_recursion_stop
+          }
+      }
+    \cs_new:cpn { @@_purify_unimath_ \token_to_str:N \frac :n } #1
+      {
+        \tl_if_single_token:nTF {#1}
+          { \exp_not:n {#1} }
+          {
+            \int_compare:nNnTF { \exp_after:wN ` \tl_head:w #1 \q_stop }
+              < { "C2 }
+              { ( \exp_not:n {#1} ) }
+              {
+               \int_compare:nNnTF { \exp_after:wN ` \tl_head:w #1 \q_stop }
+                 < { "E0 }
+                 {
+                   \tl_if_blank:oTF { \use_none:nn #1 }
+                     { \exp_not:n {#1} }
+                     { ( \exp_not:n {#1} ) }
+                 }
+                 {
+                   \int_compare:nNnTF { \exp_after:wN ` \tl_head:w #1 \q_stop }
+                     < { "F0 }
+                       {
+                         \tl_if_blank:oTF { \use_none:nnn #1 }
+                           { \exp_not:n {#1} }
+                           { ( \exp_not:n {#1} ) }
+                       }
+                       {
+                         \int_compare:nNnTF
+                           { \exp_after:wN ` \tl_head:w #1 \q_stop }
+                           < { "F5 }
+                           {
+                             \tl_if_blank:oTF { \use_none:nnnn #1 }
+                               { \exp_not:n {#1} }
+                               { ( \exp_not:n {#1} ) }
+                           }
+                           { ( \exp_not:n {#1} ) }
+                       }
+                 }
+              }
+          }
+      }
+  }
+%    \end{macrocode}
+% \end{macro}
+% \begin{macro}[rEXP]{\@@_purify_unimath_\genfrac:nw}
+%  The \pkg{amsmath} generalised version: avoid needing to cover every
+%  single possible variant.
+%    \begin{macrocode}
+\cs_new:cpx { @@_purify_unimath_ \token_to_str:N \genfrac :nw } #1#2#3#4#5#6#7
+  {
+    \exp_not:N \@@_purify_unimath_loop:nw {#1} #2
+      \exp_not:N \q_recursion_tail \exp_not:N \q_recursion_stop
+    \exp_not:c { @@_purify_unimath_ \token_to_str:N \frac :nn }
+       {#1} {#6}
+    /
+    \exp_not:c { @@_purify_unimath_ \token_to_str:N \frac :nn }
+       {#1} {#7}
+    \exp_not:N \@@_purify_unimath_loop:nw {#1} #3
+    \c_space_tl
+  }
+%    \end{macrocode}
+% \end{macro}
+% \begin{macro}[rEXP]
+%   {
+%     \@@_purify_unimath_\mbox:nw       ,
+%     \@@_purify_unimath_\hbox:nw       ,
+%     \@@_purify_unimath_\text:nw       ,
+%     \@@_purify_unimath_\textrm:nw     ,
+%     \@@_purify_unimath_\textsf:nw     ,
+%     \@@_purify_unimath_\texttt:nw     ,
+%     \@@_purify_unimath_\textnormal:nw ,
+%     \@@_purify_unimath_\textbf:nw     ,
+%     \@@_purify_unimath_\textmd:nw     ,
+%     \@@_purify_unimath_\textit:nw     ,
+%     \@@_purify_unimath_\textsl:nw     ,
+%     \@@_purify_unimath_\textsc:nw     ,
+%     \@@_purify_unimath_\textup:nw     ,
+%     \@@_purify_unimath_\textulc:nw    ,
+%     \@@_purify_unimath_\emph:nw
+%   }
+%   Commands that produce text: these need to escape from math mode
+%   processing.
+%    \begin{macrocode}
+\cs_new:cpn { @@_purify_unimath_ \token_to_str:N \mbox :nw } #1#2
+  {
+   \text_purify:n {#2}
+   \@@_purify_unimath_loop:nw {#1}
+  }
+\tl_map_inline:nn
+  {
+    \hbox
+    \text
+    \textrm
+    \textsf
+    \texttt
+    \textnormal
+    \textbf
+    \textmd
+    \textit
+    \textsl
+    \textsc
+    \textup
+    \textulc
+    \emph
+  }
+  {
+    \cs_new_eq:cc
+      { @@_purify_unimath_ \token_to_str:N #1 :nw } 
+      { @@_purify_unimath_ \token_to_str:N \mbox :nw } 
+  }
+%    \end{macrocode}
+% \end{macro}
+% \begin{macro}[rEXP]
+%   {
+%      \@@_purify_unimath_\symnormal:nw ,
+%      \@@_purify_unimath_\symrm:nw     ,
+%      \@@_purify_unimath_\symup:nw     ,
+%      \@@_purify_unimath_\symbf:nw     ,
+%      \@@_purify_unimath_\symbfup:nw   ,
+%      \@@_purify_unimath_\symit:nw     ,
+%      \@@_purify_unimath_\symbfit:nw   ,
+%      \@@_purify_unimath_\symscr:nw    ,
+%      \@@_purify_unimath_\symbfscr:nw  ,
+%      \@@_purify_unimath_\symfrak:nw   ,
+%      \@@_purify_unimath_\symbb:nw     ,
+%      \@@_purify_unimath_\symbffrak:nw ,
+%      \@@_purify_unimath_\symsfup:nw   ,
+%      \@@_purify_unimath_\symbfsfup:nw ,
+%      \@@_purify_unimath_\symsfit:nw   ,
+%      \@@_purify_unimath_\symbfsfit:nw ,
+%      \@@_purify_unimath_\symtt:nw
+%      \@@_purify_unimath_\mathnormal:nw ,
+%      \@@_purify_unimath_\mathrm:nw     ,
+%      \@@_purify_unimath_\mathup:nw     ,
+%      \@@_purify_unimath_\mathbf:nw     ,
+%      \@@_purify_unimath_\mathbfup:nw   ,
+%      \@@_purify_unimath_\mathit:nw     ,
+%      \@@_purify_unimath_\mathbfit:nw   ,
+%      \@@_purify_unimath_\mathscr:nw    ,
+%      \@@_purify_unimath_\mathbfscr:nw  ,
+%      \@@_purify_unimath_\mathfrak:nw   ,
+%      \@@_purify_unimath_\mathbb:nw     ,
+%      \@@_purify_unimath_\mathbffrak:nw ,
+%      \@@_purify_unimath_\mathsfup:nw   ,
+%      \@@_purify_unimath_\mathbfsf:nw   ,
+%      \@@_purify_unimath_\mathbfsfup:nw ,
+%      \@@_purify_unimath_\mathsfit:nw   ,
+%      \@@_purify_unimath_\mathbfsfit:nw ,
+%      \@@_purify_unimath_\mathtt:nw
+%   }
+%   For the math-font commands, we need to change the propagated
+%   information, do the recursion then switch back.
+%    \begin{macrocode}
+\group_begin:
+  \cs_set_protected:Npn \@@_tmp:nn #1#2
+    {
+      \quark_if_recursion_tail_stop:n {#1}
+      \cs_new:cpn { @@_purify_unimath_ \c_backslash_str sym #1 :nw } ##1##2
+        {
+          \@@_purify_unimath_loop:nw {#2} ##2
+            \q_recursion_tail \q_recursion_stop
+          \@@_purify_unimath_loop:nw {##1}
+        }
+      \cs_new_eq:cc
+        { @@_purify_unimath_ \c_backslash_str math #1 :nw }
+        { @@_purify_unimath_ \c_backslash_str sym #1 :nw }
+      \@@_tmp:nn
+    }
+  \@@_tmp:nn
+    { normal } { normal }
+    { rm }     { rm }
+    { up }     { rm }
+    { bf }     { bf }
+    { bfup }   { bf }
+    { it }     { it }
+    { bfit }   { bfit }
+    { scr }    { scr }
+    { bfscr }  { bfscr }
+    { frak }   { frak }
+    { bb }     { bb }
+    { bffrak } { bffrak }
+    { sf }     { sf }
+    { sfup }   { sf }
+    { bfsf   } { bfsf }
+    { bfsfup } { bfsf }
+    { sfit }   { itsf }
+    { bfsfit } { bfitsf }
+    { tt }     { tt }
+    { \q_recursion_tail } { }
+    \q_recursion_stop
+\group_end:
+%    \end{macrocode}
+% \end{macro}
+% \begin{macro}[rEXP]
+%   {
+%     \@@_purify_unimath_\mathop:nw  ,
+%     \@@_purify_unimath_\qopname:nw
+%   }
+% \begin{macro}[rEXP]{\@@_purify_unimath_opchk:nw}
+% \begin{macro}[rEXP]{\@@_purify_unimath_opchk:nN}
+% \begin{macro}[rEXP]{\@@_purify_unimath_opchk:nNn}
+%   To support operators, target both the \pkg{amsmath} and the standard
+%   internals. The look-ahead deals with super/subscripts, where a space
+%   should be moved.
+%    \begin{macrocode}
+\cs_new:cpn { @@_purify_unimath_ \token_to_str:N \mathop :nw } #1#2
+  {
+    \text_purify:n { \use_none:n #2 }
+    \@@_purify_unimath_opchk:nw {#1}
+  }
+\cs_new:cpn { @@_purify_unimath_ \token_to_str:N \qopname :nw } #1#2#3#4
+  {
+    \text_purify:n { #4 }
+    \@@_purify_unimath_opchk:nw {#1}
+  }
+\cs_new:Npn \@@_purify_unimath_opchk:nw #1#2 \q_recursion_stop
+  {
+    \tl_if_head_is_N_type:nTF {#2}
+      { \@@_purify_unimath_opchk:nN }
+      {
+        \tl_if_head_is_group:nT {#2}
+          { \c_space_tl }
+        \@@_purify_unimath_loop:nw
+      }
+      {#1} #2 \q_recursion_stop
+  }
+\cs_new:Npn \@@_purify_unimath_opchk:nN #1#2
+  {
+    \bool_lazy_or:nnTF
+      { \token_if_math_superscript_p:N #2 }
+      { \token_if_math_subscript_p:N #2 }
+      { \@@_purify_unimath_opchk:nNn {#1} #2 }
+      {
+        \c_space_tl
+        \@@_purify_unimath_loop:nw {#1} #2
+      }
+  }
+\cs_new:Npn \@@_purify_unimath_opchk:nNn #1#2#3
+  { \@@_purify_unimath_loop:nw {#1} #2 {#3} \c_space_tl }
+%    \end{macrocode}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \begin{macro}[rEXP]{\@@_purify_unimath_\sqrt:nw}
+% \begin{macro}[rEXP]{\@@_purify_unimath_\sqrt_aux:nw}
+%   Square root is tricky to handle; at the present, just dump the
+%   root if there is an index.
+%    \begin{macrocode}
+\cs_new:cpx { @@_purify_unimath_ \token_to_str:N \sqrt :nw } #1#2
+  {
+    \exp_not:N \bool_lazy_and:nnTF
+      { \exp_not:N \tl_if_single_token_p:n {#2} }
+      {
+        \exp_not:N \token_if_eq_meaning_p:NN #2 [ % ]
+      }
+      { \exp_not:c { @@_purify_unimath_ \token_to_str:N \sqrt _aux:nw } {#1} }
+      {
+        \exp_not:N \exp_not:v { c_@@_math_ \token_to_str:N \sqrt _tl }
+        \exp_not:N \@@_purify_unimath_loop:nw {#1} { ( #2 ) }
+      }
+  }
+\cs_new:cpx { @@_purify_unimath_ \token_to_str:N \sqrt _aux:nw } % [
+  #1#2 ] #3
+  {
+    \c_space_tl
+    \exp_not:N \@@_purify_unimath_loop:nw {#1} [ #2 ]
+      \exp_not:c { c_@@_math_ \token_to_str:N \sqrt _tl }
+      ( #3 )
+  }
+%    \end{macrocode}
+% \end{macro}
+% \end{macro}
+%
 % \begin{macro}{\text_set_purify_math_mode:n}
 %   Set up the mode for purification.
 %    \begin{macrocode}
@@ -3087,6 +3889,569 @@
 %    \end{macrocode}
 % \end{macro}
 %
+% \subsection{Character data for Unicode math}
+%
+% For characters with math font variants, there is potentially a lot of
+% data to store. To avoid a combinatorial exposition, the approach here is to
+% use one token list per \enquote{base} character. The pre-generated forms are
+% then stored in a token list in the order
+% \begin{enumerate}
+%   \item \texttt{normal}
+%   \item \texttt{bf}
+%   \item \texttt{it}
+%   \item \texttt{bfit}
+%   \item \texttt{scr}
+%   \item \texttt{bfscr}
+%   \item \texttt{frak}
+%   \item \texttt{bb}
+%   \item \texttt{bffrak}
+%   \item \texttt{sans}
+%   \item \texttt{bfsans}
+%   \item \texttt{itsans}
+%   \item \texttt{bfitsans}
+%   \item \texttt{tt}
+% \end{enumerate}
+% (labelling as for \pkg{unicode-math}). Where the character does not have the
+% correct font variant, the normal one is used.
+%
+% Working in this way, both $8$-bit and Unicode engines have the full data set
+% for these characters in around $100$ token lists.
+%
+% The first step is to create the generator functions: this is the only part
+% that is engine-specific.
+%     \begin{macrocode}
+\group_begin:
+  \cs_set:Npn \@@_group:n { }
+  \bool_lazy_or:nnTF
+    { \sys_if_engine_luatex_p: }
+    { \sys_if_engine_xetex_p: }
+    {
+      \cs_set:Npn \@@_tmp:n #1
+        {
+          \@@_group:n
+            { \char_generate:nn {#1} { \char_value_catcode:n {#1} } }
+        }
+    }
+    {
+      \cs_set:Npn \@@_tmp:n #1
+        { \exp_args:Ne \@@_tmp_aux:n { \char_codepoint_to_bytes:n {#1} } }
+      \cs_set:Npn \@@_tmp_aux:n #1
+        { \@@_tmp:nnnn #1 }
+      \cs_set:Npn \@@_tmp:nnnn #1#2#3#4
+        {
+          \tl_if_blank:nTF {#2}
+            { \@@_group:n { \char_generate:nn {#1} { 11 } } }
+            {
+              \@@_group:n
+                {
+                  \exp_after:wN \exp_after:wN \exp_after:wN
+                    \exp_not:N \char_generate:nn {#1} { 13 }
+                  \exp_after:wN \exp_after:wN \exp_after:wN
+                    \exp_not:N \char_generate:nn {#2} { 13 }
+                  \tl_if_blank:nF {#3}
+                    {
+                      \exp_after:wN \exp_after:wN \exp_after:wN
+                        \exp_not:N \char_generate:nn {#3} { 13 }
+                      \tl_if_blank:nF {#4}
+                        {
+                          \exp_after:wN \exp_after:wN \exp_after:wN
+                            \exp_not:N \char_generate:nn {#4} { 13 }
+                        }
+                    }
+                }
+            }
+        }
+    }
+%     \end{macrocode}
+% For Latin letters, the job is easy: there are a full set of font variants,
+% and both cases have a simple run. The only issue is the gap between the
+% two cases.
+%     \begin{macrocode}
+  \cs_set_protected:Npn \@@_tmp:nnn #1#2#3
+    {
+      \tl_const:cx
+        { c_@@_math_chars_ \char_generate:nn { #1 + #2 } { 12 } _tl }
+        {
+          \@@_tmp:n { #1 + #2 }
+          \@@_tmp:n { #1 + "1D400 + #3 }
+          \@@_tmp:n { #1 + "1D434 + #3 }
+          \@@_tmp:n { #1 + "1D468 + #3 }
+          \@@_tmp:n { #1 + "1D49C + #3 }
+          \@@_tmp:n { #1 + "1D4D0 + #3 }
+          \@@_tmp:n { #1 + "1D504 + #3 }
+          \@@_tmp:n { #1 + "1D538 + #3 }
+          \@@_tmp:n { #1 + "1D56C + #3 }
+          \@@_tmp:n { #1 + "1D5A0 + #3 }
+          \@@_tmp:n { #1 + "1D5D4 + #3 }
+          \@@_tmp:n { #1 + "1D608 + #3 }
+          \@@_tmp:n { #1 + "1D63C + #3 }
+          \@@_tmp:n { #1 + "1D670 + #3 }
+        }
+    }
+  \int_step_inline:nnn { 0 } { 25 }
+    {
+      \@@_tmp:nnn {#1} { `A } { 0 }
+      \@@_tmp:nnn {#1} { `a } { "1A }
+    }
+%     \end{macrocode}
+% Simply also for numbers, but with the fact that now most of the variants are
+% not present so the approach is biases that way.
+%     \begin{macrocode}
+  \int_step_inline:nnn { `0 } { `9 }
+    {
+      \tl_const:cx
+        { c_@@_math_chars_ \char_generate:nn {#1} { 12 } _tl }
+        {
+          \@@_tmp:n {#1}
+          \@@_tmp:n { #1 - `0 + "1D7CE }
+          \@@_tmp:n {#1}
+          \@@_tmp:n {#1}
+          \@@_tmp:n {#1}
+          \@@_tmp:n {#1}
+          \@@_tmp:n {#1}
+          \@@_tmp:n { #1 - `0 + "1D7D8 }
+          \@@_tmp:n {#1}
+          \@@_tmp:n { #1 - `0 + "1D7E2 }
+          \@@_tmp:n { #1 - `0 + "1D7EC }
+          \@@_tmp:n {#1}
+          \@@_tmp:n {#1}
+          \@@_tmp:n { #1 - `0 + "1D7F6 }
+        }
+    }
+%     \end{macrocode}
+% Greek is a lot more tricky. Some symbols have multiple forms, and the use
+% by mathematicians is different from that by Greek speakers. Then there are
+% the two forms of sigma. So we start with a pass to deal with the cases that
+% are straight-forward: those where the math symbol is the same as the Greek
+% one. We miss the problem cases, and re-start the mapping after the sigma
+% position to account for the variation in number of codepoints.
+%     \begin{macrocode}
+  \cs_set_protected:Npn \@@_tmp:Nnnnn #1#2#3#4#5
+    {
+      \tl_const:cx
+        { c_@@_math_chars_ \token_to_str:N #1 _tl }
+        {
+          \@@_tmp:n { #2 + #4 }
+          \@@_tmp:n { #2 + "1D6A8 + #5 }
+          \@@_tmp:n { #2 + "1D6E2 + #5 }
+          \@@_tmp:n { #2 + "1D71C + #5 }
+          \@@_tmp:n { #2 + #4 }
+          \@@_tmp:n { #2 + #4 }
+          \@@_tmp:n { #2 + #4 }
+          \@@_tmp:n { #2 + #4 }
+          \@@_tmp:n { #2 + #4 }
+          \@@_tmp:n { #2 + #4 }
+          \@@_tmp:n { #2 + "1D756 + #5 }
+          \@@_tmp:n { #2 + #4 }
+          \@@_tmp:n { #2 + "1D790 + #5 }
+          \@@_tmp:n { #2 + #4 }
+        }
+    }
+  \cs_set_protected:Npn \@@_tmp:nn #1#2
+    {
+      \quark_if_recursion_tail_stop:n {#2}
+      \tl_if_blank:nF {#2}
+        {
+          \exp_args:Nc \@@_tmp:Nnnnn
+            { \str_uppercase:n #2 }
+            {#1} { "0391 } { "0391 } { 0 }
+          \exp_args:Nc \@@_tmp:Nnnnn
+            {#2} {#1} { "1D6FC } { "03B1 } { "1A }
+        }
+      \exp_args:Ne \@@_tmp:nn { \int_eval:n { #1 + 1 } }
+    }
+  \@@_tmp:nn { 0 }
+      { alpha }
+      { beta }
+      { gamma }
+      { delta }
+      { }
+      { zeta }
+      { eta }
+      { theta }
+      { iota }
+      { kappa }
+      { lambda }
+      { mu }
+      { nu }
+      { xi }
+      { omicron }
+      { pi }
+      { rho }
+      { \q_recursion_tail }
+      \q_recursion_stop
+  \cs_set_protected:Npn \@@_tmp:nn #1#2
+    {
+      \quark_if_recursion_tail_stop:n {#2}
+      \tl_if_blank:nF {#2}
+        {
+          \exp_args:Nc \@@_tmp:Nnnnn
+            { \str_uppercase:n #2 }
+            {#1} { "1D6F5 } { "03A4 } { "13 }
+          \exp_args:Nc \@@_tmp:Nnnnn
+            {#2} {#1} { "1D70F } { "03C4 } { "2D }
+        }
+      \exp_args:Ne \@@_tmp:nn { \int_eval:n { #1 + 1 } }
+    }
+  \@@_tmp:nn { 0 }
+      { tau }
+      { upsilon }
+      { }
+      { chi }
+      { psi }
+      { omega }
+      { \q_recursion_tail }
+      \q_recursion_stop
+%     \end{macrocode}
+% The problem cases are all done by hand: these are the symbols as-understood
+% by mathematicians. At this stage we switch to using the fact that there is
+% a known difference between codepoints here in the math part of Unicode.
+%     \begin{macrocode}
+  \cs_set_protected:Npn \@@_tmp:Nnn #1#2#3
+    {
+      \quark_if_recursion_tail_stop:N #1
+      \tl_const:cx
+        { c_@@_math_chars_ \token_to_str:N #1 _tl }
+        {
+          \@@_tmp:n {"#2}
+          \@@_tmp:n {"#3}
+          \@@_tmp:n { "#3 + "3A }
+          \@@_tmp:n { "#3 + "74 }
+          \@@_tmp:n {"#2}
+          \@@_tmp:n {"#2}
+          \@@_tmp:n {"#2}
+          \@@_tmp:n {"#2}
+          \@@_tmp:n {"#2}
+          \@@_tmp:n {"#2}
+          \@@_tmp:n { "#3 + "AE }
+          \@@_tmp:n {"#2}
+          \@@_tmp:n { "#3 + "E8 }
+          \@@_tmp:n {"#2}
+        }
+      \@@_tmp:Nnn
+    }
+  \@@_tmp:Nnn
+    \epsilon { 03F5 } { 1D6DC }
+    \sigma   { 03C3 } { 1D6D4 }
+    \phi     { 03C6 } { 1D6DF }
+    \Epsilon { 0395 } { 1D6AC }
+    \Sigma   { 03A3 } { 1D6BA }
+    \Phi     { 03A6 } { 1D6BD }
+%     \end{macrocode}
+% Finally, deal with the variant symbols plus the odds and ends.
+%     \begin{macrocode}
+    \varepsilon { 03B5 } { 1D6C6 }
+    \vartheta   { 03D1 } { 1D6DD }
+    \varkappa   { 03F0 } { 1D6DE }
+    \varpi      { 03D6 } { 1D6E1 }
+    \varrho     { 03F1 } { 1D6E0 }
+    \varsigma   { 03C2 } { 1D6D3 }
+    \varphi     { 03D5 } { 1D6D7 }
+    \nabla      { 2207 } { 1D6C1 }
+    \partial    { 2202 } { 1D6DB }
+    \q_recursion_tail { } { }
+    \q_recursion_stop
+%     \end{macrocode}
+% Superscripts and subscripts.    
+%     \begin{macrocode}
+  \cs_set:Npn \@@_group:n #1 { #1 }
+  \cs_set_protected:Npn \@@_tmp:Nn #1#2
+    {
+      \quark_if_recursion_tail_stop:N #1
+      \tl_const:cx
+        { c_@@_math_super_ #1 _tl }
+        { \@@_tmp:n {"#2} }
+      \@@_tmp:Nn
+    }
+  \@@_tmp:Nn
+    2 { 00B2 }
+    3 { 00B3 }
+    1 { 00B9 }
+    0 { 2070 }
+    i { 2071 }
+    4 { 2074 }
+    5 { 2075 }
+    6 { 2076 }
+    7 { 2077 }
+    8 { 2078 }
+    9 { 2079 }
+    + { 207A }
+    - { 207B }
+    = { 207C }
+    ( { 207D } % ) (
+    ) { 207E }
+    n { 207F }
+    \q_recursion_tail { }
+    \q_recursion_stop
+  \cs_set_protected:Npn \@@_tmp:Nn #1#2
+    {
+      \quark_if_recursion_tail_stop:N #1
+      \tl_const:cx
+        { c_@@_math_sub_ #1 _tl }
+        { \@@_tmp:n {"#2} }
+      \@@_tmp:Nn
+    }
+  \@@_tmp:Nn
+    0 { 2080 }
+    1 { 2081 }
+    2 { 2082 }
+    3 { 2083 }
+    4 { 2084 }
+    5 { 2085 }
+    6 { 2086 }
+    7 { 2087 }
+    8 { 2088 }
+    9 { 2089 }
+    + { 208A }
+    - { 208B }
+    = { 208C }
+    ( { 208D } % ) (
+    ) { 208E }
+    a { 2090 }
+    e { 2090 }
+    o { 2090 }
+    x { 2090 }
+    k { 2090 }
+    l { 2090 }
+    m { 2090 }
+    n { 2090 }
+    p { 2090 }
+    s { 2090 }
+    t { 2090 }
+    \q_recursion_tail { }
+    \q_recursion_stop
+%     \end{macrocode}
+%  Data for the math mode accents.
+%     \begin{macrocode}
+  \cs_set_protected:Npn \@@_tmp:Nn #1#2
+    {
+      \quark_if_recursion_tail_stop:N #1
+      \tl_const:cx
+        { c_@@_math_accent_ \token_to_str:N #1 _tl }
+        { \@@_tmp:n {"#2} }
+      \@@_tmp:Nn
+    }
+  \@@_tmp:Nn
+    \grave     { 0300 }
+    \acute     { 0301 }
+    \hat       { 0302 }
+    \widehat   { 0302 }
+    \tilde     { 0303 }
+    \widetilde { 0303 }
+    \bar       { 0304 }
+    \breve     { 0306 }
+    \dot       { 0307 }
+    \ddot      { 0308 }
+    \q_recursion_tail { }
+    \q_recursion_stop
+%     \end{macrocode}
+%  Now move to the one-to-one mappings. The three up-front need non-standard
+%  category codes, then we move to the ones which can be done mechanically.
+%  Accent characters have to be pulled out as they need to be re-ordered
+%  relative to their parent letter. Notice that |-| is included here as
+%  it needs to be replaced.
+%     \begin{macrocode}
+  \tl_const:cx { c_@@_math_char_ \token_to_str:N \backslash _tl }
+    { \c_backslash_str }
+  \tl_const:cx { c_@@_math_char_ \token_to_str:N \lbrace _tl }
+    { \c_left_brace_str }
+  \tl_const:cx { c_@@_math_char_ \token_to_str:N \rbrace _tl }
+    { \c_right_brace_str }
+  \cs_set_protected:Npn \@@_tmp:Nn #1#2
+    {
+      \quark_if_recursion_tail_stop:N #1
+      \tl_const:cx
+        { c_@@_math_char_ \token_to_str:N #1 _tl }
+        { \@@_tmp:n {"#2} }
+      \@@_tmp:Nn
+    }
+  \@@_tmp:Nn
+    \mathdollar          { 0024 }
+    \lbrack              { 005B }
+    \rbrack              { 005D }
+    \vert                { 007C }
+    \mathsterling        { 00A3 }
+    \mathsection         { 00A7 }
+    \neg                 { 00AC }
+    \pm                  { 00B1 }
+    \mathparagraph       { 00B6 }
+    \cdotp               { 00B7 }
+    \times               { 00D7 }
+    \div                 { 00F7 }
+    \check               { 030C }
+    \underleftrightarrow { 034D }
+    \Vert                { 2016 }
+    \dagger              { 2020 }
+    \ddagger             { 2021 }
+    \prime               { 2032 }
+    \overleftarrow       { 20D6 }
+    \overrightarrow      { 20D7 }
+    \vec                 { 20D7 }
+    \dddot               { 20DB }
+    \ddddot              { 20DC }
+    \overleftrightarrow  { 20E1 }
+    \underleftarrow      { 20EE }
+    \underrightarrow     { 20EF }
+    \Im                  { 2111 }
+    \ell                 { 2113 }
+    \wp                  { 2118 }
+    \Re                  { 211C }
+    \mho                 { 2127 }
+    \aleph               { 2135 }
+    \leftarrow           { 2190 }
+    \uparrow             { 2191 }
+    \rightarrow          { 2192 }
+    \downarrow           { 2193 }
+    \leftrightarrow      { 2194 }
+    \updownarrow         { 2195 }
+    \nwarrow             { 2196 }
+    \nearrow             { 2197 }
+    \searrow             { 2198 }
+    \swarrow             { 2199 }
+    \mapsto              { 21A6 }
+    \hookleftarrow       { 21A9 }
+    \hookrightarrow      { 21AA }
+    \leftharpoonup       { 21BC }
+    \leftharpoondown     { 21BD }
+    \rightharpoonup      { 21C0 }
+    \rightharpoondown    { 21C1 }
+    \rightleftharpoons   { 21CC }
+    \Leftarrow           { 21D0 }
+    \Uparrow             { 21D1 }
+    \Rightarrow          { 21D2 }
+    \Downarrow           { 21D3 }
+    \Leftrightarrow      { 21D4 }
+    \Updownarrow         { 21D5 }
+    \forall              { 2200 }
+    \partial             { 2202 }
+    \exists              { 2203 }
+    \nabla               { 2207 }
+    \in                  { 2208 }
+    \notin               { 2209 }
+    \ni                  { 220B }
+    \prod                { 220F }
+    \coprod              { 2210 }
+    \sum                 { 2211 }
+    -                    { 2212 }
+    \mp                  { 2213 }
+    \ast                 { 2217 }
+    \surd                { 221A }
+    \propto              { 221D }
+    \infty               { 221E }
+    \angle               { 2220 }
+    \mid                 { 2223 }
+    \parallel            { 2225 }
+    \wedge               { 2227 }
+    \vee                 { 2228 }
+    \cap                 { 2229 }
+    \cup                 { 222A }
+    \int                 { 222B }
+    \iint                { 222C }
+    \iiint               { 222D }
+    \oint                { 222E }
+    \sim                 { 223C }
+    \wr                  { 2240 }
+    \simeq               { 2243 }
+    \cong                { 2245 }
+    \approx              { 2248 }
+    \asymp               { 224D }
+    \doteq               { 2250 }
+    \ne                  { 2260 }
+    \equiv               { 2261 }
+    \leq                 { 2264 }
+    \geq                 { 2265 }
+    \ll                  { 226A }
+    \gg                  { 226B }
+    \prec                { 227A }
+    \succ                { 227B }
+    \subset              { 2282 }
+    \supset              { 2283 }
+    \subseteq            { 2286 }
+    \supseteq            { 2287 }
+    \uplus               { 228E }
+    \sqsubset            { 228F }
+    \sqsupset            { 2290 }
+    \sqsubseteq          { 2291 }
+    \sqsupseteq          { 2292 }
+    \sqcap               { 2293 }
+    \sqcup               { 2294 }
+    \oplus               { 2295 }
+    \ominus              { 2296 }
+    \otimes              { 2297 }
+    \oslash              { 2298 }
+    \odot                { 2299 }
+    \vdash               { 22A2 }
+    \dashv               { 22A3 }
+    \top                 { 22A4 }
+    \bot                 { 22A5 }
+    \models              { 22A7 }
+    \bigwedge            { 22C0 }
+    \bigvee              { 22C1 }
+    \bigcap              { 22C2 }
+    \bigcup              { 22C3 }
+    \cdot                { 22C5 }
+    \star                { 22C6 }
+    \bowtie              { 22C8 }
+    \vdots               { 22EE }
+    \ddots               { 22F1 }
+    \lceil               { 2308 }
+    \rceil               { 2309 }
+    \lfloor              { 230A }
+    \rfloor              { 230B }
+    \frown               { 2322 }
+    \smile               { 2323 }
+    \lmoustache          { 23B0 }
+    \rmoustache          { 23B1 }
+    \overbrace           { 23DE }
+    \underbrace          { 23DF }
+    \bigtriangleup       { 25B3 }
+    \triangleright       { 25B7 }
+    \bigtriangledown     { 25BD }
+    \triangleleft        { 25C1 }
+    \spadesuit           { 2660 }
+    \heartsuit           { 2661 }
+    \diamondsuit         { 2662 }
+    \clubsuit            { 2663 }
+    \flat                { 266D }
+    \natural             { 266E }
+    \sharp               { 266F }
+    \perp                { 27C2 }
+    \langle              { 27E8 }
+    \rangle              { 27E9 }
+    \lgroup              { 27EE }
+    \rgroup              { 27EF }
+    \longleftarrow       { 27F5 }
+    \longrightarrow      { 27F6 }
+    \longleftrightarrow  { 27F7 }
+    \Longleftarrow       { 27F8 }
+    \Longrightarrow      { 27F9 }
+    \Longleftrightarrow  { 27FA }
+    \longmapsto          { 27FC }
+    \setminus            { 29F5 }
+    \bigodot             { 2A00 }
+    \bigoplus            { 2A01 }
+    \bigotimes           { 2A02 }
+    \biguplus            { 2A04 }
+    \bigsqcup            { 2A06 }
+    \iiiint              { 2A0C }
+    \Join                { 2A1D }
+    \amalg               { 2A3F }
+    \preceq              { 2AAF }
+    \succeq              { 2AB0 }
+    \imath              { 1D6A4 }
+    \jmath              { 1D6A5 }
+    \q_recursion_tail { }
+    \q_recursion_stop
+%    \end{macrocode}
+%   The \tn{sqrt} case is odd as it is a symbol but needs to be filtered out for
+%   the optional argument, so we store it separately.
+%    \begin{macrocode}
+    \tl_const:cx
+      { c_@@_math_ \token_to_str:N \sqrt _tl }
+      { \@@_tmp:n { " 221A } }
+\group_end:
+%    \end{macrocode}
+%
 %    \begin{macrocode}
 %</initex|package>
 %    \end{macrocode}