[latex3-commits] [git/LaTeX3-latex3-latex3] l3text: Initial ideas on \text_expand:n (f7a76b95a)
Joseph Wright
joseph.wright at morningstar2.co.uk
Mon Nov 25 15:25:39 CET 2019
Repository : https://github.com/latex3/latex3
On branch : l3text
Link : https://github.com/latex3/latex3/commit/f7a76b95a1996964736e9a2bf8a9e0963643252d
>---------------------------------------------------------------
commit f7a76b95a1996964736e9a2bf8a9e0963643252d
Author: Joseph Wright <joseph.wright at morningstar2.co.uk>
Date: Mon Nov 25 13:12:12 2019 +0000
Initial ideas on \text_expand:n
This yields 'formatted text.
Open question is handling of accents and letter
commands: they will need to be stripped by
a 'pure text' command in any case.
>---------------------------------------------------------------
f7a76b95a1996964736e9a2bf8a9e0963643252d
l3kernel/l3text.dtx | 577 ++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 577 insertions(+)
diff --git a/l3kernel/l3text.dtx b/l3kernel/l3text.dtx
index da472ed8e..dfb322346 100644
--- a/l3kernel/l3text.dtx
+++ b/l3kernel/l3text.dtx
@@ -52,6 +52,47 @@
%
% \section{\pkg{l3text} documentation}
%
+% This module deals with manipulation of (formatted) text; such material is
+% comprised of a restricted set of token list content. The functions provided
+% here concern conversion of textual content for example in case changing,
+% generation of bookmarks and extraction to tags.
+%
+% \begin{function}[rEXP]{\text_expand:n}
+% \begin{syntax}
+% \cs{text_expand:n} \Arg{text}
+% \end{syntax}
+% Takes user input \arg{text} and transforms expandable and implicit
+% content to the explicit equivalent. Protected commands (typically
+% formatting) are left in place, and no processing takes place of
+% math mode material (as delimited by pairs given in
+% \cs{l_text_math_delims_tl}). Commands which are neither engine-
+% nor \LaTeX{} protected are expanded exhaustively. Implicit tokens,
+% including implicit groups, are converted to their explicit equivalent.
+% The argument to any command listed in \cs{l_text_exclude_arg_tl} are
+% excluded from expansion. Any commands listed in \cs{l_text_accents_tl}
+% and \cs{l_text_letterlike_tl} are excluded from expansion.
+% \end{function}
+%
+% \begin{variable}{\l_text_accents_tl}
+% Lists commands which represent accents, and which are left unchanged
+% by expansion.
+% \end{variable}
+%
+% \begin{variable}{\l_text_exclude_arg_tl}
+% Lists commands present in the \meta{text} where the argument of the
+% command should be excluded from expansion, etc.
+% \end{variable}
+%
+% \begin{variable}{\l_text_letterlike_tl}
+% Lists commands which represent letters; these are left unchanged by
+% expansion.
+% \end{variable}
+%
+% \begin{variable}{\l_text_math_delims_tl}
+% Lists pairs of tokens which delimit (in-line) math mode content; such
+% content \emph{may} be excluded from processing.
+% \end{variable}
+%
% \end{documentation}
%
% \begin{implementation}
@@ -66,6 +107,542 @@
%<@@=text>
% \end{macrocode}
%
+% \begin{macro}[EXP]
+% {
+% \@@_token_to_explicit:N ,
+% \@@_token_to_explicit_char:N ,
+% \@@_token_to_explicit_cs:N ,
+% \@@_token_to_explicit_cs_aux:N
+% }
+% \begin{macro}[EXP]{\@@_token_to_explicit:n}
+% \begin{macro}[EXP]
+% {
+% \@@_token_to_explicit_auxi:w ,
+% \@@_token_to_explicit_auxii:w ,
+% \@@_token_to_explicit_auxiii:w
+% }
+% The idea here is to take a token and ensure that if it's an implicit
+% char, we output the explicit version. Otherwise, the token needs to be
+% unchanged. First, we have to split between control sequences and everything
+% else.
+% \begin{macrocode}
+\group_begin:
+ \char_set_catcode_active:n { 0 }
+ \cs_new:Npn \@@_token_to_explicit:N #1
+ {
+ \if_catcode:w \exp_not:N #1
+ \if_catcode:w \scan_stop: \exp_not:N #1
+ \scan_stop:
+ \else:
+ \exp_not:N ^^@
+ \fi:
+ \exp_after:wN \@@_token_to_explicit_cs:N
+ \else:
+ \exp_after:wN \@@_token_to_explicit_char:N
+ \fi:
+ #1
+ }
+\group_end:
+% \end{macrocode}
+% For control sequences, we can check for macros versus other cases using
+% \cs{if_meaning:w}, then explicitly check for \tn{chardef} and
+% \tn{mathchardef}.
+% \begin{macrocode}
+\cs_new:Npn \@@_token_to_explicit_cs:N #1
+ {
+ \exp_after:wN \if_meaning:w \exp_not:N #1 #1
+ \exp_after:wN \use:nn \exp_after:wN
+ \@@_token_to_explicit_cs_aux:N
+ \else:
+ \exp_after:wN \exp_not:n
+ \fi:
+ {#1}
+ }
+\cs_new:Npn \@@_token_to_explicit_cs_aux:N #1
+ {
+ \bool_lazy_or:nnTF
+ { \token_if_chardef_p:N #1 }
+ { \token_if_mathchardef_p:N #1 }
+ {
+ \char_generate:nn {#1}
+ { \char_value_catcode:n {#1} }
+ }
+ {#1}
+ }
+% \end{macrocode}
+% For character tokens, we need to filter out the implicit characters from
+% those that are explicit. That's done here, then if necessary we work out
+% the category code and generate the char. To avoid issues with alignment
+% tabs, that one is done by elimination rather than looking up the code
+% explicitly. The trick with finding the charcode is that the \TeX{}
+% messages are either \texttt{the \meta{something} character \meta{char}}
+% or \texttt{the \meta{type} \meta{char}}.
+% \begin{macrocode}
+\cs_new:Npn \@@_token_to_explicit_char:N #1
+ {
+ \if:w
+ \if_catcode:w ^ \exp_args:No \str_tail:n { \token_to_str:N #1 } ^
+ \token_to_str:N #1 #1
+ \else:
+ AB
+ \fi:
+ \exp_after:wN \exp_not:n
+ \else:
+ \exp_after:wN \@@_token_to_explicit:n
+ \fi:
+ {#1}
+ }
+\cs_new:Npn \@@_token_to_explicit:n #1
+ {
+ \exp_after:wN \@@_token_to_explicit_auxi:w
+ \int_value:w
+ \if_catcode:w \c_group_begin_token #1 1 \else:
+ \if_catcode:w \c_group_end_token #1 2 \else:
+ \if_catcode:w \c_math_toggle_token #1 3 \else:
+ \if_catcode:w ## #1 6 \else:
+ \if_catcode:w ^ #1 7 \else:
+ \if_catcode:w \c_math_subscript_token #1 8 \else:
+ \if_catcode:w \c_space_token #1 10 \else:
+ \if_catcode:w A #1 11 \else:
+ \if_catcode:w + #1 12 \else:
+ 4 \fi: \fi: \fi: \fi: \fi: \fi: \fi: \fi: \fi:
+ \exp_after:wN ;
+ \token_to_meaning:N #1 \q_stop
+ }
+\cs_new:Npn \@@_token_to_explicit_auxi:w #1 ; #2 \q_stop
+ {
+ \char_generate:nn
+ {
+ \if_int_compare:w #1 < 9 \exp_stop_f:
+ \exp_after:wN \@@_token_to_explicit_auxii:w
+ \else:
+ \exp_after:wN \@@_token_to_explicit_auxiii:w
+ \fi:
+ #2
+ }
+ {#1}
+ }
+\exp_last_unbraced:NNNNo \cs_new:Npn \@@_token_to_explicit_auxii:w
+ #1 { \tl_to_str:n { character ~ } } { ` }
+\cs_new:Npn \@@_token_to_explicit_auxiii:w #1 ~ #2 ~ { ` }
+% \end{macrocode}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+%
+% \begin{variable}{\l_text_accents_tl, \l_text_letterlike_tl}
+% Special cases for accents and letter-like symbols, which in some cases will
+% need to be converted further.
+% \begin{macrocode}
+\tl_new:N \l_text_accents_tl
+\tl_set:Nn \l_text_accents_tl { \" \' \. \^ \` \~ \c \H \k \r \t \u \v }
+\tl_new:N \l_text_letterlike_tl
+\tl_set:Nn \l_text_letterlike_tl
+ {
+ \AA \aa
+ \AE \ae
+ \DH \dh
+ \DJ \dj
+ \IJ \ij
+ \L \l
+ \NG \ng
+ \O \o
+ \OE \oe
+ \SS \ss
+ \TH \th
+ }
+% \end{macrocode}
+% \end{variable}
+%
+% \begin{variable}{\l_text_exclude_arg_tl}
+% Paired math mode delimiters.
+% \begin{macrocode}
+\tl_new:N \l_text_exclude_arg_tl
+\tl_set:Nn \l_text_exclude_arg_tl { \cite \ensuremath \label \ref }
+% \end{macrocode}
+% \end{variable}
+%
+% \begin{variable}{\l_text_math_delims_tl}
+% Paired math mode delimiters.
+% \begin{macrocode}
+\tl_new:N \l_text_math_delims_tl
+\tl_set:Nn \l_text_math_delims_tl { $ $ \( \) }
+% \end{macrocode}
+% \end{variable}
+%
+% \begin{variable}{\c_@@_chardef_space_token, \c_@@_mathchardef_space_token}
+% \begin{variable}
+% {\c_@@_chardef_group_begin_token, \c_@@_mathchardef_group_begin_token}
+% \begin{variable}
+% {\c_@@_chardef_group_end_token, \c_@@_mathchardef_group_end_token}
+% Markers for implict char handling.
+% \begin{macrocode}
+\tex_chardef:D \c_@@_chardef_space_token = `\ %
+\tex_mathchardef:D \c_@@_mathchardef_space_token = `\ %
+\tex_chardef:D \c_@@_chardef_group_begin_token = `\{ % `\}
+\tex_mathchardef:D \c_@@_mathchardef_group_begin_token = `\{ % `\} `\{
+\tex_chardef:D \c_@@_chardef_group_end_token = `\} % `\{
+\tex_mathchardef:D \c_@@_mathchardef_group_end_token = `\} %
+% \end{macrocode}
+% \end{variable}
+% \end{variable}
+% \end{variable}
+%
+% \begin{macro}[rEXP]{\text_expand:n}
+% \begin{macro}[rEXP]{\@@_expand_loop:w}
+% \begin{macro}[rEXP]{\@@_expand_group:n}
+% \begin{macro}[rEXP]{\@@_expand_space:w}
+% \begin{macro}[rEXP]
+% {
+% \@@_expand_N_type:N ,
+% \@@_expand_N_type_auxi:N ,
+% \@@_expand_N_type_auxii:N ,
+% \@@_expand_N_type_auxiii:N
+% }
+% \begin{macro}[rEXP]{\@@_expand_math_search:NNN}
+% \begin{macro}[rEXP]{\@@_expand_math_loop:Nw}
+% \begin{macro}[rEXP]{\@@_expand_math_N_type:NN}
+% \begin{macro}[rEXP]{\@@_expand_math_group:Nn}
+% \begin{macro}[rEXP]{\@@_expand_math_space:Nw}
+% \begin{macro}[rEXP]{\@@_expand_exclude:N}
+% \begin{macro}[rEXP]{\@@_expand_exclude:NN}
+% \begin{macro}[rEXP]{\@@_expand_exclude:Nn}
+% \begin{macro}[rEXP]{\@@_expand_letterlike:N}
+% \begin{macro}[rEXP]{\@@_expand_letterlike:NN}
+% \begin{macro}[rEXP]
+% {
+% \@@_expand_implicit:N ,
+% \@@_expand_explicit:N ,
+% \@@_expand_cs:N ,
+% \@@_expand_protect:N
+% }
+% \begin{macro}[rEXP]{\@@_expand_protect:nN}
+% \begin{macro}[rEXP]{\@@_expand_protect:Nw}
+% \begin{macro}[rEXP]{\@@_expand_cs_expand:N}
+% \begin{macro}[rEXP]{\@@_expand_if_expand:NTF}
+% After precautions against |&| tokens, start a simple loop: that of
+% course means that \enquote{text} cannot contain the two recursion
+% quarks.
+% \begin{macrocode}
+\cs_new:Npn \text_expand:n #1
+ {
+ \group_align_safe_begin:
+ \@@_expand_loop:w #1
+ \q_recursion_tail \q_recursion_stop
+ \group_align_safe_end:
+ }
+% \end{macrocode}
+% The main loop is a standard \enquote{tl action}; groups are handled
+% recursively, while spaces are just passed through. Thus all of the
+% action is in handling \texttt{N}-type tokens.
+% \begin{macrocode}
+\cs_new:Npn \@@_expand_loop:w #1 \q_recursion_stop
+ {
+ \tl_if_head_is_N_type:nTF {#1}
+ { \@@_expand_N_type:N }
+ {
+ \tl_if_head_is_group:nTF {#1}
+ { \@@_expand_group:n }
+ { \@@_expand_space:w }
+ }
+ #1 \q_recursion_stop
+ }
+\cs_new:Npn \@@_expand_group:n #1
+ {
+ {
+ \@@_expand_loop:w #1
+ \q_recursion_tail 1a \q_recursion_stop
+ }
+ \@@_expand_loop:w
+ }
+\exp_last_unbraced:NNo \cs_new:Npn \@@_expand_space:w \c_space_tl
+ {
+ \c_space_tl
+ \@@_expand_loop:w
+ }
+% \end{macrocode}
+% Before we get into the real work, we have to watch out for problematic
+% implicit characters: spaces and grouping tokens. Converting these to
+% explicit characters later would lead to real issues as they are \emph{not}
+% \texttt{N}-type. A space is the easy case, so it's dealt with first:
+% just insert the explicit token and continue the loop.
+% \begin{macrocode}
+\cs_new:Npx \@@_expand_N_type:N #1
+ {
+ \exp_not:N \quark_if_recursion_tail_stop:N #1
+ \exp_not:N \bool_lazy_any:nTF
+ {
+ { \exp_not:N \token_if_eq_meaning_p:NN #1 \c_space_token }
+ {
+ \exp_not:N \token_if_eq_meaning_p:NN #1
+ \c_@@_chardef_space_token
+ }
+ {
+ \exp_not:N \token_if_eq_meaning_p:NN #1
+ \c_@@_mathchardef_space_token
+ }
+ }
+ { \exp_not:N \@@_expand_space:w \c_space_tl }
+ { \exp_not:N \@@_expand_N_type_auxi:N #1 }
+ }
+% \end{macrocode}
+% Implicit |{|/|}| offer two issues. First, the token could be an implicit
+% brace character: we need to avoid turning that into a brace group, so filter
+% out the cases manually. Then we handle the case where an implicit group is
+% present. That is done in an \enquote{open-ended} way: there's the possibility
+% the closing token is hidden somewhere.
+% \begin{macrocode}
+\cs_new:Npn \@@_expand_N_type_auxi:N #1
+ {
+ \bool_lazy_or:nnTF
+ { \token_if_eq_meaning_p:NN #1 \c_@@_chardef_group_begin_token }
+ { \token_if_eq_meaning_p:NN #1 \c_@@_mathchardef_group_begin_token }
+ {
+ \c_left_brace_str
+ \@@_expand_loop:w
+ }
+ {
+ \bool_lazy_or:nnTF
+ { \token_if_eq_meaning_p:NN #1 \c_@@_chardef_group_end_token }
+ { \token_if_eq_meaning_p:NN #1 \c_@@_mathchardef_group_end_token }
+ {
+ \c_right_brace_str
+ \@@_expand_loop:w
+ }
+ { \@@_expand_N_type_auxii:N #1 }
+ }
+ }
+\cs_new:Npn \@@_expand_N_type_auxii:N #1
+ {
+ \token_if_eq_meaning:NNTF #1 \c_group_begin_token
+ {
+ { \if_false: } \fi:
+ \@@_expand_loop:w
+ }
+ {
+ \token_if_eq_meaning:NNTF #1 \c_group_end_token
+ {
+ \if_false: { \fi: }
+ \@@_expand_loop:w
+ }
+ { \@@_expand_N_type_auxiii:N #1 }
+ }
+ }
+% \end{macrocode}
+% The first step in dealing with \texttt{N}-type tokens is to look for
+% math mode material: that needs to be left alone. The starting function
+% has to be split into two as we need \cs{quark_if_recursion_tail_stop:N}
+% first before we can trigger the search. We then look for matching
+% pairs of delimiters, allowing for the case where math mode starts
+% but does not end. Within math mode, we simply pass all the tokens
+% through unchanged, just checking the \texttt{N}-type ones against the
+% end marker.
+% \begin{macrocode}
+\cs_new:Npn \@@_expand_N_type_auxiii:N #1
+ {
+ \exp_after:wN \@@_expand_math_search:NNN
+ \exp_after:wN #1 \l_text_math_delims_tl
+ \q_recursion_tail \q_recursion_tail
+ \q_recursion_stop
+ }
+\cs_new:Npn \@@_expand_math_search:NNN #1#2#3
+ {
+ \quark_if_recursion_tail_stop_do:Nn #2
+ { \@@_expand_exclude:N #1 }
+ \token_if_eq_meaning:NNTF #1 #2
+ {
+ \use_i_delimit_by_q_recursion_stop:nw
+ {
+ \exp_not:n {#1}
+ \@@_expand_math_loop:Nw #3
+ }
+ }
+ { \@@_expand_math_search:NNN #1 }
+ }
+\cs_new:Npn \@@_expand_math_loop:Nw #1#2 \q_recursion_stop
+ {
+ \tl_if_head_is_N_type:nTF {#2}
+ { \@@_expand_math_N_type:NN }
+ {
+ \tl_if_head_is_group:nTF {#2}
+ { \@@_expand_math_group:Nn }
+ { \@@_expand_math_space:Nw }
+ }
+ #1#2 \q_recursion_stop
+ }
+\cs_new:Npn \@@_expand_math_N_type:NN #1#2
+ {
+ \quark_if_recursion_tail_stop:N #2
+ \exp_not:n {#2}
+ \token_if_eq_meaning:NNTF #2 #1
+ { \@@_expand_loop:w }
+ { \@@_expand_math_loop:Nw #1 }
+ }
+\cs_new:Npn \@@_expand_math_group:Nn #1#2
+ {
+ { \exp_not:n {#1} }
+ \@@_expand_math_loop:Nw #1
+ }
+\exp_after:wN \cs_new:Npn \exp_after:wN \@@_expand_math_space:Nw
+ \exp_after:wN # \exp_after:wN 1 \c_space_tl
+ {
+ \c_space_tl
+ \@@_expand_math_loop:Nw #1
+ }
+% \end{macrocode}
+% Next we exclude any commands with one argument that are explicitly
+% listed. This is mainly as there \emph{might} be an \cs{ensuremath},
+% but one never knows about for example accents in labels too. We also deal
+% with accents here: they are effectively the same situation.
+% \begin{macrocode}
+\cs_new:Npn \@@_expand_exclude:N #1
+ {
+ \exp_after:wN \exp_after:wN \exp_after:wN \@@_expand_exclude:NN
+ \exp_after:wN \exp_after:wN \exp_after:wN #1
+ \exp_after:wN \l_text_exclude_arg_tl \l_text_accents_tl
+ \q_recursion_tail \q_recursion_stop
+ }
+\cs_new:Npn \@@_expand_exclude:NN #1#2
+ {
+ \quark_if_recursion_tail_stop_do:Nn #2
+ { \@@_expand_letterlike:N #1 }
+ \cs_if_eq:NNTF #2 #1
+ {
+ \use_i_delimit_by_q_recursion_stop:nw
+ { \@@_expand_exclude:Nn #1 }
+ }
+ { \@@_expand_exclude:NN #1 }
+ }
+\cs_new:Npn \@@_expand_exclude:Nn #1#2
+ {
+ \exp_not:n { #1 {#2} }
+ \@@_expand_loop:w
+ }
+% \end{macrocode}
+% \begin{macrocode}
+\cs_new:Npn \@@_expand_letterlike:N #1
+ {
+ \exp_after:wN \@@_expand_letterlike:NN \exp_after:wN
+ #1 \l_text_letterlike_tl
+ \q_recursion_tail \q_recursion_stop
+ }
+\cs_new:Npn \@@_expand_letterlike:NN #1#2
+ {
+ \quark_if_recursion_tail_stop_do:Nn #2
+ { \@@_expand_implicit:N #1 }
+ \cs_if_eq:NNTF #2 #1
+ {
+ \use_i_delimit_by_q_recursion_stop:nw
+ {
+ \exp_not:n {#1}
+ \@@_expand_loop:w
+ }
+ }
+ { \@@_expand_letterlike:NN #1 }
+ }
+% \end{macrocode}
+% Conversion of implicit to explicit tokens does not have to account for
+% spaces or brace groups: they are already fixed above. So we can assume that
+% the result of this conversion is still an \texttt{N}-type token.
+% \begin{macrocode}
+\cs_new:Npn \@@_expand_implicit:N #1
+ {
+ \exp_args:NNe \use:nn \@@_expand_explicit:N
+ { \@@_token_to_explicit:N #1 }
+ }
+% \end{macrocode}
+% At this stage, either we have a control sequence or a simple character:
+% split and handle.
+% \begin{macrocode}
+\cs_new:Npn \@@_expand_explicit:N #1
+ {
+ \token_if_cs:NTF #1
+ { \@@_expand_cs:N #1 }
+ {
+ \exp_not:n {#1}
+ \@@_expand_loop:w
+ }
+ }
+% \end{macrocode}
+% \LaTeXe{}'s \cs{protect} makes life interesting. Where possible, we
+% simply remove it and replace with the \enquote{parent} command; of course,
+% the \cs{protect} might be explicit, in which case we need to leave it alone
+% if it's required.
+% \begin{macrocode}
+\cs_new:Npn \@@_expand_cs:N #1
+ {
+ \str_if_eq:NNTF {#1} { \protect }
+ { \@@_expand_protect:N }
+ { \@@_expand_cs_expand:N #1 }
+ }
+\cs_new:Npn \@@_expand_protect:N #1
+ {
+ \exp_args:Ne \@@_expand_protect:nN
+ { \cs_to_str:N #1 } #1
+ }
+\cs_new:Npn \@@_expand_protect:nN #1#2
+ { \@@_expand_protect:Nw #2 #1 \q_nil #1 ~ \q_nil \q_nil \q_stop }
+\cs_new:Npn \@@_expand_protect:Nw #1 #2 ~ \q_nil #3 \q_nil #4 \q_stop
+ {
+ \quark_if_nil:nTF {#4}
+ {
+ \cs_if_exist:cTF {#2}
+ { \exp_not:c {#2} }
+ { \exp_not:n { \protect #1 } }
+ }
+ { \exp_not:n { \protect #1 } }
+ \@@_expand_loop:w
+ }
+% \end{macrocode}
+% Finally, expand any macros which can be: this then loops back around to
+% deal with what they produce.
+% \begin{macrocode}
+\cs_new:Npn \@@_expand_cs_expand:N #1
+ {
+ \@@_expand_if_expand:NTF #1
+ { \exp_after:wN \@@_expand_loop:w #1 }
+ {
+ \exp_not:n {#1}
+ \@@_expand_loop:w
+ }
+ }
+\cs_new:Npn \@@_expand_if_expand:NTF #1
+ {
+ \token_if_expandable:NTF #1
+ {
+ \bool_lazy_any:nTF
+ {
+ { \token_if_protected_macro_p:N #1 }
+ { \token_if_protected_long_macro_p:N #1 }
+ { \token_if_eq_meaning_p:NN \q_recursion_tail #1 }
+ }
+ { \use_ii:nn }
+ { \use_i:nn }
+ }
+ { \use_ii:nn }
+ }
+% \end{macrocode}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+%
% \begin{macrocode}
%</initex|package>
% \end{macrocode}
More information about the latex3-commits
mailing list