[latex3-commits] [git/LaTeX3-latex3-latex3] peek-analysis: Implement \peek_analysis_map_inline:n (b0a30dc9d)
Joseph Wright
joseph.wright at morningstar2.co.uk
Thu Dec 3 13:56:04 CET 2020
Repository : https://github.com/latex3/latex3
On branch : peek-analysis
Link : https://github.com/latex3/latex3/commit/b0a30dc9d63f4759e21c0f6610c9c05e79d69a68
>---------------------------------------------------------------
commit b0a30dc9d63f4759e21c0f6610c9c05e79d69a68
Author: Bruno Le Floch <bruno at le-floch.fr>
Date: Tue Jul 14 16:01:28 2020 +0200
Implement \peek_analysis_map_inline:n
>---------------------------------------------------------------
b0a30dc9d63f4759e21c0f6610c9c05e79d69a68
l3kernel/l3tl-analysis.dtx | 460 ++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 436 insertions(+), 24 deletions(-)
diff --git a/l3kernel/l3tl-analysis.dtx b/l3kernel/l3tl-analysis.dtx
index be4bd2406..c864770a3 100644
--- a/l3kernel/l3tl-analysis.dtx
+++ b/l3kernel/l3tl-analysis.dtx
@@ -52,10 +52,27 @@
%
% \section{\pkg{l3tl-analysis} documentation}
%
-% This module mostly provides internal functions for use in the
-% \pkg{l3regex} module. However, it provides as a side-effect a user
-% debugging function, very similar to the \cs{ShowTokens} macro from the
-% \pkg{ted} package.
+% This module provides functions that are particularly useful in the
+% \pkg{l3regex} module for mapping through a token list one \meta{token}
+% at a time (including begin-group/end-group tokens). The token list is
+% either given as an argument or found in the input stream. In both
+% cases the user provides \meta{inline code} that receives three
+% arguments for each \meta{token}:
+% \begin{itemize}
+% \item \meta{tokens}, which both \texttt{o}-expand and
+% \texttt{x}-expand to the \meta{token}. The detailed form of
+% \meta{token} may change in later releases.
+% \item \meta{char code}, a decimal representation of the character
+% code of the \meta{token}, $-1$ if it is a control sequence.
+% \item \meta{catcode}, a capital hexadecimal digit which denotes the
+% category code of the \meta{token} (0: control sequence, 1:
+% begin-group, 2: end-group, 3: math shift, 4: alignment tab, 6:
+% parameter, 7: superscript, 8: subscript, A: space, B: letter,
+% C:other, D:active). This can be converted to an integer by
+% writing |"|\meta{catcode}.
+% \end{itemize}
+% In addition, there is a debugging function \cs{tl_analysis_show:n},
+% very similar to the \cs{ShowTokens} macro from the \pkg{ted} package.
%
% \begin{function}[added = 2018-04-09]{\tl_analysis_show:N, \tl_analysis_show:n}
% \begin{syntax}
@@ -74,23 +91,33 @@
% \end{syntax}
% Applies the \meta{inline function} to each individual \meta{token}
% in the \meta{token list}. The \meta{inline function} receives three
-% arguments:
-% \begin{itemize}
-% \item \meta{tokens}, which both \texttt{o}-expand and
-% \texttt{x}-expand to the \meta{token}. The detailed form of
-% \meta{token} may change in later releases.
-% \item \meta{char code}, a decimal representation of the character
-% code of the token, $-1$ if it is a control sequence (with
-% \meta{catcode} $0$).
-% \item \meta{catcode}, a capital hexadecimal digit which denotes
-% the category code of the \meta{token} (0: control sequence, 1:
-% begin-group, 2: end-group, 3: math shift, 4: alignment tab, 6:
-% parameter, 7: superscript, 8: subscript, A: space, B: letter,
-% C:other, D:active).
-% \end{itemize}
-% As all other mappings the mapping is done at the current group
-% level, \emph{i.e.}~any local assignments made by the \meta{inline
-% function} remain in effect after the loop.
+% arguments as explained above. As all other mappings the mapping is
+% done at the current group level, \emph{i.e.}~any local assignments
+% made by the \meta{inline function} remain in effect after the loop.
+% \end{function}
+%
+% \begin{function}[added = 2020-07-14]{\peek_analysis_map_inline:n}
+% \begin{syntax}
+% \cs{peek_analysis_map_inline:n} \Arg{inline function}
+% \end{syntax}
+% Applies the \meta{inline function} to each individual \meta{token}
+% in the input stream that follows. The \meta{inline function}
+% receives three arguments as explained above. As all other mappings
+% the mapping is done at the current group level, \emph{i.e.}~any
+% local assignments made by the \meta{inline function} remain in
+% effect after the loop. The tokens are removed from the input
+% stream. The loop can be stopped by \cs{peek_analysis_map_break:}.
+% Within the code, \cs{l_peek_token} is set equal (as a token, not a
+% token list) to the token under consideration.
+% \end{function}
+%
+% \begin{function}[added = 2020-07-14]{\peek_analysis_map_break:, \peek_analysis_map_break:n}
+% \begin{syntax}
+% \cs{peek_analysis_map_inline:n} |{| \dots{} \cs{peek_analysis_map_break:n} \Arg{code} |}|
+% \end{syntax}
+% Stops the \cs{peek_analysis_map_inline:n} loop from seeking more
+% tokens, and inserts \meta{code} in the input stream (empty for
+% \cs{peek_analysis_map_break:}).
% \end{function}
%
% \end{documentation}
@@ -182,18 +209,51 @@
% \end{macrocode}
% \end{variable}
%
-% \begin{variable}{\l_@@_analysis_token}
-% \begin{variable}{\l_@@_analysis_char_token}
+% \begin{variable}
+% {\l_@@_analysis_token, \l_@@_analysis_char_token, \l_@@_analysis_next_token}
% The tokens in the token list are probed with the \TeX{} primitive
% \tn{futurelet}. We use \cs{l_@@_analysis_token} in that
% construction. In some cases, we convert the following token to a
% string before probing it: then the token variable used is
-% \cs{l_@@_analysis_char_token}.
+% \cs{l_@@_analysis_char_token}. When getting tokens from the input
+% stream we may need to look two tokens ahead, for which we use
+% \cs{l_@@_analysis_next_token}.
% \begin{macrocode}
\cs_new_eq:NN \l_@@_analysis_token ?
\cs_new_eq:NN \l_@@_analysis_char_token ?
+\cs_new_eq:NN \l_@@_analysis_next_token ?
+% \end{macrocode}
+% \end{variable}
+%
+% \begin{variable}{\l_@@_peek_code_tl}
+% Holds some code to be run once the next token has been fully
+% analysed in \cs{peek_analysis_map_inline:n}.
+% \begin{macrocode}
+\tl_new:N \l_@@_peek_code_tl
% \end{macrocode}
% \end{variable}
+%
+% \begin{variable}{\c_@@_peek_catcodes_tl}
+% A token list containing the character number~$32$ (space) with all
+% possible category codes except $1$ and $2$ (begin-group and
+% end-group). Why $32$? Because some \LuaTeX{} versions only allow
+% creation of catcode~$10$ (space) tokens with this character code,
+% and because even in other engines it is much easier to produce since
+% \cs{char_generate:nn} refuses to produce spaces.
+% \begin{macrocode}
+\tl_const:Nx \c_@@_peek_catcodes_tl
+ {
+ \char_generate:nn { 32 } { 3 } 3
+ \char_generate:nn { 32 } { 4 } 4
+ # \char_generate:nn { 32 } { 6 } 6
+ \char_generate:nn { 32 } { 7 } 7
+ \char_generate:nn { 32 } { 8 } 8
+ \c_space_tl \token_to_str:N A
+ \char_generate:nn { 32 } { 11 } \token_to_str:N B
+ \char_generate:nn { 32 } { 12 } \token_to_str:N C
+ \char_generate:nn { 32 } { 13 } \token_to_str:N D
+ }
+% \end{macrocode}
% \end{variable}
%
% \begin{variable}{\l_@@_analysis_normal_int}
@@ -1075,6 +1135,358 @@
% \end{macro}
% \end{macro}
%
+% \subsection{Peeking ahead}
+%
+% \begin{macro}[EXP]{\peek_analysis_map_break:, \peek_analysis_map_break:n}
+% The break statements use the general \cs{prg_map_break:Nn}.
+% \begin{macrocode}
+\cs_new:Npn \peek_analysis_map_break:
+ { \prg_map_break:Nn \peek_analysis_map_break: { } }
+\cs_new:Npn \peek_analysis_map_break:n
+ { \prg_map_break:Nn \peek_analysis_map_break: }
+% \end{macrocode}
+% \end{macro}
+%
+% \begin{variable}{\l_@@_peek_charcode_int}
+% \begin{macrocode}
+\int_new:N \l_@@_peek_charcode_int
+% \end{macrocode}
+% \end{variable}
+%
+% \begin{macro}{\@@_analysis_char_arg:Nw, \@@_analysis_char_arg_aux:Nw}
+% After a call to \tn{futurelet} \cs{l_@@_analysis_token} followed by
+% a stringified character token (either explicit space or catcode
+% other character), grab the argument and pass it to |#1|. We only
+% need to do anything in the case of a space.
+% \begin{macrocode}
+\cs_new:Npn \@@_analysis_char_arg:Nw
+ {
+ \if_meaning:w \l_@@_analysis_token \c_space_token
+ \exp_after:wN \@@_analysis_char_arg_aux:Nw
+ \fi:
+ }
+\cs_new:Npn \@@_analysis_char_arg_aux:Nw #1 ~ { #1 { ~ } }
+% \end{macrocode}
+% \end{macro}
+%
+% \begin{macro}
+% {
+% \peek_analysis_map_inline:n,
+% \@@_peek_analysis_loop:NNn, \@@_peek_analysis_test:,
+% \@@_peek_analysis_normal:N, \@@_peek_analysis_cs:,
+% \@@_peek_analysis_char:N, \@@_peek_analysis_char:nN,
+% \@@_peek_analysis_special:, \@@_peek_analysis_retest:,
+% \@@_peek_analysis_next:, \@@_peek_analysis_str:,
+% \@@_peek_analysis_str:w, \@@_peek_analysis_str:n,
+% \@@_peek_analysis_active_str:n, \@@_peek_analysis_explicit:n,
+% \@@_peek_analysis_escape:, \@@_peek_analysis_collect:w,
+% \@@_peek_analysis_collect:n, \@@_peek_analysis_collect_loop:,
+% \@@_peek_analysis_collect_test:, \@@_peek_analysis_collect_end:NNN
+% }
+% Save the user's code in a control sequence that is suitable for
+% nested maps. We may wish to pass to this function an \tn{outer}
+% control sequence or active character; for this we will undefine
+% potentially-\tn{outer} tokens within a group, closed after the
+% function receives its arguments. This user's code function also
+% calls the loop auxiliary, and includes the trailing
+% \cs{prg_break_point:Nn} for when the user wants to stop the loop.
+% The loop auxiliary must remove that break point because it must look
+% at the input stream.
+% \begin{macrocode}
+\cs_new_protected:Npn \peek_analysis_map_inline:n #1
+ {
+ \int_gincr:N \g__kernel_prg_map_int
+ \cs_set_protected:cpn
+ { @@_analysis_map_ \int_use:N \g__kernel_prg_map_int :nnN }
+ ##1##2##3
+ {
+ \group_end:
+ #1
+ \@@_peek_analysis_loop:NNn
+ \prg_break_point:Nn \peek_analysis_map_break: { }
+ }
+ \@@_peek_analysis_loop:NNn ? ? ?
+ }
+% \end{macrocode}
+% The loop starts a group (closed by the user-code function defined
+% above) with a normalized escape character, and checks if the next
+% token is special or \texttt{N}-type.
+% \begin{macrocode}
+\cs_new_protected:Npn \@@_peek_analysis_loop:NNn #1#2#3
+ {
+ \group_begin:
+ \tl_set:Nx \l_@@_peek_code_tl
+ {
+ \exp_not:c
+ { @@_analysis_map_ \int_use:N \g__kernel_prg_map_int :nnN }
+ }
+ \int_set:Nn \tex_escapechar:D { `\\ }
+ \peek_after:Nw \@@_peek_analysis_test:
+ }
+\cs_new_protected:Npn \@@_peek_analysis_test:
+ {
+ \if_int_odd:w
+ \if_catcode:w \exp_not:N \l_peek_token { 1 \exp_stop_f: \fi:
+ \if_catcode:w \exp_not:N \l_peek_token } 1 \exp_stop_f: \fi:
+ \if_meaning:w \l_peek_token \c_space_token 1 \exp_stop_f: \fi:
+ 0 \exp_stop_f:
+ \exp_after:wN \@@_peek_analysis_special:
+ \else:
+ \exp_after:wN \exp_after:wN
+ \exp_after:wN \@@_peek_analysis_normal:N
+ \exp_after:wN \exp_not:N
+ \fi:
+ }
+% \end{macrocode}
+% Normal tokens are not too hard, but can be \tn{outer}, hence the
+% \cs{exp_not:N} in the code above. If the token is expandable then
+% it might be an \tn{outer} or a \TeX{} conditional, so to be safe we
+% set it to \cs{scan_stop:} (the assignment is local and stopped by
+% the \cs{group_end:} upon calling the user's code). Then distinguish
+% characters (including active ones and macro parameter characters)
+% from control sequences (whose string representation is more than one
+% character because the escape character is printable). For a control
+% sequence call the user code with suitable arguments.
+% \begin{macrocode}
+\cs_new_protected:Npn \@@_peek_analysis_normal:N #1
+ {
+ \exp_after:wN \reverse_if:N \exp_after:wN \if_meaning:w
+ \exp_not:N #1 #1
+ \tex_let:D #1 \scan_stop:
+ \tl_put_right:Nn \l_@@_peek_code_tl { { \exp_not:N #1 } }
+ \else:
+ \tl_put_right:Nn \l_@@_peek_code_tl { { \exp_not:n {#1} } }
+ \fi:
+ \if_charcode:w
+ \scan_stop:
+ \exp_after:wN \use_none:n \token_to_str:N #1 \prg_do_nothing:
+ \scan_stop:
+ \exp_after:wN \@@_peek_analysis_char:N
+ \exp_after:wN #1
+ \else:
+ \exp_after:wN \@@_peek_analysis_cs:
+ \fi:
+ }
+\cs_new_protected:Npn \@@_peek_analysis_cs:
+ { \l_@@_peek_code_tl { -1 } 0 }
+\cs_new_protected:Npn \@@_peek_analysis_char:N #1
+ {
+ \char_set_lccode:nn { `#1 } { 32 }
+ \tex_lowercase:D { \@@_peek_analysis_char:nN {#1} } #1
+ }
+\cs_new_protected:Npn \@@_peek_analysis_char:nN #1#2
+ {
+ \cs_set_protected:Npn \@@_tmp:w ##1 #1 ##2 ##3 \scan_stop:
+ { \exp_args:No \l_@@_peek_code_tl { \int_value:w `#2 } ##2 }
+ \exp_after:wN \@@_tmp:w \c_@@_peek_catcodes_tl \scan_stop:
+ }
+% \end{macrocode}
+% For special characters the idea is to eventually act with
+% \cs{token_to_str:N}, then pick up one by one the characters of this
+% string representation until hitting the token that follows. First
+% determine the character code of (the meaning of) the \meta{token}
+% (which we know is a special token), make sure the escape character
+% is different from it, normalize the meanings of two active
+% characters and the empty control sequence, and filter out these
+% cases in \cs{@@_peek_analysis_retest:}.
+% \begin{macrocode}
+\cs_new_protected:Npn \@@_peek_analysis_special:
+ {
+ \tex_let:D \l_@@_analysis_token = ~ \l_peek_token
+ \int_set:Nn \l_@@_peek_charcode_int
+ { \@@_analysis_extract_charcode: }
+ \if_int_compare:w \l_@@_peek_charcode_int = \tex_escapechar:D
+ \int_set:Nn \tex_escapechar:D { `\/ }
+ \fi:
+ \char_set_active_eq:nN { \l_@@_peek_charcode_int } \scan_stop:
+ \char_set_active_eq:nN { \tex_escapechar:D } \scan_stop:
+ \cs_set_eq:cN { } \scan_stop:
+ \tex_futurelet:D \l_@@_analysis_token
+ \@@_peek_analysis_retest:
+ }
+\cs_new_protected:Npn \@@_peek_analysis_retest:
+ {
+ \if_meaning:w \l_@@_analysis_token \scan_stop:
+ \exp_after:wN \@@_peek_analysis_normal:N
+ \else:
+ \exp_after:wN \@@_peek_analysis_next:
+ \fi:
+ }
+% \end{macrocode}
+% At this point we know the meaning of the \meta{token} in the input
+% stream is \cs{l_peek_token}, either a space (32, 10) or a
+% begin-group or end-group token (catcode $1$ or~$2$), and we excluded
+% a few cases that would be difficult later (empty control sequence,
+% active character with the same character code as its meaning or as
+% the escape character). Now look at the \meta{next token} following
+% it using a combination of \tn{afterassignment} and \tn{futurelet}.
+% The syntax of this primitive is \tn{futurelet} \meta{peek token}
+% \meta{first token} \meta{next token}, and it sets \meta{peek token}
+% equal to \meta{next token}. Traditionally, one takes \meta{first
+% token} to be some macro that regains control of the code and, e.g.,
+% analyses \meta{peek token}. Here, both \meta{first token} and
+% \meta{next token} are mostly unknown tokens in the input stream (but
+% we know the \meta{first token} has catcode $1$, $2$ or $10$), where
+% \meta{first token} was already stored as \cs{l_peek_token}, and we
+% regain control using \tn{afterassignment}, which inserts its
+% argument after the assignment, hence after \meta{peek token} but
+% before \meta{first token}.
+% \begin{macrocode}
+\cs_new_protected:Npn \@@_peek_analysis_next:
+ {
+ \tl_if_empty:oT { \tex_the:D \tex_everyeof:D }
+ { \tex_everyeof:D { \scan_stop: } }
+ \tex_afterassignment:D \@@_peek_analysis_str:
+ \tex_futurelet:D \l_@@_analysis_next_token
+ }
+% \end{macrocode}
+% We then hit the \meta{first token} with \cs{token_to_str:N} and grab
+% characters until finding \cs{l_@@_analysis_next_token}. More
+% precisely, by looking at the first character in the string
+% representation of the \meta{first token} we distinguish three cases:
+% a stringified control sequence starts with the escape character; for
+% an explicit character we find that same character; for an explicit
+% character we find anything else (we made sure to exclude the case of
+% an active character whose string representation coincides with the
+% other two cases).
+% \begin{macrocode}
+\cs_new_protected:Npn \@@_peek_analysis_str:
+ {
+ \exp_after:wN \tex_futurelet:D
+ \exp_after:wN \l_@@_analysis_token
+ \exp_after:wN \@@_peek_analysis_str:w
+ \token_to_str:N
+ }
+\cs_new_protected:Npn \@@_peek_analysis_str:w
+ { \@@_analysis_char_arg:Nw \@@_peek_analysis_str:n }
+\cs_new_protected:Npn \@@_peek_analysis_str:n #1
+ {
+ \int_case:nnF { `#1 }
+ {
+ { \l_@@_peek_charcode_int }
+ { \@@_peek_analysis_explicit:n {#1} }
+ { \tex_escapechar:D } { \@@_peek_analysis_escape: }
+ }
+ { \@@_peek_analysis_active_str:n {#1} }
+ }
+% \end{macrocode}
+% When |#1| is a stringified active character we pass appropriate
+% arguments to the user's code; thankfully \cs{char_generate:nn}
+% can make active characters.
+% \begin{macrocode}
+\cs_new_protected:Npn \@@_peek_analysis_active_str:n #1
+ {
+ \tl_put_right:Nx \l_@@_peek_code_tl
+ {
+ { \char_generate:nn { `#1 } { 13 } }
+ { \int_value:w `#1 }
+ \token_to_str:N D
+ }
+ \l_@@_peek_code_tl
+ }
+% \end{macrocode}
+% When |#1| matches the character we had extracted from the meaning of
+% \cs{l_peek_token}, the token was an explicit character, which can be
+% a standard space, or a begin-group or end-group character with some
+% character code. In the latter two cases we call
+% \cs{char_generate:nn} with suitable arguments and put suitable
+% \cs{if_false:} \cs{fi:} constructions to make the result balanced
+% and such that \texttt{o}-expanding or \texttt{x}-expanding gives
+% back a single (unbalanced) begin-group or end-group character.
+% \begin{macrocode}
+\cs_new_protected:Npn \@@_peek_analysis_explicit:n #1
+ {
+ \tl_put_right:Nx \l_@@_peek_code_tl
+ {
+ \if_meaning:w \l_peek_token \c_space_token
+ { ~ } { 32 } \token_to_str:N A
+ \else:
+ \if_catcode:w \l_peek_token \c_group_begin_token
+ {
+ \exp_not:N \exp_after:wN
+ \char_generate:nn { `#1 } { 1 }
+ \exp_not:N \if_false:
+ \if_false: { \fi: }
+ \exp_not:N \fi:
+ }
+ { \int_value:w `#1 }
+ 1
+ \else:
+ {
+ \exp_not:N \if_false:
+ { \if_false: } \fi:
+ \exp_not:N \fi:
+ \char_generate:nn { `#1 } { 2 }
+ }
+ { \int_value:w `#1 }
+ 2
+ \fi:
+ \fi:
+ }
+ \l_@@_peek_code_tl
+ }
+% \end{macrocode}
+% Finally there is the case of a special token whose string
+% representation starts with an escape character, namely the token was
+% a control sequence. In that case we could have grabbed the token
+% directly as an \texttt{N}-type argument, but of course we couldn't
+% know that until we had run all the various tests including
+% stringifying the token. We are thus left with the hard work of
+% picking up one by one the characters in the csname (being careful
+% about spaces), until finding a token that matches the \meta{next
+% token} picked up earlier (which was not stringified), such that the
+% control sequence that we found so far indeed has the expected
+% meaning \cs{l_peek_token}. This comparison with \cs{l_peek_token}
+% catches a reasonably common case like \cs{c_group_begin_token} |_|
+% in which the trailing |_| has category code other: without
+% comparison of the constructed csname with \cs{l_peek_token}
+% collection would stop at \cs[no-index]{c}, which is wrong.
+% \begin{macrocode}
+\cs_new_protected:Npn \@@_peek_analysis_escape:
+ {
+ \tl_clear:N \l_@@_internal_a_tl
+ \tex_futurelet:D \l_@@_analysis_token
+ \@@_peek_analysis_collect:w
+ }
+\cs_new_protected:Npn \@@_peek_analysis_collect:w
+ { \@@_analysis_char_arg:Nw \@@_peek_analysis_collect:n }
+\cs_new_protected:Npn \@@_peek_analysis_collect:n #1
+ {
+ \tl_put_right:Nn \l_@@_internal_a_tl {#1}
+ \@@_peek_analysis_collect_loop:
+ }
+\cs_new_protected:Npn \@@_peek_analysis_collect_loop:
+ {
+ \tex_futurelet:D \l_@@_analysis_token
+ \@@_peek_analysis_collect_test:
+ }
+\cs_new_protected:Npn \@@_peek_analysis_collect_test:
+ {
+ \if_meaning:w \l_@@_analysis_token \l_@@_analysis_next_token
+ \exp_after:wN \if_meaning:w \cs:w \l_@@_internal_a_tl \cs_end: \l_peek_token
+ \@@_peek_analysis_collect_end:NNN
+ \fi:
+ \fi:
+ \@@_peek_analysis_collect:w
+ }
+% \end{macrocode}
+% End by calling the user code with suitable arguments (here |#1|,
+% |#2| are \cs{fi:}), which closes the group begun early on.
+% \begin{macrocode}
+\cs_new_protected:Npn \@@_peek_analysis_collect_end:NNN #1#2#3
+ {
+ #1 #2
+ \tl_put_right:Nx \l_@@_peek_code_tl
+ {
+ { \exp_not:N \exp_not:n { \exp_not:c { \l_@@_internal_a_tl } } }
+ { -1 }
+ 0
+ }
+ \l_@@_peek_code_tl
+ }
+% \end{macrocode}
+% \end{macro}
+%
% \subsection{Messages}
%
% \begin{variable}{\c_@@_analysis_show_etc_str}
More information about the latex3-commits
mailing list.