[latex3-commits] [git/LaTeX3-latex3-latex3] main: Fix how peek analysis deals with normal tokens (fix #1109) (fix #1110) (24f718890)

Mon Oct 3 22:14:00 CEST 2022

Repository : https://github.com/latex3/latex3
On branch  : main
Link       : https://github.com/latex3/latex3/commit/24f7188904d6a3695019b03c036221cc4abf3dac

>---------------------------------------------------------------

commit 24f7188904d6a3695019b03c036221cc4abf3dac
Author: Bruno Le Floch <blflatex at gmail.com>
Date:   Mon Oct 3 21:58:30 2022 +0200

    Fix how peek analysis deals with normal tokens (fix #1109) (fix #1110)
    
    Some contortions meant to support outer macros led to a very obvious
    bug: the code was setting scanned tokens willy-nilly to \scan_stop:,
    which broke when these tokens were things like \exp_after:wN.
    I had also used arbitrary characters as delimiters, which broke for
    macro parameter characters.  Now fixed by revamping the logic to
    better separate these two difficulties.


>---------------------------------------------------------------

24f7188904d6a3695019b03c036221cc4abf3dac
 l3kernel/CHANGELOG.md            |   9 ++
 l3kernel/l3tl-analysis.dtx       | 213 +++++++++++++++++++++++++++++----------
 l3kernel/l3token.dtx             |   3 +-
 l3kernel/testfiles/m3peek003.tlg |   2 +-
 4 files changed, 169 insertions(+), 58 deletions(-)

diff --git a/l3kernel/CHANGELOG.md b/l3kernel/CHANGELOG.md
index 33b9759ed..a76cafb94 100644
--- a/l3kernel/CHANGELOG.md
+++ b/l3kernel/CHANGELOG.md
@@ -7,6 +7,15 @@ this project uses date-based 'snapshot' version identifiers.
 
 ## [Unreleased]
 
+### Changed
+- In `\peek_analysis_map_inline:n`, omit unnecessary `\exp_not:n` when the token
+  seen is a character that is neither active nor a macro parameter character
+
+### Fixed
+- `\peek_analysis_map_inline:n` support for macro parameter characters (issue
+  [\#1109](https://github.com/latex3/latex3/issues/1109)) and for many
+  expandable tokens (issue [\#1110](https://github.com/latex3/latex3/issues/1110))
+
 ## [2022-09-28]
 
 ### Added
diff --git a/l3kernel/l3tl-analysis.dtx b/l3kernel/l3tl-analysis.dtx
index 612ab5e72..8228bff64 100644
--- a/l3kernel/l3tl-analysis.dtx
+++ b/l3kernel/l3tl-analysis.dtx
@@ -219,9 +219,10 @@
 %   A token list containing the character number~$32$ (space) with all
 %   possible category codes except $1$ and $2$ (begin-group and
 %   end-group).  Why $32$?  Because some \LuaTeX{} versions only allow
-%   creation of catcode~$10$ (space) tokens with this character code,
-%   and because even in other engines it is much easier to produce since
-%   \cs{char_generate:nn} refuses to produce spaces.
+%   creation of catcode~$10$ (space) tokens with this character code, so
+%   that we decided to make \cs{char_generate:nn} refuse to create such
+%   weird spaces as well.  We do not include the macro parameter case
+%   (catcode~$6$) because it cannot be used as a macro delimiter.
 %    \begin{macrocode}
 \group_begin:
 \char_set_active_eq:NN \  \scan_stop:
@@ -229,7 +230,6 @@
   {
     \char_generate:nn { 32 } { 3 }   3
     \char_generate:nn { 32 } { 4 }   4
-    # \char_generate:nn { 32 } { 6 } 6
     \char_generate:nn { 32 } { 7 }   7
     \char_generate:nn { 32 } { 8 }   8
     \c_space_tl                     \token_to_str:N A
@@ -809,7 +809,8 @@
         \scan_stop:
         \exp_after:wN \use_none:n \token_to_str:N #3 \prg_do_nothing:
         \scan_stop:
-      \exp_after:wN \@@_analysis_b_char:Nww
+      \exp_after:wN \@@_analysis_b_char:Nn
+      \exp_after:wN \@@_analysis_b_char_aux:nww
     \else:
       \exp_after:wN \@@_analysis_b_cs:Nww
     \fi:
@@ -819,35 +820,43 @@
 % \end{macro}
 % \end{macro}
 %
-% \begin{macro}[EXP]{\@@_analysis_b_char:Nww}
+% \begin{macro}[EXP]{\@@_analysis_b_char:Nn, \@@_analysis_b_char_aux:nww}
+%   This function is called here with arguments
+%   \cs{@@_analysis_b_char_aux:nww} and a normal character, while in the
+%   peek analysis code it is called with \cs{use_none:n} and possibly a
+%   space character, which is why the function has signature |Nn|.
 %   If the normal token we grab is a character, leave
 %   \meta{catcode} \meta{charcode} followed by \cs{s_@@}
 %   in the input stream, and call \cs{@@_analysis_b_normals:ww}
 %   with its first argument decremented.
 %    \begin{macrocode}
-\cs_new:Npx \@@_analysis_b_char:Nww #1
+\cs_new:Npx \@@_analysis_b_char:Nn #1#2
   {
-    \exp_not:N \if_meaning:w #1 \exp_not:N \tex_undefined:D
+    \exp_not:N \if_meaning:w #2 \exp_not:N \tex_undefined:D
       \token_to_str:N D \exp_not:N \else:
-    \exp_not:N \if_catcode:w #1 \c_catcode_other_token
+    \exp_not:N \if_catcode:w #2 \c_catcode_other_token
       \token_to_str:N C \exp_not:N \else:
-    \exp_not:N \if_catcode:w #1 \c_catcode_letter_token
+    \exp_not:N \if_catcode:w #2 \c_catcode_letter_token
       \token_to_str:N B \exp_not:N \else:
-    \exp_not:N \if_catcode:w #1 \c_math_toggle_token      3
+    \exp_not:N \if_catcode:w #2 \c_math_toggle_token      3
       \exp_not:N \else:
-    \exp_not:N \if_catcode:w #1 \c_alignment_token        4
+    \exp_not:N \if_catcode:w #2 \c_alignment_token        4
       \exp_not:N \else:
-    \exp_not:N \if_catcode:w #1 \c_math_superscript_token 7
+    \exp_not:N \if_catcode:w #2 \c_math_superscript_token 7
       \exp_not:N \else:
-    \exp_not:N \if_catcode:w #1 \c_math_subscript_token   8
+    \exp_not:N \if_catcode:w #2 \c_math_subscript_token   8
       \exp_not:N \else:
-    \exp_not:N \if_catcode:w #1 \c_space_token
+    \exp_not:N \if_catcode:w #2 \c_space_token
       \token_to_str:N A \exp_not:N \else:
       6
     \exp_not:n { \fi: \fi: \fi: \fi: \fi: \fi: \fi: \fi: }
-    \exp_not:N \int_value:w `#1 \s_@@
-   \exp_not:N \exp_after:wN \exp_not:N \@@_analysis_b_normals:ww
-     \exp_not:N \int_value:w \exp_not:N \int_eval:w - 1 +
+    #1 {#2}
+  }
+\cs_new:Npn \@@_analysis_b_char_aux:nww #1
+  {
+    \int_value:w `#1 \s_@@
+    \exp_after:wN \@@_analysis_b_normals:ww
+      \int_value:w \int_eval:w - 1 +
   }
 %    \end{macrocode}
 % \end{macro}
@@ -1170,8 +1179,9 @@
 %   {
 %     \peek_analysis_map_inline:n,
 %     \@@_peek_analysis_loop:NNn, \@@_peek_analysis_test:,
-%     \@@_peek_analysis_normal:N, \@@_peek_analysis_cs:,
-%     \@@_peek_analysis_char:N, \@@_peek_analysis_char:nN,
+%     \@@_peek_analysis_exp:N, \@@_peek_analysis_exp_active:N,
+%     \@@_peek_analysis_nonexp:N, \@@_peek_analysis_cs:N,
+%     \@@_peek_analysis_char:N, \@@_peek_analysis_char:w,
 %     \@@_peek_analysis_special:, \@@_peek_analysis_retest:,
 %     \@@_peek_analysis_next:, \@@_peek_analysis_str:,
 %     \@@_peek_analysis_str:w, \@@_peek_analysis_str:n,
@@ -1184,7 +1194,8 @@
 %   nested maps.  We may wish to pass to this function an \tn{outer}
 %   control sequence or active character; for this we will undefine
 %   potentially-\tn{outer} tokens within a group, closed after the
-%   function receives its arguments.  This user's code function also
+%   function reads its arguments (for an \tn{outer} active character
+%   there is no good alternative).  This user's code function also
 %   calls the loop auxiliary, and includes the trailing
 %   \cs{prg_break_point:Nn} for when the user wants to stop the loop.
 %   The loop auxiliary must remove that break point because it must look
@@ -1209,7 +1220,8 @@
 %    \end{macrocode}
 %   The loop starts a group (closed by the user-code function defined
 %   above) with a normalized escape character, and checks if the next
-%   token is special or \texttt{N}-type.
+%   token is special or \texttt{N}-type (distinguishing expandable from
+%   non-expandable tokens).
 %    \begin{macrocode}
 \cs_new_protected:Npn \@@_peek_analysis_loop:NNn #1#2#3
   {
@@ -1224,60 +1236,149 @@
   }
 \cs_new_protected:Npn \@@_peek_analysis_test:
   {
-    \if_int_odd:w
-      \if_catcode:w \exp_not:N \l_peek_token {   \c_zero_int \fi:
-      \if_catcode:w \exp_not:N \l_peek_token }   \c_zero_int \fi:
-      \if_meaning:w \l_peek_token \c_space_token \c_zero_int \fi:
-      \c_one_int
+    \if_case:w
+      \if_catcode:w \exp_not:N \l_peek_token {   \c_max_int \fi:
+      \if_catcode:w \exp_not:N \l_peek_token }   \c_max_int \fi:
+      \if_meaning:w \l_peek_token \c_space_token \c_max_int \fi:
+      \exp_after:wN \if_meaning:w \exp_not:N \l_peek_token \l_peek_token
+        \c_one_int
+      \fi:
+      \c_zero_int
       \exp_after:wN \exp_after:wN
-      \exp_after:wN \@@_peek_analysis_normal:N
+      \exp_after:wN \@@_peek_analysis_exp:N
       \exp_after:wN \exp_not:N
+    \or:
+      \exp_after:wN \@@_peek_analysis_nonexp:N
     \else:
       \exp_after:wN \@@_peek_analysis_special:
     \fi:
   }
 %    \end{macrocode}
-%   Normal tokens are not too hard, but can be \tn{outer}, hence the
-%   \cs{exp_not:N} in the code above.  If the token is expandable then
-%   it might be an \tn{outer} or a \TeX{} conditional, so to be safe we
-%   set it to \cs{scan_stop:} (the assignment is local and stopped by
-%   the \cs{group_end:} upon calling the user's code).  Then distinguish
-%   characters (including active ones and macro parameter characters)
-%   from control sequences (whose string representation is more than one
-%   character because the escape character is printable).  For a control
-%   sequence call the user code with suitable arguments.
+%   Expandable tokens (which are automatically |N|-type) can be
+%   \tn{outer} macros, hence the need for \cs{exp_after:wN} and
+%   \cs{exp_not:N} in the code above, which allows the next function to
+%   safely grab the token as an argument.  We run some code that is
+%   expanded using the primitive \cs{cs_set_nopar:Npx} rather than
+%   \cs{tl_set:Nx} to avoid grabbing it as an argument as |#1| may be
+%   \tn{outer}.  To allow~|#1| as an argument of the user's function
+%   (stored in \cs{l_@@_peek_code_tl}), we set it equal to
+%   \cs{scan_stop:} first, immediately before running the code as |#1|
+%   may be some pretty important function such as \cs{exp_after:wN}.
+%   Then we put the user's function and the first argument
+%   \cs{exp_not:N} |#1|.  Then we must add |{-1}0| if the token is a
+%   control sequence and \Arg{charcode}|D| otherwise.  Distinguishing
+%   the two cases is easy: since we have made the escape character
+%   printable, \cs{token_to_str:N} gives at least two characters for a
+%   control sequence versus a single one for an active character
+%   (possibly being a space).  Producing the right outcome is trickier,
+%   as |#1| cannot appear in either branch of the conditional (it could
+%   be \tn{outer}, or simply a \TeX{} conditional), and can only be
+%   safely discarded by \cs{use_none:n} if it is first hit with
+%   \cs{exp_not:N}.
 %    \begin{macrocode}
-\cs_new_protected:Npn \@@_peek_analysis_normal:N #1
+\cs_new_protected:Npn \@@_peek_analysis_exp:N #1
+  {
+    \cs_set_nopar:Npx \l_@@_peek_code_tl
+      {
+        \tex_let:D \exp_not:N #1 \scan_stop:
+        \exp_not:o \l_@@_peek_code_tl
+        { \exp_not:N \exp_not:N \exp_not:N #1 }
+        \if:w \scan_stop:
+              \exp_after:wN \use_none:n \token_to_str:N #1 \prg_do_nothing:
+              \scan_stop:
+          \exp_after:wN \exp_after:wN
+          \exp_after:wN \@@_peek_analysis_exp_active:N
+        \else:
+          { -1 } 0
+          \exp_after:wN \exp_after:wN
+          \exp_after:wN \use_none:n
+        \fi:
+        \exp_not:N #1
+      }
+    \l_@@_peek_code_tl
+  }
+\cs_new:Npx \@@_peek_analysis_exp_active:N #1
+  { { \exp_not:N \int_value:w `#1 } \token_to_str:N D }
+%    \end{macrocode}
+%   For normal non-expandable tokens we must distinguish characters
+%   (including active ones and macro parameter characters) from control
+%   sequences (whose string representation is more than one character
+%   because we made the escape character printable).  For a control
+%   sequence call the user code with suitable arguments, wrapping |#1|
+%   within \cs{exp_not:n} just in case it happens to be equal to a macro
+%   parameter character.  We do not skip \cs{exp_not:n} when
+%   unnecessary, because there might be situations where the argument
+%   could be used by the user after further redefinitions of the token,
+%   and it seems more convenient to know that \cs{exp_not:n} is always
+%   used.
+%    \begin{macrocode}
+\cs_new_protected:Npn \@@_peek_analysis_nonexp:N #1
   {
-    \exp_after:wN \reverse_if:N \exp_after:wN \if_meaning:w
-        \exp_not:N #1 #1
-      \tex_let:D #1 \scan_stop:
-      \tl_put_right:Nn \l_@@_peek_code_tl { { \exp_not:N #1 } }
-    \else:
-      \tl_put_right:Nn \l_@@_peek_code_tl { { \exp_not:n {#1} } }
-    \fi:
     \if_charcode:w
         \scan_stop:
         \exp_after:wN \use_none:n \token_to_str:N #1 \prg_do_nothing:
         \scan_stop:
       \exp_after:wN \@@_peek_analysis_char:N
-      \exp_after:wN #1
     \else:
-      \exp_after:wN \@@_peek_analysis_cs:
+      \exp_after:wN \@@_peek_analysis_cs:N
     \fi:
+    #1
   }
-\cs_new_protected:Npn \@@_peek_analysis_cs:
-  { \l_@@_peek_code_tl { -1 } 0 }
-\cs_new_protected:Npn \@@_peek_analysis_char:N #1
+\cs_new_protected:Npn \@@_peek_analysis_cs:N #1
+  { \l_@@_peek_code_tl { \exp_not:n {#1} } { -1 } 0 }
+%    \end{macrocode}
+%   For normal characters we must determine their catcode.  The main
+%   difficulty is that the character may be an active character
+%   masquerading as (i.e., set equal to) itself with a different
+%   catcode.  Two approaches based on \tn{lowercase} can detect this.
+%   One could make an active character with the same catcode as~|#1| and
+%   change its definition before testing the catcode of~|#1|, but in
+%   some Unicode engine this fills up the hash table uselessly.
+%   Instead, we lowercase~|#1| itself, changing its character code
+%   to~$32$, namely space (because \LuaTeX{} cannot turn catcode~$10$
+%   characters to anything else than character code~$32$), then we apply
+%   \cs{@@_analysis_b_char:Nn}, which detects active characters by
+%   comparing them to \cs{tex_undefined:D}, and we must have undefined
+%   the active space for this test to work ---we use an |x|-expanding
+%   assignment to get the active space in the right place.  Finally
+%   \cs{@@_peek_analysis_char:w} puts the arguments in the correct
+%   order, including \cs{exp_not:n} for macro parameter characters and
+%   active characters (the latter could be macro parameter characters,
+%   and it seems more uniform to always put \cs{exp_not:n}).
+%    \begin{macrocode}
+\group_begin:
+\char_set_active_eq:NN \ \scan_stop:
+\cs_new_protected:Npx \@@_peek_analysis_char:N #1
   {
-    \char_set_lccode:nn { `#1 } { 32 }
-    \tex_lowercase:D { \@@_peek_analysis_char:nN {#1} } #1
+    \cs_set_eq:NN
+      \char_generate:nn { 32 } { 13 }
+      \exp_not:N \tex_undefined:D
+    \tex_lccode:D `#1 = 32 \exp_stop_f:
+    \tex_lowercase:D
+      {
+        \tl_put_right:Nx \exp_not:N \l_@@_peek_code_tl
+          { \exp_not:n { \@@_analysis_b_char:Nn \use_none:n } {#1} }
+      }
+    \exp_not:n
+      {
+        \exp_after:wN \@@_peek_analysis_char:w
+        \int_value:w
+      }
+      `#1
+    \exp_not:n { \exp_after:wN \s_@@ \l_@@_peek_code_tl }
+    #1
   }
-\cs_new_protected:Npn \@@_peek_analysis_char:nN #1#2
+\group_end:
+\cs_new_protected:Npn \@@_peek_analysis_char:w #1 \s_@@ #2#3#4
   {
-    \cs_set_protected:Npn \@@_tmp:w ##1 #1 ##2 ##3 \scan_stop:
-      { \exp_args:No \l_@@_peek_code_tl { \int_value:w `#2 } ##2 }
-    \exp_after:wN \@@_tmp:w \c_@@_peek_catcodes_tl \scan_stop:
+    \if_charcode:w 6 #3
+    \else:
+      \if_charcode:w D #3
+      \else:
+        \exp_args:NNNo
+      \fi:
+    \fi:
+    #2 { \exp_not:n {#4} } {#1} #3
   }
 %    \end{macrocode}
 %   For special characters the idea is to eventually act with
diff --git a/l3kernel/l3token.dtx b/l3kernel/l3token.dtx
index 0312bf987..1aa442299 100644
--- a/l3kernel/l3token.dtx
+++ b/l3kernel/l3token.dtx
@@ -960,7 +960,8 @@
 %   (as appropriate to the result of the test).
 % \end{function}
 %
-% \begin{function}[added = 2020-12-03]{\peek_analysis_map_inline:n}
+% \begin{function}[added = 2020-12-03, updated = 2022-10-03]
+%   {\peek_analysis_map_inline:n}
 %   \begin{syntax}
 %     \cs{peek_analysis_map_inline:n} \Arg{inline function}
 %   \end{syntax}
diff --git a/l3kernel/testfiles/m3peek003.tlg b/l3kernel/testfiles/m3peek003.tlg
index c51252dc9..d87929e4f 100644
--- a/l3kernel/testfiles/m3peek003.tlg
+++ b/l3kernel/testfiles/m3peek003.tlg
@@ -4,7 +4,7 @@ Author: Bruno Le Floch
 ============================================================
 TEST 1: Peek analysis map inline
 ============================================================
-\exp_not:n {a},97,B
+a,97,B
 \exp_after:wN {\if_false: }\fi: ,123,1
  ,32,A
 \exp_after:wN {\if_false: }\fi: ,123,1