[latex3-commits] [git/LaTeX3-latex3-latex3] master: Remove further use of toks while matching in l3regex (870d2d7fa)

Thu Dec 3 17:27:34 CET 2020

Repository : https://github.com/latex3/latex3
On branch  : master
Link       : https://github.com/latex3/latex3/commit/870d2d7fa6331980840071836b780ee13b5b0424

>---------------------------------------------------------------

commit 870d2d7fa6331980840071836b780ee13b5b0424
Author: Joseph Wright <joseph.wright at morningstar2.co.uk>
Date:   Thu Dec 3 15:36:45 2020 +0000

    Remove further use of toks while matching in l3regex
    
    The eventual aim is to let users run code while the regex is being
    matched. For this we need to remove all abuses of TeX registers.
    The only one left is toks used for storing states of the nfa.


>---------------------------------------------------------------

870d2d7fa6331980840071836b780ee13b5b0424
 l3kernel/l3regex.dtx | 195 +++++++++++++++++++--------------------------------
 1 file changed, 73 insertions(+), 122 deletions(-)

diff --git a/l3kernel/l3regex.dtx b/l3kernel/l3regex.dtx
index 433414a7b..c2d44e338 100644
--- a/l3kernel/l3regex.dtx
+++ b/l3kernel/l3regex.dtx
@@ -885,14 +885,6 @@
 %     of all submatches, and then the ends of all submatches. The
 %     \meta{threads} are ordered starting from the best to the least
 %     preferred.
-%   \item \cs{g_@@_charcode_intarray} and \cs{g_@@_catcode_intarray} hold the
-%     character codes and category codes of tokens at each
-%     \meta{position} in the query.
-%   \item \cs{g_@@_balance_intarray} holds the balance of begin-group and
-%     end-group character tokens which appear before that point in the
-%     token list.
-%   \item \tn{toks}\meta{position} holds \meta{tokens} which \texttt{o}-
-%     and \texttt{x}-expand to the \meta{position}-th token in the query.
 %   \item \cs{g_@@_submatch_prev_intarray}, \cs{g_@@_submatch_begin_intarray}
 %     and \cs{g_@@_submatch_end_intarray} hold, for each submatch (as would
 %     be extracted by \cs{regex_extract_all:nnN}), the place where the
@@ -903,6 +895,14 @@
 %     block corresponding to one match with all its submatches stored in
 %     consecutive entries.
 % \end{itemize}
+% When actually building the result,
+% \begin{itemize}
+%   \item \tn{toks}\meta{position} holds \meta{tokens} which \texttt{o}-
+%     and \texttt{x}-expand to the \meta{position}-th token in the query.
+%   \item \cs{g_@@_balance_intarray} holds the balance of begin-group and
+%     end-group character tokens which appear before that point in the
+%     token list.
+% \end{itemize}
 %
 % The code is structured as follows. Variables are introduced in the
 % relevant section. First we present some generic helper functions. Then
@@ -1085,19 +1085,6 @@
 %    \end{macrocode}
 % \end{variable}
 %
-% \begin{variable}{\g_@@_charcode_intarray, \g_@@_catcode_intarray, \g_@@_balance_intarray}
-%   The first thing we do when matching is to go once through the query
-%   token list and store the information for each token into
-%   \cs{g_@@_charcode_intarray}, \cs{g_@@_catcode_intarray} and \tn{toks}
-%   registers.  We also store the balance of begin-group/end-group
-%   characters into \cs{g_@@_balance_intarray}.
-%    \begin{macrocode}
-\intarray_new:Nn \g_@@_charcode_intarray { 65536 }
-\intarray_new:Nn \g_@@_catcode_intarray { 65536 }
-\intarray_new:Nn \g_@@_balance_intarray { 65536 }
-%    \end{macrocode}
-% \end{variable}
-%
 % \begin{variable}{\l_@@_balance_int}
 %   During this phase, \cs{l_@@_balance_int} counts the balance of
 %   begin-group and end-group character tokens which appear before a
@@ -1108,15 +1095,6 @@
 %    \end{macrocode}
 % \end{variable}
 %
-% \begin{variable}{\l_@@_cs_name_tl}
-%   This variable is used in \cs{@@_item_cs:n} to store the csname of
-%   the currently-tested token when the regex contains a sub-regex for
-%   testing csnames.
-%    \begin{macrocode}
-\tl_new:N \l_@@_cs_name_tl
-%    \end{macrocode}
-% \end{variable}
-%
 % \subsubsection{Testing characters}
 %
 % \begin{macro}{\c_@@_ascii_min_int, \c_@@_ascii_max_control_int, \c_@@_ascii_max_int}
@@ -1368,21 +1346,18 @@
 %   First test the catcode of the current token to be zero.
 %   Then perform the matching test, and break if the csname
 %   indeed matches.
-%   We store the cs name before building states for the cs, as those
-%   states may overlap with toks registers storing the user's input.
 %    \begin{macrocode}
 \cs_new_protected:Npn \@@_item_cs:n #1
   {
     \int_compare:nNnT \l_@@_curr_catcode_int = 0
       {
         \group_begin:
-          \__kernel_tl_set:Nx \l_@@_cs_name_tl { \@@_curr_cs_to_str: }
           \@@_single_match:
           \@@_disable_submatches:
           \@@_build_for_cs:n {#1}
           \bool_set_eq:NN \l_@@_saved_success_bool
             \g_@@_success_bool
-          \exp_args:NV \@@_match_cs:n \l_@@_cs_name_tl
+          \exp_args:Nx \@@_match_cs:n { \@@_curr_cs_to_str: }
           \if_meaning:w \c_true_bool \g_@@_success_bool
             \group_insert_after:N \@@_break_true:w
           \fi:
@@ -4491,9 +4466,8 @@
 %   }
 %   The tokens in the query are indexed from \texttt{min_pos} for the
 %   first to $\texttt{max_pos}-1$ for the last, and their information is
-%   stored in several arrays and \tn{toks} registers with those numbers. We
-%   don't start from $0$ because the \tn{toks} registers with low
-%   numbers are used to hold the states of the \textsc{nfa}. We match
+%   stored in several arrays and \tn{toks} registers with those numbers.
+%   We match
 %   without backtracking, keeping all threads in lockstep at the
 %   \texttt{curr_pos} in the query. The starting point of the current
 %   match attempt is \texttt{start_pos}, and \texttt{success_pos},
@@ -4514,11 +4488,13 @@
 %     \l_@@_curr_catcode_int,
 %     \l_@@_curr_token_tl,
 %     \l_@@_last_char_int,
+%     \l_@@_last_char_success_int,
 %     \l_@@_case_changed_char_int
 %   }
 %   The character and category codes of the token at the current
 %   position and a token list expanding to that token; the character
 %   code of the token at the previous position;
+%   the character code of the token just before a successful match;
 %   and the character code of the result of changing the case of the
 %   current token (|A-Z|$\leftrightarrow$|a-z|). This last integer is
 %   only computed when necessary, and is otherwise \cs{c_max_int}.  The
@@ -4529,6 +4505,7 @@
 \int_new:N \l_@@_curr_catcode_int
 \tl_new:N \l_@@_curr_token_tl
 \int_new:N \l_@@_last_char_int
+\int_new:N \l_@@_last_char_success_int
 \int_new:N \l_@@_case_changed_char_int
 %    \end{macrocode}
 % \end{variable}
@@ -4676,8 +4653,7 @@
 %
 % \begin{macro}{\@@_match:n, \@@_match_cs:n}
 % \begin{macro}{\@@_match_init:}
-%   First store the query into \tn{toks} registers and arrays (see
-%   \cs{@@_query_set:nnn}). Then initialize the variables that should
+%   Initialize the variables that should
 %   be set once for each user function (even for multiple
 %   matches). Namely, the overall matching is not yet successful; none of
 %   the states should be marked as visited (\cs{g_@@_state_active_intarray}), and
@@ -4688,36 +4664,19 @@
 %    \begin{macrocode}
 \cs_new_protected:Npn \@@_match:n #1
   {
-    \int_zero:N \l_@@_balance_int
-    \int_set:Nn \l_@@_curr_pos_int { 1 + \l_@@_max_state_int }
-    \@@_query_set:nnn { } { -1 } { -2 }
-    \int_set_eq:NN \l_@@_min_pos_int \l_@@_curr_pos_int
-    \tl_analysis_map_inline:nn {#1}
-      { \@@_query_set:nnn {##1} {"##3} {##2} }
-    \int_set_eq:NN \l_@@_max_pos_int \l_@@_curr_pos_int
-    \@@_query_set:nnn { } { -1 } { -2 }
+    \int_set:Nn \l_@@_min_pos_int { 2 }
+    \int_set:Nn \l_@@_max_pos_int { 2 + \tl_count_tokens:n {#1} }
     \@@_match_init:
     \@@_match_once_init:
     \tl_analysis_map_inline:nn {#1}
       { \@@_match_one_token:nnN {##1} {##2} ##3 }
-    \@@_match_one_token:nnN { } { -2 } 9
+    \@@_match_one_token:nnN { } { -2 } F
     \prg_break_point:Nn \@@_maplike_break: { }
   }
 \cs_new_protected:Npn \@@_match_cs:n #1
   {
-    \int_zero:N \l_@@_balance_int
-    \int_set:Nn \l_@@_curr_pos_int
-      { 1 + \int_max:nn \l_@@_max_state_int \l_@@_max_pos_int }
-    \@@_query_set:nnn { } { -1 } { -2 }
-    \int_set_eq:NN \l_@@_min_pos_int \l_@@_curr_pos_int
-    \str_map_inline:nn {#1}
-      {
-        \@@_query_set:nnn { \exp_not:n {##1} }
-          { \tl_if_blank:nTF {##1} { 10 } { 12 } }
-          { `##1 }
-      }
-    \int_set_eq:NN \l_@@_max_pos_int \l_@@_curr_pos_int
-    \@@_query_set:nnn { } { -1 } { -2 }
+    \int_set:Nn \l_@@_min_pos_int { 2 + \l_@@_max_pos_int }
+    \int_set:Nn \l_@@_max_pos_int { \l_@@_min_pos_int + \str_count:n {#1} }
     \int_set_eq:NN \l_@@_min_thread_int \l_@@_max_thread_int
     \@@_match_init:
     \@@_match_once_init:
@@ -4727,7 +4686,7 @@
           { \@@_match_one_token:nnN {##1} {`##1} A }
           { \@@_match_one_token:nnN {##1} {`##1} C }
       }
-    \@@_match_one_token:nnN { } { -2 } 9
+    \@@_match_one_token:nnN { } { -2 } F
     \prg_break_point:Nn \@@_maplike_break: { }
   }
 \cs_new_protected:Npn \@@_match_init:
@@ -4763,8 +4722,10 @@
 %   successful, and has no submatches yet; clear the array of active
 %   threads, and put the starting state $0$ in it. We are then almost
 %   ready to read our first token in the query, but we actually start
-%   one position earlier than the start, and \texttt{get} that token, to
-%   set \texttt{last_char} properly for word boundaries.
+%   one position earlier than the start because
+%   \cs{@@_match_one_token:nnN} increments \cs{l_@@_curr_pos_int} and
+%   saves \cs{l_@@_curr_char_int} as the \texttt{last_char} so that word
+%   boundaries can be correctly identified.
 %    \begin{macrocode}
 \cs_new_protected:Npn \@@_match_once_init:
   {
@@ -4785,7 +4746,7 @@
     \@@_store_state:n { \l_@@_min_state_int }
     \int_set:Nn \l_@@_curr_pos_int
       { \l_@@_start_pos_int - 1 }
-    \@@_query_get:
+    \int_set_eq:NN \l_@@_curr_char_int \l_@@_last_char_success_int
     \exp_args:NNf \@@_match_once_init_aux:
     \tl_map_inline:nn { \exp_after:wN \l_@@_matched_analysis_tl \l_@@_curr_analysis_tl }
       { \@@_match_one_token:nnN ##1 }
@@ -4853,9 +4814,6 @@
     \tl_set:Nn \l_@@_curr_token_tl {#1}
     \int_set:Nn \l_@@_curr_char_int {#2}
     \int_set:Nn \l_@@_curr_catcode_int { "#3 }
-    \if_int_compare:w 9 = \l_@@_curr_catcode_int
-      \int_set:Nn \l_@@_curr_catcode_int { -1 } % ^^A todo
-    \fi:
     \tl_put_right:No \l_@@_matched_analysis_tl { \l_@@_curr_analysis_tl }
     \tl_set:Nn \l_@@_curr_analysis_tl { { {#1} {#2} #3 } }
     \use:x
@@ -4888,53 +4846,6 @@
 % \end{macro}
 % \end{macro}
 %
-% \begin{macro}{\@@_query_set:nnn}
-%   The arguments are: tokens that \texttt{o} and \texttt{x} expand to
-%   one token of the query, the catcode, and the character code. Store
-%   those, and the current brace balance (used later to check for
-%   overall brace balance) in a \tn{toks} register and some arrays,
-%   then update the \texttt{balance}.
-%    \begin{macrocode}
-\cs_new_protected:Npn \@@_query_set:nnn #1#2#3
-  {
-    \__kernel_intarray_gset:Nnn \g_@@_charcode_intarray
-      { \l_@@_curr_pos_int } {#3}
-    \__kernel_intarray_gset:Nnn \g_@@_catcode_intarray
-      { \l_@@_curr_pos_int } {#2}
-    \__kernel_intarray_gset:Nnn \g_@@_balance_intarray
-      { \l_@@_curr_pos_int } { \l_@@_balance_int }
-    \@@_toks_set:Nn \l_@@_curr_pos_int {#1}
-    \int_incr:N \l_@@_curr_pos_int
-    \if_case:w #2 \exp_stop_f:
-    \or: \int_incr:N \l_@@_balance_int
-    \or: \int_decr:N \l_@@_balance_int
-    \fi:
-  }
-%    \end{macrocode}
-% \end{macro}
-%
-% \begin{macro}{\@@_query_get:}
-%   Extract the current character and category codes at the current
-%   position from the appropriate arrays.
-%    \begin{macrocode}
-\cs_new_protected:Npn \@@_query_get:
-  {
-    \int_set:Nn \l_@@_curr_char_int
-      {
-        \__kernel_intarray_item:Nn \g_@@_charcode_intarray
-          { \l_@@_curr_pos_int }
-      }
-    \int_set:Nn \l_@@_curr_catcode_int
-      {
-        \__kernel_intarray_item:Nn \g_@@_catcode_intarray
-          { \l_@@_curr_pos_int }
-      }
-    \tl_set:Nx \l_@@_curr_token_tl
-      { \tex_the:D \tex_toks:D \l_@@_curr_pos_int }
-  }
-%    \end{macrocode}
-% \end{macro}
-%
 % \subsubsection{Using states of the \textsc{nfa}}
 %
 % \begin{macro}{\@@_use_state:}
@@ -5141,6 +5052,7 @@
         \bool_set_eq:NN \l_@@_empty_success_bool
           \l_@@_fresh_thread_bool
         \int_set_eq:NN \l_@@_success_pos_int \l_@@_curr_pos_int
+        \int_set_eq:NN \l_@@_last_char_success_int \l_@@_last_char_int
         \tl_clear:N \l_@@_matched_analysis_tl
         \tl_set_eq:NN \l_@@_success_submatches_tl
           \l_@@_curr_submatches_tl
@@ -6023,6 +5935,14 @@
 %    \end{macrocode}
 % \end{variable}
 %
+% \begin{variable}{\g_@@_balance_intarray}
+%   The first thing we do when matching is to store the balance of
+%   begin-group/end-group characters into \cs{g_@@_balance_intarray}.
+%    \begin{macrocode}
+\intarray_new:Nn \g_@@_balance_intarray { 65536 }
+%    \end{macrocode}
+% \end{variable}
+%
 % \begin{macro}{\@@_return:}
 %   This function triggers either \cs{prg_return_false:} or
 %   \cs{prg_return_true:} as appropriate to whether a match was found or
@@ -6039,6 +5959,35 @@
 %    \end{macrocode}
 % \end{macro}
 %
+% \begin{macro}{\@@_query_set:n, \@@_query_set_aux:nN}
+%   To easily extract subsets of the input once we found the positions
+%   at which to cut, store the input tokens one by one into successive
+%   \tn{toks} registers.  Also store the brace balance (used to check
+%   for overall brace balance) in an array.
+%    \begin{macrocode}
+\cs_new_protected:Npn \@@_query_set:n #1
+  {
+    \int_zero:N \l_@@_balance_int
+    \int_set:Nn \l_@@_curr_pos_int { 1 }
+    \@@_query_set_aux:nN { } F
+    \tl_analysis_map_inline:nn {#1}
+      { \@@_query_set_aux:nN {##1} ##3 }
+    \@@_query_set_aux:nN { } F
+  }
+\cs_new_protected:Npn \@@_query_set_aux:nN #1#2
+  {
+    \@@_toks_set:Nn \l_@@_curr_pos_int {#1}
+    \__kernel_intarray_gset:Nnn \g_@@_balance_intarray
+      { \l_@@_curr_pos_int } { \l_@@_balance_int }
+    \int_incr:N \l_@@_curr_pos_int
+    \if_case:w "#2 \exp_stop_f:
+    \or: \int_incr:N \l_@@_balance_int
+    \or: \int_decr:N \l_@@_balance_int
+    \fi:
+  }
+%    \end{macrocode}
+% \end{macro}
+%
 % \subsubsection{Matching}
 %
 % \begin{macro}{\@@_if_match:nn}
@@ -6094,7 +6043,7 @@
       #1
       \@@_match:n {#2}
       \@@_extract:
-    \@@_group_end_extract_seq:N #3
+    \@@_group_end_extract_seq:Nn #3 {#2}
   }
 \cs_new_protected:Npn \@@_extract_all:nnN #1#2#3
   {
@@ -6102,7 +6051,7 @@
       \@@_multi_match:n { \@@_extract: }
       #1
       \@@_match:n {#2}
-    \@@_group_end_extract_seq:N #3
+    \@@_group_end_extract_seq:Nn #3 {#2}
   }
 %    \end{macrocode}
 % \end{macro}
@@ -6141,7 +6090,6 @@
         }
       #1
       \@@_match:n {#2}
-%<assert>\assert_int:n { \l_@@_curr_pos_int = \l_@@_max_pos_int }
       \__kernel_intarray_gset:Nnn \g_@@_submatch_prev_intarray
         { \l_@@_submatch_int } { 0 }
       \__kernel_intarray_gset:Nnn \g_@@_submatch_end_intarray
@@ -6156,12 +6104,12 @@
           \int_decr:N \l_@@_submatch_int
         \fi:
       \fi:
-    \@@_group_end_extract_seq:N #3
+    \@@_group_end_extract_seq:Nn #3 {#2}
   }
 %    \end{macrocode}
 % \end{macro}
 %
-% \begin{macro}{\@@_group_end_extract_seq:N}
+% \begin{macro}{\@@_group_end_extract_seq:Nn}
 %   The end-points of submatches are stored as entries of two arrays
 %   from \cs{l_@@_min_submatch_int} to
 %   \cs{l_@@_submatch_int} (exclusive). Extract the relevant ranges
@@ -6170,8 +6118,9 @@
 %   whenever we see too many begin-group or end-group tokens in a
 %   submatch.
 %    \begin{macrocode}
-\cs_new_protected:Npn \@@_group_end_extract_seq:N #1
+\cs_new_protected:Npn \@@_group_end_extract_seq:Nn #1#2
   {
+      \@@_query_set:n {#2}
       \flag_clear:n { @@_begin }
       \flag_clear:n { @@_end }
       \seq_set_from_function:NnN \l_@@_internal_seq
@@ -6287,11 +6236,12 @@
       \@@_single_match:
       #1
       \@@_replacement:n {#2}
-      \exp_args:No \@@_match:n { #3 }
+      \exp_args:No \@@_match:n {#3}
       \if_meaning:w \c_false_bool \g_@@_success_bool
         \group_end:
       \else:
         \@@_extract:
+        \exp_args:No \@@_query_set:n {#3}
         \int_set:Nn \l_@@_balance_int
           {
             \@@_replacement_balance_one_match:n
@@ -6333,6 +6283,7 @@
       #1
       \@@_replacement:n {#2}
       \exp_args:No \@@_match:n {#3}
+      \exp_args:No \@@_query_set:n {#3}
       \int_set:Nn \l_@@_balance_int
         {
           0