[latex3-commits] [git/LaTeX3-latex3-latex3] peek-regex: Internal change to how begin/end anchors are implemented (bf8e352e0)

Mon Jul 20 02:19:19 CEST 2020

Repository : https://github.com/latex3/latex3
On branch  : peek-regex
Link       : https://github.com/latex3/latex3/commit/bf8e352e008c24976bf58ea1a76718066ca4b9ea

>---------------------------------------------------------------

commit bf8e352e008c24976bf58ea1a76718066ca4b9ea
Author: Bruno Le Floch <blflatex at gmail.com>
Date:   Sun Jul 19 17:32:15 2020 +0200

    Internal change to how begin/end anchors are implemented
    
    The aim is to remove uses of max_pos, because that will not be available
    when peeking for a regex


>---------------------------------------------------------------

bf8e352e008c24976bf58ea1a76718066ca4b9ea
 l3kernel/l3regex.dtx | 140 ++++++++++++++++++++++-----------------------------
 1 file changed, 59 insertions(+), 81 deletions(-)

diff --git a/l3kernel/l3regex.dtx b/l3kernel/l3regex.dtx
index cfb002475..0878f68e2 100644
--- a/l3kernel/l3regex.dtx
+++ b/l3kernel/l3regex.dtx
@@ -1825,7 +1825,7 @@
 %   \item \cs{@@_command_K:}
 %   \item \cs{@@_assertion:Nn} \meta{boolean} \Arg{assertion test},
 %     where the \meta{assertion test} is \cs{@@_b_test:} or
-%     |{|\cs{@@_anchor:N} \meta{integer}|}|
+%     \cs{@@_Z_test:} or \cs{@@_A_test:} or \cs{@@_G_test:}
 % \end{itemize}
 % Tests can be the following:
 % \begin{itemize}
@@ -2666,73 +2666,56 @@
 %
 % \subsubsection{Anchoring and simple assertions}
 %
-% \begin{macro}{\@@_compile_anchor:NF}
+% \begin{macro}{\@@_compile_anchor_letter:NNN}
+% \begin{macro}{\@@_compile_/A:, \@@_compile_/G:, \@@_compile_/Z:, \@@_compile_/z:, \@@_compile_/b:, \@@_compile_/B:}
 % \begin{macro}+\@@_compile_^:+
-% \begin{macro}{\@@_compile_/A:, \@@_compile_/G:}
 % \begin{macro}+\@@_compile_$:+
-% \begin{macro}{\@@_compile_/Z:, \@@_compile_/z:}
-%   In modes where assertions are allowed, anchor to the start of the
-%   query, the start of the match, or the end of the query, depending on
-%   the integer |#1|. In other modes, |#2| treats the character as raw,
-%   with an error for escaped letters (|$| is valid in a class, but |\A|
-%   is definitely a mistake on the user's part).
+%   In modes where assertions are forbidden, anchors such as |\A|
+%   produce an error (|\A|~is invalid in classes); otherwise they add an
+%   \cs{@@_assertion:Nn} test as appropriate (the only negative
+%   assertion is~|\B|).  The test functions are defined later.  The
+%   implementation for
+%   |$| and |^| is only different from |\A| etc because these are valid
+%   in a class.
 %    \begin{macrocode}
-\cs_new_protected:Npn \@@_compile_anchor:NF #1#2
+\cs_new_protected:Npn \@@_compile_anchor_letter:NNN #1#2#3
   {
-    \@@_if_in_class_or_catcode:TF {#2}
+    \@@_if_in_class_or_catcode:TF { \@@_compile_raw_error:N #1 }
       {
         \tl_build_put_right:Nn \l_@@_build_tl
-          { \@@_assertion:Nn \c_true_bool { \@@_anchor:N #1 } }
+          { \@@_assertion:Nn #2 {#3} }
       }
   }
-\cs_set_protected:Npn \@@_tmp:w #1#2
-  {
-    \cs_new_protected:cpn { @@_compile_/#1: }
-      { \@@_compile_anchor:NF #2 { \@@_compile_raw_error:N #1 } }
-  }
-\@@_tmp:w A \l_@@_min_pos_int
-\@@_tmp:w G \l_@@_start_pos_int
-\@@_tmp:w Z \l_@@_max_pos_int
-\@@_tmp:w z \l_@@_max_pos_int
+\cs_new_protected:cpn { @@_compile_/A: }
+  { \@@_compile_anchor_letter:NNN A \c_true_bool \@@_A_test: }
+\cs_new_protected:cpn { @@_compile_/G: }
+  { \@@_compile_anchor_letter:NNN G \c_true_bool \@@_G_test: }
+\cs_new_protected:cpn { @@_compile_/Z: }
+  { \@@_compile_anchor_letter:NNN Z \c_true_bool \@@_Z_test: }
+\cs_new_protected:cpn { @@_compile_/z: }
+  { \@@_compile_anchor_letter:NNN z \c_true_bool \@@_Z_test: }
+\cs_new_protected:cpn { @@_compile_/b: }
+  { \@@_compile_anchor_letter:NNN b \c_true_bool \@@_b_test: }
+\cs_new_protected:cpn { @@_compile_/B: }
+  { \@@_compile_anchor_letter:NNN B \c_false_bool \@@_b_test: }
 \cs_set_protected:Npn \@@_tmp:w #1#2
   {
     \cs_new_protected:cpn { @@_compile_#1: }
-      { \@@_compile_anchor:NF #2 { \@@_compile_raw:N #1 } }
+      {
+        \@@_if_in_class_or_catcode:TF { \@@_compile_raw:N #1 }
+          {
+            \tl_build_put_right:Nn \l_@@_build_tl
+              { \@@_assertion:Nn \c_true_bool {#2} }
+          }
+      }
   }
-\exp_args:Nx \@@_tmp:w { \iow_char:N \^ } \l_@@_min_pos_int
-\exp_args:Nx \@@_tmp:w { \iow_char:N \$ } \l_@@_max_pos_int
+\exp_args:Nx \@@_tmp:w { \iow_char:N \^ } { \@@_A_test: }
+\exp_args:Nx \@@_tmp:w { \iow_char:N \$ } { \@@_Z_test: }
 %    \end{macrocode}
 % \end{macro}
 % \end{macro}
 % \end{macro}
 % \end{macro}
-% \end{macro}
-%
-% \begin{macro}{\@@_compile_/b:, \@@_compile_/B:}
-%   Contrarily to |^| and |$|, which could be implemented without really
-%   knowing what precedes in the token list, this requires more
-%   information, namely, the knowledge of the last character code.
-%    \begin{macrocode}
-\cs_new_protected:cpn { @@_compile_/b: }
-  {
-    \@@_if_in_class_or_catcode:TF
-      { \@@_compile_raw_error:N b }
-      {
-        \tl_build_put_right:Nn \l_@@_build_tl
-          { \@@_assertion:Nn \c_true_bool { \@@_b_test: } }
-      }
-  }
-\cs_new_protected:cpn { @@_compile_/B: }
-  {
-    \@@_if_in_class_or_catcode:TF
-      { \@@_compile_raw_error:N B }
-      {
-        \tl_build_put_right:Nn \l_@@_build_tl
-          { \@@_assertion:Nn \c_false_bool { \@@_b_test: } }
-      }
-  }
-%    \end{macrocode}
-% \end{macro}
 %
 % \subsubsection{Character classes}
 %
@@ -3515,7 +3498,9 @@
             { \bool_if:NF ##1 { negative~ } assertion:~##2 }
         }
       \cs_set:Npn \@@_b_test: { word~boundary }
-      \cs_set_eq:NN \@@_anchor:N \@@_show_anchor_to_str:N
+      \cs_set:Npn \@@_Z_test: { anchor~at~end~(\iow_char:N\\Z) }
+      \cs_set:Npn \@@_A_test: { anchor~at~start~(\iow_char:N\\A) }
+      \cs_set:Npn \@@_G_test: { anchor~at~start~of~match~(\iow_char:N\\G) }
       \cs_set_protected:Npn \@@_item_caseful_equal:n ##1
         { \@@_show_one:n { char~code~\int_eval:n{##1} } }
       \cs_set_protected:Npn \@@_item_caseful_range:nn ##1##2
@@ -3658,24 +3643,6 @@
 %    \end{macrocode}
 % \end{macro}
 %
-% \begin{macro}[rEXP]{\@@_show_anchor_to_str:N}
-%   The argument is an integer telling us where the anchor is. We
-%   convert that to the relevant info.
-%    \begin{macrocode}
-\cs_new:Npn \@@_show_anchor_to_str:N #1
-  {
-    anchor~at~
-    \str_case:nnF { #1 }
-      {
-        { \l_@@_min_pos_int   } { start~(\iow_char:N\\A) }
-        { \l_@@_start_pos_int } { start~of~match~(\iow_char:N\\G) }
-        { \l_@@_max_pos_int   } { end~(\iow_char:N\\Z) }
-      }
-      { <error:~'#1'~not~recognized> }
-  }
-%    \end{macrocode}
-% \end{macro}
-%
 % \begin{macro}{\@@_show_item_catcode:NnT}
 %   Produce a sequence of categories which the catcode bitmap |#2|
 %   contains, and show it, indenting the tests on which this catcode
@@ -3855,7 +3822,7 @@
     \@@_pop_lr_states:
     \@@_toks_put_right:Nn \l_@@_right_state_int
       {
-        \if_int_compare:w \l_@@_curr_pos_int = \l_@@_max_pos_int
+        \if_int_compare:w -2 = \l_@@_curr_char_int
           \exp_after:wN \@@_action_success:
         \fi:
       }
@@ -4343,16 +4310,14 @@
 %
 % \subsubsection{Others}
 %
-% \begin{macro}{\@@_assertion:Nn, \@@_b_test:, \@@_anchor:N}
+% \begin{macro}{\@@_assertion:Nn, \@@_b_test:, \@@_A_test:, \@@_G_test:, \@@_Z_test:}
 %   Usage: \cs{@@_assertion:Nn} \meta{boolean} \Arg{test}, where the
 %   \meta{test} is either of the two other functions. Add a free
 %   transition to a new state, conditionally to the assertion test. The
 %   \cs{@@_b_test:} test is used by the |\b| and |\B| escape: check
 %   if the last character was a word character or not, and do the same
 %   to the current character. The boundary-markers of the string are
-%   non-word characters for this purpose.  Anchors at the start or end
-%   of match use \cs{@@_anchor:N}, with a position controlled by the
-%   integer |#1|.
+%   non-word characters for this purpose.
 %    \begin{macrocode}
 \cs_new_protected:Npn \@@_assertion:Nn #1#2
   {
@@ -4372,12 +4337,6 @@
           \bool_if:NT #1 { { } }
       }
   }
-\cs_new_protected:Npn \@@_anchor:N #1
-  {
-    \if_int_compare:w #1 = \l_@@_curr_pos_int
-      \exp_after:wN \@@_break_true:w
-    \fi:
-  }
 \cs_new_protected:Npn \@@_b_test:
   {
     \group_begin:
@@ -4387,6 +4346,24 @@
         { \group_end: \@@_item_reverse:n \@@_prop_w: }
         { \group_end: \@@_prop_w: }
   }
+\cs_new_protected:Npn \@@_Z_test:
+  {
+    \if_int_compare:w -2 = \l_@@_curr_char_int
+      \exp_after:wN \@@_break_true:w
+    \fi:
+  }
+\cs_new_protected:Npn \@@_A_test:
+  {
+    \if_int_compare:w -2 = \l_@@_last_char_int
+      \exp_after:wN \@@_break_true:w
+    \fi:
+  }
+\cs_new_protected:Npn \@@_G_test:
+  {
+    \if_int_compare:w \l_@@_curr_pos_int = \l_@@_start_pos_int
+      \exp_after:wN \@@_break_true:w
+    \fi:
+  }
 %    \end{macrocode}
 % \end{macro}
 %
@@ -4701,6 +4678,7 @@
       }
     \int_zero:N \l_@@_step_int
     \int_set_eq:NN \l_@@_success_pos_int \l_@@_min_pos_int
+    \int_set:Nn \l_@@_last_char_success_int { -2 }
     \tl_build_begin:N \l_@@_matched_analysis_tl
     \tl_clear:N \l_@@_curr_analysis_tl
     \int_set:Nn \l_@@_min_submatch_int { 1 }