[latex3-commits] [git/LaTeX3-latex3-latex3] master: Implement \peek_regex:nTF and \peek_regex_remove:nTF (8b9bc4ba3)

Thu Dec 3 17:27:34 CET 2020

Repository : https://github.com/latex3/latex3
On branch  : master
Link       : https://github.com/latex3/latex3/commit/8b9bc4ba313e4e5f26065a5b71373ab986b627e6

>---------------------------------------------------------------

commit 8b9bc4ba313e4e5f26065a5b71373ab986b627e6
Author: Bruno Le Floch <blflatex at gmail.com>
Date:   Mon Jul 20 02:04:02 2020 +0200

    Implement \peek_regex:nTF and \peek_regex_remove:nTF
    
    These functions check what follows in the input stream, but they
    do not give access to it in the T and F branches.  I'll also add
    \peek_regex_replace:nnTF that also performs a replacement on the
    tokens found.  It might be nice to store the tokens found into
    some token list, but unfortunately they can very well have
    unbalanced braces.


>---------------------------------------------------------------

8b9bc4ba313e4e5f26065a5b71373ab986b627e6
 l3kernel/CHANGELOG.md             |   1 +
 l3kernel/l3regex.dtx              | 144 ++++++++++++++++++++++++++++++++++----
 l3kernel/l3token.dtx              |  43 ++++++++++++
 l3kernel/testfiles/m3regex011.lvt |  38 ++++++++++
 l3kernel/testfiles/m3regex011.tlg |  53 ++++++++++++++
 5 files changed, 266 insertions(+), 13 deletions(-)

diff --git a/l3kernel/CHANGELOG.md b/l3kernel/CHANGELOG.md
index 3eab183c7..110e6f7a3 100644
--- a/l3kernel/CHANGELOG.md
+++ b/l3kernel/CHANGELOG.md
@@ -9,6 +9,7 @@ this project uses date-based 'snapshot' version identifiers.
 
 ### Added
 - `\peek_analysis_map_inlione:n`
+- `\peek_regex:nTF` and `\peek_regex_remove:nTF`
 
 ### Unchanged
 - Extend `\text_expand:n` to cover `\@protected at testopt`
diff --git a/l3kernel/l3regex.dtx b/l3kernel/l3regex.dtx
index 98088779a..06ea8c3d5 100644
--- a/l3kernel/l3regex.dtx
+++ b/l3kernel/l3regex.dtx
@@ -3728,8 +3728,9 @@
 % Each state of the \textsc{nfa} is stored in a \tn{toks}. The
 % operations which can appear in the \tn{toks} are
 % \begin{itemize}
-%   \item \cs{@@_action_start_wildcard:} inserted at the start
-%     of the regular expression to make it unanchored.
+%   \item \cs{@@_action_start_wildcard:N} \meta{boolean} inserted at the
+%     start of the regular expression, where a \texttt{true}
+%     \meta{boolean} makes it unanchored.
 %   \item \cs{@@_action_success:} marks the exit state of the
 %     \textsc{nfa}.
 %   \item \cs{@@_action_cost:n} \Arg{shift} is a transition from the
@@ -3765,20 +3766,26 @@
 %     corresponding end-points of nested groups.
 % \end{itemize}
 %
-% \begin{macro}{\@@_build:n, \@@_build:N}
+% \begin{macro}{\@@_build:n, \@@_build_aux:Nn, \@@_build:N, \@@_build_aux:NN}
 %   The \texttt{n}-type function first compiles its argument. Reset some
 %   variables. Allocate two states, and put a wildcard in state $0$
 %   (transitions to state $1$ and $0$ state). Then build the regex
 %   within a (capturing) group numbered $0$ (current
 %   value of \texttt{capturing_group}). Finally, if the match reaches the
-%   last state, it is successful.
+%   last state, it is successful.  A \texttt{false} boolean for argument
+%   |#1| for the auxiliaries will suppress the wildcard and make the
+%   match anchored: used for \cs{peek_regex:nTF} and similar.
 %    \begin{macrocode}
-\cs_new_protected:Npn \@@_build:n #1
+\cs_new_protected:Npn \@@_build:n
+  { \@@_build_aux:Nn \c_true_bool }
+\cs_new_protected:Npn \@@_build:N
+  { \@@_build_aux:NN \c_true_bool }
+\cs_new_protected:Npn \@@_build_aux:Nn #1#2
   {
-    \@@_compile:n {#1}
-    \@@_build:N \l_@@_internal_regex
+    \@@_compile:n {#2}
+    \@@_build_aux:NN #1 \l_@@_internal_regex
   }
-\cs_new_protected:Npn \@@_build:N #1
+\cs_new_protected:Npn \@@_build_aux:NN #1#2
   {
     \@@_standard_escapechar:
     \int_zero:N \l_@@_capturing_group_int
@@ -3786,8 +3793,8 @@
     \@@_build_new_state:
     \@@_build_new_state:
     \@@_toks_put_right:Nn \l_@@_left_state_int
-      { \@@_action_start_wildcard: }
-    \@@_group:nnnN {#1} { 1 } { 0 } \c_false_bool
+      { \@@_action_start_wildcard:N #1 }
+    \@@_group:nnnN {#2} { 1 } { 0 } \c_false_bool
     \@@_toks_put_right:Nn \l_@@_right_state_int
       { \@@_action_success: }
   }
@@ -4871,7 +4878,7 @@
 %
 % \subsubsection{Actions when matching}
 %
-% \begin{macro}{\@@_action_start_wildcard:}
+% \begin{macro}{\@@_action_start_wildcard:N}
 %   For an unanchored match, state $0$ has a free transition to the next
 %   and a costly one to itself, to repeat at the next position. To catch
 %   repeated identical empty matches, we need to know if a successful
@@ -4879,12 +4886,12 @@
 %   \cs{l_@@_fresh_thread_bool} may be skipped by a successful
 %   thread, hence we had to add it to \cs{@@_match_one_token:nnN} too.
 %    \begin{macrocode}
-\cs_new_protected:Npn \@@_action_start_wildcard:
+\cs_new_protected:Npn \@@_action_start_wildcard:N #1
   {
     \bool_set_true:N \l_@@_fresh_thread_bool
     \@@_action_free:n {1}
     \bool_set_false:N \l_@@_fresh_thread_bool
-    \@@_action_cost:n {0}
+    \bool_if:NT #1 { \@@_action_cost:n {0} }
   }
 %    \end{macrocode}
 % \end{macro}
@@ -6326,6 +6333,117 @@
 %    \end{macrocode}
 % \end{macro}
 %
+% \subsubsection{Peeking ahead}
+%
+% \begin{variable}{\l_@@_peek_true_tl, \l_@@_peek_false_tl}
+%   True/false code arguments of \cs{peek_regex:nTF} or similar.
+%    \begin{macrocode}
+\tl_new:N \l_@@_peek_true_tl
+\tl_new:N \l_@@_peek_false_tl
+%    \end{macrocode}
+% \end{variable}
+%
+% \begin{variable}{\l_@@_reinsert_tl}
+% \begin{macro}[EXP]{\@@_reinsert:n}
+%   Token list such that hitting it with \cs{exp:w} will expand to all
+%   tokens we found.  It is constructed using the |tl_build| machinery
+%   and takes the form of one call to \cs{@@_reinsert:n} for each token
+%   to reinsert.  The argument is something that \texttt{o}-expands to
+%   the single token we wish to put back.
+%    \begin{macrocode}
+\tl_new:N \l_@@_reinsert_tl
+\cs_new:Npn \@@_reinsert:n #1
+  {
+    \exp_after:wN \exp_after:wN
+    \exp_after:wN \exp_end:
+    \exp_after:wN \exp_after:wN
+    #1
+    \exp:w
+  }
+%    \end{macrocode}
+% \end{macro}
+% \end{variable}
+%
+% \begin{macro}[TF]{\peek_regex:n, \peek_regex:N, \peek_regex_remove:n, \peek_regex_remove:N}
+% \begin{macro}{\@@_peek:nnTF, \@@_peek_end:, \@@_peek_end_aux:N, \@@_peek_remove_end:n}
+%    \begin{macrocode}
+\cs_new_protected:Npn \peek_regex:nTF #1
+  {
+    \@@_peek:nnTF { \@@_peek_end: }
+      { \@@_build_aux:Nn \c_false_bool {#1} }
+  }
+\cs_new_protected:Npn \peek_regex:nT #1#2
+  { \peek_regex:nTF {#1} {#2} { } }
+\cs_new_protected:Npn \peek_regex:nF #1 { \peek_regex:nTF {#1} { } }
+\cs_new_protected:Npn \peek_regex:NTF #1
+  {
+    \@@_peek:nnTF { \@@_peek_end: }
+      { \@@_build_aux:NN \c_false_bool #1 }
+  }
+\cs_new_protected:Npn \peek_regex:NT #1#2
+  { \peek_regex:NTF #1 {#2} { } }
+\cs_new_protected:Npn \peek_regex:NF #1 { \peek_regex:NTF {#1} { } }
+\cs_new_protected:Npn \peek_regex_remove:nTF #1
+  {
+    \@@_peek:nnTF { \@@_peek_remove_end:n {##1} }
+      { \@@_build_aux:Nn \c_false_bool {#1} }
+  }
+\cs_new_protected:Npn \peek_regex_remove:nT #1#2
+  { \peek_regex_remove:nTF {#1} {#2} { } }
+\cs_new_protected:Npn \peek_regex_remove:nF #1
+  { \peek_regex_remove:nTF {#1} { } }
+\cs_new_protected:Npn \peek_regex_remove:NTF #1
+  {
+    \@@_peek:nnTF { \@@_peek_remove_end:n {##1} }
+      { \@@_build_aux:NN \c_false_bool #1 }
+  }
+\cs_new_protected:Npn \peek_regex_remove:NT #1#2
+  { \peek_regex_remove:NTF #1 {#2} { } }
+\cs_new_protected:Npn \peek_regex_remove:NF #1
+  { \peek_regex_remove:NTF #1 { } }
+\cs_new_protected:Npn \@@_peek:nnTF #1#2#3#4
+  {
+    \group_begin:
+      \tl_set:Nn \l_@@_peek_true_tl { \group_end: #3 }
+      \tl_set:Nn \l_@@_peek_false_tl { \group_end: #4 }
+      \@@_disable_submatches:
+      \@@_single_match:
+      #2
+      \@@_match_init:
+      \tl_build_clear:N \l_@@_reinsert_tl
+      \@@_match_once_init:
+      \peek_analysis_map_inline:n
+        {
+          \tl_build_put_right:Nn \l_@@_reinsert_tl
+            { \@@_reinsert:n {##1} }
+          \@@_match_one_token:nnN {##1} {##2} ##3
+          \use_none:nnn
+          \prg_break_point:Nn \@@_maplike_break:
+            { \peek_analysis_map_break:n {#1} }
+        }
+  }
+\cs_new_protected:Npn \@@_peek_end:
+  {
+    \bool_if:NTF \g_@@_success_bool
+      { \@@_peek_end_aux:N \l_@@_peek_true_tl }
+      { \@@_peek_end_aux:N \l_@@_peek_false_tl }
+  }
+\cs_new_protected:Npn \@@_peek_end_aux:N #1
+  {
+    \tl_build_end:N \l_@@_reinsert_tl
+    \exp_after:wN #1
+    \exp:w \l_@@_reinsert_tl \exp_end:
+  }
+\cs_new_protected:Npn \@@_peek_remove_end:n #1
+  {
+    \bool_if:NTF \g_@@_success_bool
+      { \exp_after:wN \l_@@_peek_true_tl #1 }
+      { \@@_peek_end_aux:N \l_@@_peek_false_tl }
+  }
+%    \end{macrocode}
+% \end{macro}
+% \end{macro}
+%
 % \subsection{Messages}
 %
 % Messages for the preparsing phase.
diff --git a/l3kernel/l3token.dtx b/l3kernel/l3token.dtx
index ac2a1ea9f..4abb84099 100644
--- a/l3kernel/l3token.dtx
+++ b/l3kernel/l3token.dtx
@@ -1035,6 +1035,49 @@
 %   \cs{peek_analysis_map_break:}).
 % \end{function}
 %
+% \begin{function}[added = 2020-07-20, TF]{\peek_regex:n, \peek_regex:N}
+%   \begin{syntax}
+%     \cs{peek_regex:nTF} \Arg{regex} \Arg{true code} \Arg{false code}
+%   \end{syntax}
+%   Tests if the \meta{tokens} that follow in the input stream match the
+%   \meta{regular expression}.  Any \meta{tokens} that have been read
+%   are left in the input stream after the \meta{true code} or
+%   \meta{false code} (as appropriate to the result of the test).  See
+%   \pkg{l3regex} for documentation of the syntax of regular
+%   expressions.  The \meta{regular expression} is implicitly anchored
+%   at the start, so for instance \cs{peek_regex:nTF}~|{|~|a|~|}| is
+%   essentially equivalent to \cs{peek_charcode:NTF}~|a|.
+%   \begin{texnote}
+%     Implicit character tokens are correctly considered by
+%     \cs{peek_regex:nTF} as control sequences, while functions that
+%     inspect individual tokens (for instance \cs{peek_charcode:NTF})
+%     only take into account their meaning.
+%   \end{texnote}
+% \end{function}
+%
+% \begin{function}[added = 2020-07-20, TF]
+%   {\peek_regex_remove:n, \peek_regex_remove:N}
+%   \begin{syntax}
+%     \cs{peek_regex_remove:nTF} \Arg{regex} \Arg{true code} \Arg{false code}
+%   \end{syntax}
+%   Tests if the \meta{tokens} that follow in the input stream match the
+%   \meta{regex}.  If the test is true, the \meta{tokens} are removed
+%   from the input stream and the \meta{true code} is inserted, while if
+%   the test is false, the \meta{false code} is inserted followed by the
+%   \meta{tokens} that have been read in the process of matching the
+%   \meta{regex}.  See \pkg{l3regex} for documentation of the syntax of
+%   regular expressions.  The \meta{regular expression} is implicitly
+%   anchored at the start, so for instance
+%   \cs{peek_regex_remove:nTF}~|{|~|a|~|}| is essentially equivalent to
+%   \cs{peek_charcode_remove:NTF}~|a|.
+%   \begin{texnote}
+%     Implicit character tokens are correctly considered by
+%     \cs{peek_regex_remove:nTF} as control sequences, while functions
+%     that inspect individual tokens (for instance
+%     \cs{peek_charcode:NTF}) only take into account their meaning.
+%   \end{texnote}
+% \end{function}
+%
 % \section{Description of all possible tokens}
 % \label{sec:l3token:all-tokens}
 %
diff --git a/l3kernel/testfiles/m3regex011.lvt b/l3kernel/testfiles/m3regex011.lvt
index 9d5d9cae9..bde3cbd93 100644
--- a/l3kernel/testfiles/m3regex011.lvt
+++ b/l3kernel/testfiles/m3regex011.lvt
@@ -31,4 +31,42 @@
   }
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+\OMIT
+\cs_set:Npn \test:w #1 \s_stop { \TYPE { | \exp_not:n {#1} | } }
+\TIMO
+
+\TEST { Peek~regex }
+  {
+    \peek_regex:nTF { } { \TRUE \test:w } { \ERROR \test:w } \aaa \s_stop
+    \peek_regex:nT { a } { \TRUE \test:w } a \s_stop
+    \peek_regex:nF { . (. a()) } { \ERROR \test:w } \test:w { a b \s_stop \show } \s_stop
+    \peek_regex:nTF { a \{ } { \TRUE \test:w } { \ERROR \test:w } a { b } \s_stop
+    \peek_regex:nTF { \cL. } { \ERROR \test:w } { \FALSE \test:w } \aaa \s_stop
+    \peek_regex:nT { b } { \ERROR \test:w } \test:w a \s_stop
+    \peek_regex:nF { \c[^C] . (. a()) } { \FALSE \test:w } \test:w { a b } c \s_stop
+    \peek_regex:nTF { a \{ \b c } { \ERROR \test:w } { \FALSE \test:w } a { b } \s_stop
+    \SEPARATOR
+    \peek_regex_remove:nTF { } { \TRUE \test:w } { \ERROR \test:w } \aaa \s_stop
+    \peek_regex_remove:nT { a } { \TRUE \test:w } a \s_stop
+    \peek_regex_remove:nF { . (. a()) } { \ERROR \test:w } \test:w { a b \s_stop \show } \s_stop
+    \peek_regex_remove:nTF { a \{ } { \TRUE \test:w } { \ERROR \test:w } a { b \s_stop \show } \s_stop
+    \peek_regex_remove:nTF { \cL. } { \ERROR \test:w } { \FALSE \test:w } \aaa \s_stop
+    \peek_regex_remove:nT { b } { \ERROR \test:w } \test:w a \s_stop
+    \peek_regex_remove:nF { \c[^C] . (. a()) } { \FALSE \test:w } \test:w { a b } c \s_stop
+    \peek_regex_remove:nTF { a \{ \b c } { \ERROR \test:w } { \FALSE \test:w } a { b } \s_stop
+  }
+
+\TEST { Peek~regex~compiled }
+  {
+    \regex_set:Nn \l_tmpa_regex { a | \c{test:w} }
+    \peek_regex:NTF \l_tmpa_regex { \TRUE \test:w } { \ERROR \test:w } a \s_stop
+    \peek_regex:NT \l_tmpa_regex { \TRUE \test:w } a \s_stop
+    \peek_regex:NF \l_tmpa_regex { \ERROR \test:w } \test:w a \s_stop
+    \peek_regex_remove:NTF \l_tmpa_regex { \TRUE \test:w } { \ERROR \test:w } a \s_stop
+    \peek_regex_remove:NT \l_tmpa_regex { \TRUE \test:w } a \s_stop
+    \peek_regex_remove:NF \l_tmpa_regex { \ERROR \test:w } \test:w a \s_stop
+  }
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \END
diff --git a/l3kernel/testfiles/m3regex011.tlg b/l3kernel/testfiles/m3regex011.tlg
index 05eb3b2f2..c37bd68dd 100644
--- a/l3kernel/testfiles/m3regex011.tlg
+++ b/l3kernel/testfiles/m3regex011.tlg
@@ -13,3 +13,56 @@ TEST 2: No groups within \c
 <recently read> }
 l. ...  }
 ============================================================
+============================================================
+TEST 3: Peek regex
+============================================================
+TRUE
+|\aaa |
+TRUE
+|a|
+|ab\s_stop \show |
+TRUE
+|a{b}|
+FALSE
+|\aaa |
+|a|
+FALSE
+|\test:w {ab}c|
+FALSE
+|a{b}|
+============================================================
+TRUE
+|\aaa |
+TRUE
+||
+> end-group character }.
+<argument> ... \test:w }\test:w {ab\s_stop \show }
+                                                  \s_stop \peek_regex_remove...
+l. ...  }
+TRUE
+|b|
+> end-group character }.
+<argument> ...}{\ERROR \test:w }a{b\s_stop \show }
+                                                  \s_stop \peek_regex_remove...
+l. ...  }
+FALSE
+|\aaa |
+|a|
+FALSE
+|\test:w {ab}c|
+FALSE
+|a{b}|
+============================================================
+============================================================
+TEST 4: Peek regex compiled
+============================================================
+TRUE
+|a|
+TRUE
+|a|
+|a|
+TRUE
+||
+TRUE
+||
+============================================================