[latex3-commits] [git/LaTeX3-latex3-latex3] peek-regex: Implement \peek_regex:nTF and \peek_regex_remove:nTF (1099d2503)
Bruno Le Floch
blflatex at gmail.com
Mon Jul 20 02:19:51 CEST 2020
Repository : https://github.com/latex3/latex3
On branch : peek-regex
Link : https://github.com/latex3/latex3/commit/1099d25039cdf52d028b3cf8830fe6c311ace329
>---------------------------------------------------------------
commit 1099d25039cdf52d028b3cf8830fe6c311ace329
Author: Bruno Le Floch <blflatex at gmail.com>
Date: Mon Jul 20 02:04:02 2020 +0200
Implement \peek_regex:nTF and \peek_regex_remove:nTF
These functions check what follows in the input stream, but they
do not give access to it in the T and F branches. I'll also add
\peek_regex_replace:nnTF that also performs a replacement on the
tokens found. It might be nice to store the tokens found into
some token list, but unfortunately they can very well have
unbalanced braces.
>---------------------------------------------------------------
1099d25039cdf52d028b3cf8830fe6c311ace329
l3kernel/CHANGELOG.md | 1 +
l3kernel/l3regex.dtx | 144 ++++++++++++++++++++++++++++++++++----
l3kernel/l3token.dtx | 43 ++++++++++++
l3kernel/testfiles/m3regex011.lvt | 38 ++++++++++
l3kernel/testfiles/m3regex011.tlg | 53 ++++++++++++++
5 files changed, 266 insertions(+), 13 deletions(-)
diff --git a/l3kernel/CHANGELOG.md b/l3kernel/CHANGELOG.md
index 4b3da5bf2..c6b134661 100644
--- a/l3kernel/CHANGELOG.md
+++ b/l3kernel/CHANGELOG.md
@@ -9,6 +9,7 @@ this project uses date-based 'snapshot' version identifiers.
### Added
- `\peek_analysis_map_inline:n`
+- `\peek_regex:nTF` and `\peek_regex_remove:nTF`
## [2020-07-17]
diff --git a/l3kernel/l3regex.dtx b/l3kernel/l3regex.dtx
index a7d10f277..2d3faf3f5 100644
--- a/l3kernel/l3regex.dtx
+++ b/l3kernel/l3regex.dtx
@@ -3728,8 +3728,9 @@
% Each state of the \textsc{nfa} is stored in a \tn{toks}. The
% operations which can appear in the \tn{toks} are
% \begin{itemize}
-% \item \cs{@@_action_start_wildcard:} inserted at the start
-% of the regular expression to make it unanchored.
+% \item \cs{@@_action_start_wildcard:N} \meta{boolean} inserted at the
+% start of the regular expression, where a \texttt{true}
+% \meta{boolean} makes it unanchored.
% \item \cs{@@_action_success:} marks the exit state of the
% \textsc{nfa}.
% \item \cs{@@_action_cost:n} \Arg{shift} is a transition from the
@@ -3765,20 +3766,26 @@
% corresponding end-points of nested groups.
% \end{itemize}
%
-% \begin{macro}{\@@_build:n, \@@_build:N}
+% \begin{macro}{\@@_build:n, \@@_build_aux:Nn, \@@_build:N, \@@_build_aux:NN}
% The \texttt{n}-type function first compiles its argument. Reset some
% variables. Allocate two states, and put a wildcard in state $0$
% (transitions to state $1$ and $0$ state). Then build the regex
% within a (capturing) group numbered $0$ (current
% value of \texttt{capturing_group}). Finally, if the match reaches the
-% last state, it is successful.
+% last state, it is successful. A \texttt{false} boolean for argument
+% |#1| for the auxiliaries will suppress the wildcard and make the
+% match anchored: used for \cs{peek_regex:nTF} and similar.
% \begin{macrocode}
-\cs_new_protected:Npn \@@_build:n #1
+\cs_new_protected:Npn \@@_build:n
+ { \@@_build_aux:Nn \c_true_bool }
+\cs_new_protected:Npn \@@_build:N
+ { \@@_build_aux:NN \c_true_bool }
+\cs_new_protected:Npn \@@_build_aux:Nn #1#2
{
- \@@_compile:n {#1}
- \@@_build:N \l_@@_internal_regex
+ \@@_compile:n {#2}
+ \@@_build_aux:NN #1 \l_@@_internal_regex
}
-\cs_new_protected:Npn \@@_build:N #1
+\cs_new_protected:Npn \@@_build_aux:NN #1#2
{
\@@_standard_escapechar:
\int_zero:N \l_@@_capturing_group_int
@@ -3786,8 +3793,8 @@
\@@_build_new_state:
\@@_build_new_state:
\@@_toks_put_right:Nn \l_@@_left_state_int
- { \@@_action_start_wildcard: }
- \@@_group:nnnN {#1} { 1 } { 0 } \c_false_bool
+ { \@@_action_start_wildcard:N #1 }
+ \@@_group:nnnN {#2} { 1 } { 0 } \c_false_bool
\@@_toks_put_right:Nn \l_@@_right_state_int
{ \@@_action_success: }
}
@@ -4871,7 +4878,7 @@
%
% \subsubsection{Actions when matching}
%
-% \begin{macro}{\@@_action_start_wildcard:}
+% \begin{macro}{\@@_action_start_wildcard:N}
% For an unanchored match, state $0$ has a free transition to the next
% and a costly one to itself, to repeat at the next position. To catch
% repeated identical empty matches, we need to know if a successful
@@ -4879,12 +4886,12 @@
% \cs{l_@@_fresh_thread_bool} may be skipped by a successful
% thread, hence we had to add it to \cs{@@_match_one_token:nnN} too.
% \begin{macrocode}
-\cs_new_protected:Npn \@@_action_start_wildcard:
+\cs_new_protected:Npn \@@_action_start_wildcard:N #1
{
\bool_set_true:N \l_@@_fresh_thread_bool
\@@_action_free:n {1}
\bool_set_false:N \l_@@_fresh_thread_bool
- \@@_action_cost:n {0}
+ \bool_if:NT #1 { \@@_action_cost:n {0} }
}
% \end{macrocode}
% \end{macro}
@@ -6326,6 +6333,117 @@
% \end{macrocode}
% \end{macro}
%
+% \subsubsection{Peeking ahead}
+%
+% \begin{variable}{\l_@@_peek_true_tl, \l_@@_peek_false_tl}
+% True/false code arguments of \cs{peek_regex:nTF} or similar.
+% \begin{macrocode}
+\tl_new:N \l_@@_peek_true_tl
+\tl_new:N \l_@@_peek_false_tl
+% \end{macrocode}
+% \end{variable}
+%
+% \begin{variable}{\l_@@_reinsert_tl}
+% \begin{macro}[EXP]{\@@_reinsert:n}
+% Token list such that hitting it with \cs{exp:w} will expand to all
+% tokens we found. It is constructed using the |tl_build| machinery
+% and takes the form of one call to \cs{@@_reinsert:n} for each token
+% to reinsert. The argument is something that \texttt{o}-expands to
+% the single token we wish to put back.
+% \begin{macrocode}
+\tl_new:N \l_@@_reinsert_tl
+\cs_new:Npn \@@_reinsert:n #1
+ {
+ \exp_after:wN \exp_after:wN
+ \exp_after:wN \exp_end:
+ \exp_after:wN \exp_after:wN
+ #1
+ \exp:w
+ }
+% \end{macrocode}
+% \end{macro}
+% \end{variable}
+%
+% \begin{macro}[TF]{\peek_regex:n, \peek_regex:N, \peek_regex_remove:n, \peek_regex_remove:N}
+% \begin{macro}{\@@_peek:nnTF, \@@_peek_end:, \@@_peek_end_aux:N, \@@_peek_remove_end:n}
+% \begin{macrocode}
+\cs_new_protected:Npn \peek_regex:nTF #1
+ {
+ \@@_peek:nnTF { \@@_peek_end: }
+ { \@@_build_aux:Nn \c_false_bool {#1} }
+ }
+\cs_new_protected:Npn \peek_regex:nT #1#2
+ { \peek_regex:nTF {#1} {#2} { } }
+\cs_new_protected:Npn \peek_regex:nF #1 { \peek_regex:nTF {#1} { } }
+\cs_new_protected:Npn \peek_regex:NTF #1
+ {
+ \@@_peek:nnTF { \@@_peek_end: }
+ { \@@_build_aux:NN \c_false_bool #1 }
+ }
+\cs_new_protected:Npn \peek_regex:NT #1#2
+ { \peek_regex:NTF #1 {#2} { } }
+\cs_new_protected:Npn \peek_regex:NF #1 { \peek_regex:NTF {#1} { } }
+\cs_new_protected:Npn \peek_regex_remove:nTF #1
+ {
+ \@@_peek:nnTF { \@@_peek_remove_end:n {##1} }
+ { \@@_build_aux:Nn \c_false_bool {#1} }
+ }
+\cs_new_protected:Npn \peek_regex_remove:nT #1#2
+ { \peek_regex_remove:nTF {#1} {#2} { } }
+\cs_new_protected:Npn \peek_regex_remove:nF #1
+ { \peek_regex_remove:nTF {#1} { } }
+\cs_new_protected:Npn \peek_regex_remove:NTF #1
+ {
+ \@@_peek:nnTF { \@@_peek_remove_end:n {##1} }
+ { \@@_build_aux:NN \c_false_bool #1 }
+ }
+\cs_new_protected:Npn \peek_regex_remove:NT #1#2
+ { \peek_regex_remove:NTF #1 {#2} { } }
+\cs_new_protected:Npn \peek_regex_remove:NF #1
+ { \peek_regex_remove:NTF #1 { } }
+\cs_new_protected:Npn \@@_peek:nnTF #1#2#3#4
+ {
+ \group_begin:
+ \tl_set:Nn \l_@@_peek_true_tl { \group_end: #3 }
+ \tl_set:Nn \l_@@_peek_false_tl { \group_end: #4 }
+ \@@_disable_submatches:
+ \@@_single_match:
+ #2
+ \@@_match_init:
+ \tl_build_clear:N \l_@@_reinsert_tl
+ \@@_match_once_init:
+ \peek_analysis_map_inline:n
+ {
+ \tl_build_put_right:Nn \l_@@_reinsert_tl
+ { \@@_reinsert:n {##1} }
+ \@@_match_one_token:nnN {##1} {##2} ##3
+ \use_none:nnn
+ \prg_break_point:Nn \@@_maplike_break:
+ { \peek_analysis_map_break:n {#1} }
+ }
+ }
+\cs_new_protected:Npn \@@_peek_end:
+ {
+ \bool_if:NTF \g_@@_success_bool
+ { \@@_peek_end_aux:N \l_@@_peek_true_tl }
+ { \@@_peek_end_aux:N \l_@@_peek_false_tl }
+ }
+\cs_new_protected:Npn \@@_peek_end_aux:N #1
+ {
+ \tl_build_end:N \l_@@_reinsert_tl
+ \exp_after:wN #1
+ \exp:w \l_@@_reinsert_tl \exp_end:
+ }
+\cs_new_protected:Npn \@@_peek_remove_end:n #1
+ {
+ \bool_if:NTF \g_@@_success_bool
+ { \exp_after:wN \l_@@_peek_true_tl #1 }
+ { \@@_peek_end_aux:N \l_@@_peek_false_tl }
+ }
+% \end{macrocode}
+% \end{macro}
+% \end{macro}
+%
% \subsection{Messages}
%
% Messages for the preparsing phase.
diff --git a/l3kernel/l3token.dtx b/l3kernel/l3token.dtx
index af08b088f..7b92cc2f7 100644
--- a/l3kernel/l3token.dtx
+++ b/l3kernel/l3token.dtx
@@ -1026,6 +1026,49 @@
% \cs{peek_analysis_map_break:}).
% \end{function}
%
+% \begin{function}[added = 2020-07-20, TF]{\peek_regex:n, \peek_regex:N}
+% \begin{syntax}
+% \cs{peek_regex:nTF} \Arg{regex} \Arg{true code} \Arg{false code}
+% \end{syntax}
+% Tests if the \meta{tokens} that follow in the input stream match the
+% \meta{regular expression}. Any \meta{tokens} that have been read
+% are left in the input stream after the \meta{true code} or
+% \meta{false code} (as appropriate to the result of the test). See
+% \pkg{l3regex} for documentation of the syntax of regular
+% expressions. The \meta{regular expression} is implicitly anchored
+% at the start, so for instance \cs{peek_regex:nTF}~|{|~|a|~|}| is
+% essentially equivalent to \cs{peek_charcode:NTF}~|a|.
+% \begin{texnote}
+% Implicit character tokens are correctly considered by
+% \cs{peek_regex:nTF} as control sequences, while functions that
+% inspect individual tokens (for instance \cs{peek_charcode:NTF})
+% only take into account their meaning.
+% \end{texnote}
+% \end{function}
+%
+% \begin{function}[added = 2020-07-20, TF]
+% {\peek_regex_remove:n, \peek_regex_remove:N}
+% \begin{syntax}
+% \cs{peek_regex_remove:nTF} \Arg{regex} \Arg{true code} \Arg{false code}
+% \end{syntax}
+% Tests if the \meta{tokens} that follow in the input stream match the
+% \meta{regex}. If the test is true, the \meta{tokens} are removed
+% from the input stream and the \meta{true code} is inserted, while if
+% the test is false, the \meta{false code} is inserted followed by the
+% \meta{tokens} that have been read in the process of matching the
+% \meta{regex}. See \pkg{l3regex} for documentation of the syntax of
+% regular expressions. The \meta{regular expression} is implicitly
+% anchored at the start, so for instance
+% \cs{peek_regex_remove:nTF}~|{|~|a|~|}| is essentially equivalent to
+% \cs{peek_charcode_remove:NTF}~|a|.
+% \begin{texnote}
+% Implicit character tokens are correctly considered by
+% \cs{peek_regex_remove:nTF} as control sequences, while functions
+% that inspect individual tokens (for instance
+% \cs{peek_charcode:NTF}) only take into account their meaning.
+% \end{texnote}
+% \end{function}
+%
% \section{Description of all possible tokens}
% \label{sec:l3token:all-tokens}
%
diff --git a/l3kernel/testfiles/m3regex011.lvt b/l3kernel/testfiles/m3regex011.lvt
index 9d5d9cae9..bde3cbd93 100644
--- a/l3kernel/testfiles/m3regex011.lvt
+++ b/l3kernel/testfiles/m3regex011.lvt
@@ -31,4 +31,42 @@
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+\OMIT
+\cs_set:Npn \test:w #1 \s_stop { \TYPE { | \exp_not:n {#1} | } }
+\TIMO
+
+\TEST { Peek~regex }
+ {
+ \peek_regex:nTF { } { \TRUE \test:w } { \ERROR \test:w } \aaa \s_stop
+ \peek_regex:nT { a } { \TRUE \test:w } a \s_stop
+ \peek_regex:nF { . (. a()) } { \ERROR \test:w } \test:w { a b \s_stop \show } \s_stop
+ \peek_regex:nTF { a \{ } { \TRUE \test:w } { \ERROR \test:w } a { b } \s_stop
+ \peek_regex:nTF { \cL. } { \ERROR \test:w } { \FALSE \test:w } \aaa \s_stop
+ \peek_regex:nT { b } { \ERROR \test:w } \test:w a \s_stop
+ \peek_regex:nF { \c[^C] . (. a()) } { \FALSE \test:w } \test:w { a b } c \s_stop
+ \peek_regex:nTF { a \{ \b c } { \ERROR \test:w } { \FALSE \test:w } a { b } \s_stop
+ \SEPARATOR
+ \peek_regex_remove:nTF { } { \TRUE \test:w } { \ERROR \test:w } \aaa \s_stop
+ \peek_regex_remove:nT { a } { \TRUE \test:w } a \s_stop
+ \peek_regex_remove:nF { . (. a()) } { \ERROR \test:w } \test:w { a b \s_stop \show } \s_stop
+ \peek_regex_remove:nTF { a \{ } { \TRUE \test:w } { \ERROR \test:w } a { b \s_stop \show } \s_stop
+ \peek_regex_remove:nTF { \cL. } { \ERROR \test:w } { \FALSE \test:w } \aaa \s_stop
+ \peek_regex_remove:nT { b } { \ERROR \test:w } \test:w a \s_stop
+ \peek_regex_remove:nF { \c[^C] . (. a()) } { \FALSE \test:w } \test:w { a b } c \s_stop
+ \peek_regex_remove:nTF { a \{ \b c } { \ERROR \test:w } { \FALSE \test:w } a { b } \s_stop
+ }
+
+\TEST { Peek~regex~compiled }
+ {
+ \regex_set:Nn \l_tmpa_regex { a | \c{test:w} }
+ \peek_regex:NTF \l_tmpa_regex { \TRUE \test:w } { \ERROR \test:w } a \s_stop
+ \peek_regex:NT \l_tmpa_regex { \TRUE \test:w } a \s_stop
+ \peek_regex:NF \l_tmpa_regex { \ERROR \test:w } \test:w a \s_stop
+ \peek_regex_remove:NTF \l_tmpa_regex { \TRUE \test:w } { \ERROR \test:w } a \s_stop
+ \peek_regex_remove:NT \l_tmpa_regex { \TRUE \test:w } a \s_stop
+ \peek_regex_remove:NF \l_tmpa_regex { \ERROR \test:w } \test:w a \s_stop
+ }
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\END
diff --git a/l3kernel/testfiles/m3regex011.tlg b/l3kernel/testfiles/m3regex011.tlg
index 05eb3b2f2..c37bd68dd 100644
--- a/l3kernel/testfiles/m3regex011.tlg
+++ b/l3kernel/testfiles/m3regex011.tlg
@@ -13,3 +13,56 @@ TEST 2: No groups within \c
<recently read> }
l. ... }
============================================================
+============================================================
+TEST 3: Peek regex
+============================================================
+TRUE
+|\aaa |
+TRUE
+|a|
+|ab\s_stop \show |
+TRUE
+|a{b}|
+FALSE
+|\aaa |
+|a|
+FALSE
+|\test:w {ab}c|
+FALSE
+|a{b}|
+============================================================
+TRUE
+|\aaa |
+TRUE
+||
+> end-group character }.
+<argument> ... \test:w }\test:w {ab\s_stop \show }
+ \s_stop \peek_regex_remove...
+l. ... }
+TRUE
+|b|
+> end-group character }.
+<argument> ...}{\ERROR \test:w }a{b\s_stop \show }
+ \s_stop \peek_regex_remove...
+l. ... }
+FALSE
+|\aaa |
+|a|
+FALSE
+|\test:w {ab}c|
+FALSE
+|a{b}|
+============================================================
+============================================================
+TEST 4: Peek regex compiled
+============================================================
+TRUE
+|a|
+TRUE
+|a|
+|a|
+TRUE
+||
+TRUE
+||
+============================================================
More information about the latex3-commits
mailing list.