[latex3-commits] [git/LaTeX3-latex3-latex3] gh433-regex-case: Implement regex_case_match (see #433) (45836eb26)

Sun May 16 11:50:39 CEST 2021

Repository : https://github.com/latex3/latex3
On branch  : gh433-regex-case
Link       : https://github.com/latex3/latex3/commit/45836eb266afb362375fd34643cc1a5c97416ebc

>---------------------------------------------------------------

commit 45836eb266afb362375fd34643cc1a5c97416ebc
Author: Bruno Le Floch <blflatex at gmail.com>
Date:   Thu May 6 13:20:41 2021 +0200

    Implement regex_case_match (see #433)


>---------------------------------------------------------------

45836eb266afb362375fd34643cc1a5c97416ebc
 l3kernel/l3regex.dtx                               | 236 ++++++++++++++++++++-
 l3kernel/testfiles/m3regex012.lvt                  |  50 +++++
 .../testfiles/{m3clist006.tlg => m3regex012.tlg}   |  39 ++--
 3 files changed, 299 insertions(+), 26 deletions(-)

diff --git a/l3kernel/l3regex.dtx b/l3kernel/l3regex.dtx
index f10b2f320..d32a6006f 100644
--- a/l3kernel/l3regex.dtx
+++ b/l3kernel/l3regex.dtx
@@ -98,7 +98,7 @@
 %
 % \section{Syntax of regular expressions}
 %
-% \subsection{Regex examples}
+% \subsection{Regular expression examples}
 %
 % We start with a few examples, and encourage the reader to apply
 % \cs{regex_show:n} to these regular expressions.
@@ -610,6 +610,43 @@
 %   results in \cs[no-index]{l_foo_int} taking the value $5$.
 % \end{function}
 %
+% \begin{function}[noTF, added = 2021-05-15]{\regex_case_match:nn}
+%   \begin{syntax}
+%     \cs{regex_case_match:nnTF}
+%     ~~|{| \\
+%     ~~~~\Arg{regex_1} \Arg{code case_1} \\
+%     ~~~~\Arg{regex_2} \Arg{code case_2} \\
+%     ~~~~\ldots \\
+%     ~~~~\Arg{regex_n} \Arg{code case_n} \\
+%     ~~|}| \Arg{token list}
+%     ~~\Arg{true code} \Arg{false code}
+%   \end{syntax}
+%   Determines which of the \meta{regular expressions} matches at the
+%   earliest point in the \meta{token list}, and leaves the
+%   corresponding \meta{code_i} followed by the \meta{true code} in the
+%   input stream.  If several \meta{regex} match starting at the same
+%   point, then the first one in the list is selected and the others are
+%   discarded.  If none of the \meta{regex} match, the \meta{false code}
+%   is left in the input stream.
+%
+%   In detail, for each starting position in the \meta{token list}, each
+%   of the \meta{regex} is searched in turn.  If one of them matches
+%   then the corresponding \meta{code} is used and everything else is
+%   discarded, while if none of the \meta{regex} match at a given
+%   position then the next starting position is attempted.  If none of
+%   the \meta{regex} match anywhere in the \meta{token list} then
+%   nothing is left in the input stream.  Note that this differs from
+%   nested \cs{regex_match:nnTF} statements since all \meta{regex} are
+%   attempted at each position rather than attempting to match
+%   \meta{regex_1} at every position before moving on to \meta{regex_2}.
+%
+%   Each \meta{regex} can either be given as a regex variable or as an
+%   explicit regular expression.  It may be useful to include as the
+%   last case \meta{regex_n} the regex |{\Z}|, which always matches at
+%   the very end of the \meta{token list}, so as to include fall-back
+%   code \meta{code case_n} used when no other case matches.
+% \end{function}
+%
 % \section{Submatch extraction}
 %
 % \begin{function}[noTF, added = 2017-05-26]
@@ -1088,6 +1125,27 @@
 %    \end{macrocode}
 % \end{macro}
 %
+% \begin{macro}{\@@_tl_pairs_map:nNF, \@@_tl_pairs_map:Nnn}
+%   Map through a token list one pair at a time, with a two-argument
+%   function.
+%    \begin{macrocode}
+\cs_new:Npn \@@_tl_pairs_map:nNF #1#2
+  {
+    \@@_tl_pairs_map:Nnn #2 #1 \q_@@_nil \q_@@_nil
+    \prg_break_point:
+  }
+\cs_new:Npn \@@_tl_pairs_map:Nnn #1#2#3
+  {
+    \@@_use_none_delimit_by_q_nil:w
+      #2 \prg_break:n \use_none:n
+      #3 \prg_break:n \use:n
+    \q_@@_nil
+    #1 {#2} {#3}
+    \@@_tl_pairs_map:Nnn #1
+  }
+%    \end{macrocode}
+% \end{macro}
+%
 % \subsubsection{Constants and variables}
 %
 % \begin{macro}{\@@_tmp:w}
@@ -1174,9 +1232,17 @@
 %    \end{macrocode}
 % \end{variable}
 %
+% \begin{variable}{\q_@@_nil}
+%   Internal quarks.
+%    \begin{macrocode}
+\quark_new:N \q_@@_nil
+%    \end{macrocode}
+% \end{variable}
+%
 % \begin{macro}[EXP]{
 %     \@@_use_none_delimit_by_q_recursion_stop:w,
-%     \@@_use_i_delimit_by_q_recursion_stop:nw
+%     \@@_use_i_delimit_by_q_recursion_stop:nw,
+%     \@@_use_none_delimit_by_q_nil:w,
 %   }
 %   Functions to gobble up to a quark.
 %    \begin{macrocode}
@@ -1184,16 +1250,10 @@
   #1 \q_@@_recursion_stop { }
 \cs_new:Npn \@@_use_i_delimit_by_q_recursion_stop:nw
   #1 #2 \q_@@_recursion_stop {#1}
+\cs_new:Npn \@@_use_none_delimit_by_q_nil:w #1 \q_@@_nil { }
 %    \end{macrocode}
 % \end{macro}
 %
-% \begin{variable}{\q_@@_nil}
-%   Internal quarks.
-%    \begin{macrocode}
-\quark_new:N \q_@@_nil
-%    \end{macrocode}
-% \end{variable}
-%
 % \begin{macro}[pTF]{\@@_quark_if_nil:n}
 %   Branching quark conditional.
 %    \begin{macrocode}
@@ -2331,6 +2391,24 @@
 %    \end{macrocode}
 % \end{macro}
 %
+% \begin{macro}{\@@_compile_use:n}
+%   Use a regex, regardless of whether it is given as a string (in which
+%   case we need to compile) or as a regex variable.  This is used for
+%   \cs{regex_case_match:nn} and related functions to allow a mixture of
+%   explicit regex and regex variables.
+%    \begin{macrocode}
+\cs_new_protected:Npn \@@_compile_use:n #1
+  {
+    \tl_if_single_token:nT {#1}
+      {
+        \exp_args:No \tl_if_head_eq_meaning:nNT #1 \@@_branch:n
+          { \use_ii:nnn }
+      }
+    \@@_compile:n {#1} \l_@@_internal_regex
+  }
+%    \end{macrocode}
+% \end{macro}
+%
 % \begin{macro}{\@@_compile_escaped:N, \@@_compile_special:N}
 %   If the special character or escaped alphanumeric has a particular
 %   meaning in regexes, the corresponding function is used. Otherwise,
@@ -4142,6 +4220,76 @@
 %    \end{macrocode}
 % \end{macro}
 %
+% \begin{variable}{\g_@@_case_int}
+%   Case number that was successfully matched in
+%   \cs{regex_case_match:nn} and related functions.
+%    \begin{macrocode}
+\int_new:N \g_@@_case_int
+%    \end{macrocode}
+% \end{variable}
+%
+% \begin{variable}{\l_@@_case_max_group_int}
+%   The largest group number appearing in any of the \meta{regex} in the
+%   argument of \cs{regex_case_match:nn} and related functions.
+%    \begin{macrocode}
+\int_new:N \l_@@_case_max_group_int
+%    \end{macrocode}
+% \end{variable}
+%
+% \begin{macro}{\@@_case_build:n, \@@_case_build_aux:Nn, \@@_case_build_loop:n}
+%   See \cs{@@_build:n}, but with a loop.
+%    \begin{macrocode}
+\cs_new_protected:Npn \@@_case_build:n
+  { \@@_case_build_aux:Nn \c_true_bool }
+\cs_new_protected:Npn \@@_case_build_aux:Nn #1#2
+  {
+    \@@_standard_escapechar:
+    \int_set_eq:NN \l_@@_max_state_int \l_@@_min_state_int
+    \@@_build_new_state:
+    \@@_build_new_state:
+    \@@_toks_put_right:Nn \l_@@_left_state_int
+      { \@@_action_start_wildcard:N #1 }
+    %
+    \@@_build_new_state:
+    \@@_toks_put_left:Nx \l_@@_left_state_int
+      { \@@_action_submatch:nN { 0 } < }
+    \@@_push_lr_states:
+    \int_zero:N \l_@@_case_max_group_int
+    \int_gzero:N \g_@@_case_int
+    \tl_map_inline:nn {#2}
+      {
+        \int_gincr:N \g_@@_case_int
+        \@@_case_build_loop:n {##1}
+      }
+    \int_set_eq:NN \l_@@_capturing_group_int \l_@@_case_max_group_int
+    \@@_pop_lr_states:
+  }
+\cs_new_protected:Npn \@@_case_build_loop:n #1
+  {
+    \int_set:Nn \l_@@_capturing_group_int { 1 }
+    \@@_compile_use:n {#1}
+    \int_set:Nn \l_@@_case_max_group_int
+      {
+        \int_max:nn { \l_@@_case_max_group_int }
+          { \l_@@_capturing_group_int }
+      }
+    \seq_pop:NN \l_@@_right_state_seq \l_@@_internal_a_tl
+    \int_set:Nn \l_@@_right_state_int \l_@@_internal_a_tl
+    \@@_toks_put_left:Nx \l_@@_right_state_int
+      {
+        \@@_action_submatch:nN { 0 } >
+        \int_gset:Nn \g_@@_case_int
+          { \int_use:N \g_@@_case_int }
+        \@@_action_success:
+      }
+    \@@_toks_clear:N \l_@@_max_state_int
+    \seq_push:No \l_@@_right_state_seq
+      { \int_use:N \l_@@_max_state_int }
+    \int_incr:N \l_@@_max_state_int
+  }
+%    \end{macrocode}
+% \end{macro}
+%
 % \begin{macro}{\@@_build_for_cs:n}
 %   The matching code relies on some global intarray variables, but only
 %   uses a range of their entries.  Specifically,
@@ -4483,7 +4631,7 @@
 %   the group, and leaves \texttt{internal_a} pointing to the left end
 %   of the last repetition. We only record the submatch information at
 %   the last repetition. Finally, add a state at the end (the transition
-%   to it has been taken care of by the replicating auxiliary.
+%   to it has been taken care of by the replicating auxiliary).
 %    \begin{macrocode}
 \cs_new_protected:Npn \@@_group_repeat:nn #1#2
   {
@@ -6257,6 +6405,34 @@
 %    \end{macrocode}
 % \end{macro}
 %
+% \begin{macro}[noTF]{\regex_case_match:nn}
+%   The auxiliary errors if |#1| has an odd number of items, and
+%   otherwise it sets \cs{g_@@_case_int} according to which case was
+%   found (zero if not found).  Overall, the \texttt{false} branch is
+%   taken either if the cases |#1| were nonsense (odd number of items)
+%   or if there was no match.  The \texttt{true} branch is taken
+%   otherwise, and we leave the corresponding code in the input stream.
+%    \begin{macrocode}
+\cs_new_protected:Npn \regex_case_match:nnTF #1#2#3#4
+  {
+    \int_gzero:N \g_@@_case_int
+    \@@_case_match:nn {#1} {#2}
+    \int_compare:nNnTF \g_@@_case_int = 0
+      {#4}
+      {
+        \tl_item:nn {#1} { 2 * \g_@@_case_int }
+        #3
+      }
+  }
+\cs_new_protected:Npn \regex_case_match:nn #1#2
+  { \regex_case_match:nnTF {#1} {#2} { } { } }
+\cs_new_protected:Npn \regex_case_match:nnT #1#2#3
+  { \regex_case_match:nnTF {#1} {#2} {#3} { } }
+\cs_new_protected:Npn \regex_case_match:nnF #1#2#3
+  { \regex_case_match:nnTF {#1} {#2} { } {#3} }
+%    \end{macrocode}
+% \end{macro}
+%
 % \begin{macro}[noTF]
 %   {
 %     \regex_extract_once:nnN, \regex_extract_once:NnN,
@@ -6421,6 +6597,37 @@
 %    \end{macrocode}
 % \end{macro}
 %
+% \begin{macro}{\@@_case_match:nn}
+% \begin{macro}[EXP]{\@@_case_match_aux:nn}
+%   The code would get badly messed up if the number of items in |#1|
+%   were not even, so we catch this case, then follow the same code as
+%   \cs{regex_match:nnTF} but using \cs{@@_case_build:n} and without
+%   returning a result.
+%    \begin{macrocode}
+\cs_new_protected:Npn \@@_case_match:nn #1#2
+  {
+    \int_if_odd:nTF { \tl_count:n {#1} }
+      {
+        \__kernel_msg_error:nnxxxx { regex } { case-odd }
+          { \token_to_str:N \regex_case_match:nn } { code }
+          { \tl_count:n {#1} } { \tl_to_str:n {#1} }
+      }
+      {
+        \group_begin:
+          \@@_disable_submatches:
+          \@@_single_match:
+          \exp_args:Nx \@@_case_build:n
+            { \@@_tl_pairs_map:nNF {#1} \@@_case_match_aux:nn { } }
+          \int_gzero:N \g_@@_case_int
+          \@@_match:n {#2}
+        \group_end:
+      }
+  }
+\cs_new:Npn \@@_case_match_aux:nn #1#2 { \exp_not:n { {#1} } }
+%    \end{macrocode}
+% \end{macro}
+% \end{macro}
+%
 % \begin{macro}{\@@_count:nnN}
 %   Again, we don't care about submatches. Instead of aborting after the
 %   first \enquote{longest match} is found, we search for multiple
@@ -7600,8 +7807,15 @@
   { The~values~given~in~a~quantifier~must~be~in~order. }
 %    \end{macrocode}
 %
-% Used when showing a regex.
+% Used in user commands, and when showing a regex.
 %    \begin{macrocode}
+\__kernel_msg_new:nnnn { regex } { case-odd }
+  { #1~with~odd~number~of~items }
+  {
+    There~must~be~a~#2~part~for~each~regex:~
+    found~odd~number~of~items~(#3)~in\\
+    \iow_indent:n {#4}
+  }
 \__kernel_msg_new:nnn { regex } { show }
   {
     >~Compiled~regex~
diff --git a/l3kernel/testfiles/m3regex012.lvt b/l3kernel/testfiles/m3regex012.lvt
new file mode 100644
index 000000000..32f93aa47
--- /dev/null
+++ b/l3kernel/testfiles/m3regex012.lvt
@@ -0,0 +1,50 @@
+%
+% Copyright (C) 2021 The LaTeX Project
+\documentclass{minimal}
+\input{regression-test}
+
+\RequirePackage[enable-debug]{expl3}
+\ExplSyntaxOn
+\debug_on:n { check-declarations , deprecation , log-functions }
+\ExplSyntaxOff
+
+% \begin{document}
+
+\START
+\AUTHOR{Bruno Le Floch}
+\ExplSyntaxOn
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+\TEST { regex_case_match:nn }
+  {
+    \regex_set:Nn \l_tmpa_regex { [a-z]bc }
+    \cs_set_protected:Npn \test:n #1
+      {
+        \TYPE{#1:}
+        \regex_case_match:nnTF
+          {
+            \l_tmpa_regex { \TYPE{abc} }
+            { (?i) Y \w } { \TYPE{Y} }
+            { z \Z } { \TYPE{Z} }
+          }
+          {#1} { \TRUE } { \FALSE }
+      }
+    \test:n { }
+    \test:n { abc }
+    \test:n { Y abc }
+    \test:n { y abc }
+    \test:n { y bc }
+    \test:n { y ; bc }
+    \test:n { y ; bc z }
+  }
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+\TEST { regex_case_match:nn ~ error }
+  {
+    \regex_case_match:nn { * } { .. }
+  }
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\END
diff --git a/l3kernel/testfiles/m3clist006.tlg b/l3kernel/testfiles/m3regex012.tlg
similarity index 58%
copy from l3kernel/testfiles/m3clist006.tlg
copy to l3kernel/testfiles/m3regex012.tlg
index 7c17ac21a..d006b485b 100644
--- a/l3kernel/testfiles/m3clist006.tlg
+++ b/l3kernel/testfiles/m3regex012.tlg
@@ -2,26 +2,35 @@ This is a generated file for the LaTeX (2e + expl3) validation system.
 Don't change this file in any respect.
 Author: Bruno Le Floch
 ============================================================
-TEST 1: clist_reverse:n
+TEST 1: regex_case_match:nn
 ============================================================
-|| |{}|,|, |,{}| ,|{},|
-|,,, { },{ }, {},{}, |\c ,\a \b |
-============================================================
-============================================================
-TEST 2: clist_reverse:N
-============================================================
-Defining \c_test_clist on line ...
-\c ,{,},{ },\a \b 
-TRUE
+:
 FALSE
+abc:
+abc
 TRUE
+Yabc:
+Y
 TRUE
-============================================================
-============================================================
-TEST 3: clist_if_in:nnTF
-============================================================
-TRUE
+yabc:
+Y
 TRUE
+ybc:
+abc
 TRUE
+y;bc:
 FALSE
+y;bcz:
+Z
+TRUE
+============================================================
+============================================================
+TEST 2: regex_case_match:nn error
+============================================================
+! LaTeX3 Error: \regex_case_match:nn with odd number of items
+For immediate help type H <return>.
+ ...                                              
+l. ...  }
+There must be a code part for each regex: found odd number of items (1) in
+    *
 ============================================================