[latex3-commits] [git/LaTeX3-latex3-latex3] gh590: Provide \ur escape to compose regex more easily (see #590) (fdbda83d1)

Sat Apr 24 00:55:34 CEST 2021

Repository : https://github.com/latex3/latex3
On branch  : gh590
Link       : https://github.com/latex3/latex3/commit/fdbda83d187e545e398177b82e1894ba7885bbd5

>---------------------------------------------------------------

commit fdbda83d187e545e398177b82e1894ba7885bbd5
Author: Bruno Le Floch <blflatex at gmail.com>
Date:   Sat Apr 24 00:53:31 2021 +0200

    Provide \ur escape to compose regex more easily (see #590)
    
    This takes the first approach suggested by Phelype and supported by
    eg9.  Thoughts welcome.


>---------------------------------------------------------------

fdbda83d187e545e398177b82e1894ba7885bbd5
 l3kernel/CHANGELOG.md             |  1 +
 l3kernel/l3regex.dtx              | 93 ++++++++++++++++++++++++++++-----------
 l3kernel/testfiles/m3regex007.lvt |  1 +
 l3kernel/testfiles/m3regex007.tlg | 25 +++++++++++
 4 files changed, 95 insertions(+), 25 deletions(-)

diff --git a/l3kernel/CHANGELOG.md b/l3kernel/CHANGELOG.md
index 055589332..87f5f9b0c 100644
--- a/l3kernel/CHANGELOG.md
+++ b/l3kernel/CHANGELOG.md
@@ -11,6 +11,7 @@ this project uses date-based 'snapshot' version identifiers.
 - `\seq_set_split_keep_spaces:Nnn` (see #784)
 - `\tracingstacklevels`
 - Color export in comma-separated format
+- `\ur{...}` escape in `l3regex` to compose regexes
 
 ### Changed
 - `\__kernel_file_name_sanitize:n` now uses a faster `\csname`-based
diff --git a/l3kernel/l3regex.dtx b/l3kernel/l3regex.dtx
index 5569afdd8..237c614a8 100644
--- a/l3kernel/l3regex.dtx
+++ b/l3kernel/l3regex.dtx
@@ -337,6 +337,12 @@
 % not supported directly: use a group, for instance as in
 % |(?:\u|\Arg{var~name}|){2,4}|.
 %
+% The |\ur| escape sequence allows to insert the contents of a |regex|
+% variable into a larger regular expression.  For instance,
+% |A\ur{l_tmpa_regex}B| matches the tokens |A| and |B| separated by
+% something that matches the regular expression
+% \cs[no-index]{l_tmpa_regex}.
+%
 % The option |(?i)| makes the match case insensitive (identifying
 % \texttt{A}--\texttt{Z} with \texttt{a}--\texttt{z}; no Unicode support
 % yet). This applies until the end of the group in which it appears, and
@@ -444,8 +450,8 @@
 %   submatches |\0|, |\1|, and so on, as in the example for |\u| below.
 % \end{l3regex-syntax}
 %
-% The escape sequence |\u|\Arg{tl~var~name} allows to insert the
-% contents of the token list with name \meta{tl~var~name} directly into
+% The escape sequence |\u|\Arg{var~name} allows to insert the
+% contents of the variable with name \meta{var~name} directly into
 % the replacement, giving an easier control of category codes.  When
 % nested in |\c{|\ldots{}|}| and |\u{|\ldots{}|}| constructions, the
 % |\u| and |\c|~escape sequences perform \cs{tl_to_str:v}, namely
@@ -756,9 +762,6 @@
 %   \item Unicode properties: |\p{..}| and |\P{..}|;
 %     |\X| which should match any \enquote{extended} Unicode sequence.
 %     This requires to manipulate a lot of data, probably using tree-boxes.
-%   \item Provide a syntax such as |\ur{l_my_regex}| to use an
-%     already-compiled regex in a more complicated regex.  This makes
-%     regexes more easily composable.
 % \end{itemize}
 %
 % The following features of \textsc{pcre} or Perl may or may not be
@@ -3324,33 +3327,55 @@
 % \subsubsection{Raw token lists with \cs{u}}
 %
 % \begin{macro}{\@@_compile_/u:}
-% \begin{macro}[EXP]{\@@_compile_u_loop:NN}
 %   The |\u| escape is invalid in classes and directly following a
-%   catcode test. Otherwise, it must be followed by a left brace. We
-%   then collect the characters for the argument of |\u| within an
-%   \texttt{x}-expanding assignment. In principle we could just wait to
-%   encounter a right brace, but this is unsafe: if the right brace was
-%   missing, then we would reach the end-markers of the regex, and
-%   continue, leading to obscure fatal errors. Instead, we only allow
-%   raw and special characters, and stop when encountering a special
-%   right brace, any escaped character, or the end-marker.
+%   catcode test. Otherwise test for a following |r| (for |\ur|), and
+%   call an auxiliary responsible for finding the variable name.
 %    \begin{macrocode}
 \cs_new_protected:cpn { @@_compile_/u: } #1#2
   {
     \@@_if_in_class_or_catcode:TF
       { \@@_compile_raw_error:N u #1 #2 }
       {
-        \@@_two_if_eq:NNNNTF #1 #2 \@@_compile_special:N \c_left_brace_str
-          {
-            \__kernel_tl_set:Nx \l_@@_internal_a_tl { \if_false: } \fi:
-            \@@_compile_u_loop:NN
-          }
-          {
-            \__kernel_msg_error:nn { regex } { u-missing-lbrace }
-            \@@_compile_raw:N u #1 #2
-          }
+        \@@_two_if_eq:NNNNTF #1 #2 \@@_compile_raw:N r
+          { \@@_compile_u_brace:NNN \@@_compile_ur_end: }
+          { \@@_compile_u_brace:NNN \@@_compile_u_end: #1 #2 }
       }
   }
+%    \end{macrocode}
+% \end{macro}
+%
+% \begin{macro}{\@@_compile_u_brace:NNN}
+%   This enforces the presence of a left brace, then starts a loop to
+%   find the variable name.
+%    \begin{macrocode}
+\cs_new:Npn \@@_compile_u_brace:NNN #1#2#3
+  {
+    \@@_two_if_eq:NNNNTF #2 #3 \@@_compile_special:N \c_left_brace_str
+      {
+        \tl_set:Nn \l_@@_internal_b_tl {#1}
+        \__kernel_tl_set:Nx \l_@@_internal_a_tl { \if_false: } \fi:
+        \@@_compile_u_loop:NN
+      }
+      {
+        \__kernel_msg_error:nn { regex } { u-missing-lbrace }
+        \token_if_eq_meaning:NNTF #1 \@@_compile_ur_end:
+          { \@@_compile_raw:N u \@@_compile_raw:N r }
+          { \@@_compile_raw:N u }
+        #2 #3
+      }
+  }
+%    \end{macrocode}
+% \end{macro}
+%
+% \begin{macro}[EXP]{\@@_compile_u_loop:NN}
+%   We collect the characters for the argument of |\u| within an
+%   \texttt{x}-expanding assignment. In principle we could just wait to
+%   encounter a right brace, but this is unsafe: if the right brace was
+%   missing, then we would reach the end-markers of the regex, and
+%   continue, leading to obscure fatal errors. Instead, we only allow
+%   raw and special characters, and stop when encountering a special
+%   right brace, any escaped character, or the end-marker.
+%    \begin{macrocode}
 \cs_new:Npn \@@_compile_u_loop:NN #1#2
   {
     \token_if_eq_meaning:NNTF #1 \@@_compile_raw:N
@@ -3359,19 +3384,37 @@
         \token_if_eq_meaning:NNTF #1 \@@_compile_special:N
           {
             \exp_after:wN \token_if_eq_charcode:NNTF \c_right_brace_str #2
-              { \if_false: { \fi: } \@@_compile_u_end: }
+              { \if_false: { \fi: } \l_@@_internal_b_tl }
               { #2 \@@_compile_u_loop:NN }
           }
           {
             \if_false: { \fi: }
             \__kernel_msg_error:nnx { regex } { u-missing-rbrace } {#2}
-            \@@_compile_u_end:
+            \l_@@_internal_b_tl
             #1 #2
           }
       }
   }
 %    \end{macrocode}
 % \end{macro}
+%
+% \begin{macro}{\@@_compile_ur_end:}
+%   For the |\ur{...}| construction, once we have extracted the
+%   variable's name, we simply insert the compiled regex (which the
+%   variable should be) into a non-capturing group (with no repetition)
+%   to respect the structure of regexes.  It might be possible to omit
+%   this group perhaps.
+%    \begin{macrocode}
+\cs_new_protected:Npn \@@_compile_ur_end:
+  {
+    \tl_build_put_right:Nx \l_@@_build_tl
+      {
+        \exp_not:N \@@_group_no_capture:nnnN
+          { \exp_not:v { \l_@@_internal_a_tl } }
+          { 1 } { 0 } \exp_not:N \c_false_bool
+      }
+  }
+%    \end{macrocode}
 % \end{macro}
 %
 % \begin{macro}{\@@_compile_u_end:}
diff --git a/l3kernel/testfiles/m3regex007.lvt b/l3kernel/testfiles/m3regex007.lvt
index 7e6c4bc5b..fb1b65fb6 100644
--- a/l3kernel/testfiles/m3regex007.lvt
+++ b/l3kernel/testfiles/m3regex007.lvt
@@ -23,6 +23,7 @@
     \regex_new:N \l_foo_regex
     \regex_set:Nn \l_foo_regex { \A a|b| }
     \regex_show:N \l_foo_regex
+    \regex_show:n { a \ur{l_foo_regex} b \c{\ur{l_foo_regex}|D} }
     \regex_show:n { a\c{bc}\u{c_space_tl}\c{\u{c_space_tl}|} }
     \tl_set:Nn \l_tmpa_tl { \abc }
     \int_set:Nn \l_tmpa_int { 7 }
diff --git a/l3kernel/testfiles/m3regex007.tlg b/l3kernel/testfiles/m3regex007.tlg
index b797aa26f..09b55e7d9 100644
--- a/l3kernel/testfiles/m3regex007.tlg
+++ b/l3kernel/testfiles/m3regex007.tlg
@@ -39,6 +39,31 @@ Defining \l_foo_regex on line ...
 +-branch.
 <recently read> }
 l. ...  }
+> Compiled regex {a\ur {l_foo_regex}b\c {\ur {l_foo_regex}|D}}:
++-branch
+  char code 97
+  ,-group begin (no capture)
+  | assertion: anchor at start (\A)
+  | char code 97
+  +-branch
+  | char code 98
+  +-branch
+  `-group end
+  char code 98
+  Match
+    control sequence
+    +-branch
+      ,-group begin (no capture)
+      | assertion: anchor at start (\A)
+      | char code 97
+      +-branch
+      | char code 98
+      +-branch
+      `-group end
+    +-branch
+      char code 68.
+<recently read> }
+l. ...  }
 > Compiled regex {a\c {bc}\u {c_space_tl}\c {\u {c_space_tl}|}}:
 +-branch
   char code 97