[latex3-commits] [git/LaTeX3-latex3-latex3] gh590: Provide \ur escape to compose regex more easily (see #590) (fdbda83d1)
Bruno Le Floch
blflatex at gmail.com
Sat Apr 24 00:55:34 CEST 2021
Repository : https://github.com/latex3/latex3
On branch : gh590
Link : https://github.com/latex3/latex3/commit/fdbda83d187e545e398177b82e1894ba7885bbd5
>---------------------------------------------------------------
commit fdbda83d187e545e398177b82e1894ba7885bbd5
Author: Bruno Le Floch <blflatex at gmail.com>
Date: Sat Apr 24 00:53:31 2021 +0200
Provide \ur escape to compose regex more easily (see #590)
This takes the first approach suggested by Phelype and supported by
eg9. Thoughts welcome.
>---------------------------------------------------------------
fdbda83d187e545e398177b82e1894ba7885bbd5
l3kernel/CHANGELOG.md | 1 +
l3kernel/l3regex.dtx | 93 ++++++++++++++++++++++++++++-----------
l3kernel/testfiles/m3regex007.lvt | 1 +
l3kernel/testfiles/m3regex007.tlg | 25 +++++++++++
4 files changed, 95 insertions(+), 25 deletions(-)
diff --git a/l3kernel/CHANGELOG.md b/l3kernel/CHANGELOG.md
index 055589332..87f5f9b0c 100644
--- a/l3kernel/CHANGELOG.md
+++ b/l3kernel/CHANGELOG.md
@@ -11,6 +11,7 @@ this project uses date-based 'snapshot' version identifiers.
- `\seq_set_split_keep_spaces:Nnn` (see #784)
- `\tracingstacklevels`
- Color export in comma-separated format
+- `\ur{...}` escape in `l3regex` to compose regexes
### Changed
- `\__kernel_file_name_sanitize:n` now uses a faster `\csname`-based
diff --git a/l3kernel/l3regex.dtx b/l3kernel/l3regex.dtx
index 5569afdd8..237c614a8 100644
--- a/l3kernel/l3regex.dtx
+++ b/l3kernel/l3regex.dtx
@@ -337,6 +337,12 @@
% not supported directly: use a group, for instance as in
% |(?:\u|\Arg{var~name}|){2,4}|.
%
+% The |\ur| escape sequence allows to insert the contents of a |regex|
+% variable into a larger regular expression. For instance,
+% |A\ur{l_tmpa_regex}B| matches the tokens |A| and |B| separated by
+% something that matches the regular expression
+% \cs[no-index]{l_tmpa_regex}.
+%
% The option |(?i)| makes the match case insensitive (identifying
% \texttt{A}--\texttt{Z} with \texttt{a}--\texttt{z}; no Unicode support
% yet). This applies until the end of the group in which it appears, and
@@ -444,8 +450,8 @@
% submatches |\0|, |\1|, and so on, as in the example for |\u| below.
% \end{l3regex-syntax}
%
-% The escape sequence |\u|\Arg{tl~var~name} allows to insert the
-% contents of the token list with name \meta{tl~var~name} directly into
+% The escape sequence |\u|\Arg{var~name} allows to insert the
+% contents of the variable with name \meta{var~name} directly into
% the replacement, giving an easier control of category codes. When
% nested in |\c{|\ldots{}|}| and |\u{|\ldots{}|}| constructions, the
% |\u| and |\c|~escape sequences perform \cs{tl_to_str:v}, namely
@@ -756,9 +762,6 @@
% \item Unicode properties: |\p{..}| and |\P{..}|;
% |\X| which should match any \enquote{extended} Unicode sequence.
% This requires to manipulate a lot of data, probably using tree-boxes.
-% \item Provide a syntax such as |\ur{l_my_regex}| to use an
-% already-compiled regex in a more complicated regex. This makes
-% regexes more easily composable.
% \end{itemize}
%
% The following features of \textsc{pcre} or Perl may or may not be
@@ -3324,33 +3327,55 @@
% \subsubsection{Raw token lists with \cs{u}}
%
% \begin{macro}{\@@_compile_/u:}
-% \begin{macro}[EXP]{\@@_compile_u_loop:NN}
% The |\u| escape is invalid in classes and directly following a
-% catcode test. Otherwise, it must be followed by a left brace. We
-% then collect the characters for the argument of |\u| within an
-% \texttt{x}-expanding assignment. In principle we could just wait to
-% encounter a right brace, but this is unsafe: if the right brace was
-% missing, then we would reach the end-markers of the regex, and
-% continue, leading to obscure fatal errors. Instead, we only allow
-% raw and special characters, and stop when encountering a special
-% right brace, any escaped character, or the end-marker.
+% catcode test. Otherwise test for a following |r| (for |\ur|), and
+% call an auxiliary responsible for finding the variable name.
% \begin{macrocode}
\cs_new_protected:cpn { @@_compile_/u: } #1#2
{
\@@_if_in_class_or_catcode:TF
{ \@@_compile_raw_error:N u #1 #2 }
{
- \@@_two_if_eq:NNNNTF #1 #2 \@@_compile_special:N \c_left_brace_str
- {
- \__kernel_tl_set:Nx \l_@@_internal_a_tl { \if_false: } \fi:
- \@@_compile_u_loop:NN
- }
- {
- \__kernel_msg_error:nn { regex } { u-missing-lbrace }
- \@@_compile_raw:N u #1 #2
- }
+ \@@_two_if_eq:NNNNTF #1 #2 \@@_compile_raw:N r
+ { \@@_compile_u_brace:NNN \@@_compile_ur_end: }
+ { \@@_compile_u_brace:NNN \@@_compile_u_end: #1 #2 }
}
}
+% \end{macrocode}
+% \end{macro}
+%
+% \begin{macro}{\@@_compile_u_brace:NNN}
+% This enforces the presence of a left brace, then starts a loop to
+% find the variable name.
+% \begin{macrocode}
+\cs_new:Npn \@@_compile_u_brace:NNN #1#2#3
+ {
+ \@@_two_if_eq:NNNNTF #2 #3 \@@_compile_special:N \c_left_brace_str
+ {
+ \tl_set:Nn \l_@@_internal_b_tl {#1}
+ \__kernel_tl_set:Nx \l_@@_internal_a_tl { \if_false: } \fi:
+ \@@_compile_u_loop:NN
+ }
+ {
+ \__kernel_msg_error:nn { regex } { u-missing-lbrace }
+ \token_if_eq_meaning:NNTF #1 \@@_compile_ur_end:
+ { \@@_compile_raw:N u \@@_compile_raw:N r }
+ { \@@_compile_raw:N u }
+ #2 #3
+ }
+ }
+% \end{macrocode}
+% \end{macro}
+%
+% \begin{macro}[EXP]{\@@_compile_u_loop:NN}
+% We collect the characters for the argument of |\u| within an
+% \texttt{x}-expanding assignment. In principle we could just wait to
+% encounter a right brace, but this is unsafe: if the right brace was
+% missing, then we would reach the end-markers of the regex, and
+% continue, leading to obscure fatal errors. Instead, we only allow
+% raw and special characters, and stop when encountering a special
+% right brace, any escaped character, or the end-marker.
+% \begin{macrocode}
\cs_new:Npn \@@_compile_u_loop:NN #1#2
{
\token_if_eq_meaning:NNTF #1 \@@_compile_raw:N
@@ -3359,19 +3384,37 @@
\token_if_eq_meaning:NNTF #1 \@@_compile_special:N
{
\exp_after:wN \token_if_eq_charcode:NNTF \c_right_brace_str #2
- { \if_false: { \fi: } \@@_compile_u_end: }
+ { \if_false: { \fi: } \l_@@_internal_b_tl }
{ #2 \@@_compile_u_loop:NN }
}
{
\if_false: { \fi: }
\__kernel_msg_error:nnx { regex } { u-missing-rbrace } {#2}
- \@@_compile_u_end:
+ \l_@@_internal_b_tl
#1 #2
}
}
}
% \end{macrocode}
% \end{macro}
+%
+% \begin{macro}{\@@_compile_ur_end:}
+% For the |\ur{...}| construction, once we have extracted the
+% variable's name, we simply insert the compiled regex (which the
+% variable should be) into a non-capturing group (with no repetition)
+% to respect the structure of regexes. It might be possible to omit
+% this group perhaps.
+% \begin{macrocode}
+\cs_new_protected:Npn \@@_compile_ur_end:
+ {
+ \tl_build_put_right:Nx \l_@@_build_tl
+ {
+ \exp_not:N \@@_group_no_capture:nnnN
+ { \exp_not:v { \l_@@_internal_a_tl } }
+ { 1 } { 0 } \exp_not:N \c_false_bool
+ }
+ }
+% \end{macrocode}
% \end{macro}
%
% \begin{macro}{\@@_compile_u_end:}
diff --git a/l3kernel/testfiles/m3regex007.lvt b/l3kernel/testfiles/m3regex007.lvt
index 7e6c4bc5b..fb1b65fb6 100644
--- a/l3kernel/testfiles/m3regex007.lvt
+++ b/l3kernel/testfiles/m3regex007.lvt
@@ -23,6 +23,7 @@
\regex_new:N \l_foo_regex
\regex_set:Nn \l_foo_regex { \A a|b| }
\regex_show:N \l_foo_regex
+ \regex_show:n { a \ur{l_foo_regex} b \c{\ur{l_foo_regex}|D} }
\regex_show:n { a\c{bc}\u{c_space_tl}\c{\u{c_space_tl}|} }
\tl_set:Nn \l_tmpa_tl { \abc }
\int_set:Nn \l_tmpa_int { 7 }
diff --git a/l3kernel/testfiles/m3regex007.tlg b/l3kernel/testfiles/m3regex007.tlg
index b797aa26f..09b55e7d9 100644
--- a/l3kernel/testfiles/m3regex007.tlg
+++ b/l3kernel/testfiles/m3regex007.tlg
@@ -39,6 +39,31 @@ Defining \l_foo_regex on line ...
+-branch.
<recently read> }
l. ... }
+> Compiled regex {a\ur {l_foo_regex}b\c {\ur {l_foo_regex}|D}}:
++-branch
+ char code 97
+ ,-group begin (no capture)
+ | assertion: anchor at start (\A)
+ | char code 97
+ +-branch
+ | char code 98
+ +-branch
+ `-group end
+ char code 98
+ Match
+ control sequence
+ +-branch
+ ,-group begin (no capture)
+ | assertion: anchor at start (\A)
+ | char code 97
+ +-branch
+ | char code 98
+ +-branch
+ `-group end
+ +-branch
+ char code 68.
+<recently read> }
+l. ... }
> Compiled regex {a\c {bc}\u {c_space_tl}\c {\u {c_space_tl}|}}:
+-branch
char code 97
More information about the latex3-commits
mailing list.