[latex3-commits] [latex3/latex3] gh890: Add wrapper functions for regex concepts (c568af530)

Tue Nov 26 18:27:28 CET 2024

Repository : https://github.com/latex3/latex3
On branch  : gh890
Link       : https://github.com/latex3/latex3/commit/c568af530295df45760d3f9e5fb989bd68481ddc

>---------------------------------------------------------------

commit c568af530295df45760d3f9e5fb989bd68481ddc
Author: Joseph Wright <joseph at texdev.net>
Date:   Tue Nov 26 17:10:32 2024 +0000

    Add wrapper functions for regex concepts


>---------------------------------------------------------------

c568af530295df45760d3f9e5fb989bd68481ddc
 l3kernel/CHANGELOG.md |   7 +++
 l3kernel/l3int.dtx    |  66 ++++++++++++++++++++++++++
 l3kernel/l3seq.dtx    |  97 +++++++++++++++++++++++++++++++++++++
 l3kernel/l3tl.dtx     | 129 ++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 299 insertions(+)

diff --git a/l3kernel/CHANGELOG.md b/l3kernel/CHANGELOG.md
index ce71252de..3673a0c88 100644
--- a/l3kernel/CHANGELOG.md
+++ b/l3kernel/CHANGELOG.md
@@ -11,6 +11,13 @@ this project uses date-based 'snapshot' version identifiers.
 - `\bitset_use:N` and `\clist_use:N`: this clarifies that bitsets and clists
   can be used with `V`-type expansion
 - `\sys_if_engine_opentype:(TF)`
+- Add equivalent interfaces for regex functions:
+  - `\int_(g)set_regex_count:Nnn`, `\int_(g)set_regex_count:NNn`
+  - `\seq_(g)set_regex_extract:Nnn`, `\seq_(g)set_regex_extract:NNn`
+  - `\seq_(g)set_regex_split:Nnn`, `\seq_(g)set_regex_split:NNn`
+  - `\tl_if_regex_match:nn(TF)`, `\tl_if_regex_match:nN(TF)`
+  - `\tl_regex_(g)replace_once:Nnn`, `\tl_regex_(g)replace_once:NNn`
+  - `\tl_regex_(g)replace_all:Nnn`, `\tl_regex_(g)replace_all:NNn`
 
 ### Removed
 - `\c_catcode_active_tl`: was missing a `_`, always intended to be internal
diff --git a/l3kernel/l3int.dtx b/l3kernel/l3int.dtx
index 7f3399429..08a827463 100644
--- a/l3kernel/l3int.dtx
+++ b/l3kernel/l3int.dtx
@@ -307,6 +307,33 @@
 %   \cs{int_eval:n}).
 % \end{function}
 %
+% \begin{function}[added = 2024-11-26]
+%   {
+%     \int_set_regex_count:Nnn, \int_set_regex_count:cnn,
+%     \int_set_regex_count:NNn, \int_set_regex_count:cNn,
+%     \int_gset_regex_count:Nnn, \int_gset_regex_count:cnn,
+%     \int_gset_regex_count:NNn, \int_gset_regex_count:cNn,
+%   }
+%   \begin{syntax}
+%     \cs{int_set_regex_count:Nnn} \meta{int var} \Arg{regular expression} \Arg{token list}
+%     \cs{int_set_regex_count:NNn} \meta{int var} \Arg{compiled regex} \Arg{token list}
+%   \end{syntax}
+%   Sets \meta{int var} equal to the number of times
+%   \meta{regular expression} appears in \meta{token list}.
+%   The search starts by finding the left-most longest match,
+%   respecting greedy and lazy (non-greedy) operators. Then the search
+%   starts again from the character following the last character
+%   of the previous match, until reaching the end of the token list.
+%   Infinite loops are prevented in the case where the regular expression
+%   can match an empty token list: then we count one match between each
+%   pair of characters.
+%   For instance,
+%   \begin{verbatim}
+%     \int_set_regex_count:Nnn \l_foo_int { (b+|c) } { abbababcbb }
+%   \end{verbatim}
+%   results in \cs[no-index]{l_foo_int} taking the value $5$.
+% \end{function}
+%
 % \begin{function}[updated = 2011-10-22]
 %   {\int_sub:Nn, \int_sub:cn, \int_gsub:Nn, \int_gsub:cn}
 %   \begin{syntax}
@@ -1495,6 +1522,45 @@
 % \end{macro}
 % \end{macro}
 %
+% \begin{macro}
+%   {
+%     \int_set_regex_count:Nnn, \int_set_regex_count:cnn,
+%     \int_gset_regex_count:Nnn, \int_gset_regex_count:cnn
+%   }
+% \begin{macro}
+%   {
+%     \int_set_regex_count:NNn, \int_set_regex_count:cNn,
+%     \int_gset_regex_count:NNn, \int_set_gregex_count:cNn
+%   }
+%    \begin{macrocode}
+\cs_new_protected:Npn \int_set_regex_count:Nnn #1#2#3
+  { \regex_count:nnN {#3} {#2} #1 }
+\cs_generate_variant:Nn \int_set_regex_count:Nnn { c }
+\cs_new_protected:Npn \int_gset_regex_count:Nnn #1#2#3
+  {
+    \group_begin:
+      \int_set_eq:NN \exp_not:N \l_@@_internal_a_int #1
+      \regex_count:nnN {#2} {#3} \l_@@_internal_a_int
+      \int_gset_eq:NN #1 \l_@@_internal_a_int
+    \group_end:
+  }
+\cs_generate_variant:Nn \int_gset_regex_count:Nnn { c }
+\cs_new_protected:Npn \int_set_regex_count:NNn #1#2#3
+  { \regex_count:NnN {#3} {#2} #1 }
+\cs_generate_variant:Nn \int_set_regex_count:NNn { c }
+\cs_new_protected:Npn \int_gset_regex_count:NNn #1#2#3
+  {
+    \group_begin:
+      \int_set_eq:NN \exp_not:N \l_@@_internal_a_int #1
+      \regex_count:NnN #2 {#3} \l_@@_internal_a_int
+      \int_gset_eq:NN #1 \l_@@_internal_a_int
+    \group_end:
+  }
+\cs_generate_variant:Nn \int_gset_regex_count:Nnn { c }
+%    \end{macrocode}
+% \end{macro}
+% \end{macro}
+%
 % \subsection{Using integers}
 %
 % \begin{macro}{\int_use:N, \int_use:c}
diff --git a/l3kernel/l3seq.dtx b/l3kernel/l3seq.dtx
index 120081197..885a044e1 100644
--- a/l3kernel/l3seq.dtx
+++ b/l3kernel/l3seq.dtx
@@ -189,6 +189,45 @@
 %   \end{texnote}
 % \end{function}
 %
+% \begin{function}[added = 2024-11-26]
+%   {
+%     \seq_set_regex_extract:Nnn, \seq_set_regex_extract:cnn,
+%     \seq_set_regex_extract:NNn, \seq_set_regex_extract:cNn,
+%     \seq_gset_regex_extract:Nnn, \seq_gset_regex_extract:cnn,
+%     \seq_gset_regex_extract:NNn, \seq_gset_regex_extract:cNn,
+%   }
+%   \begin{syntax}
+%     \cs{seq_set_regex_extract:Nnn} \meta{seq~var} \Arg{regular expression} \Arg{token list}
+%     \cs{seq_set_regex_extract:NNn} \meta{seq~var} \Arg{compiled regex} \Arg{token list}
+%   \end{syntax}
+%   Finds the first match of the \meta{regular expression} in the
+%   \meta{token list}. If it exists, the match is stored as the first
+%   item of the \meta{seq~var}, and further items are the contents of
+%   capturing groups, in the order of their opening parenthesis. If
+%   there is no match, the \meta{seq~var} is cleared.
+% \end{function}
+%
+% \begin{function}[added = 2024-11-26]
+%   {
+%     \seq_set_regex_split:Nnn, \seq_set_regex_split:cnn,
+%     \seq_set_regex_split:NNn, \seq_set_regex_split:cNn,
+%     \seq_gset_regex_split:Nnn, \seq_gset_regex_split:cnn,
+%     \seq_gset_regex_split:NNn, \seq_gset_regex_split:cNn,
+%   }
+%   \begin{syntax}
+%     \cs{seq_set_regex_split:Nnn} \meta{seq~var} \Arg{regular expression} \Arg{token list}
+%     \cs{seq_set_regex_split:NNn} \meta{seq~var} \Arg{compiled regex} \Arg{token list}
+%   \end{syntax}
+%   Splits the \meta{token list} into a sequence of parts, delimited by
+%   matches of the \meta{regular expression}. If the \meta{regular expression}
+%   has capturing groups, then the token lists that they match are stored as
+%   items of the sequence as well. The assignment to \meta{seq~var} is local.
+%   If no match is found the resulting \meta{seq~var} has the
+%   \meta{token list} as its sole item. If the \meta{regular expression}
+%   matches the empty token list, then the \meta{token list} is split
+%   into single tokens.
+% \end{function}
+%
 % \begin{function}
 %   {\seq_concat:NNN, \seq_concat:ccc, \seq_gconcat:NNN, \seq_gconcat:ccc}
 %   \begin{syntax}
@@ -1411,6 +1450,64 @@
 % \end{macro}
 % \end{macro}
 %
+% \begin{macro}
+%   {
+%     \seq_set_regex_extract:Nnn, \seq_set_regex_extract:cnn,
+%     \seq_gset_regex_extract:Nnn, \seq_gset_regex_extract:cnn
+%   }
+% \begin{macro}
+%   {
+%     \seq_set_regex_extract:NNn, \seq_set_regex_extract:cNn,
+%     \seq_gset_regex_extract:NNn, \seq_gset_regex_extract:cNn
+%   }
+% \begin{macro}
+%   {
+%     \seq_set_regex_split:Nnn, \seq_set_regex_split:cnn,
+%     \seq_gset_regex_split:Nnn, \seq_gset_regex_split:cnn
+%   }
+% \begin{macro}
+%   {
+%     \seq_set_regex_split:NNn, \seq_set_regex_split:cNn,
+%     \seq_gset_regex_split:NNn, \seq_gset_regex_split:cNn
+%   }
+%    \begin{macrocode}
+\cs_new_protected:Npn \seq_set_regex_extract:Nnn #1#2#3
+  { \regex_extract:nnN {#3} {#2} #1 }
+\cs_generate_variant:Nn \seq_set_regex_extract:Nnn { c }
+\cs_new_protected:Npn \seq_set_regex_extract:NNn #1#2#3
+  { \regex_extract:NnN {#3} {#2} #1 }
+\cs_generate_variant:Nn \seq_set_regex_extract:NNn { c }
+\cs_new_protected:Npn \seq_set_regex_split:Nnn #1#2#3
+  { \regex_split:nnN {#3} {#2} #1 }
+\cs_generate_variant:Nn \seq_set_regex_split:Nnn { c }
+\cs_new_protected:Npn \seq_set_regex_split:NNn #1#2#3
+  { \regex_split:NnN {#3} {#2} #1 }
+\cs_generate_variant:Nn \seq_set_regex_split:NNn { c }
+\group_begin:
+  \cs_set_protected:Npn \@@_tmp:w #1#2#3
+    {
+      \cs_new_protected:cpe { seq_gset_regex_ #1 :N #2 n } ##1##2##3
+        {
+          \group_begin:
+            \tl_set_eq:NN \exp_not:N \l_@@_internal_a_tl ##1
+            \exp_not:c { regex_ #1 :Nn #2 }
+              #3 {##2} {##3} \exp_not:N \l_@@_internal_a_tl
+            \seq_gset_eq:NN ##1 \exp_not:N \l_@@_internal_a_tl
+          \group_end:
+        }
+      \cs_generate_variant:cn { seq_gset_regex_ #1 : N #2 n } { c }
+    }
+  \@@_tmp:w { extract } n { }
+  \@@_tmp:w { extract } N \use:n
+  \@@_tmp:w { split } n { }
+  \@@_tmp:w { split } N \use:n
+\group_end:
+%    \end{macrocode}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+% \end{macro}
+%
 % \begin{macro}{\seq_concat:NNN, \seq_concat:ccc}
 % \UnitTested
 % \begin{macro}{\seq_gconcat:NNN, \seq_gconcat:ccc}
diff --git a/l3kernel/l3tl.dtx b/l3kernel/l3tl.dtx
index 3d08f2354..c40b0aa66 100644
--- a/l3kernel/l3tl.dtx
+++ b/l3kernel/l3tl.dtx
@@ -376,6 +376,24 @@
 %   Token groups (|{|\ldots|}|) are not single tokens.
 % \end{function}
 %
+% \begin{function}[TF, added = 2024-11-26]
+%   {
+%     \tl_if_regex_match:nn, \tl_if_regex_match:Vn,
+%     \tl_if_regex_match:nN, \tl_if_regex_match:VN,
+%   }
+%   \begin{syntax}
+%     \cs{tl_if_regex_match:nnTF} \Arg{token list} \Arg{regular expression} \Arg{true code} \Arg{false code}
+%     \cs{tl_if_regex_match:nNTF} \Arg{token list} \Arg{compiled regex} \Arg{true code} \Arg{false code}
+%   \end{syntax}
+%   Tests whether the \meta{regular expression} matches any part
+%   of the \meta{token list}. For instance,
+%   \begin{verbatim}
+%     \regex_match:nnTF { b [cde]* } { abecdcx } { TRUE } { FALSE }
+%     \regex_match:nnTF { [b-dq-w] } { example } { TRUE } { FALSE }
+%   \end{verbatim}
+%   leaves \texttt{TRUE} then \texttt{FALSE} in the input stream.
+% \end{function}
+%
 % \subsection{Testing the first token}
 %
 % \begin{function}[updated = 2012-07-09, EXP, pTF]
@@ -1158,6 +1176,43 @@
 %   for an example).
 % \end{function}
 %
+% \begin{function}[added = 2024-11-26]
+%   {
+%     \tl_regex_replace_once:Nnn, \tl_regex_replace_once:cnn,
+%     \tl_regex_replace_once:NNn, \tl_regex_replace_once:cNn,
+%     \tl_regex_greplace_once:Nnn, \tl_regex_greplace_once:cnn,
+%     \tl_regex_greplace_once:NNn, \tl_regex_greplace_once:cNn
+%   }
+%   \begin{syntax}
+%     \cs{tl_regex_replace_once:Nnn} \meta{tl~var} \Arg{regular expression} \Arg{replacement}
+%     \cs{tl_regex_replace_once:NNn} \meta{tl~var} \Arg{compiled regex} \Arg{replacement}
+%   \end{syntax}
+%   Searches for the \meta{regular expression} in the contents of the
+%   \meta{tl~var} and replaces the first match with the
+%   \meta{replacement}. In the \meta{replacement},
+%   |\0| represents the full match, |\1| represent the contents of the
+%   first capturing group, |\2| of the second, \emph{etc.}
+% \end{function}
+%
+% \begin{function}[added = 2024-11-26]
+%   {
+%     \tl_regex_replace_all:Nnn, \tl_regex_replace_all:cnn,
+%     \tl_regex_replace_all:NNn, \tl_regex_replace_all:cNn,
+%     \tl_regex_greplace_all:Nnn, \tl_regex_greplace_all:cnn,
+%     \tl_regex_greplace_all:NNn, \tl_regex_greplace_all:cNn
+%   }
+%   \begin{syntax}
+%     \cs{tl_regex_replace_all:Nnn} \meta{tl~var} \Arg{regular expression} \Arg{replacement}
+%     \cs{tl_regex_replace_all:NNn} \meta{tl~var} \Arg{compiled regex} \Arg{replacement}
+%   \end{syntax}
+%   Replaces all occurrences of the \meta{regular expression} in the
+%   contents of the \meta{tl~var}
+%   by the \meta{replacement}, where |\0| represents
+%   the full match, |\1| represent the contents of the first capturing
+%   group, |\2| of the second, \emph{etc.} Every match is treated
+%   independently, and matches cannot overlap.
+% \end{function}
+%
 % \begin{function}[updated = 2011-08-11]
 %   {
 %     \tl_remove_once:Nn,  \tl_remove_once:NV,  \tl_remove_once:Ne,
@@ -2249,6 +2304,57 @@
 %
 % \begin{macro}
 %   {
+%     \tl_regex_replace_once:Nnn, \tl_regex_replace_once:cnn,
+%     \tl_regex_replace_once:NNn, \tl_regex_replace_once:cNn,
+%     \tl_regex_greplace_once:Nnn, \tl_regex_greplace_once:cnn,
+%     \tl_regex_greplace_once:NNn, \tl_regex_greplace_once:cNn
+%   }
+% \begin{macro}
+%   {
+%     \tl_regex_replace_all:Nnn, \tl_regex_replace_all:cnn,
+%     \tl_regex_replace_all:NNn, \tl_regex_replace_all:cNn,
+%     \tl_regex_greplace_all:Nnn, \tl_regex_greplace_all:cnn,
+%     \tl_regex_greplace_all:NNn, \tl_regex_greplace_all:cNn
+%   }
+%  Wrappers.
+%    \begin{macrocode}
+\cs_new_protected:Npn \tl_regex_replace_once:Nnn #1#2#3
+  { \regex_replace_once:nnN {#2} {#3} #1 }
+\cs_generate_variant:Nn \tl_regex_replace_once:Nnn { c }
+\cs_new_protected:Npn \tl_regex_replace_once:NNn #1#2#3
+  { \regex_replace_once:NnN {#2} {#3} #1 }
+\cs_generate_variant:Nn \tl_regex_replace_once:NNn { c }
+\cs_new_protected:Npn \tl_regex_replace_all:Nnn #1#2#3
+  { \regex_replace_all:nnN {#2} {#3} #1 }
+\cs_generate_variant:Nn \tl_regex_replace_all:Nnn { c }
+\cs_new_protected:Npn \tl_regex_replace_all:NNn #1#2#3
+  { \regex_replace_all:NnN {#2} {#3} #1 }
+\cs_generate_variant:Nn \tl_regex_replace_all:NNn { c }
+\group_begin:
+  \cs_set_protected:Npn \@@_tmp:w #1#2#3
+    {
+      \cs_new_protected:cpe { tl_regex_greplace_ #1 :N #2 n } ##1##2##3
+        {
+          \group_begin:
+            \tl_set_eq:NN \exp_not:N \l_@@_internal_a_tl ##1
+            \exp_not:c { regex_replace_ #1 :Nn #2 }
+              #3 {##2} {##3} \exp_not:N \l_@@_internal_a_tl
+            \tl_gset_eq:NN ##1 \exp_not:N \l_@@_internal_a_tl
+          \group_end:
+        }
+      \cs_generate_variant:cn { tl_regex_greplace_ #1 :N #2 n } { c }
+    }
+  \@@_tmp:w { once } n { }
+  \@@_tmp:w { once } N \use:n
+  \@@_tmp:w { all } n { }
+  \@@_tmp:w { all } N \use:n
+\group_end:
+%    \end{macrocode}
+% \end{macro}
+% \end{macro}
+%
+% \begin{macro}
+%   {
 %     \tl_remove_once:Nn, \tl_remove_once:NV, \tl_remove_once:Ne,
 %     \tl_remove_once:cn, \tl_remove_once:cV, \tl_remove_once:ce
 %   }
@@ -2611,6 +2717,29 @@
 %    \end{macrocode}
 % \end{macro}
 %
+% \begin{macro}
+%   {
+%     \tl_if_regex_match:nn, \tl_if_regex_match:Vn,
+%     \tl_if_regex_match:nN, \tl_if_regex_match:VN,
+%   }
+%    \begin{macrocode}
+\prg_new_protected_conditional:Npnn \tl_if_regex_match:nn #1#2 { TF , T , F }
+  {
+    \regex_match:nnTF {#2} {#1}
+      \prg_return_true: \prg_return_false:
+  }
+\prg_generate_conditional_variant:Nnn \tl_if_regex_match:nn
+  { V } { TF , T , F }
+\prg_new_protected_conditional:Npnn \tl_if_regex_match:nN #1#2 { TF , T , F }
+  {
+    \regex_match:nNTF {#2} {#1}
+      \prg_return_true: \prg_return_false:
+  }
+\prg_generate_conditional_variant:Nnn \tl_if_regex_match:nN
+  { V } { TF , T , F }
+%    \end{macrocode}
+% \end{macro}
+%
 % \subsection{Mapping over token lists}
 %
 % \begin{macro}