[latex3-commits] [l3svn] r7124 - Apply category codes to groups in l3regex replacement

Tue Apr 25 06:51:07 CEST 2017

Author: bruno
Date: 2017-04-25 06:51:07 +0200 (Tue, 25 Apr 2017)
New Revision: 7124

Modified:
   trunk/l3experimental/l3str/l3regex.dtx
   trunk/l3experimental/l3str/testfiles/m3regex005.lvt
   trunk/l3experimental/l3str/testfiles/m3regex005.tlg
Log:
Apply category codes to groups in l3regex replacement


Modified: trunk/l3experimental/l3str/l3regex.dtx
===================================================================

--- trunk/l3experimental/l3str/l3regex.dtx	2017-04-25 03:23:27 UTC (rev 7123)
+++ trunk/l3experimental/l3str/l3regex.dtx	2017-04-25 04:51:07 UTC (rev 7124)
@@ -381,7 +381,8 @@
 % \end{verbatim}
 % results in \cs{l_my_tl} holding |H(ell--el)(o,--o) w(or--o)(ld--l)!|
 %
-% Submatches keep the same category codes as in the original token list.
+% Submatches always keep the same category codes as in the original
+% token list.
 % The characters inserted by the replacement have category code $12$
 % (other) by default, with the exception of space characters.  Spaces
 % inserted through \verb*|\ | have category code $10$, while spaces
@@ -389,9 +390,11 @@
 % The escape sequence |\c| allows to insert characters
 % with arbitrary category codes, as well as control sequences.
 % \begin{l3regex-syntax}
-% \item[\\cXY] Produces the character~|Y| (which can be given as an
-%   escape sequence such as~|\t| for tab) with category code~|X|, which
-%   must be one of |CBEMTPUDSLOA| as in regular expressions.
+% \item[\\cX(\ldots{})] Produces the characters \enquote{\ldots{}} with
+%   category~|X|, which must be one of |CBEMTPUDSLOA| as in regular
+%   expressions.  Parentheses are optional for a single character (which
+%   can be an escape sequence).  This can be nested, for instance
+%   |\cL(Hello\cS\ world)!|
 % \item[\\c\Arg{text}] Produces the control sequence with csname
 %   \meta{text}.  The \meta{text} may contain references to the
 %   submatches |\0|, |\1|, and so on, as in the example for |\u| below.
@@ -399,7 +402,7 @@
 %
 % The escape sequence |\u|\Arg{tl~var~name} allows to insert the
 % contents of the token list with name \meta{tl~var~name} directly into
-% the replacement, avoiding the need to escape special characters.
+% the replacement, giving an easier control of category codes.
 % Within |\c{|\ldots{}|}| and |\u{|\ldots{}|}| constructions, the |\u|
 % and |\c|~escape sequences perform \cs{tl_to_str:v}, namely extract the
 % value of the control sequence and turn it into a string.
@@ -678,7 +681,6 @@
 % The following features are likely to be implemented at some point
 % in the future.
 % \begin{itemize}
-%   \item Allow |\cL(abc)| in replacement text.
 %   \item General look-ahead/behind assertions.
 %   \item Regex matching on external files.
 %   \item Conditional subpatterns with look ahead/behind: \enquote{if
@@ -4828,6 +4830,15 @@
 %    \end{macrocode}
 % \end{variable}
 %
+% \begin{variable}{\l_@@_replacement_category_tl, \l_@@_replacement_category_seq}
+%   This sequence of letters is used to correctly restore categories in
+%   nested constructions such as |\cL(abc\cD(_)d)|.
+%    \begin{macrocode}
+\tl_new:N \l_@@_replacement_category_tl
+\seq_new:N \l_@@_replacement_category_seq
+%    \end{macrocode}
+% \end{variable}
+%
 % \begin{variable}{\l_@@_balance_tl}
 %   This token list holds the replacement text for
 %   \cs{@@_replacement_balance_one_match:n} while it is being built
@@ -4992,10 +5003,14 @@
       \@@_escape_use:nnnn
         {
           \if_charcode:w \c_right_brace_str ##1
-            \@@_replacement_rbrace:N \else: \__tl_build_one:n \fi: ##1
+            \@@_replacement_rbrace:N
+          \else:
+            \@@_replacement_normal:n
+          \fi:
+          ##1
         }
         { \@@_replacement_escaped:N ##1 }
-        { \__tl_build_one:n ##1 }
+        { \@@_replacement_normal:n ##1 }
         {#1}
       \prg_do_nothing: \prg_do_nothing:
       \if_int_compare:w \l_@@_replacement_csnames_int > 0 \exp_stop_f:
@@ -5004,6 +5019,12 @@
         \__tl_build_one:x
           { \prg_replicate:nn \l_@@_replacement_csnames_int \cs_end: }
       \fi:
+      \seq_if_empty:NF \l_@@_replacement_category_seq
+        {
+          \__msg_kernel_error:nnx { regex } { replacement-missing-rparen }
+            { \seq_count:N \l_@@_replacement_category_seq }
+          \seq_clear:N \l_@@_replacement_category_seq
+        }
       \cs_gset:Npx \@@_replacement_balance_one_match:n ##1
         {
           + \int_use:N \l_@@_balance_int
@@ -5028,6 +5049,35 @@
 % \end{macro}
 % \end{macro}
 %
+% \begin{macro}[aux]{\@@_replacement_normal:n}
+%   Most characters are simply sent to the output by
+%   \cs{__tl_build_one:n}, unless a particular category code has been
+%   requested: then \cs{@@_replacement_c_A:w} or a similar auxiliary is
+%   called.  One exception is right parentheses, which restore the
+%   category code in place before the group started.  Note that the
+%   sequence is non-empty there: it contains an empty entry
+%   corresponding to the initial value of
+%   \cs{l_@@_replacement_category_tl}.
+%    \begin{macrocode}
+\cs_new_protected:Npn \@@_replacement_normal:n #1
+  {
+    \tl_if_empty:NTF \l_@@_replacement_category_tl
+      { \__tl_build_one:n {#1} }
+      { % (
+        \token_if_eq_charcode:NNTF #1 )
+          {
+            \seq_pop:NN \l_@@_replacement_category_seq
+              \l_@@_replacement_category_tl
+          }
+          {
+            \use:c { @@_replacement_c_ \l_@@_replacement_category_tl :w }
+              \@@_replacement_normal:n {#1}
+          }
+      }
+  }
+%    \end{macrocode}
+% \end{macro}
+%
 % \begin{macro}[aux]{\@@_replacement_escaped:N}
 %   As in parsing a regular expression, we use an auxiliary built from
 %   |#1| if defined. Otherwise, check for escaped digits (standing from
@@ -5041,7 +5091,8 @@
         \if_int_compare:w 1 < 1#1 \exp_stop_f:
           \@@_replacement_put_submatch:n {#1}
         \else:
-          \__tl_build_one:o { \token_to_str:N #1 }
+          \exp_args:No \@@_replacement_normal:n
+            { \token_to_str:N #1 }
         \fi:
       }
   }
@@ -5079,7 +5130,7 @@
 %    \begin{macrocode}
 \cs_new_protected:Npn \@@_replacement_g:w #1#2
   {
-    \str_if_eq_x:nnTF { #1#2 } { \__tl_build_one:n \c_left_brace_str }
+    \str_if_eq_x:nnTF { #1#2 } { \@@_replacement_normal:n \c_left_brace_str }
       {
         \int_zero:N \l_@@_internal_a_int
         \@@_replacement_g_digits:NN
@@ -5088,7 +5139,7 @@
   }
 \cs_new_protected:Npn \@@_replacement_g_digits:NN #1#2
   {
-    \token_if_eq_meaning:NNTF #1 \__tl_build_one:n
+    \token_if_eq_meaning:NNTF #1 \@@_replacement_normal:n
       {
         \if_int_compare:w 1 < 1#2 \exp_stop_f:
           \int_set:Nn \l_@@_internal_a_int
@@ -5118,26 +5169,27 @@
 % \subsubsection{Csnames in replacement}
 %
 % \begin{macro}[aux]{\@@_replacement_c:w}
-% \begin{macro}[aux]+\@@_replacement_c_{:w+
-%   |\c| can be followed by a left brace, or by a letter for which we
-%   have defined a way to produce that category of characters.  The
-%   appropriate definitions for catcodes are introduced later.  For
-%   control sequences, call an auxiliary that starts a control sequence.
+%   |\c| may only be followed by an unescaped character.  If followed by
+%   a left brace, start a control sequence by calling an auxiliary
+%   common with |\u|.  Otherwise test whether the category is known; if
+%   it is not, complain.
 %    \begin{macrocode}
 \cs_new_protected:Npn \@@_replacement_c:w #1#2
   {
-    \token_if_eq_meaning:NNTF #1 \__tl_build_one:n
+    \token_if_eq_meaning:NNTF #1 \@@_replacement_normal:n
       {
-        \cs_if_exist_use:cF { @@_replacement_c_#2:w }
-          { \@@_replacement_error:NNN c #1#2 }
+        \exp_after:wN \token_if_eq_charcode:NNTF \c_left_brace_str #2
+          { \@@_replacement_cu_aux:Nw \@@_replacement_exp_not:N }
+          {
+            \cs_if_exist:cTF { @@_replacement_c_#2:w }
+              { \@@_replacement_cat:NNN #2 }
+              { \@@_replacement_error:NNN c #1#2 }
+          }
       }
       { \@@_replacement_error:NNN c #1#2 }
   }
-\cs_new_protected:cpn { @@_replacement_c_ \c_left_brace_str :w }
-  { \@@_replacement_cu_aux:Nw \@@_replacement_exp_not:N }
 %    \end{macrocode}
 % \end{macro}
-% \end{macro}
 %
 % \begin{macro}[aux]{\@@_replacement_cu_aux:Nw}
 %   Start a control sequence with \cs{cs:w}, which will be protected
@@ -5166,7 +5218,7 @@
 %    \begin{macrocode}
 \cs_new_protected:Npn \@@_replacement_u:w #1#2
   {
-    \str_if_eq_x:nnTF { #1#2 } { \__tl_build_one:n \c_left_brace_str }
+    \str_if_eq_x:nnTF { #1#2 } { \@@_replacement_normal:n \c_left_brace_str }
       { \@@_replacement_cu_aux:Nw \exp_not:V }
       { \@@_replacement_error:NNN u #1#2 }
   }
@@ -5184,7 +5236,7 @@
       \__tl_build_one:n \cs_end:
       \int_decr:N \l_@@_replacement_csnames_int
     \else:
-      \__tl_build_one:n #1
+      \@@_replacement_normal:n {#1}
     \fi:
   }
 %    \end{macrocode}
@@ -5192,6 +5244,39 @@
 %
 % \subsubsection{Characters in replacement}
 %
+% \begin{macro}[aux]{\@@_replacement_cat:NNN}
+%   Here, |#1| is a letter among |BEMTPUDSLOA| and |#2#3| denote the
+%   next character.  Complain if we reach the end of the replacement or
+%   if the construction appears inside |\c{|\ldots{}|}| or
+%   |\u{|\ldots{}|}|, and detect the case of a parenthesis.  In that
+%   case, store the current category in a sequence and switch to a new
+%   one.
+%    \begin{macrocode}
+\cs_new_protected:Npn \@@_replacement_cat:NNN #1#2#3
+  {
+    \token_if_eq_meaning:NNTF \prg_do_nothing: #3
+      { \__msg_kernel_error:nn { regex } { replacement-catcode-end } }
+      {
+        \int_compare:nNnTF { \l_@@_replacement_csnames_int } > 0
+          {
+            \__msg_kernel_error:nnnn
+              { regex } { replacement-catcode-in-cs } {#1} {#3}
+            #2 #3
+          }
+          {
+            \str_if_eq:nnTF { #2 #3 } { \@@_replacement_normal:n ( } % )
+              {
+                \seq_push:NV \l_@@_replacement_category_seq
+                  \l_@@_replacement_category_tl
+                \tl_set:Nn \l_@@_replacement_category_tl {#1}
+              }
+              { \use:c { @@_replacement_c_#1:w } #2 #3 }
+          }
+      }
+  }
+%    \end{macrocode}
+% \end{macro}
+%
 % We will need to change the category code of the null character many
 % times, hence work in a group. The catcode-specific macros below are
 % defined in alphabetical order; if you are trying to understand the
@@ -5212,12 +5297,8 @@
 %    \begin{macrocode}
   \cs_new_protected:Npn \@@_replacement_char:nNN #1#2#3
     {
-      \if_meaning:w \prg_do_nothing: #3
-        \__msg_kernel_error:nn { regex } { replacement-catcode-end }
-      \else:
-        \tex_lccode:D 0 = `#3 \scan_stop:
-        \tex_lowercase:D { \__tl_build_one:n {#1} }
-      \fi:
+      \tex_lccode:D 0 = `#3 \scan_stop:
+      \tex_lowercase:D { \__tl_build_one:n {#1} }
     }
 %    \end{macrocode}
 % \end{macro}
@@ -5343,15 +5424,11 @@
 %    \begin{macrocode}
   \cs_new_protected:Npn \@@_replacement_c_S:w #1#2
     {
-      \if_meaning:w \prg_do_nothing: #2
-        \__msg_kernel_error:nn { regex } { replacement-catcode-end }
-      \else:
-        \if_int_compare:w `#2 = 0 \exp_stop_f:
-          \__msg_kernel_error:nn { regex } { replacement-null-space }
-        \fi:
-        \tex_lccode:D `\ = `#2 \scan_stop:
-        \tex_lowercase:D { \__tl_build_one:n {~} }
+      \if_int_compare:w `#2 = 0 \exp_stop_f:
+        \__msg_kernel_error:nn { regex } { replacement-null-space }
       \fi:
+      \tex_lccode:D `\ = `#2 \scan_stop:
+      \tex_lowercase:D { \__tl_build_one:n {~} }
     }
 %    \end{macrocode}
 % \end{macro}
@@ -6206,6 +6283,16 @@
     the~character~category.~Then,~a~character~must~follow.~LaTeX~
     reached~the~end~of~the~replacement~when~looking~for~that.
   }
+\__msg_kernel_new:nnnn { regex } { replacement-catcode-in-cs }
+  {
+    Category~code~'\iow_char:N\\c#1#3'~ignored~inside~
+    '\iow_char:N\\c\{...\}'~in~a~replacement~text.
+  }
+  {
+    In~a~replacement~text,~the~category~codes~of~the~argument~of~
+    '\iow_char:N\\c\{...\}'~are~ignored~when~building~the~control~
+    sequence~name.
+  }
 \__msg_kernel_new:nnnn { regex } { replacement-null-space }
   { TeX~cannot~build~a~space~token~with~character~code~0. }
   {
@@ -6221,6 +6308,12 @@
     There~ \int_compare:nTF { #1 = 1 } { was } { were } ~ #1~
     missing~right~\int_compare:nTF { #1 = 1 } { brace } { braces } .
   }
+\__msg_kernel_new:nnnn { regex } { replacement-missing-rparen }
+  { Missing~right~parenthesis~inserted~in~replacement~text. }
+  {
+    There~ \int_compare:nTF { #1 = 1 } { was } { were } ~ #1~
+    missing~right~\int_compare:nTF { #1 = 1 } { parenthesis } { parentheses } .
+  }
 %    \end{macrocode}
 %
 % \begin{macro}[aux]{\@@_msg_repeated:nnN}

Modified: trunk/l3experimental/l3str/testfiles/m3regex005.lvt
===================================================================
--- trunk/l3experimental/l3str/testfiles/m3regex005.lvt	2017-04-25 03:23:27 UTC (rev 7123)
+++ trunk/l3experimental/l3str/testfiles/m3regex005.lvt	2017-04-25 04:51:07 UTC (rev 7124)
@@ -1,5 +1,5 @@
 %
-% Copyright (C) 2011-2014 LaTeX3 Project
+% Copyright (C) 2011-2014,2017 LaTeX3 Project
 %
 
 \documentclass{minimal}
@@ -171,4 +171,20 @@
   }
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\OMIT
+\cs_new:Npn \foo { Z }
+\TIMO
+\TEST { Catcode~group~in~replacement }
+  {
+    \tl_set:Nx \l_tmpa_tl { ab \tl_to_str:n { ab } }
+    \regex_replace_all:nnN { a(b) }
+      { q \cD( e \cU( t \0 \u{foo} \cM\1 p ) s ) f } \l_tmpa_tl
+    \tl_show_analysis:N \l_tmpa_tl
+    \regex_replace_all:nnN { a } { \cL( } \l_tmpb_tl
+    \tl_log:N \l_tmpb_tl
+    \regex_replace_all:nnN { a } { \cL(\cU(( } \l_tmpb_tl
+    \tl_log:N \l_tmpb_tl
+  }
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \END

Modified: trunk/l3experimental/l3str/testfiles/m3regex005.tlg
===================================================================
--- trunk/l3experimental/l3str/testfiles/m3regex005.tlg	2017-04-25 03:23:27 UTC (rev 7123)
+++ trunk/l3experimental/l3str/testfiles/m3regex005.tlg	2017-04-25 04:51:07 UTC (rev 7124)
@@ -192,6 +192,21 @@
 ============================================================
 TEST 4: Macro parameters
 ============================================================
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+!
+! LaTeX error: "regex/replacement-catcode-in-cs"
+! 
+! Category code '\cP' ignored inside '\c{...}' in a replacement text.
+! 
+! See the LaTeX3 documentation for further information.
+! 
+! For immediate help type H <return>.
+!...............................................  
+l. ...  }
+|'''''''''''''''''''''''''''''''''''''''''''''''
+| In a replacement text, the category codes of the argument of '\c{...}' are
+| ignored when building the control sequence name.
+|...............................................
 The token list \l_tmpa_tl contains the tokens:
 >  \# (control sequence=\char"23=35)
 >  # (macro parameter character #)
@@ -269,6 +284,21 @@
 l. ...  }
 {1} {a} 
 {c3} {bc} 
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+!
+! LaTeX error: "regex/replacement-catcode-in-cs"
+! 
+! Category code '\cB' ignored inside '\c{...}' in a replacement text.
+! 
+! See the LaTeX3 documentation for further information.
+! 
+! For immediate help type H <return>.
+!...............................................  
+l. ...  }
+|'''''''''''''''''''''''''''''''''''''''''''''''
+| In a replacement text, the category codes of the argument of '\c{...}' are
+| ignored when building the control sequence name.
+|...............................................
 The token list \l_tmpa_tl contains the tokens:
 >  a (the letter a)
 >  < (the character <)
@@ -330,3 +360,60 @@
 the character  
 the character  
 ============================================================
+============================================================
+TEST 14: Catcode group in replacement
+============================================================
+The token list \l_tmpa_tl contains the tokens:
+>  q (the character q)
+>  e (subscript character e)
+>  t (superscript character t)
+>  a (the letter a)
+>  b (the letter b)
+>  Z (the letter Z)
+>  1 (math shift character 1)
+>  p (superscript character p)
+>  s (subscript character s)
+>  f (the character f)
+>  q (the character q)
+>  e (subscript character e)
+>  t (superscript character t)
+>  a (the character a)
+>  b (the character b)
+>  Z (the letter Z)
+>  1 (math shift character 1)
+>  p (superscript character p)
+>  s (subscript character s)
+>  f (the character f).
+<recently read> }
+l. ...  }
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+!
+! LaTeX error: "regex/replacement-missing-rparen"
+! 
+! Missing right parenthesis inserted in replacement text.
+! 
+! See the LaTeX3 documentation for further information.
+! 
+! For immediate help type H <return>.
+!...............................................  
+l. ...  }
+|'''''''''''''''''''''''''''''''''''''''''''''''
+| There was 1 missing right parenthesis.
+|...............................................
+> \l_tmpb_tl=.
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+!
+! LaTeX error: "regex/replacement-missing-rparen"
+! 
+! Missing right parenthesis inserted in replacement text.
+! 
+! See the LaTeX3 documentation for further information.
+! 
+! For immediate help type H <return>.
+!...............................................  
+l. ...  }
+|'''''''''''''''''''''''''''''''''''''''''''''''
+| There were 2 missing right parentheses.
+|...............................................
+> \l_tmpb_tl=.
+============================================================