[latex3-commits] [l3svn] r7352 - Add example regular expressions to the documentation of l3regex

Wed Jul 12 19:11:03 CEST 2017

Author: bruno
Date: 2017-07-12 19:11:03 +0200 (Wed, 12 Jul 2017)
New Revision: 7352

Modified:
   trunk/l3kernel/l3regex.dtx
Log:
Add example regular expressions to the documentation of l3regex

This includes both things I found asked on TeX.stackexchange and basic 
things like how to match an integer, or a dimension.


Modified: trunk/l3kernel/l3regex.dtx
===================================================================

--- trunk/l3kernel/l3regex.dtx	2017-07-12 11:07:36 UTC (rev 7351)
+++ trunk/l3kernel/l3regex.dtx	2017-07-12 17:11:03 UTC (rev 7352)
@@ -70,15 +70,16 @@
 % \enquote{\texttt{This cat.}}, where the first
 % occurrence of \enquote{\texttt{at}} was replaced
 % by \enquote{\texttt{is}}. A more complicated example is
-% a pattern to add a comma at the end of each word:
+% a pattern to emphasize each word and add a comma after it:
 % \begin{verbatim}
-%   \regex_replace_all:nnN { \w+ } { \0 , } \l_my_tl
+%   \regex_replace_all:nnN { \w+ } { \c{emph}\cB\{ \0 \cE\} , } \l_my_tl
 % \end{verbatim}
-% The |\w| sequence represents any \enquote{word} character,
-% and |+| indicates that the |\w| sequence should be repeated
-% as many times as possible (at least once), hence matching a word in the
-% input token list. In the replacement text, |\0| denotes the full match
-% (here, a word).
+% The |\w| sequence represents any \enquote{word} character, and |+|
+% indicates that the |\w| sequence should be repeated as many times as
+% possible (at least once), hence matching a word in the input token
+% list. In the replacement text, |\0| denotes the full match (here, a
+% word).  The command |\emph| is inserted using |\c{emph}|, and its
+% argument |\0| is put between braces |\cB\{| and |\cE\}|.
 %
 % If a regular expression is to be used several times,
 % it can be compiled once, and stored in a regex
@@ -97,6 +98,49 @@
 %
 % \subsection{Syntax of regular expressions}
 %
+% We start with a few examples, and encourage the reader to apply
+% \cs{regex_show:n} to these regular expressions.
+% \begin{itemize}
+% \item |Cat| matches the word \enquote{Cat} capitalized in this way,
+%   but also matches the beginning of the word \enquote{Cattle}: use
+%   |\bCat\b| to match a complete word only.
+% \item |[abc]| matches one letter among \enquote{a}, \enquote{b},
+%   \enquote{c}; the pattern \verb"(a|b|c)" matches the same three
+%   possible letters (but see the discussion of submatches below).
+% \item |[A-Za-z]*| matches any number (due to the quantifier
+%   \verb"*") of Latin letters (not accented).
+% \item |\c{[A-Za-z]*}| matches a control sequence made of Latin
+%   letters.
+% \item |\_[^\_]*\_| matches an underscore, any number of characters
+%   other than underscore, and another underscore; it is equivalent to
+%   |\_.*?\_| where |.| matches arbitrary characters and the
+%   lazy quantifier |*?| means to match as few characters as
+%   possible, thus avoiding matching underscores.
+% \item |[+-]?\d+| matches an explicit integer with at most one
+%   sign.
+% \item \verb*"[+-\ ]*\d+\ *" matches an explicit integer with any
+%   number of $+$ and $-$ signs, with spaces allowed except within the
+%   mantissa, and sourrounded by spaces.
+% \item \verb*"[+-\ ]*(\d+|\d*\.\d+)\ *" matches an explict integer or
+%   decimal number; using \verb*"[.,]" instead of \verb*"\." would allow
+%   the comma as a decimal marker.
+% \item
+%   \verb*"[+-\ ]*(\d+|\d*\.\d+)\ *((?i)pt|in|[cem]m|ex|[bs]p|[dn]d|[pcn]c)\ *"
+%   matches an explicit dimension with any unit that \TeX{} knows, where
+%   \verb*"(?i)" means to treat lowercase and uppercase letters
+%   identically.
+% \item \verb*"[+-\ ]*((?i)nan|inf|(\d+|\d*\.\d+)(\ *e[+-\ ]*\d+)?)\ *"
+%   matches an explicit floating point number or the special values
+%   \verb*"nan" and \verb*"inf" (with signs).
+% \item \verb*"[+-\ ]*(\d+|\cC.)\ *" matches an explicit integer or
+%   control sequence (without checking whether it is an integer
+%   variable).
+% \end{itemize}
+% While it is impossible for a regular expression to match only integer
+% expressions, \verb*"[+-\(]*\d+\)*([+-*/][+-\(]*\d+\)*)*" matches among
+% other things all valid integer expressions (made only with explicit
+% integers).  One should follow it with further testing.
+%
 % Most characters match exactly themselves,
 % with an arbitrary category code. Some characters are
 % special and must be escaped with a backslash (\emph{e.g.}, |\*|
@@ -208,7 +252,7 @@
 %     or two |\W| tokens (including the boundary).
 %   \item[\char`^ \textrm{or} \\A]
 %     Start of the subject token list.
-%   \item[\char`$\textrm{,} \\Z \textrm{or} \\z]
+%   \item[\char`$\textrm{,} \\Z \textrm{or} \\z] ^^A $
 %     End of the subject token list.
 %   \item[\\G] Start of the current match. This is only different from |^|
 %     in the case of multiple matches: for instance
@@ -335,12 +379,12 @@
 %
 % Most of the features described in regular expressions do not make
 % sense within the replacement text.  Backslash introduces various
-% special constructions:
+% special constructions, described further below:
 % \begin{itemize}
 %   \item |\0| is the whole match;
-%   \item |\1|, |\2|, \ldots{}, |\9| or |\g{|\meta{number}|}| are the
-%     submatches (empty if there are fewer than \meta{number} capturing
-%     groups);
+%   \item |\1| is the submatch that was matched by the first (capturing)
+%     group |(...)|; similarly for |\2|, \ldots{}, |\9| and
+%     |\g{|\meta{number}|}|;
 %   \item \verb*|\ | inserts a space (spaces are ignored when not
 %     escaped);
 %   \item |\a|, |\e|, |\f|, |\n|, |\r|, |\t|, |\xhh|, |\x{hhh}|
@@ -363,8 +407,15 @@
 % \end{verbatim}
 % results in \cs{l_my_tl} holding |H(ell--el)(o,--o) w(or--o)(ld--l)!|
 %
-% Submatches always keep the same category codes as in the original
-% token list.
+% The submatches are numbered according to the order in which the
+% opening parenthesis of capturing groups appear in the regular
+% expression to match.  The $n$-th submatch is empty if there are fewer
+% than $n$ capturing groups or for capturing groups that appear in
+% alternatives that were not used for the match.  In case a capturing
+% group matches several times during a match (due to quantifiers) only
+% the last match is used in the replacement text. Submatches always keep
+% the same category codes as in the original token list.
+%
 % The characters inserted by the replacement have category code $12$
 % (other) by default, with the exception of space characters.  Spaces
 % inserted through \verb*|\ | have category code $10$, while spaces
@@ -384,12 +435,11 @@
 %
 % The escape sequence |\u|\Arg{tl~var~name} allows to insert the
 % contents of the token list with name \meta{tl~var~name} directly into
-% the replacement, giving an easier control of category codes.
-% Within |\c{|\ldots{}|}| and |\u{|\ldots{}|}| constructions, the |\u|
-% and |\c|~escape sequences perform \cs{tl_to_str:v}, namely extract the
-% value of the control sequence and turn it into a string.
-%
-% Matches can be used within the arguments of |\c| and |\u|.  For
+% the replacement, giving an easier control of category codes.  When
+% nested in |\c{|\ldots{}|}| and |\u{|\ldots{}|}| constructions, the
+% |\u| and |\c|~escape sequences perform \cs{tl_to_str:v}, namely
+% extract the value of the control sequence and turn it into a string.
+% Matches can also be used within the arguments of |\c| and |\u|.  For
 % instance,
 % \begin{verbatim}
 %   \tl_set:Nn \l_my_one_tl { first }
@@ -502,15 +552,15 @@
 %     \cs{regex_extract_once:nnN} \Arg{regex} \Arg{token list} \meta{seq~var}
 %     \cs{regex_extract_once:nnNTF} \Arg{regex} \Arg{token list} \meta{seq~var} \Arg{true code} \Arg{false code}
 %   \end{syntax}
-%   Finds the first match of the \meta{regular expression}
-%   in the \meta{token list}. If it exists, the match is stored
-%   as the zeroeth item of the \meta{seq~var}, and further
-%   items are the contents of capturing groups, in the order
-%   of their opening parenthesis. The \meta{seq~var}
-%   is assigned locally. If there is no match,
-%   the \meta{seq~var} is cleared.
-%   The testing versions insert the \meta{true code} into the input
-%   stream if a match was found, and the \meta{false code} otherwise.
+%   Finds the first match of the \meta{regular expression} in the
+%   \meta{token list}. If it exists, the match is stored as the first
+%   item of the \meta{seq~var}, and further items are the contents of
+%   capturing groups, in the order of their opening parenthesis. The
+%   \meta{seq~var} is assigned locally. If there is no match, the
+%   \meta{seq~var} is cleared.  The testing versions insert the
+%   \meta{true code} into the input stream if a match was found, and the
+%   \meta{false code} otherwise.
+%
 %   For instance, assume that you type
 %   \begin{verbatim}
 %     \regex_extract_once:nnNTF { \A(La)?TeX(!*)\Z } { LaTeX!!! } \l_foo_seq
@@ -522,6 +572,9 @@
 %   group, |(!*)|, matches |!!!|. Thus, |\l_foo_seq| contains as a result
 %   the items |{LaTeX!!!}|, |{La}|, and |{!!!}|, and the \texttt{true}
 %   branch is left in the input stream.
+%   Note that the $n$-th item of |\l_foo_seq|, as obtained using
+%   \cs{seq_item:Nn}, correspond to the submatch numbered $(n-1)$ in
+%   functions such as \cs{regex_replace_once:nnN}.
 % \end{function}
 %
 % \begin{function}[TF, added = 2017-05-26]