[latex3-commits] [l3svn] r7352 - Add example regular expressions to the documentation of l3regex
noreply at latex-project.org
noreply at latex-project.org
Wed Jul 12 19:11:03 CEST 2017
Author: bruno
Date: 2017-07-12 19:11:03 +0200 (Wed, 12 Jul 2017)
New Revision: 7352
Modified:
trunk/l3kernel/l3regex.dtx
Log:
Add example regular expressions to the documentation of l3regex
This includes both things I found asked on TeX.stackexchange and basic
things like how to match an integer, or a dimension.
Modified: trunk/l3kernel/l3regex.dtx
===================================================================
--- trunk/l3kernel/l3regex.dtx 2017-07-12 11:07:36 UTC (rev 7351)
+++ trunk/l3kernel/l3regex.dtx 2017-07-12 17:11:03 UTC (rev 7352)
@@ -70,15 +70,16 @@
% \enquote{\texttt{This cat.}}, where the first
% occurrence of \enquote{\texttt{at}} was replaced
% by \enquote{\texttt{is}}. A more complicated example is
-% a pattern to add a comma at the end of each word:
+% a pattern to emphasize each word and add a comma after it:
% \begin{verbatim}
-% \regex_replace_all:nnN { \w+ } { \0 , } \l_my_tl
+% \regex_replace_all:nnN { \w+ } { \c{emph}\cB\{ \0 \cE\} , } \l_my_tl
% \end{verbatim}
-% The |\w| sequence represents any \enquote{word} character,
-% and |+| indicates that the |\w| sequence should be repeated
-% as many times as possible (at least once), hence matching a word in the
-% input token list. In the replacement text, |\0| denotes the full match
-% (here, a word).
+% The |\w| sequence represents any \enquote{word} character, and |+|
+% indicates that the |\w| sequence should be repeated as many times as
+% possible (at least once), hence matching a word in the input token
+% list. In the replacement text, |\0| denotes the full match (here, a
+% word). The command |\emph| is inserted using |\c{emph}|, and its
+% argument |\0| is put between braces |\cB\{| and |\cE\}|.
%
% If a regular expression is to be used several times,
% it can be compiled once, and stored in a regex
@@ -97,6 +98,49 @@
%
% \subsection{Syntax of regular expressions}
%
+% We start with a few examples, and encourage the reader to apply
+% \cs{regex_show:n} to these regular expressions.
+% \begin{itemize}
+% \item |Cat| matches the word \enquote{Cat} capitalized in this way,
+% but also matches the beginning of the word \enquote{Cattle}: use
+% |\bCat\b| to match a complete word only.
+% \item |[abc]| matches one letter among \enquote{a}, \enquote{b},
+% \enquote{c}; the pattern \verb"(a|b|c)" matches the same three
+% possible letters (but see the discussion of submatches below).
+% \item |[A-Za-z]*| matches any number (due to the quantifier
+% \verb"*") of Latin letters (not accented).
+% \item |\c{[A-Za-z]*}| matches a control sequence made of Latin
+% letters.
+% \item |\_[^\_]*\_| matches an underscore, any number of characters
+% other than underscore, and another underscore; it is equivalent to
+% |\_.*?\_| where |.| matches arbitrary characters and the
+% lazy quantifier |*?| means to match as few characters as
+% possible, thus avoiding matching underscores.
+% \item |[+-]?\d+| matches an explicit integer with at most one
+% sign.
+% \item \verb*"[+-\ ]*\d+\ *" matches an explicit integer with any
+% number of $+$ and $-$ signs, with spaces allowed except within the
+% mantissa, and sourrounded by spaces.
+% \item \verb*"[+-\ ]*(\d+|\d*\.\d+)\ *" matches an explict integer or
+% decimal number; using \verb*"[.,]" instead of \verb*"\." would allow
+% the comma as a decimal marker.
+% \item
+% \verb*"[+-\ ]*(\d+|\d*\.\d+)\ *((?i)pt|in|[cem]m|ex|[bs]p|[dn]d|[pcn]c)\ *"
+% matches an explicit dimension with any unit that \TeX{} knows, where
+% \verb*"(?i)" means to treat lowercase and uppercase letters
+% identically.
+% \item \verb*"[+-\ ]*((?i)nan|inf|(\d+|\d*\.\d+)(\ *e[+-\ ]*\d+)?)\ *"
+% matches an explicit floating point number or the special values
+% \verb*"nan" and \verb*"inf" (with signs).
+% \item \verb*"[+-\ ]*(\d+|\cC.)\ *" matches an explicit integer or
+% control sequence (without checking whether it is an integer
+% variable).
+% \end{itemize}
+% While it is impossible for a regular expression to match only integer
+% expressions, \verb*"[+-\(]*\d+\)*([+-*/][+-\(]*\d+\)*)*" matches among
+% other things all valid integer expressions (made only with explicit
+% integers). One should follow it with further testing.
+%
% Most characters match exactly themselves,
% with an arbitrary category code. Some characters are
% special and must be escaped with a backslash (\emph{e.g.}, |\*|
@@ -208,7 +252,7 @@
% or two |\W| tokens (including the boundary).
% \item[\char`^ \textrm{or} \\A]
% Start of the subject token list.
-% \item[\char`$\textrm{,} \\Z \textrm{or} \\z]
+% \item[\char`$\textrm{,} \\Z \textrm{or} \\z] ^^A $
% End of the subject token list.
% \item[\\G] Start of the current match. This is only different from |^|
% in the case of multiple matches: for instance
@@ -335,12 +379,12 @@
%
% Most of the features described in regular expressions do not make
% sense within the replacement text. Backslash introduces various
-% special constructions:
+% special constructions, described further below:
% \begin{itemize}
% \item |\0| is the whole match;
-% \item |\1|, |\2|, \ldots{}, |\9| or |\g{|\meta{number}|}| are the
-% submatches (empty if there are fewer than \meta{number} capturing
-% groups);
+% \item |\1| is the submatch that was matched by the first (capturing)
+% group |(...)|; similarly for |\2|, \ldots{}, |\9| and
+% |\g{|\meta{number}|}|;
% \item \verb*|\ | inserts a space (spaces are ignored when not
% escaped);
% \item |\a|, |\e|, |\f|, |\n|, |\r|, |\t|, |\xhh|, |\x{hhh}|
@@ -363,8 +407,15 @@
% \end{verbatim}
% results in \cs{l_my_tl} holding |H(ell--el)(o,--o) w(or--o)(ld--l)!|
%
-% Submatches always keep the same category codes as in the original
-% token list.
+% The submatches are numbered according to the order in which the
+% opening parenthesis of capturing groups appear in the regular
+% expression to match. The $n$-th submatch is empty if there are fewer
+% than $n$ capturing groups or for capturing groups that appear in
+% alternatives that were not used for the match. In case a capturing
+% group matches several times during a match (due to quantifiers) only
+% the last match is used in the replacement text. Submatches always keep
+% the same category codes as in the original token list.
+%
% The characters inserted by the replacement have category code $12$
% (other) by default, with the exception of space characters. Spaces
% inserted through \verb*|\ | have category code $10$, while spaces
@@ -384,12 +435,11 @@
%
% The escape sequence |\u|\Arg{tl~var~name} allows to insert the
% contents of the token list with name \meta{tl~var~name} directly into
-% the replacement, giving an easier control of category codes.
-% Within |\c{|\ldots{}|}| and |\u{|\ldots{}|}| constructions, the |\u|
-% and |\c|~escape sequences perform \cs{tl_to_str:v}, namely extract the
-% value of the control sequence and turn it into a string.
-%
-% Matches can be used within the arguments of |\c| and |\u|. For
+% the replacement, giving an easier control of category codes. When
+% nested in |\c{|\ldots{}|}| and |\u{|\ldots{}|}| constructions, the
+% |\u| and |\c|~escape sequences perform \cs{tl_to_str:v}, namely
+% extract the value of the control sequence and turn it into a string.
+% Matches can also be used within the arguments of |\c| and |\u|. For
% instance,
% \begin{verbatim}
% \tl_set:Nn \l_my_one_tl { first }
@@ -502,15 +552,15 @@
% \cs{regex_extract_once:nnN} \Arg{regex} \Arg{token list} \meta{seq~var}
% \cs{regex_extract_once:nnNTF} \Arg{regex} \Arg{token list} \meta{seq~var} \Arg{true code} \Arg{false code}
% \end{syntax}
-% Finds the first match of the \meta{regular expression}
-% in the \meta{token list}. If it exists, the match is stored
-% as the zeroeth item of the \meta{seq~var}, and further
-% items are the contents of capturing groups, in the order
-% of their opening parenthesis. The \meta{seq~var}
-% is assigned locally. If there is no match,
-% the \meta{seq~var} is cleared.
-% The testing versions insert the \meta{true code} into the input
-% stream if a match was found, and the \meta{false code} otherwise.
+% Finds the first match of the \meta{regular expression} in the
+% \meta{token list}. If it exists, the match is stored as the first
+% item of the \meta{seq~var}, and further items are the contents of
+% capturing groups, in the order of their opening parenthesis. The
+% \meta{seq~var} is assigned locally. If there is no match, the
+% \meta{seq~var} is cleared. The testing versions insert the
+% \meta{true code} into the input stream if a match was found, and the
+% \meta{false code} otherwise.
+%
% For instance, assume that you type
% \begin{verbatim}
% \regex_extract_once:nnNTF { \A(La)?TeX(!*)\Z } { LaTeX!!! } \l_foo_seq
@@ -522,6 +572,9 @@
% group, |(!*)|, matches |!!!|. Thus, |\l_foo_seq| contains as a result
% the items |{LaTeX!!!}|, |{La}|, and |{!!!}|, and the \texttt{true}
% branch is left in the input stream.
+% Note that the $n$-th item of |\l_foo_seq|, as obtained using
+% \cs{seq_item:Nn}, correspond to the submatch numbered $(n-1)$ in
+% functions such as \cs{regex_replace_once:nnN}.
% \end{function}
%
% \begin{function}[TF, added = 2017-05-26]
More information about the latex3-commits
mailing list