[latex3-commits] [git/LaTeX3-latex3-latex3] master: implemented an expandable and about 40% faster keyval_parse (33121e415)

Tue Mar 3 17:59:10 CET 2020

Repository : https://github.com/latex3/latex3
On branch  : master
Link       : https://github.com/latex3/latex3/commit/33121e41520f4cca0eaebc24ab139600d7a7201e

>---------------------------------------------------------------

commit 33121e41520f4cca0eaebc24ab139600d7a7201e
Author: Jonathan Spratte <jspratte at yahoo.de>
Date:   Thu Feb 20 18:51:39 2020 +0100

    implemented an expandable and about 40% faster keyval_parse


>---------------------------------------------------------------

33121e41520f4cca0eaebc24ab139600d7a7201e
 l3kernel/l3keys.dtx | 457 +++++++++++++++++++++++++++++++---------------------
 1 file changed, 269 insertions(+), 188 deletions(-)

diff --git a/l3kernel/l3keys.dtx b/l3kernel/l3keys.dtx
index dcf83e0d8..a976349af 100644
--- a/l3kernel/l3keys.dtx
+++ b/l3kernel/l3keys.dtx
@@ -891,7 +891,7 @@
 % \end{verbatim}
 % are treated identically.
 %
-% \begin{function}[updated = 2011-09-08]{\keyval_parse:NNn}
+% \begin{function}[EXP,updated = 2020-02-20]{\keyval_parse:NNn}
 %   \begin{syntax}
 %     \cs{keyval_parse:NNn} \meta{function_1} \meta{function_2} \Arg{key--value list}
 %   \end{syntax}
@@ -920,6 +920,11 @@
 %   all). Spaces are trimmed from the ends of the \meta{key} and \meta{value},
 %   then one \emph{outer} set of braces is removed from the \meta{key}
 %   and \meta{value} as part of the processing.
+%   \begin{texnote}
+%     The result is returned within \cs{exp_not:n}, which means that the
+%     converted input stream does not expand further when appearing in an
+%     \texttt{x}-type or \texttt{e}-type argument expansion.
+%   \end{texnote}
 % \end{function}
 %
 % \end{documentation}
@@ -946,233 +951,309 @@
 %<@@=keyval>
 %    \end{macrocode}
 %
-% \begin{variable}{\l_@@_key_tl, \l_@@_value_tl}
-%   The current key name and value.
+% \begin{macro}[internal]{\@@_tmp:NN}
+%   This temporary macro will be used since some of the definitions will need an
+%   active comma or equals sign. Inside of this macro |#1| will be the active
+%   comma and |#2| will be the active equals sign.
 %    \begin{macrocode}
-\tl_new:N \l_@@_key_tl
-\tl_new:N \l_@@_value_tl
-%    \end{macrocode}
-% \end{variable}
-%
-% \begin{variable}{\l_@@_sanitise_tl}
-%   A token list variable for dealing with awkward category codes in the
-%   input.
-%    \begin{macrocode}
-\tl_new:N \l_@@_sanitise_tl
+\group_begin:
+\cs_set:Npn \@@_tmp:NN #1 #2
+  {
 %    \end{macrocode}
-% \end{variable}
+% \end{macro}
 %
 % \begin{macro}{\keyval_parse:NNn}
-%   The main function starts off by normalising category codes in package mode.
-%   That's relatively \enquote{expensive} so is skipped (hopefully) in format
-%   mode. We then hand off to the parser. The use of \cs{q_mark} here prevents
-%   loss of braces from the key argument. Notice that by passing the two
-%   processor commands along the input stack we avoid the need to track these
-%   at all.
+%   The main function starts the first of two input loops. The outer loop splits
+%   the key--value list at active commas, the inner loop will do so at other
+%   commas. The use of \cs{q_mark} here prevents loss of braces from the key
+%   argument.
 %    \begin{macrocode}
-\cs_new_protected:Npn \keyval_parse:NNn #1#2#3
-  {
-%<*initex>
-    \@@_loop:NNw #1#2 \q_mark #3 , \q_recursion_tail ,
-%</initex>
-%<*package>
-    \tl_set:Nn \l_@@_sanitise_tl {#3}
-    \@@_sanitise_equals:
-    \@@_sanitise_comma:
-    \exp_after:wN \@@_loop:NNw \exp_after:wN #1 \exp_after:wN #2
-      \exp_after:wN \q_mark \l_@@_sanitise_tl , \q_recursion_tail ,
-%</package>
-  }
+    \cs_new:Npn \keyval_parse:NNn ##1 ##2 ##3
+      {
+        \@@_loop_active:NNw ##1 ##2 \q_mark ##3 #1 \q_recursion_tail #1
+      }
 %    \end{macrocode}
 % \end{macro}
 %
-% \begin{macro}{\@@_sanitise_equals:, \@@_sanitise_comma:}
-% \begin{macro}
-%   {
-%     \@@_sanitise_equals_auxi:w, \@@_sanitise_equals_auxii:w,
-%     \@@_sanitise_comma_auxi:w, \@@_sanitise_comma_auxii:w,
-%     \@@_sanitise_aux:w
-%   }
-%   A reasonably fast search and replace set up specifically for the active
-%   tokens. The nature of the input is known so everything is hard-coded.
-%   With only two tokens to cover, the speed gain from using dedicated
-%   functions is worth it.
+% \begin{macro}[internal]{\@@_loop_active:NNw}
+%   First a fast test for the end of the loop is done, it'll gobble everything
+%   up to a \cs{q_mark} immediately followed by a \cs{q_recursion_tail}. The
+%   loop ending macro will gobble everything to the last \cs{q_mark} in this
+%   definition.
+%   If the end isn't reached yet, start the second loop splitting at other
+%   comments, and after that one iterate the current loop.
 %    \begin{macrocode}
-%<*package>
-\group_begin:
-  \char_set_catcode_active:n { `\= }
-  \char_set_catcode_active:n { `\, }
-  \cs_new_protected:Npn \@@_sanitise_equals:
-    {
-      \exp_after:wN \@@_sanitise_equals_auxi:w \l_@@_sanitise_tl
-        \q_mark = \q_nil =
-      \exp_after:wN \@@_sanitise_aux:w \l_@@_sanitise_tl
-    }
-    \cs_new_protected:Npn \@@_sanitise_equals_auxi:w #1 =
+    \cs_new:Npn \@@_loop_active:NNw ##1 ##2 ##3 #1
       {
-        \tl_set:Nn \l_@@_sanitise_tl {#1}
-        \@@_sanitise_equals_auxii:w
+        \@@_if_recursion_tail:w ##3
+          \@@_end_loop_active:w \q_mark \q_recursion_tail
+        \@@_loop_other:NNw ##1 ##2 ##3 , \q_recursion_tail ,
+        \@@_loop_active:NNw ##1 ##2 \q_mark
       }
-    \cs_new_protected:Npn \@@_sanitise_equals_auxii:w #1 =
+%    \end{macrocode}
+% \end{macro}
+%
+% \begin{macro}[internal]{\@@_loop_other:NNw}
+%   The second loop uses the same test for its end as the first loop, next it
+%   tests whether there are other or active equals signs, throwing an error if
+%   there are both. If there are none, test whether the argument is blank or is
+%   a single key. If there are only active equals signs split at those, else
+%   split at others. Finally, iterate the loop.
+%    \begin{macrocode}
+    \cs_new:Npn \@@_loop_other:NNw ##1 ##2 ##3 ,
       {
-        \if_meaning:w \q_nil #1 \scan_stop:
-        \else:
-          \tl_set:Nx \l_@@_sanitise_tl
-            {
-              \exp_not:o \l_@@_sanitise_tl
-              \token_to_str:N =
-              \exp_not:n {#1}
-            }
-          \exp_after:wN \@@_sanitise_equals_auxii:w
-        \fi:
+        \@@_if_recursion_tail:w ##3
+          \@@_end_loop_other:w \q_mark \q_recursion_tail
+        \@@_if_has_equal_other:w ##3 = \q_stop
+          \@@_has_false:w \q_mark \q_stop \use_i:nn
+          {
+            \@@_if_has_equal_active:w ##3 #2 \q_stop
+              \@@_has_false:w \q_mark \q_stop \use_i:nn
+              \@@_misplaced_equal_error:
+              { \@@_split_other:wN ##3 = \q_stop ##2 }
+          }
+          {
+            \@@_if_has_equal_active:w ##3 #2 \q_stop
+              \@@_has_false:w \q_mark \q_stop \use_i:nn
+              { \@@_split_active:wN ##3 #2 \q_stop ##2 }
+              {
+                \@@_if_blank:w ##3 \q_nil \q_stop
+                  \@@_blank_true:w \q_mark \q_stop \use:n
+                  { \@@_trim:nN { ##3 } \@@_key:nN ##1 }
+              }
+          }
+        \@@_loop_other:NNw ##1 ##2 \q_mark
       }
-  \cs_new_protected:Npn \@@_sanitise_comma:
-    {
-      \exp_after:wN \@@_sanitise_comma_auxi:w \l_@@_sanitise_tl
-        \q_mark , \q_nil ,
-      \exp_after:wN \@@_sanitise_aux:w \l_@@_sanitise_tl
-    }
-    \cs_new_protected:Npn \@@_sanitise_comma_auxi:w #1 ,
+%    \end{macrocode}
+% \end{macro}
+%
+% \begin{macro}[internal]{\@@_split_active:wN, \@@_split_active_aux:nwN}
+%   Splits at the first active equals sign and tests whether there are any more
+%   valid split points, if so throw an error and gobble the remaining
+%   \meta{function_2}, which will not yet be gobbled. If there was only one
+%   active equals sign start trimming the spaces off the key and give control to
+%   \cs[no-index]{@@_key_val:nnN}.
+%    \begin{macrocode}
+    \cs_new:Npn \@@_split_active:wN ##1 #2
       {
-        \tl_set:Nn \l_@@_sanitise_tl {#1}
-        \@@_sanitise_comma_auxii:w
+        \@@_trim:nN { ##1 } \@@_split_active_aux:nwN \q_mark
       }
-    \cs_new_protected:Npn \@@_sanitise_comma_auxii:w #1 ,
+      \cs_new:Npn \@@_split_active_aux:nwN ##1 ##2 #2 ##3 \q_stop
+        {
+          \@@_if_empty:w \q_mark ##3 \q_stop
+            \@@_has_false:w \q_mark \q_stop \use_i:nn
+            { \@@_misplaced_equal_error: \use_none:n }
+            { \@@_trim:nN { ##2 } \@@_key_val:nnN { ##1 } }
+        }
+%    \end{macrocode}
+% \end{macro}
+%
+% \begin{macro}{\@@_if_has_equal_active:w}
+%   The test for an active equals sign just gobbles tokens until the first
+%   active equals sign and then runs the test for an empty argument.
+%    \begin{macrocode}
+    \cs_new:Npn \@@_if_has_equal_active:w ##1 #2
       {
-        \if_meaning:w \q_nil #1 \scan_stop:
-        \else:
-          \tl_set:Nx \l_@@_sanitise_tl
-            {
-              \exp_not:o \l_@@_sanitise_tl
-              \token_to_str:N ,
-              \exp_not:n {#1}
-            }
-          \exp_after:wN \@@_sanitise_comma_auxii:w
-        \fi:
+        \@@_if_empty:w \q_mark
       }
+%    \end{macrocode}
+% \end{macro}
+%
+% We're done with the macros which need active equals signs or commas in their
+% definition, so we can end that scope and call the temporary macro which will
+% do the definitions.
+%    \begin{macrocode}
+  }
+\char_set_catcode_active:n { `\, }
+\char_set_catcode_active:n { `\= }
+\@@_tmp:NN , =
 \group_end:
-\cs_new_protected:Npn \@@_sanitise_aux:w #1 \q_mark
-  { \tl_set:Nn \l_@@_sanitise_tl {#1} }
-%</package>
-%    \end{macrocode}
-% \end{macro}
-% \end{macro}
-%
-% \begin{macro}{\@@_loop:NNw}
-%   A fast test for the end of the loop, remembering to remove the leading
-%   quark first. Assuming that is not the case, look for a key and value then
-%   loop around, re-inserting a leading quark in front of the next position.
-%    \begin{macrocode}
-\cs_new_protected:Npn \@@_loop:NNw #1#2#3 ,
-  {
-    \exp_after:wN \if_meaning:w \exp_after:wN \q_recursion_tail
-      \use_none:n #3 \prg_do_nothing:
-    \else:
-      \@@_split:NNw #1#2#3 == \q_stop
-      \exp_after:wN \@@_loop:NNw \exp_after:wN #1 \exp_after:wN #2
-        \exp_after:wN \q_mark
-    \fi:
-  }
-%    \end{macrocode}
-% \end{macro}
-%
-% \begin{macro}{\@@_split:NNw, \@@_split_value:NNw}
-% \begin{macro}{\@@_split_tidy:w}
-% \begin{macro}{\@@_action:}
-%   The value is picked up separately from the key so there can be another
-%   quark inserted at the front, keeping braces and allowing both parts to
-%   share the same code paths. The key is found first then there's a check
-%   that there is something there: this is biased to the common case of there
-%   actually being a key. For the value, we first need to see if there is
-%   anything to do: if there is, extract it. The appropriate action is then
-%   inserted in front of the key and value. Doing this using an assignment is
-%   marginally faster than an expansion chain.
-%    \begin{macrocode}
-\cs_new_protected:Npn \@@_split:NNw #1#2#3 =
-  {
-    \@@_def:Nn \l_@@_key_tl {#3}
-    \if_meaning:w \l_@@_key_tl \c_empty_tl
-      \exp_after:wN \@@_split_tidy:w
-    \else:
-      \exp_after:wN \@@_split_value:NNw
-        \exp_after:wN #1
-        \exp_after:wN #2
-        \exp_after:wN \q_mark
-    \fi:
-  }
-\cs_new_protected:Npn \@@_split_value:NNw #1#2#3 = #4 \q_stop
-  {
-    \if:w \scan_stop: \tl_to_str:n {#4} \scan_stop:
-      \cs_set:Npx \@@_action:
-        { \exp_not:N #1 { \exp_not:o \l_@@_key_tl } }
-    \else:
-      \if:w
-        \scan_stop:
-        \__kernel_tl_to_str:w \exp_after:wN { \use_none:n #4 }
-        \scan_stop:
-        \@@_def:Nn \l_@@_value_tl {#3}
-        \cs_set:Npx \@@_action:
-          {
-            \exp_not:N #2
-              { \exp_not:o \l_@@_key_tl }
-              { \exp_not:o \l_@@_value_tl }
-          }
-      \else:
-        \cs_set:Npn \@@_action:
-          {
-            \__kernel_msg_error:nn { kernel }
-              { misplaced-equals-sign }
-          }
-      \fi:
-    \fi:
-    \@@_action:
+%    \end{macrocode}
+%
+% \begin{macro}{\@@_end_loop_active:w,\@@_end_loop_other:w}
+%   Both of these macros just have to gobble a few tokens to remove the reminder
+%   of the loops current iteration. We do this in a pretty static manner,
+%   explicitly stating every token we know it'll gobble because this is slightly
+%   faster.
+%    \begin{macrocode}
+\cs_new:Npn \@@_end_loop_active:w
+    \q_mark \q_recursion_tail
+    \@@_loop_other:NNw #1 , \q_recursion_tail ,
+    \@@_loop_active:NNw #2 \q_mark
+  {}
+\cs_new:Npn \@@_end_loop_other:w
+    \q_mark \q_recursion_tail
+    \@@_if_has_equal_other:w #1 = \q_stop
+    \@@_has_false:w \q_mark \q_stop \use_i:nn
+    #2
+    \@@_loop_other:NNw #3 \q_mark
+  {}
+%    \end{macrocode}
+% \end{macro}
+%
+% \begin{macro}[internal]{\@@_split_other:wN, \@@_split_other:nwN}
+%   These work exactly as \cs[no-index]{@@_split_active:wN}, just for
+%   equals signs of category other.
+%    \begin{macrocode}
+\cs_new:Npn \@@_split_other:wN #1 =
+  {
+    \@@_trim:nN { #1 } \@@_split_other_aux:nwN \q_mark
   }
-\cs_new_protected:Npn \@@_split_tidy:w #1 \q_stop
+  \cs_new:Npn \@@_split_other_aux:nwN #1 #2 = #3 \q_stop
+    {
+      \@@_if_empty:w \q_mark #3 \q_stop
+        \@@_has_false:w \q_mark \q_stop \use_i:nn
+        { \@@_misplaced_equal_error: \use_none:n }
+        { \@@_trim:nN { #2 } \@@_key_val:nnN { #1 } }
+    }
+%    \end{macrocode}
+% \end{macro}
+%
+% \begin{macro}[internal]{\@@_key:nN}
+%   This will get the current key with spaces trimmed and \meta{function_1} as
+%   its arguments. All it has to do is put them in an \cs{exp_not:n} and reorder
+%   them.
+%    \begin{macrocode}
+\cs_new:Npn \@@_key:nN #1 #2
   {
-    \if:w
-      \scan_stop:
-      \__kernel_tl_to_str:w \exp_after:wN { \use_none:n #1 }
-      \scan_stop:
-    \else:
-      \exp_after:wN \@@_empty_key:
-    \fi:
+    \exp_not:n { #2 { #1 } }
   }
-\cs_new:Npn \@@_action: { }
-\cs_new_protected:Npn \@@_empty_key:
-  { \__kernel_msg_error:nn { kernel } { misplaced-equals-sign } }
 %    \end{macrocode}
 % \end{macro}
+%
+% \begin{macro}[internal]{\@@_key_val:nnN}
+%   This will get the key name and value with spaces trimmed. It has to 
+%   assert that the key name isn't empty. Afterwards
+%   put them into an \cs{exp_not:n} together with \meta{function_2}. If the key
+%   is empty they are gobbled instead.
+%    \begin{macrocode}
+\cs_new:Npn \@@_key_val:nnN #1 #2 #3
+  {
+    \@@_if_empty:w \q_mark #2 \q_stop
+      \@@_empty_key:w \q_mark \q_stop
+    \exp_not:n { #3 { #2 } { #1 } }
+  }
+%    \end{macrocode}
 % \end{macro}
+%
+% \begin{macro}[internal]{\@@_if_empty:w,\@@_if_blank:w,\@@_if_recursion_tail:w}
+%   All these tests work by gobbling tokens until a certain combination is met,
+%   which makes them pretty fast. The test for a blank argument should be called
+%   with an arbitrary token following the argument. Each of these utilize the
+%   fact that the argument will contain a leading \cs{q_mark}.
+%    \begin{macrocode}
+\cs_new:Npn \@@_if_empty:w #1 \q_mark \q_stop {}
+\cs_new:Npn \@@_if_blank:w \q_mark #1 { \@@_if_empty:w \q_mark }
+\cs_new:Npn \@@_if_recursion_tail:w #1 \q_mark \q_recursion_tail {}
+%    \end{macrocode}
 % \end{macro}
 %
-% \begin{macro}{\@@_def:Nn}
-% \begin{macro}[EXP]{\@@_def_aux:n}
-% \begin{macro}[EXP]{\@@_def_aux:w}
-%   First remove the leading quark, then trim spaces off, and finally remove
-%   a set of braces.
+% \begin{macro}[internal]{\@@_has_false:w,\@@_blank_true:w,\@@_empty_key:w}
+%   These macros will be called if the tests above didn't gobble them, they
+%   execute the branching.
 %    \begin{macrocode}
-\cs_new_protected:Npn \@@_def:Nn #1#2
+\cs_new:Npn \@@_has_false:w \q_mark \q_stop \use_i:nn #1 #2 { #2 }
+\cs_new:Npn \@@_blank_true:w \q_mark \q_stop \use:n #1 {}
+\cs_new:Npn \@@_empty_key:w \q_mark \q_stop \exp_not:n #1
   {
-    \tl_set:Nx #1
-      { \tl_trim_spaces_apply:oN { \use_none:n #2 } \@@_def_aux:n }
+    \@@_misplaced_equal_error:
   }
-\cs_new:Npn \@@_def_aux:n #1
-  { \@@_def_aux:w #1 \q_stop }
-\cs_new:Npn \@@_def_aux:w #1 \q_stop { \exp_not:n {#1} }
 %    \end{macrocode}
 % \end{macro}
+%
+% \begin{macro}[internal]{\@@_gobble_q_mark:w}
+%   If we know what to gobble (the \cs{q_mark}), it is faster to name it
+%   explicitly than to use \cs{use_none:n}.
+%    \begin{macrocode}
+\cs_new:Npn \@@_gobble_q_mark:w \q_mark {}
+%    \end{macrocode}
 % \end{macro}
+%
+% \begin{macro}[internal]{\@@_if_has_equal_other:w}
+%   Another test that works by gobbling tokens until a specific one is hit.
+%    \begin{macrocode}
+\cs_new:Npn \@@_if_has_equal_other:w #1 =
+  {
+    \@@_if_empty:w \q_mark
+  }
+%    \end{macrocode}
+% \end{macro}
+%
+% \begin{macro}[internal]{\@@_misplaced_equal_error:}
+%   Just throw an error expandably. This is hid inside a macro so that other
+%   macros don't have to gobble so many tokens, which increases speed.
+%    \begin{macrocode}
+\cs_new:Npn \@@_misplaced_equal_error:
+  {
+    \__kernel_msg_expandable_error:nn { kernel } { misplaced-equals-sign }
+  }
+%    \end{macrocode}
 % \end{macro}
 %
 % One message for the low level parsing system.
 %    \begin{macrocode}
-\__kernel_msg_new:nnnn { kernel } { misplaced-equals-sign }
+\__kernel_msg_new:nnn { kernel } { misplaced-equals-sign }
   { Misplaced~equals~sign~in~key-value~input~\msg_line_context: }
+%    \end{macrocode}
+%
+% \begin{macro}[internal]{\@@_trim:nN}
+% \begin{macro}[internal]{\@@_trim_auxi:w,\@@_trim_auxii:w,\@@_trim_auxiii:w,\@@_trim_auxiv:w}
+% And an adapted version of \cs{__tl_trim_spaces:nn} which is a bit faster for
+% our use case, as it can strip the braces at the end. This is pretty much the
+% same concept, so I won't comment on it here. The speed gain by using this
+% instead of \cs{tl_trim_spaces_apply:nN} is about 10\,\% of the total time for
+% \cs{keyval_parse:NNn} with one key and one key--value pair, so I think it's
+% worth it.
+%    \begin{macrocode}
+\group_begin:
+\cs_set:Npn \@@_tmp:n #1
   {
-    LaTeX~is~attempting~to~parse~some~key-value~input~but~found~
-    two~equals~signs~not~separated~by~a~comma.
+    \cs_new:Npn \@@_trim:nN ##1
+      {
+        \@@_trim_auxi:w
+          ##1
+          \q_nil
+          \q_mark #1 {}
+          \q_mark \@@_trim_auxii:w
+          \@@_trim_auxiii:w
+          #1 \q_nil
+          \@@_trim_auxiv:w
+        \q_stop
+      }
+    \cs_new:Npn \@@_trim_auxi:w ##1 \q_mark #1 ##2 \q_mark ##3
+      {
+        ##3
+        \@@_trim_auxi:w
+        \q_mark
+        ##2
+        \q_mark #1 {##1}
+      }
+    \cs_new:Npn \@@_trim_auxii:w \@@_trim_auxi:w \q_mark \q_mark ##1
+      {
+        \@@_trim_auxiii:w
+        ##1
+      }
+    \cs_new:Npn \@@_trim_auxiii:w ##1 #1 \q_nil ##2
+      {
+        ##2
+        ##1 \q_nil
+        \@@_trim_auxiii:w
+      }
+%    \end{macrocode}
+%   This is the one macro which differs from the original definition.
+%    \begin{macrocode}
+    \cs_new:Npn \@@_trim_auxiv:w \q_mark ##1 \q_nil ##2 \q_stop ##3
+      {
+        ##3 { ##1 }
+      }
   }
+\@@_tmp:n { ~ }
+\group_end:
 %    \end{macrocode}
+% \end{macro}
+% \end{macro}
+%
+%
 %
 % \subsection{Constants and variables}
 %