[latex3-commits] [git/LaTeX3-latex3-latex2e] utf8andspace: different concept for utf8 handling, so medium size rewrite. We forgot to handle \protected at def and the like (41ea83b)

Tue Feb 12 14:14:34 CET 2019

Repository : https://github.com/latex3/latex2e
On branch  : utf8andspace
Link       : https://github.com/latex3/latex2e/commit/41ea83bb293c9b09688f743f32ee67f8ff503390

>---------------------------------------------------------------

commit 41ea83bb293c9b09688f743f32ee67f8ff503390
Author: Frank Mittelbach <frank.mittelbach at latex-project.org>
Date:   Tue Feb 12 14:14:34 2019 +0100

    different concept for utf8 handling, so medium size rewrite. We forgot to handle \protected at def and the like


>---------------------------------------------------------------

41ea83bb293c9b09688f743f32ee67f8ff503390
 .../utf8-test-001.lvt}                             |   25 +-
 base/testfiles-legacy/utf8-test-001.tlg            |   57 ++++
 base/utf8andspace.tex                              |  276 +++++++++++++++++---
 3 files changed, 322 insertions(+), 36 deletions(-)

diff --git a/base/testfiles-filename/utf-test.lvt b/base/testfiles-legacy/utf8-test-001.lvt
similarity index 85%
copy from base/testfiles-filename/utf-test.lvt
copy to base/testfiles-legacy/utf8-test-001.lvt
index ff39122..a68acb2 100644
--- a/base/testfiles-filename/utf-test.lvt
+++ b/base/testfiles-legacy/utf8-test-001.lvt
@@ -1,19 +1,20 @@
-\input{utf8andspace}
 \documentclass{article}
 
-%\usepackage{textcomp}
+%\usepackage{trace}
 
+\input{test2e}
 
-% ----------------------------------------------------------
 
-\input{utf8andspace}
+% ----------------------------------------------------------
 
+\START
 
 \begin{filecontents*}{one two three}
 1 2 3
 \end{filecontents*}
 
 
+%\traceon
 \begin{filecontents*}{füße.tex}
 Füße file
 \end{filecontents*}
@@ -31,11 +32,13 @@ eureka
 
 \includeonly{foo, füße€€€ , öfoo}
 
+
 \begin{document}
 
+
 \tableofcontents
 
-\section{A with ref: ``\ref{abß}''}
+\section{A with ref: ``\ref{abß}'' and Füßen}
 
 \label{öfoo}
 
@@ -82,5 +85,17 @@ Some refs: \ref{abß} and \ref{öfoo} and \ref{bar€}
 
 \input{one two three}
 
+
+% here we test if protected edef  works
+\makeatletter
+
+\protected at edef\foo{Füße}
+
+\show\foo
+
+\setbox0\hbox{\foo} \foo
+
+\showbox0
+
 \end{document}
 
diff --git a/base/testfiles-legacy/utf8-test-001.tlg b/base/testfiles-legacy/utf8-test-001.tlg
new file mode 100644
index 0000000..030dee5
--- /dev/null
+++ b/base/testfiles-legacy/utf8-test-001.tlg
@@ -0,0 +1,57 @@
+This is a generated file for the LaTeX2e validation system.
+Don't change this file in any respect.
+LaTeX Warning: File `one two three' already exists on the system.
+               Not generating it from this source.
+LaTeX Warning: File `f^^c3^^bc^^c3^^9fe.tex' already exists on the system.
+               Not generating it from this source.
+LaTeX Warning: File `f^^c3^^bc^^c3^^9fe^^e2^^82^^ac^^e2^^82^^ac^^e2^^82^^ac.tex' already exists on the system.
+               Not generating it from this source.
+LaTeX Warning: File `"f^^c3^^bc^^c3^^9fe im sand.tex"' already exists on the system.
+               Not generating it from this source.
+(utf8-test-001.aux (f^^c3^^bc^^c3^^9fe^^e2^^82^^ac^^e2^^82^^ac^^e2^^82^^ac.aux)
+No file ^^e2^^82^^ac^^e2^^82^^ac^^e2^^82^^ac.aux.
+)
+LaTeX Font Info:    Checking defaults for OML/cmm/m/it on input line ....
+LaTeX Font Info:    ... okay on input line ....
+LaTeX Font Info:    Checking defaults for T1/cmr/m/n on input line ....
+LaTeX Font Info:    ... okay on input line ....
+LaTeX Font Info:    Checking defaults for OT1/cmr/m/n on input line ....
+LaTeX Font Info:    ... okay on input line ....
+LaTeX Font Info:    Checking defaults for OMS/cmsy/m/n on input line ....
+LaTeX Font Info:    ... okay on input line ....
+LaTeX Font Info:    Checking defaults for OMX/cmex/m/n on input line ....
+LaTeX Font Info:    ... okay on input line ....
+LaTeX Font Info:    Checking defaults for U/cmr/m/n on input line ....
+LaTeX Font Info:    ... okay on input line ....
+(utf8-test-001.toc
+LaTeX Warning: Reference `ab^^c3^^9f' on page 1 undefined on input line ...
+LaTeX Font Info:    External font `cmex10' loaded for size
+(Font)              <7> on input line ....
+LaTeX Font Info:    External font `cmex10' loaded for size
+(Font)              <5> on input line ....
+)
+\tf at toc=\write...
+[1
+]
+(f^^c3^^bc^^c3^^9fe^^e2^^82^^ac^^e2^^82^^ac^^e2^^82^^ac.tex) [2
+]
+(f^^c3^^bc^^c3^^9fe im sand.tex) (f^^c3^^bc^^c3^^9fe.tex)
+No file ^^e2^^82^^ac^^e2^^82^^ac^^e2^^82^^ac.
+(f^^c3^^bc^^c3^^9fe im sand.tex) (f^^c3^^bc^^c3^^9fe im sand.tex)
+(one two three.tex)
+> \foo=macro:
+->F^^c3^^bc^^c3^^9fe.
+l. ...\show\foo
+> \box...=
+\hbox(6.8872+0.0)x21.52252
+.\T1/cmr/m/n/10 F
+.\T1/cmr/m/n/10 ^^fc
+.\T1/cmr/m/n/10 ^^ff
+.\T1/cmr/m/n/10 e
+! OK.
+l. ...\showbox0
+[3
+] (utf8-test-001.aux (f^^c3^^bc^^c3^^9fe^^e2^^82^^ac^^e2^^82^^ac^^e2^^82^^ac.aux)
+No file ^^e2^^82^^ac^^e2^^82^^ac^^e2^^82^^ac.aux.
+)
+LaTeX Warning: There were undefined references.
diff --git a/base/utf8andspace.tex b/base/utf8andspace.tex
index 7d5772d..c5d6cf6 100644
--- a/base/utf8andspace.tex
+++ b/base/utf8andspace.tex
@@ -1,75 +1,278 @@
-\makeatletter
+%% This is a patch of the LaTeX kernel to support UTF8 character in
+%% all places where they can be supported by an 8bit engine such as
+%% pdfTeX.
+%%
+%%
+%% This should enable UTF8 not only in ordinary text (as
+%% already provided by a recent LaTeX release but in addition
+%% supports:
+%%
+%%  - utf8 characters in file names used by \input \includegraphics
+%%    and the like --- this includes spaces and it is not longer necessary
+%     to quote the file name in this case (not possible is the use of
+%%    the " as part of a file name, this is restriction of the library
+%%    the TeX engines use).
+%%
+%%  - use of all utf8 characters in labels
+%%
+%%  - in contrast to the utf8 characters that are used in typesetting
+%%    it is not necessary that LaTeX has any knowledge how to render
+%%    the character, e.g., without loading the textcomp package it is
+%%    not possible to typeset € but even then you can have a file or a
+%%    label with that character.
+%%
+%%
+%% The plan is to integrate this patch (or a version of it) into the
+%% kernel. Thus the current external version is intended to invite
+%% tests with real documents beyond the test suite that we have
+%% available at our disposal.
+%%
+%%
+%% If you find any issues, please prepare a short example and submit
+%% it as an issue at
+%%
+%%    https://github.com/latex3/latex2e/issues
+%%
+%% Thanks!
 
 
+\makeatletter
 
 
 
-% quoting spaces
-% a b c     -> "a b c"
-% "a b c"   -> "a b c"
-% a" "b" "c -> "a b c"
-%           -> ""
-\def\quote at name#1{"\quote@@name#1\@gobble""}
-\def\quote@@name#1"{#1\quote@@name}
+% utf8
+%
+%
+%  whenever we encounter a UTF8 char in non-typesetting  situation we make sure it
+%  doesn't expand.
 
+%-------------------------------------------------------------------------
 
-% utf8
+% Approach
+%
+% The utf8 characters are seen by an 8-bit engine as a sequence of octets.
+%
+% We make each starting octet an active character.
+%
+
+%  - When typesetting we pick up the necessary number of additional
+%    octets check if they form a command that LaTeX knows about
+%    ( \csname u8:\string#1\string#2...\encsname ) and if so use that
+%    for typesetting.  \string is needed as the octets may (all?) be
+%    active and we want the literal values in the name.
+
+%  - If the utf8 character is going to be part of a label then it is
+%    essentially becoming part of some csname and with the
+%    test \ifincsname we can find this out. If so we render the whole
+%    sequence off octets harmless by using \string too when the
+%    starting octet executes.
+%
+
+%  - Another possible case is that \protect has *not* the meaning
+%    of \typeset at protect. In that case we may do a write or we may do
+%    a \protected at edef or ...  In all such cases we want to keep the
+%    sequence of octets unchanged, but we can't use \string since at
+%    least in the case of \protect at edef the result may later be
+%    typeset after all (in fact that is quite likely) and so at that
+%    point the starting octet needs to be an active character again
+%    (the others could be stringified). So for those cases we use \noexpand.
+%
+
+%  So the code for a start octet of a two byte sequence would there
+%  look like this:
+
+%
+%
+% \long\def\UTFviii at two@octets{%
+%   \ifincsname
+%     \expandafter
+%     \UTF at twostring@octets
+%   \else
+%     \ifx\protect\@typeset at protect
+%     \else
+%       \expandafter\expandafter\expandafter
+%      \UTF at twoharmless@octets
+%     \fi
+%   \fi
+%   \UTFviii at two@octets at do
+% }
+% 
+
+% \ifcsname is tested first because that can be true even if we are
+%  otherwise doing typesetting. If this is the case use \string on the
+%  whole octet sequence. \UTF at twostring@octets not only does this but
+%  also gets rid of \UTFviii at two@octets at do in the input stream by
+%  picking it up as a first argument and dropping it.
+%
+% If this is not the case and we are doing typesetting (i.e., \protect
+%  is \typeset at protect) then execute \UTFviii at two@octets at do which
+%  picks up all octets and typesets the character (or generates an
+%  error if it doesn't know how to typeset it).
+%
+% If we are not doing typesetting then we run \UTFviii at two@octets at do
+%  which is like \UTF at twostring@octets but uses \noexpand instead
+%  of \string. This way the sequence is temporay frozen, eg would
+%  display as is or stays put inside a \protected at edef but if the
+%  result is later reused the starting octet is still active.
+%
+% The definitions for the other starting octets are the same except
+% that they pick up more octets after them.
+
+
+
+% In the original all starting octets would be defined as calling such
+%  a \UTFviii at ...@octets command followed by a \string version of the
+%  octet itself (so that it can be used to form the character). We now
+%  need to keep that octet active and so we have to do a slightly
+%  different setup.
+%
+%
+% So here is the new setup loop. Note that for error cases we can and
+%  should of course use a \string version of the octet since there is
+%  no point do extra work.
+
+\begingroup
+\catcode`\~13
+\catcode`\"12
+\def\UTFviii at loop{%
+  \uccode`\~\count@
+  \uppercase\expandafter{\UTFviii at tmp}%
+  \advance\count@\@ne
+  \ifnum\count@<\@tempcnta
+  \expandafter\UTFviii at loop
+  \fi}
+    \def\UTFviii at tmp{\xdef~{\noexpand\UTFviii at undefined@err{:\string~}}}
+    \count@"1
+    \@tempcnta9
+\UTFviii at loop
+    \count at 11
+    \@tempcnta12
+\UTFviii at loop
+    \count at 14
+    \@tempcnta32
+\UTFviii at loop
+    \count@"80
+    \@tempcnta"C2
+    \def\UTFviii at tmp{\xdef~{\noexpand\UTFviii at invalid@err\string~}}
+\UTFviii at loop
+    \count@"C2
+    \@tempcnta"E0
+    \def\UTFviii at tmp{\xdef~{\noexpand\UTFviii at two@octets\noexpand~}}
+\UTFviii at loop
+    \count@"E0
+    \@tempcnta"F0
+    \def\UTFviii at tmp{\xdef~{\noexpand\UTFviii at three@octets\noexpand~}}
+\UTFviii at loop
+    \count@"F0
+    \@tempcnta"F5
+    \def\UTFviii at tmp{\xdef~{\noexpand\UTFviii at four@octets\noexpand~}}
+\UTFviii at loop
+    \count@"F5
+    \@tempcnta"100
+    \def\UTFviii at tmp{\xdef~{\noexpand\UTFviii at invalid@err\string~}}
+\UTFviii at loop
+\endgroup
+
+% These are new work macros for the sequences as discussed above.
 
 \long\def\UTFviii at two@octets{%
-  \ifx\protect\relax
-    \ifincsname
+  \ifincsname
+    \expandafter
+    \UTF at twostring@octets
+  \else
+    \ifx\protect\@typeset at protect
+    \else
       \expandafter\expandafter\expandafter
-      \UTF at twoharmless@octets
+     \UTF at twoharmless@octets
     \fi
-  \else
-    \expandafter\UTF at twoharmless@octets
   \fi
   \UTFviii at two@octets at do
 }
 
 
 \long\def\UTFviii at three@octets{%
-  \ifx\protect\relax
-    \ifincsname
+  \ifincsname
+    \expandafter
+    \UTF at threestring@octets
+  \else
+    \ifx\protect\@typeset at protect
+    \else
       \expandafter\expandafter\expandafter
-      \UTF at threeharmless@octets
+     \UTF at threeharmless@octets
     \fi
-  \else
-    \expandafter\UTF at threeharmless@octets
   \fi
   \UTFviii at three@octets at do
 }
 
 
 \long\def\UTFviii at four@octets{%
-  \ifx\protect\relax
-    \ifincsname
+  \ifincsname
+    \expandafter
+    \UTF at fourstring@octets
+  \else
+    \ifx\protect\@typeset at protect
+    \else
       \expandafter\expandafter\expandafter
-      \UTF at fourharmless@octets
+     \UTF at fourharmless@octets
     \fi
-  \else
-    \expandafter\UTF at fourharmless@octets
   \fi
   \UTFviii at four@octets at do
 }
 
 
+% The \... at do are more or less what the original code was doing as
+%  part of \UTFviii at ...@octets. However #1 is now active (wasn't in
+%  the original impl) so we better string that inside the cs. This
+%  is faster than having it figure out that by itself that it is in a
+%  csname.
 
 \long\def\UTFviii at two@octets at do#1#2{\expandafter
-    \UTFviii at defined\csname u8:#1\string#2\endcsname}
+    \UTFviii at defined\csname u8:\string#1\string#2\endcsname}
 \long\def\UTFviii at three@octets at do#1#2#3{\expandafter
-    \UTFviii at defined\csname u8:#1\string#2\string#3\endcsname}
+    \UTFviii at defined\csname u8:\string#1\string#2\string#3\endcsname}
 \long\def\UTFviii at four@octets at do#1#2#3#4{\expandafter
-    \UTFviii at defined\csname u8:#1\string#2\string#3\string#4\endcsname}
+    \UTFviii at defined\csname u8:\string#1\string#2\string#3\string#4\endcsname}
+
+
+% These tempoarily prevent the active chars from expanding. (Maybe
+%  using \unexpanded would be faster here?)
+
+\long\def\UTF at twoharmless@octets#1#2{\noexpand#2\noexpand}
+\long\def\UTF at threeharmless@octets#1#2#3{\noexpand#2\noexpand#3\noexpand}
+\long\def\UTF at fourharmless@octets#1#2#3#4{\noexpand#2\noexpand#3\noexpand#4\noexpand}
 
+% And the same with \string for use in \csname constructions.
 
-\long\def\UTF at twoharmless@octets#1#2{\string#2\string}
-\long\def\UTF at threeharmless@octets#1#2#3{\string#2\string#3\string}
-\long\def\UTF at fourharmless@octets#1#2#3#4{\string#2\string#3\string#4\string}
+\long\def\UTF at twostring@octets#1#2{\string#2\string}
+\long\def\UTF at threestring@octets#1#2#3{\string#2\string#3\string}
+\long\def\UTF at fourstring@octets#1#2#3#4{\string#2\string#3\string#4\string}
 
 
+% The kernel already has saved away definitions for the starting code so
+%  we have to refresh that (until the day this is properly integrated):
+
+% if used in the kernel we also need this:
+\let\UTFviii at two@octets@@\UTFviii at two@octets
+\let\UTFviii at three@octets@@\UTFviii at three@octets
+\let\UTFviii at four@octets@@\UTFviii at four@octets
+
+% Done :-)
+
 
 %-------------------------------------------------------------------------
+%
+
+% File name handling is done by generating a csname from the provided
+%  file name (which means that utf8 octets gets turned into strings
+%  due to the above procedure). By setting \escapchar to -1 we ensure
+%  that we don't get a \ in front. As a result we end up with all
+%  characters as catcode 12 (plus spaces). We then sometimes add
+%  quotes around the contruct (removing any existing inner
+%  quotes. Somes we only remove the quotes if they have been supplied
+%  by the user. There is clearly some room for improvement.
+%
+% A side effect of the new code is that we will see quotes around file
+%  name displays where there haven't been any before.
 
 \def\set at curr@file#1{%
   \begingroup
@@ -78,6 +281,18 @@
   \endgroup
 }
 
+% quoting spaces
+% a b c     -> "a b c"
+% "a b c"   -> "a b c"
+% a" "b" "c -> "a b c"
+%           -> ""
+\def\quote at name#1{"\quote@@name#1\@gobble""}
+\def\quote@@name#1"{#1\quote@@name}
+
+% removing quotes
+%
+\def\unquote at name#1{\quote@@name#1\@gobble"}
+
 
 %-------------------------------------------------------------------------
 
@@ -210,7 +425,6 @@
 
 % graphics
 
-\def\unquote at name#1{\quote@@name#1\@gobble"}
 
 
 \AtBeginDocument{%