[latex3-commits] [git/LaTeX3-latex3-latex2e] master: improve error messages to distinguish undefined but valid utf8 from invalid utf8 (6903393)

David Carlisle d.p.carlisle at gmail.com
Wed Mar 28 21:03:02 CEST 2018


Repository : https://github.com/latex3/latex2e
On branch  : master
Link       : https://github.com/latex3/latex2e/commit/690339353bae5b6a0fa5b8ebd08f8bf29d423623

>---------------------------------------------------------------

commit 690339353bae5b6a0fa5b8ebd08f8bf29d423623
Author: David Carlisle <d.p.carlisle at gmail.com>
Date:   Wed Mar 28 20:03:02 2018 +0100

    improve error messages to distinguish undefined but valid utf8 from invalid utf8


>---------------------------------------------------------------

690339353bae5b6a0fa5b8ebd08f8bf29d423623
 base/utf8ienc.dtx |   49 +++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 47 insertions(+), 2 deletions(-)

diff --git a/base/utf8ienc.dtx b/base/utf8ienc.dtx
index 4bb47c4..8d41c8d 100644
--- a/base/utf8ienc.dtx
+++ b/base/utf8ienc.dtx
@@ -290,18 +290,32 @@
 \def\UTFviii at defined#1{%
   \ifx#1\relax
 %    \end{macrocode}
+% Test if tthe sequence is invalid UTF-8 or valid UTF-8 but without
+% a \LaTeX\ definition.
+%    \begin{macrocode}
+      \if\relax\expandafter\UTFviii at checkseq\string#1\relax\relax
+%    \end{macrocode}
 %    The endline character has a special definition within the
 %    inputenc package (it is gobbling spaces). For this reason we
 %    can't produce multiline strings without some precaution.
 % \changes{v1.1b}{2004/02/09}{No newlines allowed in error messages}
 % \changes{v1.1g}{2005/09/27}{Real spaces do not show up so use \cs{space}}
 % \changes{v1.1o}{2015/08/28}{Show Unicode number of character in hex}
-%    \begin{macrocode}
+% \changes{v1.2a}{2018/03/24}{Error message inproved for non-UTF-8 sequences}%
       \PackageError{inputenc}{Unicode\space char\space\expandafter
                               \UTFviii at splitcsname\string#1\relax
                               \MessageBreak
                               not\space set\space up\space
                               for\space use\space with\space LaTeX}\@eha
+%    \end{macrocode}
+%
+%    \begin{macrocode}
+     \else
+      \PackageError{inputenc}{Invalid UTF-8 byte sequence }\@eha
+     \fi         
+%    \end{macrocode}
+%
+%    \begin{macrocode}
   \else\expandafter
     #1%
   \fi
@@ -309,7 +323,7 @@
 %    \end{macrocode}
 % \end{macro}
 %
-% \begin{macro}{\def\UTFviii at invalid}
+% \begin{macro}{\UTFviii at invalid}
 % \changes{v1.2a}{2018/03/24}{Macro added}%
 %    \begin{macrocode}
 \def\UTFviii at invalid#1{%
@@ -318,6 +332,37 @@
 % \end{macro}
 %
 %
+% \begin{macro}{\UTFviii at checkseq}
+% \begin{macro}{\UTFviii at check@continue}
+% \changes{v1.2a}{2018/03/24}{Macro added}%
+% Check that the csname consists of a valid UTF-8 sequence.
+%    \begin{macrocode}
+\def\UTFviii at checkseq#1:#2#3{%
+ \ifnum`#2<"80 %
+   \ifx\relax#3\else1\fi
+ \else
+   \ifnum`#2<"C0 %
+     1 %
+   \else
+     \expandafter\expandafter\expandafter\UTFviii at check@continue
+     \expandafter\expandafter\expandafter#3%
+   \fi
+  \fi}
+%    \end{macrocode}
+%
+%    \begin{macrocode}
+\def\UTFviii at check@continue#1{%
+  \ifx\relax#1%
+  \else
+  \ifnum`#1<"80 1\else\ifnum`#1>"BF 1\fi\fi
+  \expandafter\UTFviii at check@continue
+  \fi
+}
+%    \end{macrocode}
+% \end{macro}
+% \end{macro}
+%
+%
 % \begin{macro}{\UTFviii at loop}
 %    This wonderful bit of code from Dr Carlisle defines the starting
 %    octets to call |\UTFviii at two@octets| etc as appropriate. The starting





More information about the latex3-commits mailing list