[latex3-commits] [git/LaTeX3-latex3-latex2e] master: improve error messages to distinguish undefined but valid utf8 from invalid utf8 (6903393)
David Carlisle
d.p.carlisle at gmail.com
Wed Mar 28 21:03:02 CEST 2018
Repository : https://github.com/latex3/latex2e
On branch : master
Link : https://github.com/latex3/latex2e/commit/690339353bae5b6a0fa5b8ebd08f8bf29d423623
>---------------------------------------------------------------
commit 690339353bae5b6a0fa5b8ebd08f8bf29d423623
Author: David Carlisle <d.p.carlisle at gmail.com>
Date: Wed Mar 28 20:03:02 2018 +0100
improve error messages to distinguish undefined but valid utf8 from invalid utf8
>---------------------------------------------------------------
690339353bae5b6a0fa5b8ebd08f8bf29d423623
base/utf8ienc.dtx | 49 +++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 47 insertions(+), 2 deletions(-)
diff --git a/base/utf8ienc.dtx b/base/utf8ienc.dtx
index 4bb47c4..8d41c8d 100644
--- a/base/utf8ienc.dtx
+++ b/base/utf8ienc.dtx
@@ -290,18 +290,32 @@
\def\UTFviii at defined#1{%
\ifx#1\relax
% \end{macrocode}
+% Test if tthe sequence is invalid UTF-8 or valid UTF-8 but without
+% a \LaTeX\ definition.
+% \begin{macrocode}
+ \if\relax\expandafter\UTFviii at checkseq\string#1\relax\relax
+% \end{macrocode}
% The endline character has a special definition within the
% inputenc package (it is gobbling spaces). For this reason we
% can't produce multiline strings without some precaution.
% \changes{v1.1b}{2004/02/09}{No newlines allowed in error messages}
% \changes{v1.1g}{2005/09/27}{Real spaces do not show up so use \cs{space}}
% \changes{v1.1o}{2015/08/28}{Show Unicode number of character in hex}
-% \begin{macrocode}
+% \changes{v1.2a}{2018/03/24}{Error message inproved for non-UTF-8 sequences}%
\PackageError{inputenc}{Unicode\space char\space\expandafter
\UTFviii at splitcsname\string#1\relax
\MessageBreak
not\space set\space up\space
for\space use\space with\space LaTeX}\@eha
+% \end{macrocode}
+%
+% \begin{macrocode}
+ \else
+ \PackageError{inputenc}{Invalid UTF-8 byte sequence }\@eha
+ \fi
+% \end{macrocode}
+%
+% \begin{macrocode}
\else\expandafter
#1%
\fi
@@ -309,7 +323,7 @@
% \end{macrocode}
% \end{macro}
%
-% \begin{macro}{\def\UTFviii at invalid}
+% \begin{macro}{\UTFviii at invalid}
% \changes{v1.2a}{2018/03/24}{Macro added}%
% \begin{macrocode}
\def\UTFviii at invalid#1{%
@@ -318,6 +332,37 @@
% \end{macro}
%
%
+% \begin{macro}{\UTFviii at checkseq}
+% \begin{macro}{\UTFviii at check@continue}
+% \changes{v1.2a}{2018/03/24}{Macro added}%
+% Check that the csname consists of a valid UTF-8 sequence.
+% \begin{macrocode}
+\def\UTFviii at checkseq#1:#2#3{%
+ \ifnum`#2<"80 %
+ \ifx\relax#3\else1\fi
+ \else
+ \ifnum`#2<"C0 %
+ 1 %
+ \else
+ \expandafter\expandafter\expandafter\UTFviii at check@continue
+ \expandafter\expandafter\expandafter#3%
+ \fi
+ \fi}
+% \end{macrocode}
+%
+% \begin{macrocode}
+\def\UTFviii at check@continue#1{%
+ \ifx\relax#1%
+ \else
+ \ifnum`#1<"80 1\else\ifnum`#1>"BF 1\fi\fi
+ \expandafter\UTFviii at check@continue
+ \fi
+}
+% \end{macrocode}
+% \end{macro}
+% \end{macro}
+%
+%
% \begin{macro}{\UTFviii at loop}
% This wonderful bit of code from Dr Carlisle defines the starting
% octets to call |\UTFviii at two@octets| etc as appropriate. The starting
More information about the latex3-commits
mailing list