[latex3-commits] [l3svn] 01/02: Move all character encoding set up to l3bootstrap

noreply at latex-project.org noreply at latex-project.org
Tue Dec 15 15:20:24 CET 2015


This is an automated email from the git hooks/post-receive script.

joseph pushed a commit to branch master
in repository l3svn.

commit 2cb47f1cf80fe92858aa6aa0fabca30303f8eb28
Author: Joseph Wright <joseph.wright at morningstar2.co.uk>
Date:   Tue Dec 15 09:49:47 2015 +0000

    Move all character encoding set up to l3bootstrap
    
    This puts the Unicode set up (which must be here) in the same place
    as the 8-bit one (which traditionally is in 'final').
    
    Note that this retains the assumption of EC encoding for 8-bit engines.
    Input encoding and hyphenation pattern loading is yet to do.
---
 l3kernel/l3bootstrap.dtx |   99 +++++++++++++++++++++++++++++++++++++++++-----
 l3kernel/l3final.dtx     |   94 -------------------------------------------
 2 files changed, 89 insertions(+), 104 deletions(-)

diff --git a/l3kernel/l3bootstrap.dtx b/l3kernel/l3bootstrap.dtx
index 07883c6..d9f3491 100644
--- a/l3kernel/l3bootstrap.dtx
+++ b/l3kernel/l3bootstrap.dtx
@@ -506,20 +506,99 @@
 %</package>
 %    \end{macrocode}
 %
-% \subsection{Unicode set up}
-%
-% The Unicode engines start in Ini\TeX{} mode with category codes, \emph{etc.},
-% for code points beyond $127$ all treated as \enquote{other} characters.
-% Various properties can be derived from the Unicode data files and this
-% is the same here as it would be for plain or \LaTeXe{}. There are shared
-% loaders available, but these of course need standard (Ini)\TeX{} category
-% codes and primitive availablity. They are therefore loaded \emph{very}
-% early.
-%    \being{macrocode}
+% \subsection{Character data}
+%
+% \TeX{} needs various pieces of data to be set about characters, in particular
+% which ones to treat as letters and which \tn{lccode} values apply as these
+% affect hyphenation. It makes most sense to set this and related information
+% up in one place. Whilst for \LuaTeX{} hyphenation patterns can be read
+% anywhere, other engines have to build them into the format and so we
+% \emph{must} do this set up before reading the patterns. For the Unicode
+% engines, there are shared loaders available to obtain the relevant
+% information irectly from the Unicode Consortium data files. These need
+% standard (Ini)\TeX{} category codes and primitive availablity and must
+% therefore loaded \emph{very} early. This has a knock-on effect on the
+% $8$-bit set up: it makes sense to do the defintions for those here as
+% well so it is all in one place.
+%
+% For \XeTeX{} and \LuaTeX{}, which are natively Unicode engines, simply
+% load the Unicode data.
+%    \begin{macrocode}
 %<*initex>
 \ifdefined\Umathcode
   \input load-unicode-data %
+\else
+%    \end{macrocode}
+% For the $8$-bit engines an encoding scheme must be chosen. At present,
+% this is the EC (|T1|) scheme, with the assumption that languages for which
+% this is not appropriate will be used with one of the Unicode engines.
+%    \begin{macrocode}
+  \begingroup
+%    \end{macrocode}
+% Lower case chars: map to themselves when lower casing and down by |"20| when
+% upper casing. (The characters |a|--|z| are set up correctly by Ini\TeX{}.)
+%    \begin{macrocode}
+    \def\temp{%
+      \ifnum\count0>\count2 %
+      \else
+        \global\lccode\count0 = \count0 %
+        \global\uccode\count0 = \numexpr\count0 - "20\relax
+        \advance\count0 by 1 %
+        \expandafter\temp
+      \fi
+    }
+    \count0="A0 %
+    \count2="BC %
+    \temp
+    \count0="E0 %
+    \count2="FF %
+    \temp
+%    \end{macrocode}
+% Upper case chars: map up by |"20| when lower casing, to themselves when upper
+% casing and require an \tn{sfcode} of $999$. (The characters |A|--|Z| are set
+% up correctly by Ini\TeX{}.)
+%    \begin{macrocode}
+    \def\temp{%
+      \ifnum\count0>\count2 %
+      \else
+        \global\lccode\count0 = \numexpr\count0 + "20\relax
+        \global\uccode\count0 = \count0 %
+        \global\sfcode\count0 = 999 %
+        \advance\count0 by 1 %
+        \expandafter\temp
+      \fi
+    }
+    \count0="80 %
+    \count2="9C %
+    \temp
+    \count0="C0 %
+    \count2="DF %
+    \temp
+%    \end{macrocode}
+% A few special cases where things are not as one might expect using the above
+% pattern: dotless-I, dotless-J, dotted-I and d-bar.
+%    \begin{macrocode}
+    \global\lccode`\^^Y = `\^^Y %
+    \global\uccode`\^^Y = `\I %
+    \global\lccode`\^^Z = `\^^Z %
+    \global\uccode`\^^Y = `\J %
+    \global\lccode"9D = `\i %
+    \global\uccode"9D = "9D %
+    \global\lccode"9E = "9E %
+    \global\uccode"9E = "D0 %
+%    \end{macrocode}
+% Allow hyphenation at a zero-width glyph (used to break up ligatures or
+% to place accents between characters).
+%    \begin{macrocode}
+    \global\lccode23 = 23 %
+  \endgroup
 \fi
+%    \end{macrocode}
+% In all cases it makes sense to set up |-| to map to itself: this allows
+% hyphenation of the rest of a word following it (suggested by
+% Lars Helstr\"om).
+%    \begin{macrocode}
+\global\lccode`\-=`\- %
 %</initex>
 %    \end{macrocode}
 %
diff --git a/l3kernel/l3final.dtx b/l3kernel/l3final.dtx
index 03a643f..32520f3 100644
--- a/l3kernel/l3final.dtx
+++ b/l3kernel/l3final.dtx
@@ -78,100 +78,6 @@
 %<*initex>
 %    \end{macrocode}
 %
-% \subsection{Character data}
-%
-% \TeX{} needs various pieces of data to be set about characters, in particular
-% which ones to treat as letters and which \tn{lccode} values apply as these
-% affect hyphenation. It makes most sense to set this and related information
-% up in one place. This is all done before reading the hyphenation patterns,
-% which engines except \LuaTeX{} can only read during format-building.
-%
-% Hyphenation patterns are nowadays all available in UTF-8 form, and are
-% therefore loaded for \pdfTeX{} with appropriate set up to convert from $8$-bit
-% input. As a result, only a subset of patterns are available with this engine
-% compared with \XeTeX{} and \LuaTeX{}: appropriate set ups require appropriate
-% font encodings. For \pTeX{} and \upTeX{} the situation is more complex:
-% these engines treat characters outside of the \acro{ascii} range in their
-% own way and so require patterns written in $7$-bit input only.
-%
-% Most of this data is taken straight from \LaTeXe{} with appropriate changes
-% for things that are not required here (such as support for non-\eTeX{}
-% engines). In particular, the \tn{uccode} values are set even where they could
-% be skipped (as case changing at the document level is done without using
-% \tn{uppercase}). There is little cost to setting everything and it does mean
-% that the values are logical, so this seems a reasonable approach.
-%
-% For \XeTeX{} and \LuaTeX{}, which are natively Unicode engines, the
-% encoding set up is exactly Unicode. For the other supported engines
-% input has to be $8$-bit and so an encoding scheme must be chosen. At present,
-% this is the EC (|T1|) scheme, with the assumption that languages for which
-% this is not appropriate will be used with one of the Unicode engines.
-% The latter read the Unicode data very early such that the same data
-% files can be used as for plain \TeX{} and \LaTeXe{}. Thus only the $8$-bit
-% engines are of concern here.
-%    \begin{macrocode}
-\bool_if:nF
-  { \sys_if_engine_luatex_p: || \sys_if_engine_xetex_p: }
-%    \end{macrocode}
-% Set up the Cork T1 encoding data. Most of this can be
-% done using a few loops. We don't provide a global interface for setting
-% these codes so it is done at a low level (to avoid code repetition).
-%    \begin{macrocode}
-  {
-    \group_begin:
-%    \end{macrocode}
-% Lower case chars: map to themselves when lower casing and down by |"20| when
-% upper casing. (The characters |a|--|z| are set up correctly by Ini\TeX{}.)
-%    \begin{macrocode}
-      \cs_set_protected:Npn \__cs_tmp:w #1
-        {
-          \tex_global:D \tex_lccode:D #1 = #1 \scan_stop:
-          \tex_global:D \tex_uccode:D #1 =
-            \__int_eval:w #1 - "20 \__int_eval_end:
-        }
-      \int_step_function:nnnN { "A0 } { 1 } { "BC } \__cs_tmp:w
-      \int_step_function:nnnN { "E0 } { 1 } { "FF } \__cs_tmp:w
-%    \end{macrocode}
-% Upper case chars: map up by |"20| when lower casing, to themselves when upper
-% casing and require an \tn{sfcode} of $999$. (The characters |A|--|Z| are set
-% up correctly by Ini\TeX{}.)
-%    \begin{macrocode}
-      \cs_set_protected:Npn \__cs_tmp:w #1
-        {
-          \tex_global:D \tex_lccode:D #1 =
-            \__int_eval:w #1 + "20 \__int_eval_end:
-          \tex_global:D \tex_uccode:D #1 =  #1 \scan_stop:
-          \tex_global:D \tex_sfcode:D #1 = 999 \scan_stop:
-        }
-      \int_step_function:nnnN { "80 } { 1 } { "9C } \__cs_tmp:w
-      \int_step_function:nnnN { "C0 } { 1 } { "DF } \__cs_tmp:w
-    \group_end:
-%    \end{macrocode}
-% A few special cases where things are not as one might expect using the above
-% pattern: dotless-I, dotless-J, dotted-I and d-bar.
-%    \begin{macrocode}
-    \char_set_lccode:nn { `\^^Y } { `\^^Y }
-    \char_set_uccode:nn { `\^^Y } { `\I }
-    \char_set_lccode:nn { `\^^Z } { `\^^Z }
-    \char_set_uccode:nn { `\^^Z } { `\J }
-    \char_set_lccode:nn { "9D } { `\i }
-    \char_set_uccode:nn { "9D } { "9D }
-    \char_set_lccode:nn { "9E } { "9E }
-    \char_set_uccode:nn { "9E } { "D0 }
-%    \end{macrocode}
-% Allow hyphenation at a zero-width glyph (used to break up ligatures or
-% to place accents between characters).
-%    \begin{macrocode}
-    \char_set_lccode:nn { 23 } { 23 }
-  }
-%    \end{macrocode}
-% In all cases it makes sense to set up |-| to map to itself: this allows
-% hyphenation of the rest of a word following it (suggested by
-% Lars Helstr\"om).
-%    \begin{macrocode}
-\char_set_lccode:nn { `\- } { `\- }
-%    \end{macrocode}
-%
 % \subsection{Input encoding}
 %
 % The letters |a|--|z| and |A|--|Z| will be correct directly from Ini\TeX{}

-- 
To stop receiving notification emails like this one, please contact
the administrator of this repository.


More information about the latex3-commits mailing list