[latex3-commits] [l3svn] 03/03: Set up T1/Unicode in format mode

Fri Nov 13 10:16:44 CET 2015

This is an automated email from the git hooks/post-receive script.

joseph pushed a commit to branch master
in repository l3svn.

commit 14c2616251928b4a8fe297e9f9ffdf6389b06bb1
Author: Joseph Wright <joseph.wright at morningstar2.co.uk>
Date:   Fri Nov 13 09:12:51 2015 +0000

    Set up T1/Unicode in format mode
    
    This follows LaTeX2e closely but drops stuff that is there only
    for compatibilyt reasons.
---
 l3kernel/l3final.dtx |  166 +++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 164 insertions(+), 2 deletions(-)

diff --git a/l3kernel/l3final.dtx b/l3kernel/l3final.dtx
index f60dfbb..a28215b 100644
--- a/l3kernel/l3final.dtx
+++ b/l3kernel/l3final.dtx
@@ -64,8 +64,9 @@
 %
 % \begin{documentation}
 %
-% This module is the end of the \LaTeX3 format file. Currently, there
-% is not a lot happening here.
+% This module is the end of the \LaTeX3 format file. Currently, a lot of this
+% is copy-pasted from the \LaTeXe{} format or is highly unstable (essentially
+% hacks which need revisiting later).
 %
 % \end{documentation}
 %
@@ -77,6 +78,167 @@
 %<*initex>
 %    \end{macrocode}
 %
+% \subsection{Character data}
+%
+% \TeX{} needs various pieces of data to be set about characters, in particular
+% which ones to treat as letters and which \tn{lccode} values apply as these
+% affect hyphenation. It makes most sense to set this and related information
+% up in one place. This is all done before reading the hyphenation patterns,
+% which engines except \LuaTeX{} can only read during format-building.
+%
+% Hyphenation patterns are nowadays all available in UTF-8 form, and are
+% therefore loaded for \pdfTeX{} with appropriate set up to convert from $8$-bit
+% input. As a result, only a subset of patterns are available with this engine
+% compared with \XeTeX{} and \LuaTeX{}: appropriate set ups require appropriate
+% font encodings. For \pTeX{} and \upTeX{} the situation is more complex:
+% these engines treat characters outside of the \acro{ascii} range in their
+% own way and so require patterns written in $7$-bit input only.
+%
+% Most of this data is taken straight from \LaTeXe{} with appropriate changes
+% for things that are not required here (such as support for non-\eTeX{}
+% engines). In particular, the \tn{uccode} values are set even where they could
+% be skipped (as case changing at the document level is done without using
+% \tn{uppercase}). There is little cost to setting everything and it does mean
+% that the values are logical, so this seems a reasonable approach.
+%
+% For \XeTeX{} and \LuaTeX{}, which are natively Unicode engines, the
+% encoding set up is exactly Unicode. For the other supported engines
+% input has to be $8$-bit and so an encoding scheme must be chosen. At present,
+% this is the EC (|T1|) scheme, with the assumption that languages for which this
+% is not appropriate will be used with one of the Unicode engines.
+%    \begin{macrocode}
+\bool_if:nTF
+  {
+       \sys_if_engine_luatex_p:
+    || \sys_if_engine_xetex_p:
+  }
+%    \end{macrocode}
+% For the Unicode engines, the core data used is derived automatically from
+% the master Unicode Consortium files and is thus read here. At present, this
+% is done by reading the \LaTeXe{} data file, setting up so only the data
+% part is extracted. For setting up letter codes, |\L| lines are cased letters,
+% |\l| uncased letters and |\C| cased non-letters. See |ltunicode.dtx| for
+% details of \XeTeX{} inter-character class types.
+%    \begin{macrocode}
+  {
+    \group_begin:
+      \cs_set:Npn \begingroup #1 \fi \fi { }
+      \sys_if_engine_xetex:TF
+        {
+          \cs_set_protected:Npn \endgroup #1 \fi \ID
+            { 
+              \cs_set_eq:NN \endgroup \tex_endinput:D
+              \ID 
+            } 
+        }
+        { \cs_set_eq:NN \endgroup \tex_endinput:D }
+      \cs_set_protected:Npn \C #1 ~ #2 ~ #3 ~
+        {
+          \tex_global:D \tex_lccode:D "#1 = "#3 \scan_stop:
+          \tex_global:D \tex_uccode:D "#1 = "#2 \scan_stop:
+        }
+      \cs_set_protected:Npn \L #1 ~ #2 ~ #3 ~
+        {
+          \C #1 ~ #2 ~ #3 ~
+          \int_compare:nNnF { "#1 } = { "#3 }
+            { \tex_global:D \tex_sfcode:D "#1 = 999 \scan_stop: }
+          \tex_global:D \utex_mathcode:D "#1 =  
+            \int_compare:nNnTF { "#1 } < { "10000 } { "7 } { "0 }
+            "01 "#1 \scan_stop:
+        }
+      \cs_set_protected:Npn \l #1 ~ { \L #1 ~ #1 ~ #1 ~ }
+      \sys_if_engine_xetex:T
+        {  
+          \cs_set_protected:Npn \ID #1 ~ #2 ~
+            { \__char_set_class:nnn {#1} {#2} { 1 } }
+          \cs_set_protected:Npn \OP #1 ~
+            { \__char_set_class:nnn {#1} {#1} { 2 } }
+          \cs_set_protected:Npn \CL #1 ~
+            { \__char_set_class:nnn {#1} {#1} { 3 } }
+          \cs_set_protected:Npn \EX #1 ~
+            { \__char_set_class:nnn {#1} {#1} { 3 } }
+          \cs_set_protected:Npn \IS #1 ~
+            { \__char_set_class:nnn {#1} {#1} { 3 } }
+          \cs_set_protected:Npn \NS #1 ~
+            { \__char_set_class:nnn {#1} {#1} { 3 } }
+          \cs_set_protected:Npn \CM #1 ~
+            { \__char_set_class:nnn {#1} {#1} { 256 } }
+          \cs_set_protected:Npn \__char_set_class:nnn #1#2#3
+            {
+              \int_step_inline:nnnn { "#1 } { 1 } { "#2 }
+                { \tex_global:D \xetex_charclass:D ##1 = #3 \scan_stop: }
+            }
+        }
+      \char_set_catcode_space:n { `\  }%
+      \file_input:n{unicode-letters.def}%
+    \group_end:
+%    \end{macrocode}
+% A couple of special cases that make sense for \TeX{} but don't derive
+% readily from the Unicode files.
+%    \begin{macrocode}
+    \tex_global:D \tex_sfcode:D "2019 = 0 \scan_stop:
+    \tex_global:D \tex_sfcode:D "201D = 0 \scan_stop:
+  }
+%    \end{macrocode}
+% For the other engines, set up the Cork T1 encoding data. Most of this can be
+% done using a few loops. We don't provide a global interface for setting
+% these codes so it is done at a low level (to avoid code repetition).
+%    \begin{macrocode}
+  {
+    \group_begin:
+%    \end{macrocode}
+% Lower case chars: map to themselves when lower casing and down by |"20| when
+% upper casing. (The characters |a|--|z| are set up correctly by Ini\TeX{}.)
+%    \begin{macrocode}
+      \cs_set_protected:Npn \__cs_tmp:w #1
+        {
+          \tex_global:D \tex_lccode:D #1 = #1 \scan_stop:
+          \tex_global:D \tex_uccode:D #1 =
+            \__int_eval:w #1 - "20 \__int_eval_end:
+        }
+      \int_step_function:nnnN { "A0 } { 1 } { "BC } \__cs_tmp:w
+      \int_step_function:nnnN { "E0 } { 1 } { "FF } \__cs_tmp:w
+%    \end{macrocode}
+% Upper case chars: map up by |"20| when lower casing, to themselves when upper
+% casing and require an \tn{sffode} of $999$. (The characters |A|--|Z| are set
+% up correctly by Ini\TeX{}.)
+%    \begin{macrocode}
+      \cs_set_protected:Npn \__cs_tmp:w #1
+        {
+          \tex_global:D \tex_lccode:D #1 =
+            \__int_eval:w #1 + "20 \__int_eval_end:
+          \tex_global:D \tex_uccode:D #1 =  #1 \scan_stop:
+          \tex_global:D \tex_sfcode:D #1 = 999 \scan_stop:
+        }
+      \int_step_function:nnnN { "80 } { 1 } { "9C } \__cs_tmp:w
+      \int_step_function:nnnN { "C0 } { 1 } { "DF } \__cs_tmp:w
+    \group_end:
+%    \end{macrocode}
+% A few special cases where things are not as one might expect using the above
+% pattern: dotless-I, dotless-J, dotted-I and d-bar.
+%    \begin{macrocode}
+    \char_set_lccode:nn { `\^^Y } { `\^^Y }
+    \char_set_uccode:nn { `\^^Y } { `\I }
+    \char_set_lccode:nn { `\^^Z } { `\^^Z }
+    \char_set_uccode:nn { `\^^Z } { `\J }
+    \char_set_lccode:nn { "9D } { `\i }
+    \char_set_uccode:nn { "9D } { "9D }
+    \char_set_lccode:nn { "9E } { "9E }
+    \char_set_uccode:nn { "9E } { "D0 }
+%    \end{macrocode}
+% Allow hyphenation at a zero-width glyph (used to break up ligatures or
+% to place accents between characters).
+%    \begin{macrocode}
+    \char_set_lccode:nn { 23 } { 23 }
+  }
+%    \end{macrocode}
+% In all cases it makes sense to set up |-| to map to itself: this allows
+% hyphenation of the rest of a word following it (suggested by
+% Lars Helstr\"om).
+%    \begin{macrocode}
+\char_set_lccode:nn { `\- } { `\- }
+%    \end{macrocode}
+%
 % \subsection{Temporary hacks}
 %
 % \begin{macro}{\T1/lmr/m/n/10}

-- 
To stop receiving notification emails like this one, please contact
the administrator of this repository.