[latex3-commits] [l3svn] 03/03: Set up T1/Unicode in format mode
noreply at latex-project.org
noreply at latex-project.org
Fri Nov 13 10:16:44 CET 2015
This is an automated email from the git hooks/post-receive script.
joseph pushed a commit to branch master
in repository l3svn.
commit 14c2616251928b4a8fe297e9f9ffdf6389b06bb1
Author: Joseph Wright <joseph.wright at morningstar2.co.uk>
Date: Fri Nov 13 09:12:51 2015 +0000
Set up T1/Unicode in format mode
This follows LaTeX2e closely but drops stuff that is there only
for compatibilyt reasons.
---
l3kernel/l3final.dtx | 166 +++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 164 insertions(+), 2 deletions(-)
diff --git a/l3kernel/l3final.dtx b/l3kernel/l3final.dtx
index f60dfbb..a28215b 100644
--- a/l3kernel/l3final.dtx
+++ b/l3kernel/l3final.dtx
@@ -64,8 +64,9 @@
%
% \begin{documentation}
%
-% This module is the end of the \LaTeX3 format file. Currently, there
-% is not a lot happening here.
+% This module is the end of the \LaTeX3 format file. Currently, a lot of this
+% is copy-pasted from the \LaTeXe{} format or is highly unstable (essentially
+% hacks which need revisiting later).
%
% \end{documentation}
%
@@ -77,6 +78,167 @@
%<*initex>
% \end{macrocode}
%
+% \subsection{Character data}
+%
+% \TeX{} needs various pieces of data to be set about characters, in particular
+% which ones to treat as letters and which \tn{lccode} values apply as these
+% affect hyphenation. It makes most sense to set this and related information
+% up in one place. This is all done before reading the hyphenation patterns,
+% which engines except \LuaTeX{} can only read during format-building.
+%
+% Hyphenation patterns are nowadays all available in UTF-8 form, and are
+% therefore loaded for \pdfTeX{} with appropriate set up to convert from $8$-bit
+% input. As a result, only a subset of patterns are available with this engine
+% compared with \XeTeX{} and \LuaTeX{}: appropriate set ups require appropriate
+% font encodings. For \pTeX{} and \upTeX{} the situation is more complex:
+% these engines treat characters outside of the \acro{ascii} range in their
+% own way and so require patterns written in $7$-bit input only.
+%
+% Most of this data is taken straight from \LaTeXe{} with appropriate changes
+% for things that are not required here (such as support for non-\eTeX{}
+% engines). In particular, the \tn{uccode} values are set even where they could
+% be skipped (as case changing at the document level is done without using
+% \tn{uppercase}). There is little cost to setting everything and it does mean
+% that the values are logical, so this seems a reasonable approach.
+%
+% For \XeTeX{} and \LuaTeX{}, which are natively Unicode engines, the
+% encoding set up is exactly Unicode. For the other supported engines
+% input has to be $8$-bit and so an encoding scheme must be chosen. At present,
+% this is the EC (|T1|) scheme, with the assumption that languages for which this
+% is not appropriate will be used with one of the Unicode engines.
+% \begin{macrocode}
+\bool_if:nTF
+ {
+ \sys_if_engine_luatex_p:
+ || \sys_if_engine_xetex_p:
+ }
+% \end{macrocode}
+% For the Unicode engines, the core data used is derived automatically from
+% the master Unicode Consortium files and is thus read here. At present, this
+% is done by reading the \LaTeXe{} data file, setting up so only the data
+% part is extracted. For setting up letter codes, |\L| lines are cased letters,
+% |\l| uncased letters and |\C| cased non-letters. See |ltunicode.dtx| for
+% details of \XeTeX{} inter-character class types.
+% \begin{macrocode}
+ {
+ \group_begin:
+ \cs_set:Npn \begingroup #1 \fi \fi { }
+ \sys_if_engine_xetex:TF
+ {
+ \cs_set_protected:Npn \endgroup #1 \fi \ID
+ {
+ \cs_set_eq:NN \endgroup \tex_endinput:D
+ \ID
+ }
+ }
+ { \cs_set_eq:NN \endgroup \tex_endinput:D }
+ \cs_set_protected:Npn \C #1 ~ #2 ~ #3 ~
+ {
+ \tex_global:D \tex_lccode:D "#1 = "#3 \scan_stop:
+ \tex_global:D \tex_uccode:D "#1 = "#2 \scan_stop:
+ }
+ \cs_set_protected:Npn \L #1 ~ #2 ~ #3 ~
+ {
+ \C #1 ~ #2 ~ #3 ~
+ \int_compare:nNnF { "#1 } = { "#3 }
+ { \tex_global:D \tex_sfcode:D "#1 = 999 \scan_stop: }
+ \tex_global:D \utex_mathcode:D "#1 =
+ \int_compare:nNnTF { "#1 } < { "10000 } { "7 } { "0 }
+ "01 "#1 \scan_stop:
+ }
+ \cs_set_protected:Npn \l #1 ~ { \L #1 ~ #1 ~ #1 ~ }
+ \sys_if_engine_xetex:T
+ {
+ \cs_set_protected:Npn \ID #1 ~ #2 ~
+ { \__char_set_class:nnn {#1} {#2} { 1 } }
+ \cs_set_protected:Npn \OP #1 ~
+ { \__char_set_class:nnn {#1} {#1} { 2 } }
+ \cs_set_protected:Npn \CL #1 ~
+ { \__char_set_class:nnn {#1} {#1} { 3 } }
+ \cs_set_protected:Npn \EX #1 ~
+ { \__char_set_class:nnn {#1} {#1} { 3 } }
+ \cs_set_protected:Npn \IS #1 ~
+ { \__char_set_class:nnn {#1} {#1} { 3 } }
+ \cs_set_protected:Npn \NS #1 ~
+ { \__char_set_class:nnn {#1} {#1} { 3 } }
+ \cs_set_protected:Npn \CM #1 ~
+ { \__char_set_class:nnn {#1} {#1} { 256 } }
+ \cs_set_protected:Npn \__char_set_class:nnn #1#2#3
+ {
+ \int_step_inline:nnnn { "#1 } { 1 } { "#2 }
+ { \tex_global:D \xetex_charclass:D ##1 = #3 \scan_stop: }
+ }
+ }
+ \char_set_catcode_space:n { `\ }%
+ \file_input:n{unicode-letters.def}%
+ \group_end:
+% \end{macrocode}
+% A couple of special cases that make sense for \TeX{} but don't derive
+% readily from the Unicode files.
+% \begin{macrocode}
+ \tex_global:D \tex_sfcode:D "2019 = 0 \scan_stop:
+ \tex_global:D \tex_sfcode:D "201D = 0 \scan_stop:
+ }
+% \end{macrocode}
+% For the other engines, set up the Cork T1 encoding data. Most of this can be
+% done using a few loops. We don't provide a global interface for setting
+% these codes so it is done at a low level (to avoid code repetition).
+% \begin{macrocode}
+ {
+ \group_begin:
+% \end{macrocode}
+% Lower case chars: map to themselves when lower casing and down by |"20| when
+% upper casing. (The characters |a|--|z| are set up correctly by Ini\TeX{}.)
+% \begin{macrocode}
+ \cs_set_protected:Npn \__cs_tmp:w #1
+ {
+ \tex_global:D \tex_lccode:D #1 = #1 \scan_stop:
+ \tex_global:D \tex_uccode:D #1 =
+ \__int_eval:w #1 - "20 \__int_eval_end:
+ }
+ \int_step_function:nnnN { "A0 } { 1 } { "BC } \__cs_tmp:w
+ \int_step_function:nnnN { "E0 } { 1 } { "FF } \__cs_tmp:w
+% \end{macrocode}
+% Upper case chars: map up by |"20| when lower casing, to themselves when upper
+% casing and require an \tn{sffode} of $999$. (The characters |A|--|Z| are set
+% up correctly by Ini\TeX{}.)
+% \begin{macrocode}
+ \cs_set_protected:Npn \__cs_tmp:w #1
+ {
+ \tex_global:D \tex_lccode:D #1 =
+ \__int_eval:w #1 + "20 \__int_eval_end:
+ \tex_global:D \tex_uccode:D #1 = #1 \scan_stop:
+ \tex_global:D \tex_sfcode:D #1 = 999 \scan_stop:
+ }
+ \int_step_function:nnnN { "80 } { 1 } { "9C } \__cs_tmp:w
+ \int_step_function:nnnN { "C0 } { 1 } { "DF } \__cs_tmp:w
+ \group_end:
+% \end{macrocode}
+% A few special cases where things are not as one might expect using the above
+% pattern: dotless-I, dotless-J, dotted-I and d-bar.
+% \begin{macrocode}
+ \char_set_lccode:nn { `\^^Y } { `\^^Y }
+ \char_set_uccode:nn { `\^^Y } { `\I }
+ \char_set_lccode:nn { `\^^Z } { `\^^Z }
+ \char_set_uccode:nn { `\^^Z } { `\J }
+ \char_set_lccode:nn { "9D } { `\i }
+ \char_set_uccode:nn { "9D } { "9D }
+ \char_set_lccode:nn { "9E } { "9E }
+ \char_set_uccode:nn { "9E } { "D0 }
+% \end{macrocode}
+% Allow hyphenation at a zero-width glyph (used to break up ligatures or
+% to place accents between characters).
+% \begin{macrocode}
+ \char_set_lccode:nn { 23 } { 23 }
+ }
+% \end{macrocode}
+% In all cases it makes sense to set up |-| to map to itself: this allows
+% hyphenation of the rest of a word following it (suggested by
+% Lars Helstr\"om).
+% \begin{macrocode}
+\char_set_lccode:nn { `\- } { `\- }
+% \end{macrocode}
+%
% \subsection{Temporary hacks}
%
% \begin{macro}{\T1/lmr/m/n/10}
--
To stop receiving notification emails like this one, please contact
the administrator of this repository.
More information about the latex3-commits
mailing list