[pdftex] Generating CJK in PDF again

Cho, Jin-Hwan chofchof at kias.re.kr
Thu Nov 29 16:24:46 CET 2001


I've read the discussions on May, 2001 with the topic "Generating CJK
in PDF" in this list. The solution seems to use encoding files (in the
case of truetype fonts) containing glyph codes, for example,

gbsong81 <gbk-81.enc <htsong.ttf

Is there any other progress for embedding CJK truetype fonts in PDF?

Recently, there was the same discussion in the Korean TeX Users Group
(KTUG) because HLaTeX is the best known TeX package for Korean
characters and the package uses the subfont feature (like the CJK
package) implemented in ttf2tfm and ttf2pk.

The solution of the discussion in KTUG was to implement the subfont
feature in pdfTeX, because there are two disadvantages to use
encoding files containing glyph codes as above.

1. For the HLaTeX package, we need 34 separated encoding files for
   each truetype font. (too many!!!)

2. In the map files, we need to record 34 lines for each font.

So the idea to implement subfont feature in pdfTeX is as follows.

1. Use encoding files containing character codes not glyph codes.
   Instead we need to put a routine in the source of pdfTeX,
   which lookup the CMAP table to convert the character codes
   to the actual glyph codes.

2. For the map files, use the following format

gbsong@ <gbk- at .enc <htsong.ttf

   instead of

gbsong81 <gbk-81.enc <htsong.ttf
...(many lines here)
gbsong94 <gbk-94.enc <htsong.ttf

   and put a routine in the source of pdfTeX which replace the @
   character to corresponding subfont name.

Actually I wrote a patch on pdftex-0.14f(20000525) to implement
these two features. But the patch reads only CMAP format 4 because
almost all Korean truetype fonts use this format.

I will attach the patch (11K) here. Any comment is welcome on this
patch or on thic topic.

Best, ChoF.
-- 
~~~~~~~~~~~~~~~~~~~~~~~~~     ***
| Cho, Jin-Hwan == ChoF |     ^ ^
~~~~~~~~~~~~~~~~~~~~~~~~~      o
| Research Fellow       |     ~~~
| School of Mathematics ~~~~~~~~~~~~~~
| Korea Institute for Advanced Study |
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
| chofchof at kias.re.kr                |
| http://free.kaist.ac.kr/ChoF/      |
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- texk/web2c/pdftexdir/ptexmac.h.org	Sun Apr 23 23:10:24 2000
+++ texk/web2c/pdftexdir/ptexmac.h	Mon Nov 26 12:46:04 2001
@@ -113,6 +113,10 @@
 #define F_BASEFONT          0x08
 #define F_NOPARSING         0x10
 #define F_PGCFONT           0x20
+/* patched by ChoF on Nov 23, 2001 to implement the subfont feature */
+#define F_SUBFONT           0x40
+#define F_HAVESUBFONT       0x80
+/* end of patch by ChoF */
 
 #define set_included(fm)    ((fm)->type |= F_INCLUDED)
 #define set_subsetted(fm)   ((fm)->type |= F_SUBSETTED)
@@ -120,6 +124,10 @@
 #define set_basefont(fm)    ((fm)->type |= F_BASEFONT)
 #define set_noparsing(fm)   ((fm)->type |= F_NOPARSING)
 #define set_pcgfont(fm)     ((fm)->type |= F_PGCFONT)
+/* patched by ChoF on Nov 23, 2001 to implement the subfont feature */
+#define set_subfont(fm)     ((fm)->type |= F_SUBFONT)
+#define set_havesubfont(fm) ((fm)->type |= F_HAVESUBFONT)
+/* end of patch by ChoF */
 
 #define unset_included(fm)  ((fm)->type &= ~F_INCLUDED)
 #define unset_subsetted(fm) ((fm)->type &= ~F_SUBSETTED)
@@ -127,12 +135,20 @@
 #define unset_basefont(fm)  ((fm)->type &= ~F_BASEFONT)
 #define unset_noparsing(fm) ((fm)->type &= ~F_NOPARSING)
 #define unset_pcgfont(fm)   ((fm)->type &= ~F_PGCFONT)
+/* patched by ChoF on Nov 23, 2001 to implement the subfont feature */
+#define unset_subfont(fm)   ((fm)->type &= ~F_SUBFONT)
+#define unset_havesubfont(fm) ((fm)->type &= ~F_HAVESUBFONT)
+/* end of patch by ChoF */
 
 #define is_included(fm)     ((fm)->type & F_INCLUDED)
 #define is_subsetted(fm)    ((fm)->type & F_SUBSETTED)
 #define is_truetype(fm)     ((fm)->type & F_TRUETYPE)
 #define is_basefont(fm)     ((fm)->type & F_BASEFONT)
 #define is_noparsing(fm)    ((fm)->type & F_NOPARSING)
+/* patched by ChoF on Nov 23, 2001 to implement the subfont feature */
+#define is_subfont(fm)      ((fm)->type & F_SUBFONT)
+#define is_havesubfont(fm)  ((fm)->type & F_HAVESUBFONT)
+/* end of patch by ChoF */
 
 #define fm_slant(fm)        (fm)->slant
 #define fm_extend(fm)       (fm)->extend
--- texk/web2c/pdftexdir/mapfile.c.org	Mon Apr 17 16:06:30 2000
+++ texk/web2c/pdftexdir/mapfile.c	Mon Nov 26 12:53:08 2001
@@ -69,6 +69,17 @@
     fm_ptr->tfm_num         = getnullfont();
 }
 
+/* patched by ChoF on Nov 23, 2001 to implement the subfont feature */
+static fm_entry *fm_dup_entry(int fm_index)
+{
+    fm_entry *e;
+    entry_room(fm, 1, 25);
+    e = fm_tab + fm_index;
+    memcpy(fm_ptr++, e, sizeof(fm_entry));
+    return e;
+}
+/* end of patch by ChoF */
+
 void fm_read_info(void)
 {
     float d;
@@ -120,6 +131,16 @@
                     pdftex_warn("entry for `%s' already exists, duplicates ignored", buf);
                     goto bad_line;
                 }
+/*
+ * patched to implement the subfont feature by ChoF on Nov 23, 2001
+ *
+ * Any tfm name with '@' as its last character plays a role as a
+ * pseudo font name on which encoding name will be attached to make
+ * real tfm names. (e.g. kmjm@ ==> kmjm0,...,kmjm29,kmjm04,...,kmjm07)
+ */
+	    if (*(strend(buf) - 1) == '@')
+	        set_havesubfont(fm_ptr);
+/* end of patch by ChoF */
             set_field(tfm_name);
         }
         p = r;
@@ -365,6 +386,22 @@
         fm->tfm_num = f;
 }
 
+/* patched by ChoF on Nov 23, 2001 to implement the subfont feature */
+static char *replace_at(char *src, char *rpl)
+{
+    char *dst, *p;
+    if (src == 0) return 0;
+    for (p = src; *p != '\0' && *p != '@'; p++);
+    if (*p == '\0') return xstrdup(src);
+    dst = xtalloc(strlen(src) + strlen(rpl), char);
+    strncpy(dst, src, p - src);
+    *(dst + (p - src)) = '\0';
+    strcat(dst, rpl);
+    strcat(dst, ++p);
+    return dst;
+}
+/* end of patch by ChoF */
+
 integer fmlookup(internalfontnumber f)
 {
     char *tfm, *p;
@@ -378,12 +415,46 @@
         if ((e = getexpandfactor(f)) != 0)
             goto ex_font;
     }
-    for (fm = fm_tab; fm < fm_ptr; fm++)
-        if (fm->tfm_name != nontfm && strcmp(tfm, fm->tfm_name) == 0) {
-            init_fm(fm, f);
-            return fm - fm_tab;
-        }
+/*
+ * patched by ChoF on Nov 23, 2001 to implement the subfont
+ * feature used in ttf2tfm.
+ *
+ * In the following code we should note that the fm_entry of
+ * a pseudo font will be swapped by its subfonts so that the
+ * fm_entry of a pseudo font goes to the end (fm_ptr).
+ */
+    for (fm = fm_tab; fm < fm_ptr; fm++) {
+        if (fm->tfm_name != nontfm && strcmp(tfm, fm->tfm_name) == 0)
+            goto tfm_found;
+        else if (!is_havesubfont(fm))
+            continue;
+	if ((l = strlen(fm->tfm_name) - 1) >= strlen(tfm))
+            continue;
+        if (strncmp(tfm, fm->tfm_name, l) != 0)
+            continue;
+        /* in the following we assume that subfont tags consist of digits */
+        if (!isdigitstr(tfm + l))
+            continue;
+        fm = fm_dup_entry(fm - fm_tab);
+        set_subfont(fm);
+        unset_havesubfont(fm);
+        if (is_truetype(fm) && is_included(fm))
+            set_subsetted(fm);	/* truetype subfonts should be subsetted always */
+        fm->tfm_name = xstrdup(tfm);
+        fm->ps_name = replace_at(fm->ps_name, fm->tfm_name + l);
+        fm->ff_name = replace_at(fm->ff_name, fm->tfm_name + l);
+        if (fm->encoding >= 0)
+            if ((p = replace_at(enc_tab[fm->encoding].name, fm->tfm_name + l)) != enc_tab[fm->encoding].name) {
+                fm->encoding = add_enc(p);
+                free(p);
+            }
+        goto tfm_found;
+    }
     return -2;
+tfm_found:
+    init_fm(fm, f);
+    return fm - fm_tab;
+/* end of patch by ChoF */
 ex_font:
     l = strlen(tfm);
     /* look up for expanded fonts in reversed direction, as they are are
--- texk/web2c/pdftexdir/writettf.c.org	Fri Apr 14 04:55:04 2000
+++ texk/web2c/pdftexdir/writettf.c	Mon Nov 26 12:43:42 2001
@@ -116,6 +116,29 @@
     TTF_USHORT format;
 } cmap_entry;
 
+/* TCMap4Segment and TCMap4 come from the freetype library (ttcmap.h) */
+typedef struct {
+    TTF_USHORT endCount;
+    TTF_USHORT startCount;
+    TTF_SHORT  idDelta;
+    TTF_USHORT idRangeOffset;
+} TCMap4Segment;
+
+typedef struct {
+    TTF_USHORT length;
+    TTF_USHORT segCountX2;	/* number of segments * 2	*/
+    TTF_USHORT searchRange;	/* these parameters can be used */
+    TTF_USHORT entrySelector;	/* for a binary search		*/
+    TTF_USHORT rangeShift;
+    TCMap4Segment *segments;
+    TTF_USHORT *glyphIdArray;
+    TTF_USHORT numGlyphId;	/* control value */
+} TCMap4;
+
+static TCMap4 cmap4;
+static int segCount;
+/* end of patch by ChoF */
+
 static TTF_USHORT ntabs;
 static TTF_ULONG checksum;
 static TTF_USHORT upem;
@@ -268,19 +291,122 @@
     xfseek(INFILE, offset, SEEK_SET, cur_file_name);
 }
 
+/* ttf_code_to_index4 comes from the freetype library (ttcmap.c) */
+static TTF_USHORT ttf_code_to_index4(TTF_USHORT charcode)
+{
+    TTF_USHORT i, index1;
+    TCMap4Segment *seg4;
+
+    for (i = 0; i < segCount; i++)
+        if (charcode <= cmap4.segments[i].endCount)
+            break;
+
+    /* Safety check - even though the last endCount shoud be 0xFFFF */
+    if (i >= segCount)
+        return 0;
+ 
+    seg4 = &cmap4.segments[i];
+
+    if (charcode < seg4->startCount)
+        return 0;
+
+    if (seg4->idRangeOffset == 0)
+        return (charcode + seg4->idDelta) & 0xFFFF;
+    else {
+        index1 = seg4->idRangeOffset / 2 + (charcode - seg4->startCount) -
+                 (segCount - i);
+
+	if (index1 < cmap4.numGlyphId) {
+            ttf_seek_tab("cmap", (12L + index1) * TTF_USHORT_SIZE +
+                (3L * TTF_USHORT_SIZE + TTF_SHORT_SIZE) * segCount +
+                TTF_ULONG_SIZE);
+	    /* charcode = glyphIdArray[index1], not confirmed yet */
+            if ((charcode = get_ushort()) == 0)
+                return 0;
+            else
+                return (charcode + seg4->idDelta) & 0xFFFF;
+        }
+	else
+            return 0;
+    }
+}
+/* end of patch by ChoF */
+
+/* ttf_read_cmap4 comes from the freetype library (ttcmap.c) */
+static void ttf_read_cmap4(void)
+{
+    int i, format;
+    TCMap4Segment *segments;
+
+    ttf_seek_tab("cmap", 4 * TTF_USHORT_SIZE + TTF_ULONG_SIZE);
+
+    if ((format = get_ushort()) != 4)
+	return;
+
+    cmap4.length = get_ushort();
+    ttf_skip(TTF_USHORT_SIZE);	/* language field not used here */
+    cmap4.segCountX2 = get_ushort();
+    cmap4.searchRange = get_ushort();
+    cmap4.entrySelector = get_ushort();
+    cmap4.rangeShift = get_ushort();
+
+    segCount = cmap4.segCountX2 / 2;
+    segments = cmap4.segments = xtalloc(segCount, TCMap4Segment);
+
+    for (i = 0; i < segCount; i++)
+        segments[i].endCount = get_ushort();
+    ttf_skip(TTF_USHORT_SIZE);	/* reservedPad not used here */
+    for (i = 0; i < segCount; i++)
+        segments[i].startCount = get_ushort();
+    for (i = 0; i < segCount; i++)
+        segments[i].idDelta = get_short();
+    for (i = 0; i < segCount; i++)
+        segments[i].idRangeOffset = get_ushort();
+
+    cmap4.numGlyphId = ((cmap4.length - (16L + 8L * segCount)) & 0xFFFF) / 2;
+}
+/* end of patch by ChoF */
+
+/*
+ * ttf_copy_encoding was patched by ChoF on Nov 23, 2001
+ * to implement the subfont feature used by ttf2tfm.
+ *
+ * Each subfont has corresponding encoding tables which
+ * contain character codes (e.g. Unicode or Korean Wansung).
+ * However the actual glyph codes are different in each
+ * truetype font so that we need to convert the character
+ * codes to the glyph codes recorded in the cmap entry of
+ * truetype fonts.
+ */
 static void ttf_copy_encoding(void)
 {
     int i;
     char **glyph_names = enc_tab[fm_cur->encoding].glyph_names;
     ttfenc_entry *e = ttfenc_tab;
+    TTF_USHORT charcode;
+    static char glyph_names_buf[MAX_CHAR_CODE + 1][12];
+		/* need 12 characters to store "index0xFFFF" */
+
     pdfmarkchar(tex_font, 'a'); /* workaround for a bug of AcroReader 4.0 */
     for (i = 0; i <= MAX_CHAR_CODE; i++, e++) {
-        if (pdfcharmarked(tex_font, i))
-            e->name = glyph_names[i];
-        else
-            e->name = notdef;
+        if (pdfcharmarked(tex_font, i)) {
+            if (!is_subfont(fm_cur)) {
+                e->name = glyph_names[i];
+		continue;
+            }
+	    if (sscanf(glyph_names[i], INDEXED_GLYPH_PREFIX "%i", &charcode) == 1) {
+                if ((charcode = ttf_code_to_index4(charcode)) < 0)
+                    pdftex_fail("`%s' failure on cmap mapping from %s", glyph_names[i], fm_cur->ff_name);
+		if (charcode > 0) {
+                    sprintf(e->name = glyph_names_buf[i], "index0x%04X\0", charcode);
+		    continue;
+		}
+	    }
+        }
+        e->name = notdef;
     }
 }
+/* end of patch by ChoF */
 
 #define ttf_append_byte(B)\
 do {\
@@ -480,6 +606,9 @@
     ttf_read_post();
     ttf_read_loca();
     ttf_read_name();
+/* patched by ChoF on Nov 23, 2001 to implement the subfont feature */
+    ttf_read_cmap4();
+/* end of patch by ChoF */
 }
 
 #define ttf_reset_chksm() do {  \
@@ -1012,6 +1141,9 @@
     xfree(glyph_name_buf);
     xfree(name_tab);
     xfree(name_buf);
+/* patched by ChoF on Nov 23, 2001 to implement the subfont feature */
+    xfree(cmap4.segments);
+/* end of patch by ChoF */
     ttf_close();
     if (!is_included(fm_cur))
         tex_printf("}");


--- StripMime Report -- processed MIME parts ---
multipart/mixed
  text/plain (text body -- kept)
  text/plain (text body -- kept)
---



More information about the pdftex mailing list