[pdftex] Patch for better performance of map file reading
Heiko Oberdiek
oberdiek at uni-freiburg.de
Wed Jul 31 02:44:03 CEST 2002
Hello,
in the mailing list TEX-D-L someone has compared
old teTeX with TeX Live 7, 10 runs:
7s vs. 48s.
The main reason is the increased map file:
800 vs 3000 lines
Because the difference of performance is very high,
I have looked at the sources and detected the problem:
pdfTeX parses the map file and compares the current
entry with all previous entries to detect duplicate
entries. But the complexity is then N(N-1)/2 = O(N^2).
Therefore I have written a patch, that uses a
hashtable to increase the performance.
Yours sincerely
Heiko <oberdiek at uni-freiburg.de>
Directory: source/TeX/texk/web2c/pdftexdir/
%%% cut %%% mapfile.c.diff %%% cut %%%
*** mapfile.c.org Wed Jul 31 00:59:42 2002
--- mapfile.c Wed Jul 31 01:12:20 2002
***************
*** 1,8 ****
--- 1,13 ----
+ #include <math.h>
#include "ptexlib.h"
#include <kpathsea/c-auto.h>
#include <kpathsea/c-memstr.h>
#define FM_BUF_SIZE 1024
+ #define FM_MAX_INC 256
+ #define FMHT_SIZE_INC 256
+ #define FMHT_MAX_INC 386
+ #define FMHT_THETA 0.61803398875 /* (sqrt(5) - 1) / 2 */
static FILE *fm_file;
***************
*** 47,55 ****
goto done; \
} while (0)
static void fm_new_entry(void)
{
! entry_room(fm, 1, 25);
fm_ptr->tfm_name = 0;
fm_ptr->ps_name = 0;
fm_ptr->flags = 0;
--- 52,231 ----
goto done; \
} while (0)
+ /*
+ Ordered hashing with linear probing
+
+ The map file "pdftex.map" of TeX Live 7 has about 3000 entries.
+ Function fm_read_info uses most of the time for comparing
+ the current tfm file name with the old entries.
+ Complexity: N(N-1)/2 = O(N^2)
+ For better performance a data structure is needed with
+ the following properties:
+ * Efficient insertion.
+ * Fast check for entries that are not stored. If an entry
+ is already available, then pdfTeX will generate a warning,
+ therefore this is not the normal case.
+ Hashtables provide nearly O(1) for insertion and lookup.
+ But unfortunately the number of entries is not known
+ at start. The reorganization is implemented here by
+ reinsertion of all old entries. Therefore the complexity
+ remains O(N^2), but in practice the performance is much
+ better because of the factor of the square term:
+ N + N*N/chunk_size
+
+ Constants and variables:
+
+ fmht_tab: Array that contains pointers to the tfm file
+ names.
+ FMHT_MAX_INC: chunk size by which the hash table size
+ is increased.
+ FMHT_SIZE_INC: size of a chunk that may be filled by
+ entries.
+ FMHT_SIZE_INC = 256 means FMHT_MAX_INC = 386 that
+ the maximal fill of the hash table is 2/3.
+ fmht_max: current maximal size of the hash table
+ fmht_size: maximal allowed number of entries in the
+ hash table
+ fmht_curr_size: number of current entries in the hash table
+
+ Functions:
+
+ int fmht_get_hash_pos(char *str):
+ Parameter str: tfm font file name
+ Return value: position in hash table
+ Description:
+ Most tfm file names obey the Karl-Berry-schema, so the
+ string length is not greater than 8. The string is
+ converted into an int by dividing the first eight
+ bytes into two byte parts. These parts are then
+ combined by xor. As hashing function the multiplicative
+ method is used.
+
+ void fmht_entry_room():
+ Description:
+ The hashtable is increased. The old entries are
+ inserted in the larger new hashtable.
+
+ boolean fmht_check_and_insert(char *str):
+ Parameter str: tfm font file name
+ Return value: true if tfm file is inserted,
+ false if the name is already in the hashtable
+ Description:
+ The hash position is calculated and the entry
+ inserted, if this position is free.
+ Otherwise linear probing with ordering is used:
+ * Linear probing is easy to implement and it ensures,
+ that a free place will be found.
+ * Ordering improves the performance for
+ unsuccessful lookups. If an entry is found
+ that is greater, then the search can be aborted.
+ That means for inserting, that smaller entries
+ drive out larger ones.
+
+ void fmht_new_place(int pos):
+ Parameter pos: entry in hashtable that have to move away
+ Description:
+ The current entry in the hashtable is moved
+ to a free place.
+ */
+
+ int fmht_max = 0;
+ int fmht_size = 0;
+ int fmht_curr_size = 0;
+ char **fmht_tab = 0;
+
+ int fmht_get_hash_pos(char *str) {
+ double a;
+ int hash_code = 0;
+ int i;
+ for (i=0; *str && i<4; i++, str++) {
+ hash_code ^= (*str << 8) + *++str;
+ if (!*str) {
+ break;
+ }
+ }
+ a = hash_code * FMHT_THETA;
+ return fmht_max * (a - floor(a));
+ }
+
+ void fmht_entry_room() {
+ int i;
+ if (fmht_tab == 0) {
+ fmht_max = FMHT_MAX_INC;
+ fmht_size = FMHT_SIZE_INC;
+ fmht_tab = xtalloc(fmht_max, char*);
+ for (i=0; i<fmht_max; i++) {
+ fmht_tab[i] = 0;
+ }
+ }
+ else {
+ char **fmht_tab_old = fmht_tab;
+ int fmht_max_old = fmht_max;
+
+ fmht_max += FMHT_MAX_INC;
+ fmht_size += FMHT_SIZE_INC;
+ fmht_tab = xtalloc(fmht_max, char*);
+ for (i=0; i<fmht_max; i++) {
+ fmht_tab[i] = 0;
+ }
+ for (i=0; i<fmht_max_old; i++) {
+ if (fmht_tab_old[i]) {
+ fmht_check_and_insert(fmht_tab_old[i]);
+ }
+ }
+ xfree(fmht_tab_old);
+ }
+ }
+
+ void fmht_new_place(int pos) {
+ char *str = fmht_tab[pos];
+ int cmp;
+ do {
+ pos++;
+ pos %= fmht_max;
+ if (fmht_tab[pos] == 0) {
+ fmht_tab[pos] = str;
+ return;
+ }
+ cmp = strcmp(fmht_tab[pos], str);
+ if (cmp > 0) {
+ fmht_new_place(pos);
+ fmht_tab[pos] = str;
+ return;
+ }
+ if (cmp == 0) {
+ return;
+ }
+ }
+ while (1);
+ }
+
+ boolean fmht_check_and_insert(char *str) {
+ int pos = fmht_get_hash_pos(str);
+ int cmp;
+ do {
+ if (fmht_tab[pos] == 0) {
+ fmht_tab[pos] = str;
+ return true;
+ }
+ cmp = strcmp(fmht_tab[pos], str);
+ if (cmp > 0) {
+ fmht_new_place(pos);
+ fmht_tab[pos] = str;
+ return true;
+ }
+ if (cmp == 0) {
+ return false;
+ }
+ pos++;
+ pos %= fmht_max;
+ }
+ while (1);
+ }
+
static void fm_new_entry(void)
{
! entry_room(fm, 1, FM_MAX_INC);
fm_ptr->tfm_name = 0;
fm_ptr->ps_name = 0;
fm_ptr->flags = 0;
***************
*** 76,81 ****
--- 252,258 ----
char fm_line[FM_BUF_SIZE], buf[FM_BUF_SIZE];
fm_entry *e;
char *p, *q, *r, *s, *n = mapfiles;
+ fmht_entry_room();
for (;;) {
if (fm_file == 0) {
if (*n == 0) {
***************
*** 83,88 ****
--- 260,266 ----
cur_file_name = 0;
if (fm_tab == 0)
fm_new_entry();
+ xfree(fmht_tab);
return;
}
s = strchr(n, '\n');
***************
*** 117,128 ****
if (strcmp(buf, nontfm) == 0)
fm_ptr->tfm_name = nontfm;
else {
! for (e = fm_tab; e < fm_ptr; e++)
! if (e->tfm_name != nontfm && strcmp(e->tfm_name, buf) == 0) {
pdftex_warn("entry for `%s' already exists, duplicates ignored", buf);
goto bad_line;
}
! set_field(tfm_name);
}
p = r;
read_field(r, q, buf);
--- 295,312 ----
if (strcmp(buf, nontfm) == 0)
fm_ptr->tfm_name = nontfm;
else {
! if (fmht_curr_size >= fmht_size) {
! /* increase hashtable if maximal fill is reached */
! fmht_entry_room();
! }
! fm_ptr->tfm_name = xstrdup(buf);
! if (fmht_check_and_insert(fm_ptr->tfm_name) == false) {
pdftex_warn("entry for `%s' already exists, duplicates ignored", buf);
goto bad_line;
}
! fmht_curr_size++;
! if (*r == 10)
! goto done;
}
p = r;
read_field(r, q, buf);
%%% cut %%% mapfile.c.diff %%% cut %%%
More information about the pdftex
mailing list